##### Bag-of-words

In [1]:
sent1 = 'they are playing on the ground from four hours'
sent2 = 'I dont know for how many hours they will be playing'

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [5]:
cv = CountVectorizer()

In [6]:
x_new = cv.fit_transform([sent1, sent2])

In [9]:
x_new

<2x16 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [11]:
df = pd.DataFrame(data=x_new.toarray(), columns=cv.get_feature_names_out())

In [12]:
df

Unnamed: 0,are,be,dont,for,four,from,ground,hours,how,know,many,on,playing,the,they,will
0,1,0,0,0,1,1,1,1,0,0,0,1,1,1,1,0
1,0,1,1,1,0,0,0,1,1,1,1,0,1,0,1,1


##### Continous Bag-of-Words

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
lines = ['It was a nice rainy day.','The things are so beatiful in his point.',
         'When your focus is clear, you won.','Many many happy returns of the day.']

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)

In [38]:
tokenizer.word_docs;

In [39]:
tokenizer.word_index;

In [40]:
mat = tokenizer.texts_to_matrix(lines)
mat;

In [41]:
seq =  tokenizer.texts_to_sequences(lines)
seq;

In [42]:
padded = pad_sequences(seq, maxlen=10, padding= 'pre' )
padded;

In [43]:
# Dataset: twitter-sentiments.csv
# Location: https://mitu.co.in/dataset

In [44]:
# Importing libraries
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
import re
import numpy as np

In [45]:
data = pd.read_csv("twitter_sentiments.csv", names = ['id','loc','label','text'])

In [46]:
data.shape

(74682, 4)

In [47]:
data

Unnamed: 0,id,loc,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [48]:
data.dtypes

id        int64
loc      object
label    object
text     object
dtype: object

In [49]:
data['text'] = data['text'].astype(str)

In [52]:
# Text cleaning (consider implementing more advanced cleaning techniques)
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"[^a-zA-Z]+", " ", text)  # remove non-alphanumeric characters
    return text

In [55]:
data["text"] = data["text"].apply(clean_text)

In [56]:
data

Unnamed: 0,id,loc,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,i am coming to the borders and i will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
...,...,...,...,...
74677,9200,Nvidia,Positive,just realized that the windows partition of my...
74678,9200,Nvidia,Positive,just realized that my mac window partition is ...
74679,9200,Nvidia,Positive,just realized the windows partition of my mac ...
74680,9200,Nvidia,Positive,just realized between the windows partition of...


In [57]:
# Feature and target preparation
comments = data["text"].tolist()
targets = data['label'].values

In [58]:
np.unique(targets)

array(['Irrelevant', 'Negative', 'Neutral', 'Positive'], dtype=object)

In [59]:
# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=200)

In [61]:
padded_sequences.shape

(74682, 200)

In [64]:
# Output data preparation
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(targets)

from keras.utils import to_categorical
y_new = to_categorical(y)

In [63]:
y_new

array([[0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [66]:
from collections import Counter
Counter(targets)

Counter({'Negative': 22542,
         'Positive': 20832,
         'Neutral': 18318,
         'Irrelevant': 12990})

In [69]:
# Train-test split / cross validation
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, y_new, test_size=0.2)

In [70]:
X_train.shape

(59745, 200)

In [71]:
X_test.shape

(14937, 200)

In [72]:
# Model definition (customize architecture as needed)
model = Sequential()
model.add(Embedding(5000, 128, input_length=200))
model.add(LSTM(64))
model.add(Dense(4, activation="softmax"))  
# Multi-label output with sigmoid activation



In [73]:
# Model compilation
model.compile(loss="categorical_crossentropy", 
              optimizer="adam", metrics=["accuracy"])

In [75]:
# Model training
model.fit(X_train, y_train, epochs=3, batch_size=32, 
          validation_data=(X_test, y_test))

Epoch 1/3
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 85ms/step - accuracy: 0.7043 - loss: 0.7538 - val_accuracy: 0.7484 - val_loss: 0.6651
Epoch 2/3
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 84ms/step - accuracy: 0.7973 - loss: 0.5283 - val_accuracy: 0.7835 - val_loss: 0.5731
Epoch 3/3
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 83ms/step - accuracy: 0.8504 - loss: 0.3936 - val_accuracy: 0.8104 - val_loss: 0.5210


<keras.src.callbacks.history.History at 0x7193c83390f0>

In [82]:
# Prediction on unseen comment (replace 'new_comment' with your actual comment)
new_comment = "I hate him."
new_sequence = tokenizer.texts_to_sequences([clean_text(new_comment)])
padded_new_sequence = pad_sequences(new_sequence, maxlen=200)
prediction = model.predict(padded_new_sequence)[0]
le.inverse_transform([np.argmax(prediction)])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


array(['Negative'], dtype=object)