In [1]:
sent1 = 'It is good for our progress'
sent2 = 'They have decided that it was not good'

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
cvt = CountVectorizer()

In [4]:
vect = cvt.fit_transform([sent1, sent2])

In [5]:
import pandas as pd

In [8]:
df = pd.DataFrame(vect.toarray(), columns=cvt.get_feature_names_out())
df

Unnamed: 0,decided,for,good,have,is,it,not,our,progress,that,they,was
0,0,1,1,0,1,1,0,1,1,0,0,0
1,1,0,1,1,0,1,1,0,0,1,1,1


##### Continous Bag of Words

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
lines = ['It was a nice rainy day.',
         'The things are so beatiful in his point.',
         'When your focus is clear, you won.',
         'Many many happy returns of the day.']

In [16]:
lines[0].split()

['It', 'was', 'a', 'nice', 'rainy', 'day.']

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)

In [20]:
tokenizer.word_counts;

In [19]:
tokenizer.word_index

{'day': 1,
 'the': 2,
 'many': 3,
 'it': 4,
 'was': 5,
 'a': 6,
 'nice': 7,
 'rainy': 8,
 'things': 9,
 'are': 10,
 'so': 11,
 'beatiful': 12,
 'in': 13,
 'his': 14,
 'point': 15,
 'when': 16,
 'your': 17,
 'focus': 18,
 'is': 19,
 'clear': 20,
 'you': 21,
 'won': 22,
 'happy': 23,
 'returns': 24,
 'of': 25}

In [22]:
mat = tokenizer.texts_to_matrix(lines)
mat

array([[0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
       [0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]])

In [23]:
seq =  tokenizer.texts_to_sequences(lines)
seq

[[4, 5, 6, 7, 8, 1],
 [2, 9, 10, 11, 12, 13, 14, 15],
 [16, 17, 18, 19, 20, 21, 22],
 [3, 3, 23, 24, 25, 2, 1]]

In [24]:
padded = pad_sequences(seq, maxlen=10, padding= 'pre' )
padded

array([[ 0,  0,  0,  0,  4,  5,  6,  7,  8,  1],
       [ 0,  0,  2,  9, 10, 11, 12, 13, 14, 15],
       [ 0,  0,  0, 16, 17, 18, 19, 20, 21, 22],
       [ 0,  0,  0,  3,  3, 23, 24, 25,  2,  1]], dtype=int32)

In [25]:
# Dataset: twitter-sentiments.csv
# Location: https://mitu.co.in/dataset

In [26]:
# Importing libraries
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
import re
import numpy as np

##### Data Preparation

In [28]:
data = pd.read_csv("twitter_sentiments.csv", names = ['id','loc','label','text'])

In [29]:
data

Unnamed: 0,id,loc,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [30]:
data.shape

(74682, 4)

In [31]:
data.dtypes

id        int64
loc      object
label    object
text     object
dtype: object

In [32]:
data['text'] = data['text'].astype(str)

In [34]:
# Text cleaning 
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"[^a-zA-Z]+", " ", text)  # remove non-alphanumeric characters
    return text

In [36]:
clean_text("Hello friends! How are you???? Welcome.. 62782!!!")

'hello friends how are you welcome '

In [37]:
data["text"] = data["text"].apply(clean_text)

In [38]:
data

Unnamed: 0,id,loc,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,i am coming to the borders and i will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
...,...,...,...,...
74677,9200,Nvidia,Positive,just realized that the windows partition of my...
74678,9200,Nvidia,Positive,just realized that my mac window partition is ...
74679,9200,Nvidia,Positive,just realized the windows partition of my mac ...
74680,9200,Nvidia,Positive,just realized between the windows partition of...


In [40]:
# Feature and target preparation
comments = data["text"].tolist()  # input variable
targets = data['label'].values    # ouput variable

In [41]:
pd.DataFrame(targets).value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

##### b. Generate training data

In [44]:
# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=200)

In [45]:
padded_sequences.shape

(74682, 200)

In [46]:
padded_sequences

array([[   0,    0,    0, ..., 1695,   12,   26],
       [   0,    0,    0, ...,  424,   12,   26],
       [   0,    0,    0, ...,  424,   12,   26],
       ...,
       [   0,    0,    0, ...,  302,   15, 2055],
       [   0,    0,    0, ...,  302,   15, 2055],
       [   0,    0,    0, ...,  302,   15, 2055]], dtype=int32)

##### Output data preparation

In [50]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(targets)

# One-hot encoding
from keras.utils import to_categorical
y_new = to_categorical(y)

In [51]:
y_new

array([[0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [56]:
# Train-test split / cross validation
x_train, x_test, y_train, y_test = train_test_split(
    padded_sequences, y_new, test_size=0.2, random_state=0)

In [57]:
x_train.shape

(59745, 200)

In [58]:
x_test.shape

(14937, 200)

##### c. Train model

In [61]:
# Model definition (customize architecture as needed)
model = Sequential()
model.add(Embedding(5000, 128, input_length=200))
model.add(LSTM(64))
model.add(Dense(4, activation="softmax"))  
# Multi-label output with sigmoid activation

In [62]:
# Model compilation
model.compile(loss="categorical_crossentropy", 
              optimizer="adam", metrics=["accuracy"])

In [63]:
# Model training
model.fit(x_train, y_train, epochs=3, batch_size=32, 
          validation_data=(x_test, y_test))

Epoch 1/3
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 89ms/step - accuracy: 0.5405 - loss: 1.0674 - val_accuracy: 0.7089 - val_loss: 0.7433
Epoch 2/3
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 93ms/step - accuracy: 0.7581 - loss: 0.6246 - val_accuracy: 0.7652 - val_loss: 0.6160
Epoch 3/3
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 94ms/step - accuracy: 0.8301 - loss: 0.4504 - val_accuracy: 0.7923 - val_loss: 0.5614


<keras.src.callbacks.history.History at 0x78254dd82a70>

##### d. Output

In [76]:
new_comment = "I hate him."
new_sequence = tokenizer.texts_to_sequences([clean_text(new_comment)])
padded_new_sequence = pad_sequences(new_sequence, maxlen=200)
prediction = model.predict(padded_new_sequence)[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step


In [77]:
prediction

array([0.22064301, 0.5586227 , 0.19831812, 0.02241617], dtype=float32)

In [78]:
le.inverse_transform([np.argmax(prediction)])[0]

'Negative'