##### Dataset

In [1]:
# twitter_sentiments.csv
# Location: https://mitu.co.in/dataset

##### Import the libraries

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re
import numpy as np
import pandas as pd

##### Read the dataset

In [7]:
df = pd.read_csv('twitter_sentiments.csv', names=['id','company','label','text'])

In [8]:
df.shape

(74682, 4)

In [9]:
df

Unnamed: 0,id,company,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [10]:
df.dtypes

id          int64
company    object
label      object
text       object
dtype: object

In [12]:
df['text'] = df['text'].astype(str)

In [13]:
df.dtypes

id          int64
company    object
label      object
text       object
dtype: object

##### Text Cleaning

In [14]:
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)  # remove non-alphanumeric characters
    return text

In [16]:
df["text"] = df["text"].apply(clean_text)

In [17]:
df

Unnamed: 0,id,company,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,i am coming to the borders and i will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,just realized that the windows partition of my...
74678,9200,Nvidia,Positive,just realized that my mac window partition is ...
74679,9200,Nvidia,Positive,just realized the windows partition of my mac ...
74680,9200,Nvidia,Positive,just realized between the windows partition of...


##### Separate the input and output data

In [21]:
# Feature and target preparation
comments = df["text"].tolist()
targets = df['label'].values

In [23]:
comments;

In [24]:
com = 'it is so nice of you'

In [36]:
[67, 890, 12, 4567, 29, 1067]

[67, 890, 12, 4567, 29, 1067]

In [26]:
com1 = 'all the best'

In [31]:
[0,0,0,0,0,0,0,789, 458, 2045]

[0, 0, 0, 0, 0, 0, 0, 789, 458, 2045]

##### Tokenization and Padding

In [33]:
# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=200)

In [34]:
padded_sequences

array([[   0,    0,    0, ..., 1740,   12,   26],
       [   0,    0,    0, ...,  429,   12,   26],
       [   0,    0,    0, ...,  429,   12,   26],
       ...,
       [   0,    0,    0, ...,  308,   15, 2117],
       [   0,    0,    0, ...,  308,   15, 2117],
       [   0,    0,    0, ...,  308,   15, 2117]], dtype=int32)

In [35]:
padded_sequences.shape

(74682, 200)

In [37]:
set(targets)

{'Irrelevant', 'Negative', 'Neutral', 'Positive'}

In [38]:
targets

array(['Positive', 'Positive', 'Positive', ..., 'Positive', 'Positive',
       'Positive'], dtype=object)

In [40]:
# Label Encode
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(targets)

In [41]:
y

array([3, 3, 3, ..., 3, 3, 3])

In [42]:
# one hot encode
from keras.utils import to_categorical
y_new = to_categorical(y)

In [43]:
y_new

array([[0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

In [44]:
y_new.shape

(74682, 4)

##### Cross-validation

In [46]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y_new, test_size=0.2)

In [47]:
X_train.shape

(59745, 200)

In [49]:
X_test.shape

(14937, 200)

##### Build the model

In [53]:
# Model definition (customize architecture as needed)
model = Sequential()
model.add(Embedding(5000, 128, input_length=200))
model.add(LSTM(64))
model.add(Dense(4, activation="softmax"))  # Multi-label output with softmax activation

##### Compile the model

In [55]:
# Model compilation
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

##### Train the model

In [57]:
# Model training
model.fit(X_train, y_train, epochs=1, batch_size=32, validation_data=(X_test, y_test))



<keras.src.callbacks.History at 0x74b21410f220>

In [58]:
new_comment = "It was a great day that made be pleasing!"
new_sequence = tokenizer.texts_to_sequences([clean_text(new_comment)])
padded_new_sequence = pad_sequences(new_sequence, maxlen=200)
prediction = model.predict(padded_new_sequence)[0]



In [59]:
le.inverse_transform([np.argmax(prediction)])[0]

'Positive'

In [61]:
new_comment = "I think it was possible for me."
new_sequence = tokenizer.texts_to_sequences([clean_text(new_comment)])
padded_new_sequence = pad_sequences(new_sequence, maxlen=200)
prediction = model.predict(padded_new_sequence)[0]
le.inverse_transform([np.argmax(prediction)])[0]



'Positive'