In [1]:
"""
The dataset has three sentiments namely, negative(-1), neutral(0), and positive(+1).
It contains two fields for the tweet and label. Our goal is to do sentiment analysis and divide it in
two category.

I apply LSTM on this dataset to perform Sentiment Analysis.
"""

'\nThe dataset has three sentiments namely, negative(-1), neutral(0), and positive(+1).\nIt contains two fields for the tweet and label. Our goal is to do sentiment analysis and divide it in\ntwo category.\n\nI apply LSTM on this dataset to perform Sentiment Analysis.\n'

In [2]:
# import libraries and read data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Load data
df = pd.read_csv("Twitter_Data.csv")
df

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [4]:
# Data cleaning and preprocessing

In [5]:
print (df.info())
print ("----------------------------------------------")
print (df.isna().sum())
print ("----------------------------------------------")
print (df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB
None
----------------------------------------------
clean_text    4
category      7
dtype: int64
----------------------------------------------
1


In [6]:
# Dropping NA values and duplicates
df = df.dropna()
df = df.drop_duplicates()
df = df[df['category'] != 0]

In [7]:
print (df.isna().sum())
print ("----------------------------------------------")
print (df.duplicated().sum())

clean_text    0
category      0
dtype: int64
----------------------------------------------
0


In [8]:
# Encode categorical labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])

In [9]:
# Split data into features (x) and labels (y)
x = df['clean_text']
y = df['category']

In [10]:
# Tokenize and pad sequences
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=" ")
tokenizer.fit_on_texts(x)
x = tokenizer.texts_to_sequences(x)
x = pad_sequences(x)

In [11]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [12]:
# Build model

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils import to_categorical

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=x.shape[1]))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(np.max(y) + 1, activation='softmax'))  # Number of neurons equals the number of categories
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1e19f4f7910>

In [15]:
# Evaluat the model 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate on test data
y_pred_prob = model.predict(x_test)

# Get the predicted classes
y_pred = np.argmax(y_pred_prob, axis=1)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9229305864884929
