# Sentiment Analysis using LSTM

In [1]:
pip install Keras

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tensorflow


Note: you may need to restart the kernel to use updated packages.


In [3]:
# Importing Libraries
import re
import pandas as pd
import numpy as np
import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score
import nltk

In [4]:
data = pd.read_csv('IMDB Dataset.csv')
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
# Function to clean text
def remove_tags(string):
    # Removing HTML tags
    result = re.sub(r'<.*?>', '', string)
    # Removing URLs
    result = re.sub(r'https?://\S+|www\.\S+', '', result)
    # Removing non-alphanumeric characters
    result = re.sub(r'[^a-zA-Z\s]', '', result)
    result = result.lower()
    return result

# Apply text cleaning
data['review'] = data['review'].apply(lambda cw: remove_tags(cw))


In [6]:
# Download NLTK stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Lemmatization function
nltk.download('wordnet')
nltk.download('omw-1.4')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to C:\Users\mohamed
[nltk_data]     wajith\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\mohamed
[nltk_data]     wajith\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\mohamed
[nltk_data]     wajith\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

# Apply lemmatization before removing stopwords
data['review'] = data['review'].apply(lemmatize_text)

# Remove stopwords
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [8]:
# Calculate average review length
avg_len = data['review'].apply(lambda x: len(x.split())).mean()
print("Average length of each review:", avg_len)

Average length of each review: 121.0772


In [9]:
# Sentiment distribution
pos = sum(data['sentiment'] == 'positive')
neg = data.shape[0] - pos
print(f"Percentage of reviews with positive sentiment: {pos / data.shape[0] * 100}%")
print(f"Percentage of reviews with negative sentiment: {neg / data.shape[0] * 100}%")

Percentage of reviews with positive sentiment: 50.0%
Percentage of reviews with negative sentiment: 50.0%


In [10]:
# Preparing the data
reviews = data['review'].values
labels = data['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify=encoded_labels)


In [11]:
# Hyperparameters
vocab_size = 10000  # Increased vocab size
oov_tok = '<OOV>'
embedding_dim = 100
max_length = 200
padding_type = 'post'
trunc_type = 'post'

In [12]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)


In [13]:
# Model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
    keras.layers.Dropout(0.5),
    keras.layers.Bidirectional(keras.layers.LSTM(32)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
model.summary()



In [14]:
# Early stopping to prevent overfitting
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [15]:
# Train the model
num_epochs = 10
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, 
                    validation_split=0.1, 
                    callbacks=[early_stopping], 
                    verbose=1)

Epoch 1/10
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 280ms/step - accuracy: 0.7284 - loss: 0.5172 - val_accuracy: 0.8371 - val_loss: 0.3654
Epoch 2/10
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 274ms/step - accuracy: 0.8916 - loss: 0.2861 - val_accuracy: 0.8603 - val_loss: 0.3136
Epoch 3/10
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 271ms/step - accuracy: 0.9280 - loss: 0.2013 - val_accuracy: 0.8648 - val_loss: 0.3315
Epoch 4/10
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 274ms/step - accuracy: 0.9482 - loss: 0.1508 - val_accuracy: 0.8619 - val_loss: 0.3336
Epoch 5/10
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 278ms/step - accuracy: 0.9598 - loss: 0.1232 - val_accuracy: 0.8661 - val_loss: 0.3967


In [17]:
# Predictions on test set
prediction = model.predict(test_padded)
pred_labels = (prediction >= 0.5).astype(int)

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 94ms/step


In [19]:
accuracy = accuracy_score(test_labels, pred_labels)
print(f"Accuracy of prediction on test set: {accuracy * 100:.2f}%")

Accuracy of prediction on test set: 86.14%


In [20]:
# Predict sentiment of new sentences
sentence = ["The movie was very touching and heartwarming", 
            "I have never seen a terrible movie like this", 
            "The movie plot is terrible but it had good acting"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, padding=padding_type, maxlen=max_length)


In [21]:

# Predict sentiment
prediction = model.predict(padded)
pred_labels = (prediction >= 0.5).astype(int)

for i in range(len(sentence)):
    print(sentence[i])
    sentiment = 'Positive' if pred_labels[i] == 1 else 'Negative'
    print("Predicted sentiment:", sentiment)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
The movie was very touching and heartwarming
Predicted sentiment: Positive
I have never seen a terrible movie like this
Predicted sentiment: Positive
The movie plot is terrible but it had good acting
Predicted sentiment: Negative
