#Sentiment Analysis

## Installing Dependencies

In [1]:
!pip install pandas==2.2.3
!pip install scikit-learn==1.6.1
# For GPU users
!pip install tensorflow[and-cuda]
# For CPU users
# !pip install tensorflow

Collecting pandas==2.2.3
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.2.3


## Importing Dependencies

In [2]:
import os
import re
import pandas as pd
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/IMDB-Dataset.csv


In [7]:
df = pd.read_csv('./data/IMDB-Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Data Cleaning

In [8]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [9]:
df.duplicated().sum()

np.int64(418)

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.duplicated().sum()

np.int64(0)

In [12]:
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<,*?>'), '', raw_text)
    return cleaned_text

In [13]:
df['review'] = df['review'].apply(remove_tags)

In [14]:
df['review'] = df['review'].apply(lambda x: x.lower())

In [15]:
df.shape

(49582, 2)

In [16]:
min_count = min(df['sentiment'].value_counts())
print(min_count)

24698


In [17]:
# sampling equal amount of positive and negative sentiments
positive_samples = df[df['sentiment']=='positive'].sample(min_count, random_state=42)
negative_samples = df[df['sentiment']=='negative'].sample(min_count, random_state=42)
df = pd.concat([positive_samples, negative_samples])

In [18]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,24698
negative,24698


In [19]:
# Encoding the labels
df.replace({'sentiment': {'positive': 1, 'negative':0}}, inplace=True)
df.head()

  df.replace({'sentiment': {'positive': 1, 'negative':0}}, inplace=True)


In [20]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [21]:
train_data.shape, test_data.shape

((39516, 2), (9880, 2))

In [23]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data['review'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)

In [24]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

## Building & Training The Model

In [25]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [26]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [27]:
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m377s[0m 744ms/step - accuracy: 0.7353 - loss: 0.5267 - val_accuracy: 0.8319 - val_loss: 0.3944
Epoch 2/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 656ms/step - accuracy: 0.8585 - loss: 0.3433 - val_accuracy: 0.8424 - val_loss: 0.3665
Epoch 3/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 658ms/step - accuracy: 0.8551 - loss: 0.3557 - val_accuracy: 0.8540 - val_loss: 0.3417
Epoch 4/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 652ms/step - accuracy: 0.8851 - loss: 0.2905 - val_accuracy: 0.8653 - val_loss: 0.3582
Epoch 5/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 669ms/step - accuracy: 0.8991 - loss: 0.2536 - val_accuracy: 0.8740 - val_loss: 0.3205


<keras.src.callbacks.history.History at 0x79622438cdd0>

## Model Evaluation

In [28]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 109ms/step - accuracy: 0.8669 - loss: 0.3209
Test Loss: 0.31481656432151794
Test Accuracy: 0.8711538314819336


In [29]:
def predict_sentiment(review):
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [30]:
new_reivew = "That movie was the best film this year"
sentiment = predict_sentiment(new_reivew)
print(f"The Sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 464ms/step
The Sentiment of the review is: positive


## Saving The Model

In [32]:
## If you wish to save the model you can do so by using the following code
## You may use the above tokenization and testing method to utilize the model
model.save('./model/sentiment_analysis_model.h5')

