In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, LSTM, Dropout

2024-06-19 21:20:53.379756: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-19 21:20:53.380024: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-19 21:20:53.463379: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-19 21:20:53.659754: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
file_path = 'IMDB Dataset.csv'

In [3]:
# Load the dataset
data = pd.read_csv(file_path)

In [4]:
print(data.head(5))

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


## Data Cleaning

In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
# Function to clean the text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters and lower the text
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [8]:
data['review'] = data['review'].apply(clean_text)

After cleansing

In [9]:
print(data.head(5))

                                              review sentiment
0  one reviewers mentioned watching oz episode ho...  positive
1  wonderful little production filming technique ...  positive
2  thought wonderful way spend time hot summer we...  positive
3  basically family little boy jake thinks zombie...  negative
4  petter mattei love time money visually stunnin...  positive


Label encode the target feature

In [10]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])

In [11]:
print(data.head(5))

                                              review  sentiment
0  one reviewers mentioned watching oz episode ho...          1
1  wonderful little production filming technique ...          1
2  thought wonderful way spend time hot summer we...          1
3  basically family little boy jake thinks zombie...          0
4  petter mattei love time money visually stunnin...          1


In [12]:
# Parameters
max_words = 10000  # Maximum number of words to consider, only the most frequent words will be kept
max_len = 100      # Maximum length of each sequence

Tokenize

In [13]:
# Tokenize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['review'])
sequences = tokenizer.texts_to_sequences(data['review'])

Padding sequences

In [14]:
# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=max_len)

In [15]:
print(padded_sequences[0])

[4494 2295 1047 6778 2742  267  183 4705 2743  395  225   31 7911 4847
 7538 2284  208 8832 7032 8428   34  116  131    8   48  167 1179   38
  549   90  154  161 2851  687   80 1156 3979 2330 1078  687 1284  687
  733 2915  805   82   19  275   42  104 3074 1466 2078   48 1429  177
 1268 1120 2915   88 9882  183 1876 1973  434  434 7422 6925 4817 2812
 6643  371  500   15  131   12 7362  511  566 6643  522 1047  549  439
  759 1859 1047  424   55 2915   94  304 3657 3142  664 1444   15 1065
 3915  356]


Train Test split

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data['sentiment'], test_size=0.2, random_state=42)

In [17]:
# Model parameters
embedding_dim = 100

In [18]:
# Build the model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))



In [19]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [20]:
# Print the model summary
model.summary()

In [21]:
y_train

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

In [22]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
[1m  13/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m14s[0m 15ms/step - accuracy: 0.5668 - loss: 0.6857

I0000 00:00:1718810905.282757     106 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - accuracy: 0.6562 - loss: 0.5897 - val_accuracy: 0.8432 - val_loss: 0.4067
Epoch 2/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.8947 - loss: 0.2671 - val_accuracy: 0.8595 - val_loss: 0.3335
Epoch 3/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.9018 - loss: 0.2447 - val_accuracy: 0.8601 - val_loss: 0.3889
Epoch 4/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.9720 - loss: 0.0881 - val_accuracy: 0.8306 - val_loss: 0.4862
Epoch 5/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.9866 - loss: 0.0448 - val_accuracy: 0.8225 - val_loss: 0.6156
Epoch 6/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.9897 - loss: 0.0338 - val_accuracy: 0.7997 - val_loss: 0.7440
Epoch 7/20
[1m1000/1000

In [23]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.7929 - loss: 1.0322
Test Accuracy: 79.77%


## Training LSTM

In [24]:
# Build the model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(32))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [25]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
# Print the model summary
model.summary()

In [27]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - accuracy: 0.7836 - loss: 0.4459 - val_accuracy: 0.8711 - val_loss: 0.3100
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.9119 - loss: 0.2333 - val_accuracy: 0.8691 - val_loss: 0.3243
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.9431 - loss: 0.1563 - val_accuracy: 0.8711 - val_loss: 0.3454
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.9646 - loss: 0.1074 - val_accuracy: 0.8648 - val_loss: 0.4212
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.9796 - loss: 0.0695 - val_accuracy: 0.8583 - val_loss: 0.4522
Epoch 6/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.9867 - loss: 0.0484 - val_accuracy: 0.8518 - val_loss: 0.5340
Epoc

In [28]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8488 - loss: 0.7045
Test Accuracy: 85.10%


Got better accuracy than simple rnn model

In [29]:
sample_reviews = [
    "I really loved the movie! It was fantastic and thrilling.",
    "The movie was terrible, I did not enjoy it at all.",
    "It was an average movie, nothing special.",
    "Absolutely wonderful experience, would watch again!",
    "Not my cup of tea, found it quite boring."
]

In [30]:
# Clean and preprocess the sample reviews
sample_reviews_cleaned = [clean_text(review) for review in sample_reviews]
sample_sequences = tokenizer.texts_to_sequences(sample_reviews_cleaned)
sample_padded_sequences = pad_sequences(sample_sequences, maxlen=max_len)

In [31]:
predictions = model.predict(sample_padded_sequences)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


In [32]:
print(predictions)

[[9.8923510e-01]
 [1.0948971e-02]
 [3.0000351e-02]
 [9.9039817e-01]
 [8.0816715e-04]]


In [33]:
decoded_predictions = ["positive" if pred > 0.5 else "negative" for pred in predictions]

In [34]:
print(decoded_predictions)

['positive', 'negative', 'negative', 'positive', 'negative']
