In [10]:
!pip install kagglehub tensorflow keras pandas numpy nltk

import kagglehub
import pandas as pd
import numpy as np
import tensorflow as tf
tfk = tf.keras
Sequential = tfk.models.Sequential
Dense = tfk.layers.Dense
Embedding = tfk.layers.Embedding
SimpleRNN = tfk.layers.SimpleRNN
Adam = tfk.optimizers.Adam
Tokenizer = tfk.preprocessing.text.Tokenizer
pad_sequences = tfk.preprocessing.sequence.pad_sequences
import nltk
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")


print("Path to dataset files:", path)


file_path = f"{path}/IMDB Dataset.csv"
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
except Exception as e:
    print("Error loading dataset:", e)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1
Dataset loaded successfully.


In [11]:
print(df.columns)


Index(['review', 'sentiment'], dtype='object')


In [12]:
df = df.dropna(subset=['review']).drop_duplicates(subset=['review'])

stop_words = set(stopwords.words('english'))

def preprocess_text(sentence):
    sentence = re.sub(r'[^a-zA-Z]', ' ', sentence).lower().split()
    sentence = [word for word in sentence if word not in stop_words]
    return ' '.join(sentence)


df['cleaned_text'] = df['review'].apply(preprocess_text)

df = df[df['sentiment'].isin(['positive', 'negative'])]
df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})

df = df.dropna(subset=['sentiment'])

print("Distinct sentiment labels:", df['sentiment'].unique())
print("Sentiment distribution:\n", df['sentiment'].value_counts())


Distinct sentiment labels: [1 0]
Sentiment distribution:
 sentiment
1    24884
0    24698
Name: count, dtype: int64


  df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})


In [17]:
vectorizer = Tokenizer(num_words=5000, oov_token="<OOV>")
vectorizer.fit_on_texts(df['cleaned_text'])

text_sequences = vectorizer.texts_to_sequences(df['cleaned_text'])
padded_texts = pad_sequences(text_sequences, maxlen=100, padding='post')

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    padded_texts, df['sentiment'], test_size=0.2, random_state=42
)


In [18]:
print("Padded sequences shape:", padded_texts.shape)

Padded sequences shape: (49582, 100)


In [19]:
rnn_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    SimpleRNN(64, return_sequences=False),
    Dense(1, activation='sigmoid')
])


rnn_optimizer = Adam(learning_rate=1e-5)
rnn_model.compile(optimizer=rnn_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

rnn_model.build(input_shape=(None, 100))
rnn_model.summary()




In [20]:
rnn_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), validation_split=0.2, verbose=1)

test_loss, test_acc = rnn_model.evaluate(X_test, y_test)
print(f"Evaluation Results - Loss: {test_loss:.4f}, Accuracy: {test_acc:.4f}")


Epoch 1/20
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 40ms/step - accuracy: 0.4985 - loss: 0.6971 - val_accuracy: 0.5026 - val_loss: 0.6932
Epoch 2/20
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 41ms/step - accuracy: 0.5143 - loss: 0.6920 - val_accuracy: 0.5073 - val_loss: 0.6925
Epoch 3/20
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 40ms/step - accuracy: 0.5328 - loss: 0.6894 - val_accuracy: 0.5204 - val_loss: 0.6919
Epoch 4/20
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 40ms/step - accuracy: 0.5525 - loss: 0.6850 - val_accuracy: 0.5640 - val_loss: 0.6804
Epoch 5/20
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 41ms/step - accuracy: 0.6111 - loss: 0.6574 - val_accuracy: 0.6159 - val_loss: 0.6543
Epoch 6/20
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 38ms/step - accuracy: 0.6591 - loss: 0.6228 - val_accuracy: 0.6475 - val_loss: 0.6295
Epoc

In [22]:
sample_review = ["The movie had an amazing storyline and brilliant acting!"]
processed_seq = vectorizer.texts_to_sequences(sample_review)
padded_input = pad_sequences(processed_seq, maxlen=100)

prediction = rnn_model.predict(padded_input)
sentiment_label = "Positive" if prediction[0][0] > 0.5 else "Negative"

print(f"Predicted Sentiment: {sentiment_label}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 255ms/step
Predicted Sentiment: Positive
