In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.utils import to_categorical

In [19]:
!pip install kaggle

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip imdb-dataset-of-50k-movie-reviews.zip

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [20]:
df = pd.read_csv('IMDB Dataset.csv')

reviews = df['review'].values
sentiments = df['sentiment'].values

sentiment_mapping = {'positive': 1, 'negative': 0}
sentiments = np.array([sentiment_mapping[sentiment] for sentiment in sentiments])

In [21]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
max_length = 200
padded_sequences = pad_sequences(sequences, maxlen=max_length)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, sentiments, test_size=0.2, random_state=42)

In [22]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_length))
model.add(GRU(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy}')

Epoch 1/5




[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.7370 - loss: 0.4997 - val_accuracy: 0.8831 - val_loss: 0.2849
Epoch 2/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 11ms/step - accuracy: 0.9111 - loss: 0.2295 - val_accuracy: 0.8940 - val_loss: 0.2645
Epoch 3/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.9505 - loss: 0.1371 - val_accuracy: 0.8890 - val_loss: 0.2960
Epoch 4/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.9744 - loss: 0.0799 - val_accuracy: 0.8845 - val_loss: 0.3563
Epoch 5/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.9840 - loss: 0.0510 - val_accuracy: 0.8749 - val_loss: 0.4410
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8759 - loss: 0.4232
Test accuracy: 0.8773999810218811


In [23]:
predictions = model.predict(X_test[:5])
predicted_labels = (predictions > 0.5).astype(int)

for i in range(5):
  print(f"Text: {reviews[np.where(padded_sequences == X_test[i])[0][0]][:100]}...")
  print(f"Predicted label: {predicted_labels[i][0]}")
  print(f"True Label: {y_test[i]}")
  print("---")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 562ms/step
Text: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. The...
Predicted label: 1
True Label: 1
---
Text: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. The...
Predicted label: 1
True Label: 1
---
Text: A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-B...
Predicted label: 0
True Label: 0
---
Text: A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-B...
Predicted label: 1
True Label: 1
---
Text: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. The...
Predicted label: 0
True Label: 0
---
