**Preprocessing**

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
!unzip '/content/drive/MyDrive/ColabNotebooks/IMDB Dataset.zip' -d '/content/sample_data'

Archive:  /content/drive/MyDrive/ColabNotebooks/IMDB Dataset.zip
  inflating: /content/sample_data/IMDB Dataset.csv  


In [None]:
dataset = pd.read_csv('/content/sample_data/IMDB Dataset.csv')

# Clean the review texts
dataset['cleaned_review'] = dataset['review'].apply(clean_text)

# Encode labels: 'positive' -> 1, 'negative' -> 0
dataset['label'] = dataset['sentiment'].map({'positive': 1, 'negative': 0})

**Tokenizer**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
vocab_size = 10000  # Adjust based on your dataset
max_length = 100    # Adjust based on your dataset

# Initialize tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(dataset['cleaned_review'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(dataset['cleaned_review'])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

**Neural Network**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# Define the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),
    GRU(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model architecture
model.summary()



In [None]:
from sklearn.model_selection import train_test_split

# Features and labels
X = padded_sequences
y = dataset['label'].values

# Split the dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X, y, epochs=5,  verbose=1, validation_split=0.2)

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 71ms/step - accuracy: 0.5743 - loss: 0.6426 - val_accuracy: 0.8654 - val_loss: 0.3160
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 70ms/step - accuracy: 0.8999 - loss: 0.2546 - val_accuracy: 0.8752 - val_loss: 0.3049
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 73ms/step - accuracy: 0.9346 - loss: 0.1761 - val_accuracy: 0.8709 - val_loss: 0.3390
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 71ms/step - accuracy: 0.9618 - loss: 0.1158 - val_accuracy: 0.8613 - val_loss: 0.4286
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 71ms/step - accuracy: 0.9790 - loss: 0.0657 - val_accuracy: 0.8557 - val_loss: 0.4698


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')
print(f'Test Loss: {loss:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8441 - loss: 0.7026
Test Accuracy: 84.39%
Test Loss: 0.7188


In [None]:
for text in ["Storyline is rubbish but movie direction is best", "really bad", "It is not okay ", "Absolutely brilliant"]:
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, padding='post')
    pred = model.predict(pad)[0][0]
    label = "positive" if pred > 0.5 else "negative"
    print(f"{text} → {label} ({pred:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Storyline is rubbish but movie direction is best → negative (0.06)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
really bad → negative (0.27)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
It is not okay  → positive (0.72)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Absolutely brilliant → positive (1.00)


# **Testing**

In [None]:
test_sentences = ["I was really looking forward to this amazing cast and hoping they would bring a fantastic performance and story... Honestly shocking! Why would they sign up for this utterly boring movie. There was no saving grace anywhere in those 2+ hrs.", "It was  good but not much",'What a film story']
test_seq = tokenizer.texts_to_sequences(test_sentences)
test_pad = pad_sequences(test_seq,padding='post')
predictions = model.predict(test_pad)

# Show results with positive/negative
for sentence, pred in zip(test_sentences, predictions):
    sentiment = "positive" if pred > 0.5 else "negative"
    print(f"{sentence} -> {sentiment} ({pred[0]:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 324ms/step
I was really looking forward to this amazing cast and hoping they would bring a fantastic performance and story... Honestly shocking! Why would they sign up for this utterly boring movie. There was no saving grace anywhere in those 2+ hrs. -> negative (0.01)
It was  good but not much -> negative (0.48)
What a film story -> negative (0.46)


**model saving**

In [None]:
model.save('/content/drive/MyDrive/ColabNotebooks/gru_model.keras')

In [None]:
import pickle

# Save tokenizer
with open('/content/drive/MyDrive/ColabNotebooks/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)