# Home Exercise 1 on Text Classification

Implement a **Recurrent Neural Network model** (**Vanilla RNN, GRU, and LSTM**) to predict whether a review is positive or negative.

- **Data**: [IMDB Dataset of 50K Movie Reviews](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) (the last 10% of rows serve as the test set).
- **Compare** the performance of the three models.

**Note**: Submit only a **single Jupyter Notebook file** that can handle all tasks, including data downloading, preprocessing, model training, and model evaluation. *(Submissions that do not follow the guidelines will receive a score of 0.)*

**Grading Criteria**

For valid submissions, scores will be assigned based on the **leaderboard ranking** (**strictly greater**):

- **Top 25%** → **10 points**
- **25% - 50%** → **9.0 points**
- **50% - 75%** → **8.0 points**
- **75% - 100%** → **7.0 points**


# Import Libs

In [3]:
%pip install numpy pandas tensorflow scikit-learn


Note: you may need to restart the kernel to use updated packages.




In [4]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import kagglehub
import os
import time 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, GRU, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define constants
MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 100
BATCH_SIZE = 64
EPOCHS = 5


  from .autonotebook import tqdm as notebook_tqdm


In [None]:


# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

# Define the dataset path (update this if needed)
dataset_file = os.path.join(path, "IMDB Dataset.csv")  # Ensure correct file name

# Load the dataset
df = pd.read_csv(dataset_file)

# Convert sentiments to binary labels
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(df['review'], df['sentiment'], test_size=0.1, random_state=42)

# Tokenization and padding
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

train_padded = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Convert labels to numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)


Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1


In [6]:
# Define function to build models
def build_rnn_model():
    model = Sequential([
        Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
        SimpleRNN(64, return_sequences=False),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_gru_model():
    model = Sequential([
        Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
        GRU(64, return_sequences=False),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_lstm_model():
    model = Sequential([
        Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
        LSTM(64, return_sequences=False),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model



In [7]:
# Train and evaluate models
models = {
    "Vanilla RNN": build_rnn_model(),
    "GRU": build_gru_model(),
    "LSTM": build_lstm_model()
}



In [8]:
# Train, evaluate and compare
results = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    start_train = time.time()
    history = model.fit(
        train_padded, train_labels,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(test_padded, test_labels),
        verbose=1
    )
    end_train = time.time()

    # Evaluation
    start_pred = time.time()
    predictions = (model.predict(test_padded) > 0.5).astype("int32")
    end_pred = time.time()

    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions)
    loss = history.history['val_loss'][-1]

    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "Loss": loss,
        "Training Time (s)": end_train - start_train,
        "Inference Time (s)": end_pred - start_pred
    })


Training Vanilla RNN...
Epoch 1/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 156ms/step - accuracy: 0.5067 - loss: 0.7094 - val_accuracy: 0.4994 - val_loss: 0.6937
Epoch 2/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 113ms/step - accuracy: 0.5381 - loss: 0.6859 - val_accuracy: 0.5002 - val_loss: 0.6946
Epoch 3/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 84ms/step - accuracy: 0.6384 - loss: 0.6107 - val_accuracy: 0.5036 - val_loss: 0.7560
Epoch 4/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 177ms/step - accuracy: 0.7207 - loss: 0.4507 - val_accuracy: 0.5158 - val_loss: 0.8559
Epoch 5/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 149ms/step - accuracy: 0.7462 - loss: 0.3978 - val_accuracy: 0.5096 - val_loss: 0.9913
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step

Training GRU...
Epoch 1/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [14]:
# Compile results into a DataFrame
results_df = pd.DataFrame(results)
results_df[["Accuracy", "Precision", "Recall", "F1-Score"]]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
0,0.5096,0.510317,0.657801,0.574749
1,0.8828,0.89019,0.875347,0.882706
2,0.862,0.896059,0.821358,0.857084


In [10]:
results_json_path = os.path.join(os.getcwd(), "imdb_results.json")

with open(results_json_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"Results saved to {results_json_path}")


Results saved to e:\2_LEARNING_BKU\2_File_2\K22_HK242\CO3085_NLP\BT\Lab09\imdb_results.json
