## Import Libraries

In [43]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import zipfile
import re
import string
import nltk
import torch
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, GlobalAveragePooling1D,GRU,LSTM,Bidirectional,Dropout,BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping



## Data Downloading

In [44]:
# zip_path = "/content/IMDB Dataset.csv.zip"
# extract_to = "data/"
# # Open the zip file
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     # Extract specific files
#     zip_ref.extract("IMDB Dataset.csv", extract_to)

In [45]:
train_data = pd.read_csv("/kaggle/input/imdb-dataset/IMDB Dataset.csv")
train_df, test_df = train_test_split(train_data, test_size=0.1, random_state=42)

In [46]:
# Tải stopwords nếu chưa có
nltk.download("stopwords")

# Khai báo stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Preprocessing

In [47]:
def preprocess_text(text):
    text = text.lower()  # Chuyển thành chữ thường
    text = re.sub(r"\d+", "", text)  # Xóa số
    text = text.translate(str.maketrans("", "", string.punctuation))  # Xóa dấu câu
    text = " ".join([word for word in text.split() if word not in stop_words])  # Xóa stopwords
    text = re.sub(r"\s+", " ", text).strip()  # Xóa khoảng trắng dư thừa
    return text

# ✅ ÁP DỤNG TIỀN XỬ LÝ CHO DỮ LIỆU
train_df["review"] = train_df["review"].apply(preprocess_text)
test_df["review"] = test_df["review"].apply(preprocess_text)

KeyboardInterrupt: 

In [None]:
# Tách cột review và sentiment
X_train = train_df["review"].values
y_train = train_df["sentiment"].values
X_test = test_df["review"].values
y_test = test_df["sentiment"].values

# Kiểm tra kiểu dữ liệu của nhãn
print("y_train type:", type(y_train))
print("Unique values in y_train:", np.unique(y_train))

# Nếu nhãn là chuỗi, chuyển thành số (1 cho 'positive', 0 cho 'negative')
if y_train.dtype == 'O':  # 'O' (object) có thể chứa chuỗi
    label_mapping = {'positive': 1, 'negative': 0}
    y_train = np.array([label_mapping[label] for label in y_train], dtype=np.float32)
    y_test = np.array([label_mapping[label] for label in y_test], dtype=np.float32)
else:
    y_train = y_train.astype(np.float32)
    y_test = y_test.astype(np.float32)

# Số lượng từ tối đa trong vocab
MAX_WORDS = 10000
# Độ dài tối đa của mỗi review
MAX_LEN = 500

EMBEDDING_DIMS = 128

# Tokenize dữ liệu văn bản
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

## Build Model

#### Vanilla RNN Model

In [59]:
# Vanilla RNN model
def create_rnn_model():
    model = Sequential([
        Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIMS, mask_zero=True),  # mask_zero=True 
        SimpleRNN(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=False),  # return_sequences=False -> output 2D
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

#### GRU Model

In [62]:
# GRU model
# def create_gru_model():
#     model = Sequential()
#     model.add(Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIMS, input_length=MAX_LEN))  # Embedding layer
#     model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))  # GRU layer (no return_sequences)
#     model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model
from tensorflow.keras.layers import GRU, Dense, Dropout, Embedding, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential

def create_gru_model():
    model = Sequential([
        # Embedding Layer
        Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIMS, input_length=MAX_LEN),
        
        # First GRU Layer (returns sequences for stacking)
        Bidirectional(GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.1)),
        BatchNormalization(),

        # Second GRU Layer
        Bidirectional(GRU(32, dropout=0.1, recurrent_dropout=0.05)),
        Dropout(0.2),

        # Fully Connected Layers
        Dense(16, activation="relu", kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        
        # Output Layer
        Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=1e-3),
        metrics=['accuracy']
    )
    return model



#### LSTM Model

In [48]:
# LSTM model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# def create_lstm_model():
#     # model = Sequential([
#     #     Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIMS, input_length=MAX_LEN),
#     #     LSTM(128, dropout=0.2, recurrent_dropout=0.2),
#     #     Dense(1, activation='sigmoid')
#     # ])
#     model = Sequential([
#     Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN),

#     Bidirectional(GRU(64, return_sequences=True)),  
#     Dropout(0.3),
#     BatchNormalization(),

#     Bidirectional(GRU(32)),  
#     Dropout(0.3),
#     BatchNormalization(),

#     Dense(32, activation="relu"),
#     Dropout(0.3),

#     Dense(1, activation="sigmoid")  
# ])
#     model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
#     return model

from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, BatchNormalization, Dense
from tensorflow.keras.regularizers import l2

def create_lstm_model():
    model = Sequential([
        # Embedding with a slightly smaller dimension
        Embedding(input_dim=MAX_WORDS, output_dim=64, mask_zero=True),

        # Optimized bidirectional LSTM
        Bidirectional(LSTM(32, return_sequences=True)),  
        BatchNormalization(),

        Bidirectional(LSTM(16)),  
        Dropout(0.1),  

        # Dense layers with reduced regularization
        Dense(16, activation="relu"),  
        BatchNormalization(),

        # Output layer
        Dense(1, activation="sigmoid")
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

## Train Model

#### Train Vanilla RNN Model

In [57]:
# Convert text data to sequences of integers using the tokenizer
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequence
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")


In [60]:
# Train Vanilla RNN model
rnn_model = create_rnn_model()
X_train_tensor = np.array(X_train_pad, dtype=np.int32)  
y_train_tensor = np.array(y_train, dtype=np.float32)  
X_test_tensor = np.array(X_test_pad, dtype=np.int32)
y_test_tensor = np.array(y_test, dtype=np.float32)

# train model
history_rnn = rnn_model.fit(
    X_train_tensor, y_train_tensor,
    validation_data=(X_test_tensor, y_test_tensor),
    epochs=10, batch_size=64
)


Epoch 1/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 53ms/step - accuracy: 0.5147 - loss: 0.7059 - val_accuracy: 0.5584 - val_loss: 0.6777
Epoch 2/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 49ms/step - accuracy: 0.5741 - loss: 0.6701 - val_accuracy: 0.6096 - val_loss: 0.6427
Epoch 3/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 49ms/step - accuracy: 0.6381 - loss: 0.6243 - val_accuracy: 0.6510 - val_loss: 0.6178
Epoch 4/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 49ms/step - accuracy: 0.6866 - loss: 0.5789 - val_accuracy: 0.6856 - val_loss: 0.6013
Epoch 5/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 49ms/step - accuracy: 0.7594 - loss: 0.5031 - val_accuracy: 0.7812 - val_loss: 0.5000
Epoch 6/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 49ms/step - accuracy: 0.7548 - loss: 0.5071 - val_accuracy: 0.7036 - val_loss: 0.5839
Epoch 7/10
[1m7

#### Train GRU Model

In [61]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding để các chuỗi có cùng độ dài
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")


In [None]:
# Train GRU model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
gru_model = create_gru_model()
history_gru = gru_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test),
                            epochs=10, batch_size=64, callbacks=[early_stopping])

Epoch 1/10
[1m307/704[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m20:49[0m 3s/step - accuracy: 0.5159 - loss: 0.8099

In [None]:
# Define tokenizer (adjust num_words if needed)
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)  # Fit tokenizer on training data

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Now pad the sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

In [49]:
# Train LSTM model
lstm_model = create_lstm_model()
history_lstm = lstm_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test),
                              epochs=10, batch_size=64, callbacks=[early_stopping])

Epoch 1/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 59ms/step - accuracy: 0.5793 - loss: 0.7269 - val_accuracy: 0.8246 - val_loss: 0.3963
Epoch 2/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 59ms/step - accuracy: 0.8566 - loss: 0.3401 - val_accuracy: 0.8796 - val_loss: 0.2849
Epoch 3/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 60ms/step - accuracy: 0.9127 - loss: 0.2274 - val_accuracy: 0.8856 - val_loss: 0.2781
Epoch 4/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 60ms/step - accuracy: 0.9309 - loss: 0.1857 - val_accuracy: 0.8800 - val_loss: 0.2856
Epoch 5/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 61ms/step - accuracy: 0.9463 - loss: 0.1518 - val_accuracy: 0.8806 - val_loss: 0.3031
Epoch 6/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 60ms/step - accuracy: 0.9572 - loss: 0.1250 - val_accuracy: 0.8790 - val_loss: 0.3422


### Plot results

In [None]:
import matplotlib.pyplot as plt

# Plot accuracy
plt.plot(history_rnn.history['val_accuracy'], label="Vanilla RNN")
plt.plot(history_gru.history['val_accuracy'], label="GRU")
plt.plot(history_lstm.history['val_accuracy'], label="LSTM")
plt.xlabel("Epochs")
plt.ylabel("Validation Accuracy")
plt.legend()
plt.title("Model Accuracy Comparison")
plt.show()
