In [None]:
import os
import re
import warnings
from datetime import datetime
from collections import Counter

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
tf.keras.mixed_precision.set_global_policy('mixed_float16')

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report,
    ConfusionMatrixDisplay
)

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

warnings.filterwarnings('ignore')
nltk.download('stopwords')
%matplotlib inline

np.random.seed(42)
tf.random.set_seed(42)

# **Project Overview**
**1. Objective**: The goal of this project is to build a sentiment analysis model using the Neural Network, BiLSTM, and DistilBERT transformer model to classify IMDb movie reviews into positive and negative sentiments. The project aims to automate the sentiment classification process, enabling businesses to analyze customer feedback on movies at scale.

**2. Dataset**: The data is [IMDB movie reviews](https://www.kaggle.com/datasets/pawankumargunjan/imdb-review) on Kaggle, which contain 50,000 labeled reviews (25,000 for training, 25,000 for testing). Each review is labeled as either positive(1) or negative(0). Both train and test set are balanced dataset with 50-50 for two classes. Each has 3 columns: sentence (film review in English), sentiment (ratings), polarity (sentimnet label).

**3. Model Description**:
- DistilBERT is a balance between computational efficiency and performance. DistilBERT is a smaller version of BERT while retaining 97% language understanding. The model consists DistilBERT followed by a classification head that predicts the sentiment (positive or negative). The model is fine-tuned for 3 epochs on IMDB training dataset.

- BiLSTM: is a type of recurrent neural network (RNN) that processes input data in both forward and backward directions. This allows the model to capture context from both the past and the future in sequence data, which is particularly useful for tasks like sentiment classification. It improves accuracy compared to a unidirectional LSTM.

- A Dense Network: consists of fully connected layers where each neuron is connected to every neuron in the previous layer. It directly processes the input features (e.g., word embeddings) through these layers to predict sentiment. The final Dense layer outputting the sentiment prediction.

**4. Result**:
- The DistilBERT achieved **93% F1, and 93% Accuracy** on the balanced test set, indicating strong performance for sentiment analysis.
- LSTM: 86% F1, and 86% Accuracy which is pretty good.
- Dense: 1.00 Recall but achieved only 50% Accuracy and 66.68% F1


# **I. Import Data**

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("pawankumargunjan/imdb-review")

print("Path to dataset files:", path)

In [None]:
for dirname, _, filenames in os.walk(path):
    print(dirname)

In [None]:
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)


In [None]:
# Get the train and test data
train = load_dataset('/root/.cache/kagglehub/datasets/pawankumargunjan/imdb-review/versions/3/aclImdb/train')
test = load_dataset('/root/.cache/kagglehub/datasets/pawankumargunjan/imdb-review/versions/3/aclImdb/test')

In [None]:
train.head(5)

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

# **II. EDA**

**Film Rating Distribution**

In [None]:
# Get the rating distribution of Train and Test

# Convert rating from object to number
train['sentiment'] = train['sentiment'].astype(int)
test['sentiment'] = test['sentiment'].astype(int)

# Create data for pie chart
rating_train_agg = train.groupby('sentiment')['sentiment'].count().reset_index(name='count')
rating_test_agg = test.groupby('sentiment')['sentiment'].count().reset_index(name='count')

# Subplot for rating of train and test set
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].pie(rating_train_agg['count'], labels=rating_train_agg['sentiment'], autopct='%1.1f%%', startangle=90)
axes[0].axis('equal')
axes[0].set_title('Training Data')

axes[1].pie(rating_test_agg['count'], labels=rating_test_agg['sentiment'], autopct='%1.1f%%', startangle=90)
axes[1].axis('equal')
axes[1].set_title('Test Data')

plt.suptitle("Rating for movies on Train and Test set")
plt.tight_layout()
plt.show()

The distribution Rating on Train and Test data are similar

**Word Length Distribution**

In [None]:
# Plot the length distribution of Train and Test

# Add text length column
train['word_length'] = train['sentence'].apply(lambda x: len(x.split()))
test['word_length'] = test['sentence'].apply(lambda x: len(x.split()))

# Suplot for length of text
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.histplot(train['word_length'], bins=50, kde=True, ax=axes[0])
axes[0].set_title('Training Data')
axes[0].set_xlabel('Word Length')
axes[0].set_ylabel('Frequency')

sns.histplot(test['word_length'], bins=50, kde=True, color='red', ax=axes[1])
axes[1].set_title('Test Data')
axes[1].set_xlabel('Word Length')
axes[1].set_ylabel('Frequency')

plt.suptitle("Distribution of Word Lengths in Training & Test Data")
plt.tight_layout()
plt.show()

Most reviews have less than 500 words, however, there are some very long reviews with more than 2500 words.

**Top 10 Words of Negative and Positive Review**

In [None]:
# Get 10 most common words in Train data for Negative and Positive review
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
extra_stopwords = {"'s", "n't", "'m", "'re", "br"}
stop_words.update(extra_stopwords)

def get_top_words_by_label(dataframe, label, stop_words, n=10):
    """Get top N words for a specific sentiment label"""
    subset = dataframe[dataframe['polarity'] == label]

    all_text = ' '.join(subset['sentence'].tolist()).lower()
    words = re.findall(r"\b[\w']+\b", all_text)

    # Filter words -  Remove nonsense and high frenquency word
    filtered_words = [
        word for word in words
        if word not in stop_words
        and len(word) > 2
        and not word.isdigit()
        and word not in {"movie", "film", "one", "really", "time", "story", "would", "see"}
    ] # These words are not nonsense as a, an, the but they do not provide any insight -> delete

    word_counts = Counter(filtered_words)
    top_words = pd.DataFrame(word_counts.most_common(n),
                            columns=['word', f'count_{"positive" if label else "negative"}'])

    return top_words

top_positive = get_top_words_by_label(train, 1, stop_words)
top_negative = get_top_words_by_label(train, 0, stop_words)

# Display both dataframes
print("Top Words in Positive Reviews:")
display(top_positive)

print("\nTop Words in Negative Reviews:")
display(top_negative)

In [None]:
# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))

sns.barplot(data=top_positive, x='word', y='count_positive', ax=ax1, color='green')
ax1.set_title('Top Words - Positive Reviews')
ax1.set_ylabel('Count')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')

sns.barplot(data=top_negative, x='word', y='count_negative', ax=ax2, color='red')
ax2.set_title('Top Words - Negative Reviews')
ax2.set_ylabel('Count')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

**Word Cloud for both Negative and Positive Review**

In [None]:
# Get the Word Cloud for two labels in Train set
from wordcloud import WordCloud
def generate_label_wordcloud(dataframe, label, stop_words, title):
    subset = dataframe[dataframe['polarity'] == label]
    text = ' '.join(subset['sentence'].tolist()).lower()

    words = re.findall(r"\b[\w']+\b", text)
    filtered_words = [
        word for word in words
        if word not in stop_words
        and len(word) > 2
        and not word.isdigit()
        and word not in {"movie", "film", "one", "really", "time", "story", "would", "see"}
    ]
    text = ' '.join(filtered_words)

    # Generate and plot word cloud
    wordcloud = WordCloud(width=800, height=400,
                         background_color='white',
                         colormap='viridis' if label else 'plasma').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

# Generate word clouds for both labels
generate_label_wordcloud(train, 1, stop_words,
                        "Positive Reviews (Label 1) Word Cloud - Training Set")
print("="*100)
generate_label_wordcloud(train, 0, stop_words,
                        "Negative Reviews (Label 0) Word Cloud - Training Set")

In [None]:
# Distribution of word length
print('Distribution of word length in train data')
print(train['word_length'].describe())
print()
print('Distribution of word length in test data')
print(test['word_length'].describe())

75% examples of both train and test set has length under or equal 284. However, the max length is 2470 which is really far beyond the rest of data. I will delete the examples with length more than 284 from the dataset to reduce computational expense (in terms of number of training examples, and the length of tokens) while remain most dataset.

In [None]:
train = train[train['word_length'] <= 284].reset_index(drop=True)
test = test[test['word_length'] <= 284].reset_index(drop=True)

In [None]:
# Have a look at customer review on films
pd.options.display.max_colwidth = 200
train.head()

In [None]:
# Have a closer look at a review
print(f"Sentiment Label for this review: {train.loc[0, 'sentiment']}")
print('-'*50)
print(train.loc[0, 'sentence'])

In [None]:
print('Percentage of pos and neg in train data')
print()
print(train.polarity.value_counts(normalize=True).reset_index())
print()
print()
print('Percentage of pos and neg in test data')
print()
print(test.polarity.value_counts(normalize=True).reset_index())

After deleting long sentence, both train and test data still have a balance of negative and positive review which is really good. And there is no need for resampling training data.

In [None]:
print(f"Numner of example in Train data: {train.shape[0]}")
print(f"Numner of example in Test data: {test.shape[0]}")

# **III. Preprocessing**

**Prepare Data for DistilBERT**

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
      # Tokenization with truncate and padding
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding='max_length',
            max_length=max_length
        )
        self.labels = labels

    def __getitem__(self, idx):
      # Tensor conversion
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

In [None]:
# Get the max length for padding purpose
max_length = max(max(train['word_length']), max(test['word_length']))
max_length_set = int(max_length*1.2)
print(f"Max length: {max_length}")
print(f"Max length set: {max_length_set}")

# Initialize tokenizer with max length
bert_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
# Data preparation
train_dataset = SentimentDataset(
    train['sentence'].tolist(),
    train['polarity'].tolist(),
    bert_tokenizer,
    max_length_set
)

test_dataset = SentimentDataset(
    test['sentence'].tolist(),
    test['polarity'].tolist(),
    bert_tokenizer,
    max_length_set
)

In [None]:
# Have a look at train_dataset and its original sentence
sample = train_dataset[0]

print("Input IDs:", sample['input_ids'])
print("Attention Mask:", sample['attention_mask'])
print("Label:", sample['labels'])

# Get original sentence
decoded_text = bert_tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
print("\nDecoded Text:", decoded_text)

**Prepare Data for BiLSTM and Dense Neural Network**

In [None]:
# Parameters
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 256
EMBED_DIM = 64
BATCH_SIZE = 256
EPOCHS = 20

# Tokenization
keras_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=MAX_NB_WORDS,
    oov_token='<OOV>',
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
)
keras_tokenizer.fit_on_texts(train['sentence'])

X_train = pad_sequences(keras_tokenizer.texts_to_sequences(train['sentence']),
                       maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_test = pad_sequences(keras_tokenizer.texts_to_sequences(test['sentence']),
                      maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
y_train = train['polarity'].values
y_test = test['polarity'].values

# **IV. Training**

**First Model: Dense**

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Callbacks setup
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

In [None]:
model_dense = tf.keras.Sequential([
    tf.keras.layers.Embedding(MAX_NB_WORDS + 1, 128, input_length=MAX_SEQUENCE_LENGTH),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_dense.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

dense_checkpoint = ModelCheckpoint(
    'best_dense.h5',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

In [None]:
# Training
history_dense = model_dense.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test, y_test),
    callbacks=[early_stop, dense_checkpoint],
    verbose=1
)

**Second Model: LSTM**

In [None]:
# Model definition
model_bilstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(MAX_NB_WORDS + 1, EMBED_DIM, input_length=MAX_SEQUENCE_LENGTH),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_bilstm.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

bilstm_checkpoint = ModelCheckpoint(
    'best_bilstm.h5',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

In [None]:
# Training
history_bilstm = model_bilstm.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test, y_test),
    callbacks=[early_stop, bilstm_checkpoint],
    verbose=1
)

**Third Model: DistilBERT**

In [None]:
from datetime import datetime
# Get DistilBERT

# Version control setup
MODEL_VERSION = "v1"
TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M")
MODEL_SAVE_NAME = f"sentiment_model_{MODEL_VERSION}_{TIMESTAMP}"
MODEL_SAVE_PATH = os.path.join('./saved_models', MODEL_SAVE_NAME)

# Check for saved model, else get pre_trained model
if os.path.exists(MODEL_SAVE_PATH):
    print(f"Loading existing model: {MODEL_SAVE_NAME}")
    model = DistilBertForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)
else:
    print(f"Initializing new model: {MODEL_SAVE_NAME}")
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2
    )

In [None]:
# Function for metric calcualtion
def compute_metrics(p):
    """
    Calculate multiple evaluation metrics for classification
    Uses sklearn metrics for compatibility with Hugging Face outputs
    """
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds),
        'precision': precision_score(p.label_ids, preds),
        'recall': recall_score(p.label_ids, preds)
    }

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f"🎯 Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"💻 Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("⚠️ Warning: No GPU detected, training will be slow!")


In [None]:
# Only train if model doesn't exist
if not os.path.exists(MODEL_SAVE_PATH):
  # Training arguments
  training_args = TrainingArguments(
      output_dir='./results',          # Directory for checkpoints
      num_train_epochs=3,
      per_device_train_batch_size=64,  # Batch size in training
      per_device_eval_batch_size=128,  # Batch size in evaluation
      warmup_ratio=0.1,                # 10% of training steps for learning rate warmup
      weight_decay=0.01,               # Regularization
      learning_rate=2e-5,              # Small rate for fine-tuning
      eval_strategy='epoch',           # Evaluate after each epoch
      save_strategy='epoch',           # Save checkpoint after each epoch
      load_best_model_at_end=True,
      fp16=True,
      report_to='none',                # Disable external services logging
      optim='adamw_torch',
      gradient_accumulation_steps=1,   # No accumulation needed with current batch size
  )

  # Trainer initialization
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
  )

  # Training process and saving model
  print("Starting training...")
  trainer.train()
  print("Training completed! Saving model...")
  model.save_pretrained(MODEL_SAVE_PATH)
  bert_tokenizer.save_pretrained(MODEL_SAVE_PATH)

# **V. Evaluation**

In [None]:
def evaluate_models(X_test, y_test, test_texts, test_labels, max_length_set):
    '''Get metric of 3 models, putting into a dataframe, and then plotting confusion matrix'''
    # Initialize results storage
    metrics = []
    predictions = {}

    # Evaluate Dense Model
    model_dense = tf.keras.models.load_model('best_dense.h5')
    y_pred_dense = (model_dense.predict(X_test) > 0.5).astype(int)

    metrics.append({
        'Model': 'Dense Network',
        'Accuracy': accuracy_score(y_test, y_pred_dense),
        'Precision': precision_score(y_test, y_pred_dense),
        'Recall': recall_score(y_test, y_pred_dense),
        'F1': f1_score(y_test, y_pred_dense)
    })
    predictions['Dense'] = y_pred_dense

    # Evaluate BiLSTM Model
    model_bilstm = tf.keras.models.load_model('best_bilstm.h5')
    y_pred_bilstm = (model_bilstm.predict(X_test) > 0.5).astype(int)

    metrics.append({
        'Model': 'BiLSTM',
        'Accuracy': accuracy_score(y_test, y_pred_bilstm),
        'Precision': precision_score(y_test, y_pred_bilstm),
        'Recall': recall_score(y_test, y_pred_bilstm),
        'F1': f1_score(y_test, y_pred_bilstm)
    })
    predictions['BiLSTM'] = y_pred_bilstm

    # Evaluate DistilBERT
    model = DistilBertForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)
    bert_tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_SAVE_PATH)

    # Recreate test dataset
    bert_dataset = SentimentDataset(
        test_texts,
        test_labels,
        bert_tokenizer,
        max_length_set
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    test_loader = DataLoader(bert_dataset, batch_size=64, shuffle=False)
    bert_preds = []
    bert_labels = []

    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)

            bert_preds.extend(preds.cpu().numpy())
            bert_labels.extend(labels.cpu().numpy())

    metrics.append({
        'Model': 'DistilBERT',
        'Accuracy': accuracy_score(bert_labels, bert_preds),
        'Precision': precision_score(bert_labels, bert_preds),
        'Recall': recall_score(bert_labels, bert_preds),
        'F1': f1_score(bert_labels, bert_preds)
    })
    predictions['DistilBERT'] = bert_preds

    # Create metrics dataframe
    metrics_df = pd.DataFrame(metrics).set_index('Model')

    # Plot confusion matrices
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))

    for i, (name, pred) in enumerate(predictions.items()):
        if name == 'DistilBERT':
            labels = bert_labels
        else:
            labels = y_test

        cm = confusion_matrix(labels, pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(ax=axes[i])
        axes[i].set_title(f'{name} Confusion Matrix')

    plt.tight_layout()
    plt.show()

    return metrics_df, predictions

In [None]:
# Run evaluation
test_texts = test['sentence'].tolist()
test_labels = test['polarity'].tolist()
metrics_df, predictions = evaluate_models(X_test, y_test, test_texts, test_labels, max_length_set)

# Display metrics
print("Model Comparison Metrics:")
display(metrics_df.style.background_gradient(cmap='Blues', axis=0))