# Project: IMDb 3-Class Sentiment Analysis
# Notebook : 512-Token Model Training and Analysis

This notebook contains the complete workflow for the 512-token-length model:
1.  **Data Preparation:** Loads the raw IMDb dataset, tokenizes all text to `max_length=512`, and saves the processed dataset to Google Drive.
2.  **Model Training:** Fine-tunes the `bert-base-uncased` model on the 512-token data.
3.  **Binary Evaluation:** Evaluates the trained model for binary accuracy.
4.  **Phase 2 Analysis:** Sets up and runs the 3-class heuristic systems (Ratio, Logit, Weighting) on a 1,000-review sample to generate the final comparison report.

## 1. Setup: Install and Import Libraries

In [None]:
# Install required libraries
!pip install transformers datasets accelerate torch

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from torch.utils.data import DataLoader
from datasets import load_dataset, load_from_disk
from google.colab import drive
import time
import datetime
import os
import numpy as np
import nltk
from sklearn.metrics import classification_report
from tqdm.auto import tqdm # For progress bars
import pandas as pd
import math

In [None]:
# Helper function to format elapsed time
def format_time(elapsed):
    '''Takes a time in seconds and returns a string hh:mm:ss'''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## 2. Phase 1: Data Preparation (512-Token)

This step loads the raw IMDb dataset, tokenizes all text to a `max_length` of 512, and saves the processed dataset to Google Drive to avoid re-computing.

In [None]:
# Mount Google Drive to access and save project files
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Define path to save the processed data
save_path = '/content/drive/My Drive/BERT_IMDB_Processed_512'

# Load the raw 'imdb' dataset from Hugging Face
print("Loading IMDb dataset...")
imdb = load_dataset("imdb")

print("Loading bert-base-uncased tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function_512(examples):
    """Tokenizes text, pads, and truncates to the max BERT length (512)."""
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512 # Set to BERT maximum
    )

print("Tokenizing and preprocessing dataset (max_length=512)...")
tokenized_imdb = imdb.map(preprocess_function_512, batched=True)


tokenized_imdb = tokenized_imdb.remove_columns(["text"])
tokenized_imdb = tokenized_imdb.rename_column("label", "labels")
tokenized_imdb.set_format("torch")
print("Preprocessing complete.")

# Save the processed dataset to Google Drive
print(f"Saving tokenized dataset to: {save_path}")
tokenized_imdb.save_to_disk(save_path)

print(f"--- Successfully saved to Google Drive! ---")

## 3. Phase 1: Model Training (512-Token)

This section loads the pre-processed 512-token data, defines the training parameters, and runs the fine-tuning loop.

In [None]:
# --- 1. Define Key Parameters ---
BATCH_SIZE = 32
NUM_WORKERS = 4
LEARNING_RATE = 3e-6
EPOCHS = 2

# Define paths
data_load_path = '/content/drive/My Drive/BERT_IMDB_Processed_512'
model_save_path = '/content/drive/My Drive/BERT_IMDB_Model_Trained_512'

# --- 2. Load Data from Drive ---
print(f"Loading processed data from: {data_load_path}")
tokenized_imdb = load_from_disk(data_load_path)

# --- 3. Set Up DataLoaders ---
print("Setting up DataLoaders...")
train_dataloader = DataLoader(
    tokenized_imdb["train"],
    shuffle=True,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

test_dataloader = DataLoader(
    tokenized_imdb["test"],
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)
print("DataLoaders are ready.")
print(f"Using Batch Size: {BATCH_SIZE}")
print(f"Training batches: {len(train_dataloader)}")

# --- 4. Define the Model ---
print("Loading BertForSequenceClassification model...")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2 # Binary classification
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model moved to {device}")

# --- 5. Set up Training Parameters ---
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_dataloader) * EPOCHS

# --- 6. The Training Loop ---
print(f"\nStarting training for {EPOCHS} epochs...")

for epoch_i in range(0, EPOCHS):

    print(f"\n======== Epoch {epoch_i + 1} / {EPOCHS} ========")
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train() # training mode

    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(f'  Batch {step:>5,}  of  {len(train_dataloader):>5,}.    Elapsed: {elapsed}.')

        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)

        model.zero_grad()

        # forward pass
        result = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels,
            return_dict=True
        )

        loss = result.loss
        total_train_loss += loss.item()

        # backward pass
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print(f"\n  Average training loss: {avg_train_loss:.2f}")
    print(f"  Training epoch took: {training_time}")

print("\n--- Training complete! ---")

# --- 7. Save the trained model and tokenizer to your Drive ---
print(f"Saving model to {model_save_path}")
model.save_pretrained(model_save_path)

print(f"Saving tokenizer to {model_save_path}...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.save_pretrained(model_save_path)

print("Model and tokenizer saved.")

## 4. Phase 1: Binary Model Evaluation (512-Token)

This section evaluates the 512-token model on the 25,000-review test set to get its baseline accuracy, precision, and recall.

In [None]:
# --- 1. Define Key Parameters ---
EVAL_BATCH_SIZE = 128 # Use a larger batch size for (faster) evaluation
NUM_WORKERS = 8
PREFETCH_FACTOR = 4

# --- 2. Define Paths ---
data_load_path = '/content/drive/My Drive/BERT_IMDB_Processed_512'
model_load_path = '/content/drive/My Drive/BERT_IMDB_Model_Trained_512'

# --- 3. Load Model and Tokenizer ---
print(f"Loading trained model from: {model_load_path}")
model = BertForSequenceClassification.from_pretrained(model_load_path)
tokenizer = BertTokenizer.from_pretrained(model_load_path)

# --- 4. Load Test Data ---
print(f"Loading processed data from: {data_load_path}")
tokenized_imdb = load_from_disk(data_load_path)

# --- 5. Create OPTIMIZED Test DataLoader ---
print("Setting up optimized Test DataLoader...")
test_dataloader = DataLoader(
    tokenized_imdb["test"],
    batch_size=EVAL_BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    prefetch_factor=PREFETCH_FACTOR
)
print(f"Test DataLoader is ready. Batch size: {EVAL_BATCH_SIZE}")

# --- 6. Set Up Evaluation ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() # Put model in evaluation mode

# Apply PyTorch 2.0 compile for faster inference
print("Compiling model with torch.compile()... (This may take a moment)")
try:
    model = torch.compile(model)
    print("Model compiled successfully.")
except Exception as e:
    print(f"torch.compile() failed: {e}. Continuing without compilation.")

print(f"Model moved to {device}. Starting evaluation...")

all_preds = []
all_labels = []

# --- 7. Optimized Evaluation Loop ---
print("Starting evaluation...")
t0 = time.time()

# torch.no_grad() disables gradient calculation, saving memory and VRAM
with torch.no_grad():
  # torch.cuda.amp.autocast() runs inference in faster FP16
  with torch.cuda.amp.autocast():
    for batch in tqdm(test_dataloader, desc="Evaluating"):

        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)

        outputs = model(b_input_ids,
                      token_type_ids=None,
                      attention_mask=b_input_mask,
                      return_dict=True)

        logits = outputs.logits
        # Get the final prediction (0 or 1)
        predictions = torch.argmax(logits, dim=-1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(b_labels.cpu().numpy())

eval_time = format_time(time.time() - t0)
print(f"Evaluation complete. Total time: {eval_time}")

# --- 8. Display Results ---
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

report = classification_report(all_labels, all_preds, target_names=["Negative (0)", "Positive (1)"])

print("\n--- Phase 1: Binary Classification Report (512-Token Model) ---")
print(report)

## 5. Phase 2: 3-Class System Comparison (512-Token Model)

This section defines all three "mixed" classification systems and runs them on a 1,000-review sample to compare their performance.

In [None]:
# --- 1. Setup: NLTK and Load Model ---


print("Downloading NLTK sentencizer (punkt)...")
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
print("NLTK ready.")

# --- 2. Define Helper Functions for Model Prediction ---
TOKENIZER_MAX_LENGTH = 512 # Set to 512 for this model

# Helper for Systems 1 (Ratio) and 3 (Weighting)
def predict_sentence_sentiment_binary(sentence_text):
    """Feeds a single sentence to the binary model and returns 0 or 1."""
    inputs = tokenizer(
        sentence_text, return_tensors="pt", truncation=True,
        padding="max_length", max_length=TOKENIZER_MAX_LENGTH
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    prediction = torch.argmax(outputs.logits, dim=-1)
    return prediction.cpu().item()

# Helper for System 2 (Logit)
def get_sentence_logits(sentence_text):
    """Feeds a single sentence to the binary model and returns its raw logits."""
    inputs = tokenizer(
        sentence_text, return_tensors="pt", truncation=True,
        padding="max_length", max_length=TOKENIZER_MAX_LENGTH
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask, return_dict=True)
    return outputs.logits.squeeze().cpu().numpy()


# --- 3. Define All Three Classification Systems ---

# SYSTEM 1: Simple Ratio
def classify_review_ratio(full_review_text, pos_threshold=0.7, neg_threshold=0.7):
    """Classifies based on the ratio of positive/negative sentences."""
    sentences = sent_tokenize(full_review_text)
    # Classify short reviews based on the whole text
    if len(sentences) < 3:
        pred = predict_sentence_sentiment_binary(full_review_text)
        return "POSITIVE" if pred == 1 else "NEGATIVE"

    sentence_predictions = [predict_sentence_sentiment_binary(s) for s in sentences]
    if not sentence_predictions: return "NEUTRAL"

    num_sentences = len(sentence_predictions)
    num_positive = sum(sentence_predictions)
    positive_ratio = num_positive / num_sentences

    # Apply threshold logic
    if positive_ratio >= pos_threshold: return "POSITIVE"
    elif (1 - positive_ratio) >= neg_threshold: return "NEGATIVE"
    else: return "MIXED"

# SYSTEM 2: Logit-Based "Neutral Zone"
def classify_review_logit(full_review_text, neutral_threshold=1.0, min_sentences=2):
    """Classifies based on confidence (logits) and co-occurrence."""
    sentences = sent_tokenize(full_review_text)
    if not sentences: return "NEUTRAL"

    confident_pos_count = 0
    confident_neg_count = 0

    for sentence in sentences:
        logits = get_sentence_logits(sentence)
        neg_score, pos_score = logits[0], logits[1]
        score_difference = abs(pos_score - neg_score)


        if score_difference >= neutral_threshold:
            if pos_score > neg_score: confident_pos_count += 1
            else: confident_neg_count += 1

    # Rule: Must have at least 2 of each to be "Mixed"
    if confident_pos_count >= min_sentences and confident_neg_count >= min_sentences: return "MIXED"
    elif confident_pos_count > 0 and confident_neg_count == 0: return "POSITIVE"
    elif confident_neg_count > 0 and confident_pos_count == 0: return "NEGATIVE"
    else:
        if confident_pos_count > confident_neg_count: return "POSITIVE"
        elif confident_neg_count > confident_pos_count: return "NEGATIVE"
        else: return "NEUTRAL"

# SYSTEM 3: Positional Weighting + Co-occurrence
def classify_review_weighting(full_review_text, positional_weight=2,
                            mixed_pos_threshold=2, mixed_neg_threshold=2):
    """Classifies based on weighted co-occurrence, giving 2x weight to first/last sentences."""
    sentences = sent_tokenize(full_review_text)
    if not sentences: return "NEUTRAL"

    num_sentences = len(sentences)
    weighted_pos_score = 0
    weighted_neg_score = 0

    for i, sentence in enumerate(sentences):
        current_weight = 1
        # Apply 2x weight to first and last sentence
        if i == 0 or i == (num_sentences - 1):
            current_weight = positional_weight

        prediction = predict_sentence_sentiment_binary(sentence)
        if prediction == 1: weighted_pos_score += current_weight
        else: weighted_neg_score += current_weight


    if (weighted_pos_score >= mixed_pos_threshold and
        weighted_neg_score >= mixed_neg_threshold): return "MIXED"
    elif weighted_pos_score > weighted_neg_score: return "POSITIVE"
    elif weighted_neg_score > weighted_pos_score: return "NEGATIVE"
    else: return "NEUTRAL"


# --- 4. Load Raw Test Data and Run Comparison ---
print("\nLoading original (non-tokenized) IMDb test set...")
imdb_raw = load_dataset("imdb")


SAMPLE_SIZE = 1000
test_reviews = imdb_raw['test'].shuffle(seed=42).select(range(SAMPLE_SIZE))

results = []
true_labels_map = {0: "NEGATIVE", 1: "POSITIVE"}

print(f"Running all 3 systems on {len(test_reviews)} test reviews...")
t0 = time.time()

for review in tqdm(test_reviews, desc="Analyzing Test Set"):
    text = review['text']
    true_label = true_labels_map[review['label']]

    # Run all three classifiers on the same review
    pred_1 = classify_review_ratio(text)
    pred_2 = classify_review_logit(text)
    pred_3 = classify_review_weighting(text)

    results.append({
        "true_label": true_label,
        "system_1_ratio": pred_1,
        "system_2_logit": pred_2,
        "system_3_weighting": pred_3,
        "text": text
    })

total_time = format_time(time.time() - t0)
print(f"Analysis complete. Total time: {total_time}")

# --- 5. Generate and Print Comparison Report ---
df = pd.DataFrame(results)

print("\n" + "="*50)
print(f" COMPARISON 1: OVERALL PREDICTION DISTRIBUTION (Sample size={SAMPLE_SIZE})")
print("="*50)
print("\nSystem 1 (Ratio) Distribution:")
print(df['system_1_ratio'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')
print("\nSystem 2 (Logit/Neutral) Distribution:")
print(df['system_2_logit'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')
print("\nSystem 3 (Weighting) Distribution:")
print(df['system_3_weighting'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')

print("\n" + "="*50)
print(f" COMPARISON 2: AGREEMENT WITH TRUE LABELS (Sample size={SAMPLE_SIZE})")
print("="*50)
print("\nSystem 1 (Ratio) vs. True Labels:")
print(pd.crosstab(df['true_label'], df['system_1_ratio']))
print("\nSystem 2 (Logit/Neutral) vs. True Labels:")
print(pd.crosstab(df['true_label'], df['system_2_logit']))
print("\nSystem 3 (Weighting) vs. True Labels:")
print(pd.crosstab(df['true_label'], df['system_3_weighting']))

print("\n" + "="*50)
print(f" COMPARISON 3: HARD ERROR COUNT (Sample size={SAMPLE_SIZE})")
print("="*50)
s1_errors = len(df[(df['true_label'] == "POSITIVE") & (df['system_1_ratio'] == "NEGATIVE")]) + \
            len(df[(df['true_label'] == "NEGATIVE") & (df['system_1_ratio'] == "POSITIVE")])
print(f"System 1 (Ratio) Hard Errors: {s1_errors}")
s2_errors = len(df[(df['true_label'] == "POSITIVE") & (df['system_2_logit'] == "NEGATIVE")]) + \
            len(df[(df['true_label'] == "NEGATIVE") & (df['system_2_logit'] == "POSITIVE")])
print(f"System 2 (Logit) Hard Errors: {s2_errors}")
s3_errors = len(df[(df['true_label'] == "POSITIVE") & (df['system_3_weighting'] == "NEGATIVE")]) + \
            len(df[(df['true_label'] == "NEGATIVE") & (df['system_3_weighting'] == "POSITIVE")])
print(f"System 3 (Weighting) Hard Errors: {s3_errors}")

# --- 6. Save Full Results to CSV for Qualitative Analysis ---
results_save_path = f'/content/drive/My Drive/BERT_IMDB_Model_Trained_512/sample_comparison_results_512.csv'
print(f"\nSaving sample results to {results_save_path} for manual inspection...")
df.to_csv(results_save_path, index=False)
print("Done.")