# Project: IMDb 3-Class Sentiment Analysis
# Notebook: Analysis and Results

This notebook demonstrates the final, trained 3-class sentiment analysis systems.

It will:
1.  Download the two pre-trained BERT models (256-token and 512-token) from public Google Drive links.
2.  Load the **512-token model** and run the **1,000-sample 3-system comparison**.
3.  Load the **256-token model** and run the **10,000-sample validation** of the winning heuristic (System 3).

## Final Project Results (Summary)

This notebook reproduces the key findings of the project.

### 1. Binary Model Performance (Phase 1)
* **256-Token Model Accuracy:** 92.0%
* **512-Token Model Accuracy:** 93.0%

### 2. 3-System Heuristic Comparison (1,000-Sample Test, 512-Token Model)

The 'Hard Error' count measures complete misclassifications (e.g., 'Positive' as 'Negative').

* **System 1 (Ratio) Hard Errors:** 27
* **System 2 (Logit) Hard Errors:** 31
* **System 3 (Weighting) Hard Errors:** **8 (Winner)**

### 3. System 3 Validation (10,000-Sample Test, 256-Token Model)

The winning heuristic (System 3) was validated on a larger sample using the more efficient 256-token model.

* **Total Hard Errors:** 126
* **Final Hard Error Rate:** **1.26%**

Calculated Results:

Analysis Data (1,000-Sample 3-System Comparison):
https://drive.google.com/file/d/1qxyaRpJLQiE9UoxsO1MraKjSIST10lNI/view?usp=sharing

Analysis Data (10,000-Sample Validation of System 3):
https://drive.google.com/file/d/1UTiKFeBe31ZZUwi41xVKAPtS5Hv5dWG-/view?usp=sharing

---
## 1. Setup: Install and Import Libraries

In [None]:
# Install all required libraries
!pip install transformers datasets accelerate torch gdown pandas nltk

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader
from datasets import load_dataset
import gdown
import zipfile
import os
import time
import datetime
import numpy as np
import nltk
from tqdm.auto import tqdm
import pandas as pd
import math

# Helper function to format elapsed time
def format_time(elapsed):
    '''Takes a time in seconds and returns a string hh:mm:ss'''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## 2. Download and Unzip Pre-Trained Models

This step downloads the two `.zip` files containing the trained models from public Google Drive links.

In [None]:



URL_256_MODEL = "https://drive.google.com/file/d/1GOigFk5XDsUiju5AuTesWMS_lxp77N8u/view?usp=sharing"


URL_512_MODEL = "https://drive.google.com/file/d/1kebIYqseo_2X7FeHnk64tXVpOMuyidro/view?usp=sharing"

# --- End of Action Required ---

def download_and_unzip(url, zip_name, extract_path):
    """Downloads a zip file from a GDrive URL and unzips it."""
    print(f"Downloading {zip_name}...")
    if "drive.google.com" in url:
        file_id = url.split('/')[-2]
        url = f'https://drive.google.com/uc?id={file_id}'

    gdown.download(url, zip_name, quiet=False)
    print(f"Download complete. Unzipping to {extract_path}...")

    with zipfile.ZipFile(zip_name, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    print(f"Model successfully unzipped.")
    os.remove(zip_name) # Clean up the zip file

# Download and unzip both models
download_and_unzip(URL_256_MODEL, "model_256.zip", "./model_256/")
download_and_unzip(URL_512_MODEL, "model_512.zip", "./model_512/")

print("\nAll models are downloaded and ready.")

## 3. Define the 3-Class Heuristic Systems

These are the three systems (Ratio, Logit, Weighting) that will be tested. They rely on helper functions to get predictions from the currently loaded model.

In [None]:
# --- 1. Setup: NLTK and Global Variables ---
print("Downloading NLTK sentencizer (punkt)...")
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
print("NLTK ready.")


model = None
tokenizer = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOKENIZER_MAX_LENGTH = 512

# --- 2. Define Helper Functions for Model Prediction ---

def predict_sentence_sentiment_binary(sentence_text):
    """Feeds a single sentence to the currently loaded binary model and returns 0 or 1."""
    inputs = tokenizer(
        sentence_text, return_tensors="pt", truncation=True,
        padding="max_length", max_length=TOKENIZER_MAX_LENGTH
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    prediction = torch.argmax(outputs.logits, dim=-1)
    return prediction.cpu().item()

def get_sentence_logits(sentence_text):
    """Feeds a single sentence to the currently loaded binary model and returns its raw logits."""
    inputs = tokenizer(
        sentence_text, return_tensors="pt", truncation=True,
        padding="max_length", max_length=TOKENIZER_MAX_LENGTH
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask, return_dict=True)
    return outputs.logits.squeeze().cpu().numpy()


# --- 3. Define All Three Classification Systems ---

# SYSTEM 1: Simple Ratio
def classify_review_ratio(full_review_text, pos_threshold=0.7, neg_threshold=0.7):
    """Classifies based on the ratio of positive/negative sentences."""
    sentences = sent_tokenize(full_review_text)
    if len(sentences) < 3:
        pred = predict_sentence_sentiment_binary(full_review_text)
        return "POSITIVE" if pred == 1 else "NEGATIVE"
    sentence_predictions = [predict_sentence_sentiment_binary(s) for s in sentences]
    if not sentence_predictions: return "NEUTRAL"
    num_sentences = len(sentence_predictions)
    num_positive = sum(sentence_predictions)
    positive_ratio = num_positive / num_sentences
    if positive_ratio >= pos_threshold: return "POSITIVE"
    elif (1 - positive_ratio) >= neg_threshold: return "NEGATIVE"
    else: return "MIXED"

# SYSTEM 2: Logit-Based "Neutral Zone"
def classify_review_logit(full_review_text, neutral_threshold=1.0, min_sentences=2):
    """Classifies based on confidence (logits) and co-occurrence."""
    sentences = sent_tokenize(full_review_text)
    if not sentences: return "NEUTRAL"
    confident_pos_count = 0
    confident_neg_count = 0
    for sentence in sentences:
        logits = get_sentence_logits(sentence)
        neg_score, pos_score = logits[0], logits[1]
        score_difference = abs(pos_score - neg_score)
        if score_difference >= neutral_threshold:
            if pos_score > neg_score: confident_pos_count += 1
            else: confident_neg_count += 1
    if confident_pos_count >= min_sentences and confident_neg_count >= min_sentences: return "MIXED"
    elif confident_pos_count > 0 and confident_neg_count == 0: return "POSITIVE"
    elif confident_neg_count > 0 and confident_pos_count == 0: return "NEGATIVE"
    else:
        if confident_pos_count > confident_neg_count: return "POSITIVE"
        elif confident_neg_count > confident_pos_count: return "NEGATIVE"
        else: return "NEUTRAL"

# SYSTEM 3: Positional Weighting + Co-occurrence
def classify_review_weighting(full_review_text, positional_weight=2,
                            mixed_pos_threshold=2, mixed_neg_threshold=2):
    """Classifies based on weighted co-occurrence, giving 2x weight to first/last sentences."""
    sentences = sent_tokenize(full_review_text)
    if not sentences: return "NEUTRAL"
    num_sentences = len(sentences)
    weighted_pos_score = 0
    weighted_neg_score = 0
    for i, sentence in enumerate(sentences):
        current_weight = 1
        if i == 0 or i == (num_sentences - 1):
            current_weight = positional_weight
        prediction = predict_sentence_sentiment_binary(sentence)
        if prediction == 1: weighted_pos_score += current_weight
        else: weighted_neg_score += current_weight
    if (weighted_pos_score >= mixed_pos_threshold and
        weighted_neg_score >= mixed_neg_threshold): return "MIXED"
    elif weighted_pos_score > weighted_neg_score: return "POSITIVE"
    elif weighted_neg_score > weighted_pos_score: return "NEGATIVE"
    else: return "NEUTRAL"

print("All 3 systems defined.")

## 4. Run Analysis 1: 3-System Comparison (1,000 Samples, 512-Token Model)

This section loads the 512-token model and runs it on a 1,000-review sample to generate the first set of comparison tables.

In [None]:
# --- 1. Load the 512-Token Model ---
print("Loading 512-token model for analysis...")
model_load_path = "./model_512/BERT_IMDB_Model_Trained_512"
TOKENIZER_MAX_LENGTH = 512

model = BertForSequenceClassification.from_pretrained(model_load_path)
tokenizer = BertTokenizer.from_pretrained(model_load_path)
model.to(device)
model.eval()


try:
    model = torch.compile(model)
    print("Model compiled successfully.")
except Exception as e:
    print("Continuing without compilation.")

# --- 2. Load Raw Test Data ---
print("\nLoading original (non-tokenized) IMDb test set...")
imdb_raw = load_dataset("imdb")

SAMPLE_SIZE = 1000
test_reviews = imdb_raw['test'].shuffle(seed=42).select(range(SAMPLE_SIZE))

results_1k = []
true_labels_map = {0: "NEGATIVE", 1: "POSITIVE"}

print(f"Running all 3 systems on {len(test_reviews)} test reviews (512-token model)...")
t0 = time.time()

# --- 3. Run Analysis Loop ---
for review in tqdm(test_reviews, desc="Analyzing 1k Sample (512-model)"):
    text = review['text']
    true_label = true_labels_map[review['label']]
    pred_1 = classify_review_ratio(text)
    pred_2 = classify_review_logit(text)
    pred_3 = classify_review_weighting(text)
    results_1k.append({
        "true_label": true_label,
        "system_1_ratio": pred_1,
        "system_2_logit": pred_2,
        "system_3_weighting": pred_3,
        "text": text
    })

total_time = format_time(time.time() - t0)
print(f"Analysis complete. Total time: {total_time}")

# --- 4. Generate and Print Comparison Report ---
df_1k = pd.DataFrame(results_1k)

print("\n" + "="*50)
print(f" 512-MODEL: 3-SYSTEM COMPARISON (Sample size={SAMPLE_SIZE})")
print("="*50)
print("\nSystem 1 (Ratio) Distribution:")
print(df_1k['system_1_ratio'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')
print("\nSystem 2 (Logit/Neutral) Distribution:")
print(df_1k['system_2_logit'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')
print("\nSystem 3 (Weighting) Distribution:")
print(df_1k['system_3_weighting'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')

print("\n" + "="*50)
print(f" 512-MODEL: AGREEMENT WITH TRUE LABELS (Sample size={SAMPLE_SIZE})")
print("="*50)
print("\nSystem 1 (Ratio) vs. True Labels:")
print(pd.crosstab(df_1k['true_label'], df_1k['system_1_ratio']))
print("\nSystem 2 (Logit/Neutral) vs. True Labels:")
print(pd.crosstab(df_1k['true_label'], df_1k['system_2_logit']))
print("\nSystem 3 (Weighting) vs. True Labels:")
print(pd.crosstab(df_1k['true_label'], df_1k['system_3_weighting']))

print("\n" + "="*50)
print(f" 512-MODEL: HARD ERROR COUNT (Sample size={SAMPLE_SIZE})")
print("="*50)
s1_errors = len(df_1k[(df_1k['true_label'] == "POSITIVE") & (df_1k['system_1_ratio'] == "NEGATIVE")]) + \
            len(df_1k[(df_1k['true_label'] == "NEGATIVE") & (df_1k['system_1_ratio'] == "POSITIVE")])
print(f"System 1 (Ratio) Hard Errors: {s1_errors}")
s2_errors = len(df_1k[(df_1k['true_label'] == "POSITIVE") & (df_1k['system_2_logit'] == "NEGATIVE")]) + \
            len(df_1k[(df_1k['true_label'] == "NEGATIVE") & (df_1k['system_2_logit'] == "POSITIVE")])
print(f"System 2 (Logit) Hard Errors: {s2_errors}")
s3_errors = len(df_1k[(df_1k['true_label'] == "POSITIVE") & (df_1k['system_3_weighting'] == "NEGATIVE")]) + \
            len(df_1k[(df_1k['true_label'] == "NEGATIVE") & (df_1k['system_3_weighting'] == "POSITIVE")])
print(f"System 3 (Weighting) Hard Errors: {s3_errors}")

## 5. Run Analysis 2: System 3 Validation (10,000 Samples, 256-Token Model)

This section validates the winning heuristic (System 3) on a larger 10,000-review sample using the faster 256-token model.

In [None]:
# --- 1. Load the 256-Token Model ---
print("\n" + "="*60)
print("Loading 256-token model for validation...")
model_load_path = "./model_256/BERT_IMDB_Model_Trained_256_FP16_Optimized"
TOKENIZER_MAX_LENGTH = 256 # Switch to 256

model = BertForSequenceClassification.from_pretrained(model_load_path)
tokenizer = BertTokenizer.from_pretrained(model_load_path)
model.to(device)
model.eval()


try:
    model = torch.compile(model)
    print("Model compiled successfully.")
except Exception as e:
    print("Continuing without compilation.")

# --- 2. Load Raw Test Data ---
print("\nLoading original (non-tokenized) IMDb test set...")
imdb_raw = load_dataset("imdb")

SAMPLE_SIZE = 10000
test_reviews = imdb_raw['test'].shuffle(seed=42).select(range(SAMPLE_SIZE))

results_10k = []
true_labels_map = {0: "NEGATIVE", 1: "POSITIVE"}

print(f"Running System 3 on {len(test_reviews)} test reviews (256-token model)...")
t0 = time.time()

# --- 3. Run Analysis Loop (System 3 Only) ---
for review in tqdm(test_reviews, desc="Analyzing 10k Sample (256-model)"):
    text = review['text']
    true_label = true_labels_map[review['label']]
    pred_3 = classify_review_weighting(text)
    results_10k.append({
        "true_label": true_label,
        "system_3_weighting": pred_3,
        "text": text
    })

total_time = format_time(time.time() - t0)
print(f"Analysis complete. Total time: {total_time}")

# --- 4. Generate and Print Comparison Report ---
df_10k = pd.DataFrame(results_10k)

print("\n" + "="*50)
print(f" 256-MODEL: SYSTEM 3 VALIDATION (Sample size={SAMPLE_SIZE})")
print("="*50)

print("\nSystem 3 (Weighting) Distribution:")
print(df_10k['system_3_weighting'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')

print("\nSystem 3 (Weighting) vs. True Labels:")
print(pd.crosstab(df_10k['true_label'], df_10k['system_3_weighting']))

s3_errors_10k = len(df_10k[(df_10k['true_label'] == "POSITIVE") & (df_10k['system_3_weighting'] == "NEGATIVE")]) + \
            len(df_10k[(df_10k['true_label'] == "NEGATIVE") & (df_10k['system_3_weighting'] == "POSITIVE")])
print(f"\nSystem 3 (Weighting) Hard Errors: {s3_errors_10k}")
print(f"Hard Error Rate: {s3_errors_10k / SAMPLE_SIZE * 100:.2f}%")

## 6. Save Final CSVs

This cell saves the DataFrames generated above as `.csv` files for local use.

In [None]:
print("Saving final analysis CSVs...")

# Save the 1k 3-system comparison
results_1k_path = 'sample_comparison_results_1k_512model.csv'
df_1k.to_csv(results_1k_path, index=False)
print(f"Saved 1k sample results to {results_1k_path}")

# Save the 10k System 3 validation
results_10k_path = 'final_system3_validation_10k_256model.csv'
df_10k.to_csv(results_10k_path, index=False)
print(f"Saved 10k validation results to {results_10k_path}")

print("Done.")