<a href="https://colab.research.google.com/github/nagwaelmobark/educational-reviews-imbalance-study/blob/main/notebooks/03_imbalance_techniques_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Debug Version - Let's see what's happening step by step

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

print("🔍 DEBUG MODE - Step by Step Analysis")
print("=" * 50)

# Load data
!wget -q https://raw.githubusercontent.com/nagwaelmobark/educational-reviews-imbalance-study/main/data/raw/reviews.csv
df = pd.read_csv('reviews.csv')

# Quick preprocessing
df = df.dropna(subset=['Review', 'Label'])
df['Review_cleaned'] = df['Review'].astype(str).str.lower()

print(f"✅ Data loaded: {len(df):,} reviews")

# Train-test split
X = df['Review_cleaned']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"✅ Features: {X_train_tfidf.shape}")

# Check class distribution
print(f"\n📊 Class Distribution:")
class_counts = y_train.value_counts().sort_index()
for rating, count in class_counts.items():
    percentage = (count / len(y_train)) * 100
    print(f"Rating {rating}: {count:,} ({percentage:.1f}%)")

# =============================================================================
# BASELINE TEST
# =============================================================================

print(f"\n🔬 BASELINE TEST...")
baseline_model = SVC(kernel='linear', random_state=42)
baseline_model.fit(X_train_tfidf, y_train)
baseline_pred = baseline_model.predict(X_test_tfidf)
baseline_f1 = f1_score(y_test, baseline_pred, average='macro')

print(f"Baseline Macro F1: {baseline_f1:.4f}")
print(f"Baseline Accuracy: {accuracy_score(y_test, baseline_pred):.4f}")

# Per-class baseline performance
baseline_f1_per_class = f1_score(y_test, baseline_pred, average=None)
print(f"\nBaseline per-class F1:")
for i, rating in enumerate(sorted(np.unique(y_test))):
    print(f"  Rating {rating}: {baseline_f1_per_class[i]:.4f}")

# =============================================================================
# TEST 1: CLASS WEIGHTS
# =============================================================================

print(f"\n🧪 TEST 1: Class-Weighted SVM...")

try:
    # Calculate weights
    classes = np.unique(y_train)
    weights = compute_class_weight('balanced', classes=classes, y=y_train)
    weight_dict = dict(zip(classes, weights))

    print(f"Class weights calculated:")
    for rating, weight in weight_dict.items():
        print(f"  Rating {rating}: {weight:.2f}")

    # Train weighted model
    weighted_svm = SVC(kernel='linear', class_weight='balanced', random_state=42)
    weighted_svm.fit(X_train_tfidf, y_train)
    weighted_pred = weighted_svm.predict(X_test_tfidf)
    weighted_f1 = f1_score(y_test, weighted_pred, average='macro')

    print(f"✅ Weighted SVM Macro F1: {weighted_f1:.4f}")
    print(f"✅ Improvement: {((weighted_f1 - baseline_f1)/baseline_f1)*100:+.1f}%")

    # Per-class performance
    weighted_f1_per_class = f1_score(y_test, weighted_pred, average=None)
    print(f"\nWeighted SVM per-class F1:")
    for i, rating in enumerate(sorted(np.unique(y_test))):
        improvement = ((weighted_f1_per_class[i] - baseline_f1_per_class[i]) / baseline_f1_per_class[i]) * 100 if baseline_f1_per_class[i] > 0 else 0
        print(f"  Rating {rating}: {weighted_f1_per_class[i]:.4f} ({improvement:+.1f}%)")

except Exception as e:
    print(f"❌ Error in Test 1: {e}")

# =============================================================================
# TEST 2: LOGISTIC REGRESSION WITH WEIGHTS
# =============================================================================

print(f"\n🧪 TEST 2: Weighted Logistic Regression...")

try:
    weighted_lr = LogisticRegression(
        class_weight='balanced',
        random_state=42,
        max_iter=1000,
        solver='liblinear'  # Better for small datasets
    )
    weighted_lr.fit(X_train_tfidf, y_train)
    lr_pred = weighted_lr.predict(X_test_tfidf)
    lr_f1 = f1_score(y_test, lr_pred, average='macro')

    print(f"✅ Weighted Logistic Macro F1: {lr_f1:.4f}")
    print(f"✅ Improvement: {((lr_f1 - baseline_f1)/baseline_f1)*100:+.1f}%")

    # Per-class performance
    lr_f1_per_class = f1_score(y_test, lr_pred, average=None)
    print(f"\nWeighted Logistic per-class F1:")
    for i, rating in enumerate(sorted(np.unique(y_test))):
        improvement = ((lr_f1_per_class[i] - baseline_f1_per_class[i]) / baseline_f1_per_class[i]) * 100 if baseline_f1_per_class[i] > 0 else 0
        print(f"  Rating {rating}: {lr_f1_per_class[i]:.4f} ({improvement:+.1f}%)")

except Exception as e:
    print(f"❌ Error in Test 2: {e}")

# =============================================================================
# TEST 3: RANDOM FOREST WITH WEIGHTS
# =============================================================================

print(f"\n🧪 TEST 3: Weighted Random Forest...")

try:
    weighted_rf = RandomForestClassifier(
        class_weight='balanced',
        n_estimators=50,
        random_state=42,
        n_jobs=-1
    )
    weighted_rf.fit(X_train_tfidf, y_train)
    rf_pred = weighted_rf.predict(X_test_tfidf)
    rf_f1 = f1_score(y_test, rf_pred, average='macro')

    print(f"✅ Weighted RF Macro F1: {rf_f1:.4f}")
    print(f"✅ Improvement: {((rf_f1 - baseline_f1)/baseline_f1)*100:+.1f}%")

    # Per-class performance
    rf_f1_per_class = f1_score(y_test, rf_pred, average=None)
    print(f"\nWeighted RF per-class F1:")
    for i, rating in enumerate(sorted(np.unique(y_test))):
        improvement = ((rf_f1_per_class[i] - baseline_f1_per_class[i]) / baseline_f1_per_class[i]) * 100 if baseline_f1_per_class[i] > 0 else 0
        print(f"  Rating {rating}: {rf_f1_per_class[i]:.4f} ({improvement:+.1f}%)")

except Exception as e:
    print(f"❌ Error in Test 3: {e}")

# =============================================================================
# TEST 4: SIMPLE OVERSAMPLING
# =============================================================================

print(f"\n🧪 TEST 4: Simple Oversampling...")

try:
    # Find minority classes (1 and 2)
    minority_mask_1 = y_train == 1
    minority_mask_2 = y_train == 2

    # Get minority samples
    minority_reviews_1 = X_train[minority_mask_1]
    minority_labels_1 = y_train[minority_mask_1]
    minority_reviews_2 = X_train[minority_mask_2]
    minority_labels_2 = y_train[minority_mask_2]

    # Repeat minority samples 5 times
    oversample_factor = 5

    X_train_oversampled = pd.concat([
        X_train,
        pd.concat([minority_reviews_1] * oversample_factor),
        pd.concat([minority_reviews_2] * oversample_factor)
    ])

    y_train_oversampled = pd.concat([
        y_train,
        pd.concat([minority_labels_1] * oversample_factor),
        pd.concat([minority_labels_2] * oversample_factor)
    ])

    print(f"Original training size: {len(X_train):,}")
    print(f"Oversampled training size: {len(X_train_oversampled):,}")

    # Show new distribution
    new_counts = y_train_oversampled.value_counts().sort_index()
    print(f"New distribution:")
    for rating, count in new_counts.items():
        percentage = (count / len(y_train_oversampled)) * 100
        print(f"  Rating {rating}: {count:,} ({percentage:.1f}%)")

    # Extract features for oversampled data
    X_train_oversampled_tfidf = vectorizer.transform(X_train_oversampled)

    # Train model on oversampled data
    oversample_svm = SVC(kernel='linear', random_state=42)
    oversample_svm.fit(X_train_oversampled_tfidf, y_train_oversampled)
    oversample_pred = oversample_svm.predict(X_test_tfidf)
    oversample_f1 = f1_score(y_test, oversample_pred, average='macro')

    print(f"✅ Oversampled SVM Macro F1: {oversample_f1:.4f}")
    print(f"✅ Improvement: {((oversample_f1 - baseline_f1)/baseline_f1)*100:+.1f}%")

    # Per-class performance
    oversample_f1_per_class = f1_score(y_test, oversample_pred, average=None)
    print(f"\nOversampled SVM per-class F1:")
    for i, rating in enumerate(sorted(np.unique(y_test))):
        improvement = ((oversample_f1_per_class[i] - baseline_f1_per_class[i]) / baseline_f1_per_class[i]) * 100 if baseline_f1_per_class[i] > 0 else 0
        print(f"  Rating {rating}: {oversample_f1_per_class[i]:.4f} ({improvement:+.1f}%)")

except Exception as e:
    print(f"❌ Error in Test 4: {e}")

# =============================================================================
# SUMMARY
# =============================================================================

print(f"\n📊 SUMMARY OF ALL TECHNIQUES:")
print("=" * 40)

# Collect all results
all_techniques = ['Baseline SVM']
all_f1_scores = [baseline_f1]

if 'weighted_f1' in locals():
    all_techniques.append('Weighted SVM')
    all_f1_scores.append(weighted_f1)

if 'lr_f1' in locals():
    all_techniques.append('Weighted Logistic')
    all_f1_scores.append(lr_f1)

if 'rf_f1' in locals():
    all_techniques.append('Weighted Random Forest')
    all_f1_scores.append(rf_f1)

if 'oversample_f1' in locals():
    all_techniques.append('Oversampled SVM')
    all_f1_scores.append(oversample_f1)

# Find best
best_idx = np.argmax(all_f1_scores)
best_technique = all_techniques[best_idx]
best_f1 = all_f1_scores[best_idx]

print(f"🏆 BEST TECHNIQUE: {best_technique}")
print(f"📈 Best Macro F1: {best_f1:.4f}")
print(f"📈 Improvement over baseline: {((best_f1 - baseline_f1)/baseline_f1)*100:+.1f}%")

print(f"\n🎯 All Results:")
for i, (technique, f1) in enumerate(zip(all_techniques, all_f1_scores)):
    improvement = ((f1 - baseline_f1)/baseline_f1)*100 if technique != 'Baseline SVM' else 0
    print(f"  {technique}: {f1:.4f} ({improvement:+.1f}%)")

print(f"\n✅ DEBUG COMPLETE!")
if best_f1 > baseline_f1:
    print(f"💡 SUCCESS: Found improvement with {best_technique}!")
else:
    print(f"⚠️  Need to investigate further - trying different approaches...")

🔍 DEBUG MODE - Step by Step Analysis
✅ Data loaded: 107,018 reviews
✅ Features: (85614, 5000)

📊 Class Distribution:
Rating 1: 1,975 (2.3%)
Rating 2: 1,801 (2.1%)
Rating 3: 4,057 (4.7%)
Rating 4: 14,443 (16.9%)
Rating 5: 63,338 (74.0%)

🔬 BASELINE TEST...
Baseline Macro F1: 0.3691
Baseline Accuracy: 0.7653

Baseline per-class F1:
  Rating 1: 0.4187
  Rating 2: 0.1433
  Rating 3: 0.1881
  Rating 4: 0.2151
  Rating 5: 0.8804

🧪 TEST 1: Class-Weighted SVM...
Class weights calculated:
  Rating 1: 8.67
  Rating 2: 9.51
  Rating 3: 4.22
  Rating 4: 1.19
  Rating 5: 0.27
✅ Weighted SVM Macro F1: 0.3651
✅ Improvement: -1.1%

Weighted SVM per-class F1:
  Rating 1: 0.3088 (-26.2%)
  Rating 2: 0.1647 (+14.9%)
  Rating 3: 0.2280 (+21.2%)
  Rating 4: 0.3309 (+53.8%)
  Rating 5: 0.7931 (-9.9%)

🧪 TEST 2: Weighted Logistic Regression...
✅ Weighted Logistic Macro F1: 0.4087
✅ Improvement: +10.7%

Weighted Logistic per-class F1:
  Rating 1: 0.3774 (-9.9%)
  Rating 2: 0.1997 (+39.3%)
  Rating 3: 0.2651 