# Baseline Models

Train and evaluate traditional machine learning models for emotion classification.


## Import Libraries


In [None]:
import pandas as pd
import numpy as np
import json
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import os

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
os.makedirs('../results/models', exist_ok=True)
os.makedirs('../results/figures', exist_ok=True)


## Load Data


In [None]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
val_df = pd.read_csv('../data/processed/val_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

print(f"Train: {train_df.shape} | Val: {val_df.shape} | Test: {test_df.shape}")


## Load Label Encoder


In [None]:
with open('../data/processed/emotion_mapping.json', 'r') as f:
    emotion_mapping = json.load(f)

label_encoder = LabelEncoder()
label_encoder.fit(train_df['emotion'])

print("Emotion to ID mapping:")
for emotion, idx in emotion_mapping['emotion_to_id'].items():
    print(f"  {emotion}: {idx}")


## Prepare Data


In [None]:
X_train = train_df['text'].values
y_train = label_encoder.transform(train_df['emotion'])

X_val = val_df['text'].values
y_val = label_encoder.transform(val_df['emotion'])

X_test = test_df['text'].values
y_test = label_encoder.transform(test_df['emotion'])

print(f"Train: {len(X_train)} samples")
print(f"Val: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")


## Feature Extraction


In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=2)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF features: {X_train_tfidf.shape[1]}")
print(f"Train shape: {X_train_tfidf.shape}")
print(f"Val shape: {X_val_tfidf.shape}")
print(f"Test shape: {X_test_tfidf.shape}")


## Calculate Class Weights


In [None]:
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

print("Class weights:")
for emotion, idx in emotion_mapping['emotion_to_id'].items():
    print(f"  {emotion:10s}: {class_weight_dict[idx]:.3f}")


## Train Models


In [None]:
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'SVM': SVC(class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)
}

trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_tfidf, y_train)
    trained_models[name] = model
    print(f"  {name} trained")

print("\nAll models trained")


## Evaluate Models


In [None]:
results = {}

for name, model in trained_models.items():
    y_pred = model.predict(X_test_tfidf)
    
    results[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1': f1_score(y_test, y_pred, average='weighted')
    }

results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
results_df = results_df.sort_values('f1', ascending=False)

print("Model Performance on Test Set:")
print(results_df)


## Visualize Results


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

results_df.plot(kind='bar', y=['accuracy', 'precision', 'recall', 'f1'], ax=axes[0])
axes[0].set_title('Model Performance Comparison', fontsize=14)
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].legend(loc='lower right')
axes[0].tick_params(axis='x', rotation=45)

results_df['f1'].plot(kind='barh', ax=axes[1], color='steelblue')
axes[1].set_title('F1-Score Comparison', fontsize=14)
axes[1].set_xlabel('F1-Score')
axes[1].set_ylabel('Model')

plt.tight_layout()
plt.savefig('../results/figures/baseline_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()


## Detailed Classification Report


In [None]:
best_model_name = results_df.index[0]
best_model = trained_models[best_model_name]

y_pred = best_model.predict(X_test_tfidf)

print(f"Best Model: {best_model_name}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


## Confusion Matrix


In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('../results/figures/confusion_matrix_baseline.png', dpi=300, bbox_inches='tight')
plt.show()


## Save Results


In [None]:
results_df.to_csv('../results/models/baseline_models_results.csv')

print("Results saved to: ../results/models/baseline_models_results.csv")
print(f"\nBest model: {best_model_name}")
print(f"Best F1-score: {results_df.loc[best_model_name, 'f1']:.4f}")
