# Phase 3: Model & Strategy Comparison
This phase compares different classifiers (Logistic Regression, SVM, Naive Bayes) and tests the hypothesis of grouped vs. all classes for both clarity_label and evasion_label.

## Imports & Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from src.preprocessing import get_preprocessed_data
from src.evaluate import run_full_evaluation

SEED = 42
np.random.seed(SEED)

# Create directories
os.makedirs('figures', exist_ok=True)
os.makedirs('results', exist_ok=True)

# Load data
train_df, val_df, test_df, id2label, label2id = get_preprocessed_data()

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
print(f"Evasion labels: {id2label}")

## Helper Functions for Grouping Labels

In [None]:
def group_evasion_labels(series):
    """
    Group evasion labels into 3 main classes:
    - Clear: Explicit
    - Ambivalent Reply: General, Implicit, Dodging, Deflection
    - Clear non-reply: Claims ignorance, Clarification, Declining to answer
    - Partial/half-answer: Partial/half-answer
    """
    # First convert integers back to string labels
    string_labels = series.map(id2label)
    
    # Then apply grouping mapping
    mapping = {
        'Explicit': 'Clear',
        'General': 'Ambivalent Reply',
        'Implicit': 'Ambivalent Reply',
        'Dodging': 'Ambivalent Reply',
        'Claims ignorance': 'Clear non-reply',
        'Clarification': 'Clear non-reply',
        'Declining to answer': 'Clear non-reply',
        'Partial/half-answer': 'Ambivalent Reply',
        'Deflection': 'Ambivalent Reply',
    }
    return string_labels.map(mapping)

# Create grouped versions
for df in [train_df, val_df, test_df]:
    df['evasion_label_grouped'] = group_evasion_labels(df['evasion_label'], id2label)

print("Evasion label distribution (all classes):")
print(train_df['evasion_label'].value_counts())
print("\nEvasion label distribution (grouped):")
print(train_df['evasion_label_grouped'].value_counts())

## Prepare Best Features from Phase 2
Based on Phase 2 results, we use:
- Sub-question + Answer context
- Meta-features (multiple_questions, affirmative_questions, inaudible)
- Trigrams (1, 3) which performed best

In [None]:
# Create contextual features
for df in [train_df, val_df, test_df]:
    df['sub_q_context'] = df['question'].fillna('') + " [SEP] " + df['interview_answer'].fillna('')
    
meta_cols = ['multiple_questions', 'affirmative_questions', 'inaudible']
for df in [train_df, val_df, test_df]:
    for col in meta_cols:
        df[col] = df[col].astype(int)

print("Features prepared!")
print(f"Text feature: sub_q_context")
print(f"Meta features: {meta_cols}")

## Experiment Runner Function

In [None]:
def run_classifier_experiment(
    clf_name,
    vectorizer_name,
    label_column,
    train_df,
    val_df,
    use_meta_features=True,
    ngram_range=(1, 2)
):
    """
    Run a single classifier experiment
    
    Parameters:
    - clf_name: 'logistic', 'svm', or 'naive_bayes'
    - vectorizer_name: 'tfidf' or 'count'
    - label_column: 'clarity_label', 'evasion_label', or 'evasion_label_grouped'
    - use_meta_features: whether to include meta-features
    - ngram_range: tuple for n-gram range
    """
    
    # Get labels
    label_column = 'evasion_label_grouped' if use_grouped else 'evasion_label'
    y_train = train_df[label_column]
    y_val = val_df[label_column]
    
    # Build vectorizer
    vec_params = {
        'max_features': 2000,
        'ngram_range': ngram_range,
        'min_df': 2,
        'max_df': 0.95
    }
    
    if vectorizer_name == 'tfidf':
        vectorizer = TfidfVectorizer(**vec_params)
    else:  # count
        vectorizer = CountVectorizer(**vec_params)
    
    # Build classifier
    if clf_name == 'logistic':
        classifier = LogisticRegression(max_iter=1000, multi_class="multinomial", random_state=SEED)
    elif clf_name == 'svm':
        classifier = LinearSVC(max_iter=2000, class_weight='balanced', random_state=SEED, dual='auto')
    else:  # naive_bayes
        classifier = MultinomialNB()
    
    # Build pipeline
    if use_meta_features:
        preprocessor = ColumnTransformer(
            transformers=[
                ('text', vectorizer, 'sub_q_context'),
                ('meta', 'passthrough', meta_cols)
            ]
        )
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', classifier)
        ])
        X_train = train_df
        X_val = val_df
    else:
        X_train_text = train_df['sub_q_context']
        X_val_text = val_df['sub_q_context']
        X_train = vectorizer.fit_transform(X_train_text)
        X_val = vectorizer.transform(X_val_text)
        pipeline = classifier
    
    # Train
    pipeline.fit(X_train, y_train)
    
    # Predict
    y_pred = pipeline.predict(X_val)
    
    # Calculate metrics
    metrics = {
        'classifier': clf_name,
        'vectorizer': vectorizer_name,
        'label_column': label_column,
        'use_meta': use_meta_features,
        'f1_macro': f1_score(y_val, y_pred, average='macro', zero_division=0),
        'f1_weighted': f1_score(y_val, y_pred, average='weighted', zero_division=0),
        'precision': precision_score(y_val, y_pred, average='macro', zero_division=0),
        'recall': recall_score(y_val, y_pred, average='macro', zero_division=0),
        'accuracy': accuracy_score(y_val, y_pred)
    }
    
    return metrics, pipeline, y_pred

print("Experiment runner ready!")

## Experiment 1: Classifier Comparison on Clarity Label

In [None]:
results_clarity = []

classifiers = ['logistic', 'svm', 'naive_bayes']
vectorizers = ['tfidf', 'count']

print("\n" + "="*80)
print("EXPERIMENT 1: Classifier Comparison on Clarity Label")
print("="*80)

for clf_name in classifiers:
    for vec_name in vectorizers:
        print(f"\nTesting: {clf_name} + {vec_name}...")
        
        metrics, _, _ = run_classifier_experiment(
            clf_name=clf_name,
            vectorizer_name=vec_name,
            label_column='evasion_label',
            train_df=train_df,
            val_df=val_df,
            use_meta_features=True
        )
        
        results_clarity.append(metrics)
        print(f"  F1 Macro: {metrics['f1_macro']:.4f}")
        print(f"  Accuracy: {metrics['accuracy']:.4f}")

df_clarity = pd.DataFrame(results_clarity)
df_clarity_sorted = df_clarity.sort_values('f1_macro', ascending=False)

print("\n" + "="*80)
print("RESULTS SUMMARY - Clarity Label")
print("="*80)
print(df_clarity_sorted[['classifier', 'vectorizer', 'f1_macro', 'accuracy']].to_string(index=False))

## Experiment 2: Grouped vs. All Classes for Evasion Label

In [None]:
results_evasion_comparison = []

print("\n" + "="*80)
print("EXPERIMENT 2: Grouped vs. All Classes for Evasion Label")
print("="*80)

# Test each classifier with both grouped and all classes
for clf_name in classifiers:
    for vec_name in vectorizers:
        print(f"\n{clf_name} + {vec_name}:")
        
        # All classes
        metrics_all, _, _ = run_classifier_experiment(
            clf_name=clf_name,
            vectorizer_name=vec_name,
            label_column='evasion_label',
            train_df=train_df,
            val_df=val_df,
            use_meta_features=True
        )
        metrics_all['class_strategy'] = 'all_classes'
        
        # Grouped classes
        metrics_grouped, _, _ = run_classifier_experiment(
            clf_name=clf_name,
            vectorizer_name=vec_name,
            label_column='evasion_label_grouped',
            train_df=train_df,
            val_df=val_df,
            use_meta_features=True
        )
        metrics_grouped['class_strategy'] = 'grouped'
        
        improvement = metrics_grouped['f1_macro'] - metrics_all['f1_macro']
        
        print(f"  All classes:     F1 = {metrics_all['f1_macro']:.4f}")
        print(f"  Grouped classes: F1 = {metrics_grouped['f1_macro']:.4f}")
        print(f"  Improvement:     {improvement:+.4f}")
        
        results_evasion_comparison.append(metrics_all)
        results_evasion_comparison.append(metrics_grouped)

df_evasion = pd.DataFrame(results_evasion_comparison)

print("\n" + "="*80)
print("RESULTS SUMMARY - Evasion Label Comparison")
print("="*80)
print(df_evasion[['classifier', 'vectorizer', 'class_strategy', 'f1_macro', 'accuracy']].to_string(index=False))

## Experiment 3: Direct Comparison - Grouped vs. All Classes

In [None]:
comparison_data = []

print("\n" + "="*80)
print("EXPERIMENT 3: Grouped (3) vs. All (7) Classes - Direct Comparison")
print("="*80)

for clf_name in classifiers:
    for vec_name in vectorizers:
        model_name = f"{clf_name} + {vec_name}"
        print(f"\n{model_name}:")
        
        # Get F1 scores from previous experiments
        f1_all = df_all_classes[
            (df_all_classes['classifier'] == clf_name) & 
            (df_all_classes['vectorizer'] == vec_name)
        ]['f1_macro'].values[0]
        
        f1_grouped = df_grouped[
            (df_grouped['classifier'] == clf_name) & 
            (df_grouped['vectorizer'] == vec_name)
        ]['f1_macro'].values[0]
        
        improvement = f1_grouped - f1_all
        improvement_pct = (improvement / f1_all) * 100 if f1_all > 0 else 0
        
        print(f"  All 7 classes:  F1 = {f1_all:.4f}")
        print(f"  Grouped 3:      F1 = {f1_grouped:.4f}")
        print(f"  Improvement:    {improvement:+.4f} ({improvement_pct:+.1f}%)")
        
        comparison_data.append({
            'model': model_name,
            'classifier': clf_name,
            'vectorizer': vec_name,
            'f1_all_7': f1_all,
            'f1_grouped_3': f1_grouped,
            'improvement': improvement,
            'improvement_pct': improvement_pct
        })

df_comparison = pd.DataFrame(comparison_data)

print("\n" + "="*80)
print("COMPARISON SUMMARY")
print("="*80)
print(df_comparison[['model', 'f1_all_7', 'f1_grouped_3', 'improvement', 'improvement_pct']].to_string(index=False))

# Calculate average improvement
avg_improvement = df_comparison['improvement'].mean()
print(f"\nAverage improvement from grouping: {avg_improvement:+.4f}")

## Visualization: Grouped vs. All Classes

In [None]:
# Prepare data for comparison plot
comparison_data = []

for clf in classifiers:
    for vec in vectorizers:
        model_name = f"{clf}\n{vec}"
        
        f1_all = df_evasion[
            (df_evasion['classifier'] == clf) & 
            (df_evasion['vectorizer'] == vec) & 
            (df_evasion['class_strategy'] == 'all_classes')
        ]['f1_macro'].values[0]
        
        f1_grouped = df_evasion[
            (df_evasion['classifier'] == clf) & 
            (df_evasion['vectorizer'] == vec) & 
            (df_evasion['class_strategy'] == 'grouped')
        ]['f1_macro'].values[0]
        
        comparison_data.append({
            'model': model_name,
            'all_classes': f1_all,
            'grouped': f1_grouped,
            'improvement': f1_grouped - f1_all
        })

df_comparison = pd.DataFrame(comparison_data)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Bar plot comparing F1 scores
x = np.arange(len(df_comparison))
width = 0.35

ax1.bar(x - width/2, df_comparison['all_classes'], width, label='All Classes', alpha=0.8, color='steelblue')
ax1.bar(x + width/2, df_comparison['grouped'], width, label='Grouped (3 classes)', alpha=0.8, color='coral')

ax1.set_xlabel('Model')
ax1.set_ylabel('F1 Macro Score')
ax1.set_title('Grouped vs. All Classes - Evasion Label')
ax1.set_xticks(x)
ax1.set_xticklabels(df_comparison['model'], rotation=45, ha='right')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)
ax1.set_ylim(0, 1.0)

# Improvement plot
colors = ['green' if x > 0 else 'red' for x in df_comparison['improvement']]
ax2.barh(df_comparison['model'], df_comparison['improvement'], color=colors, alpha=0.7)
ax2.axvline(x=0, color='black', linestyle='--', linewidth=1)
ax2.set_xlabel('F1 Improvement (Grouped - All)')
ax2.set_title('Grouping Impact per Model')
ax2.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('figures/phase3_grouped_vs_all_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nConclusion:")
if df_comparison['improvement'].mean() > 0:
    print(f"✓ Grouping classes improves performance on average by {df_comparison['improvement'].mean():.4f}")
else:
    print(f"✗ Grouping classes decreases performance on average by {abs(df_comparison['improvement'].mean()):.4f}")

## Overall Best Model with Detailed Metrics

In [None]:
# Combine all results
all_results = pd.concat([df_all_classes, df_grouped])
all_results_sorted = all_results.sort_values('f1_macro', ascending=False)

best_overall = all_results_sorted.iloc[0]

print("\n" + "="*80)
print("OVERALL BEST MODEL FOR EVASION DETECTION")
print("="*80)
print(f"Classifier:   {best_overall['classifier']}")
print(f"Vectorizer:   {best_overall['vectorizer']}")
print(f"Classes:      {best_overall['classes']}")
print(f"Meta Features: {best_overall['use_meta']}")
print(f"\nPerformance:")
print(f"  F1 Macro:     {best_overall['f1_macro']:.4f}")
print(f"  F1 Weighted:  {best_overall['f1_weighted']:.4f}")
print(f"  Precision:    {best_overall['precision']:.4f}")
print(f"  Recall:       {best_overall['recall']:.4f}")
print(f"  Accuracy:     {best_overall['accuracy']:.4f}")

# Train best model and show confusion matrix
use_grouped = (best_overall['classes'] == 'grouped_3')
_, best_pipeline, y_pred, y_true = run_classifier_experiment(
    clf_name=best_overall['classifier'],
    vectorizer_name=best_overall['vectorizer'],
    use_grouped=use_grouped,
    train_df=train_df,
    val_df=val_df,
    use_meta_features=True
)

# Plot confusion matrix
labels = sorted(y_true.unique())
cm = confusion_matrix(y_true, y_pred, labels=labels, normalize='true')

plt.figure(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap='Blues', values_format='.2f')
plt.title(f"Best Model: {best_overall['classifier']} + {best_overall['vectorizer']}\n"
          f"{best_overall['classes']} | F1 Macro: {best_overall['f1_macro']:.4f}",
          fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('figures/phase3_best_model_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

## Classifier Comparison Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# By classifier
clf_comparison = all_results.groupby('classifier')['f1_macro'].mean().sort_values()
clf_comparison.plot(kind='barh', ax=axes[0, 0], color='steelblue', alpha=0.7)
axes[0, 0].set_title('Average F1 by Classifier', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('F1 Macro')
axes[0, 0].grid(axis='x', alpha=0.3)

# By vectorizer
vec_comparison = all_results.groupby('vectorizer')['f1_macro'].mean().sort_values()
vec_comparison.plot(kind='barh', ax=axes[0, 1], color='coral', alpha=0.7)
axes[0, 1].set_title('Average F1 by Vectorizer', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('F1 Macro')
axes[0, 1].grid(axis='x', alpha=0.3)

# By class strategy
class_comparison = all_results.groupby('classes')['f1_macro'].mean()
class_comparison.plot(kind='bar', ax=axes[1, 0], color=['skyblue', 'salmon'], alpha=0.7)
axes[1, 0].set_title('Average F1 by Class Strategy', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('F1 Macro')
axes[1, 0].set_xticklabels(['All 7 Classes', 'Grouped 3 Classes'], rotation=0)
axes[1, 0].grid(axis='y', alpha=0.3)

# Overall ranking
top_5 = all_results_sorted.head(5)
top_5_labels = [f"{row['classifier']}\n{row['vectorizer']}\n{row['classes']}" 
                for _, row in top_5.iterrows()]
axes[1, 1].barh(range(len(top_5)), top_5['f1_macro'], color='lightgreen', alpha=0.7)
axes[1, 1].set_yticks(range(len(top_5)))
axes[1, 1].set_yticklabels(top_5_labels, fontsize=9)
axes[1, 1].set_title('Top 5 Configurations', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('F1 Macro')
axes[1, 1].grid(axis='x', alpha=0.3)
axes[1, 1].invert_yaxis()

plt.tight_layout()
plt.savefig('figures/phase3_overall_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## Export Results

In [None]:
# Save all results to CSV
df_all_classes.to_csv('results/phase3_all_7_classes.csv', index=False)
df_grouped.to_csv('results/phase3_grouped_3_classes.csv', index=False)
df_comparison.to_csv('results/phase3_comparison_grouped_vs_all.csv', index=False)
all_results_sorted.to_csv('results/phase3_all_results.csv', index=False)

# Save best model info
with open('results/phase3_best_model.txt', 'w') as f:
    f.write("BEST MODEL FOR EVASION DETECTION\n")
    f.write("="*50 + "\n\n")
    f.write(f"Classifier:    {best_overall['classifier']}\n")
    f.write(f"Vectorizer:    {best_overall['vectorizer']}\n")
    f.write(f"Classes:       {best_overall['classes']}\n")
    f.write(f"Meta Features: {best_overall['use_meta']}\n")
    f.write(f"\nPerformance:\n")
    f.write(f"  F1 Macro:    {best_overall['f1_macro']:.4f}\n")
    f.write(f"  F1 Weighted: {best_overall['f1_weighted']:.4f}\n")
    f.write(f"  Precision:   {best_overall['precision']:.4f}\n")
    f.write(f"  Recall:      {best_overall['recall']:.4f}\n")
    f.write(f"  Accuracy:    {best_overall['accuracy']:.4f}\n")

print("\n" + "="*80)
print("Results exported to results/ directory")
print("Figures saved to figures/ directory")
print("\nPhase 3 complete! ✓")
print("="*80)