In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

# =============================================================================
# 1. LOAD RESULTS
# =============================================================================

def load_catboost_results(results_dir):
    
    print(f"\n{'='*80}")
    print("LOADING CATBOOST RESULTS")
    print(f"{'='*80}")
    
    results_dir = Path(results_dir)
    
    # Load metrics
    metrics_path = results_dir / 'catboost_metrics.json'
    with open(metrics_path, 'r') as f:
        metrics = json.load(f)
    
    print(f" Loaded metrics from {metrics_path}")
    
    # Load predictions
    predictions_path = results_dir / 'catboost_predictions.csv'
    predictions = pd.read_csv(predictions_path)
    
    print(f" Loaded predictions from {predictions_path}")
    print(f"  Test samples: {len(predictions)}")
    
    return {
        'name': 'CatBoost ML',
        'metrics': metrics['test'],
        'predictions': predictions
    }


def load_symbolic_results(results_dir):
    
    print(f"\n{'='*80}")
    print("LOADING SYMBOLIC REASONING RESULTS")
    print(f"{'='*80}")
    
    results_dir = Path(results_dir)
    
    # Try to load from evaluation results
    possible_paths = [
        results_dir / 'evaluation_results/symbolic/evaluation_results.csv',
        results_dir / 'ichd3_diagnoses_final.csv',
        'data/diagnoses/ichd3_diagnoses_final.csv'
    ]
    
    for path in possible_paths:
        if path.exists():
            print(f" Found symbolic results: {path}")
            df = pd.read_csv(path)
            
            # Extract predictions
            if 'predicted_diagnosis' in df.columns and 'true_diagnosis' in df.columns:
                predictions = df[['true_diagnosis', 'predicted_diagnosis']].copy()
                predictions.columns = ['true_label', 'predicted_label']
            elif 'Type' in df.columns and 'diagnosis' in df.columns:
                predictions = df[['Type', 'diagnosis']].copy()
                predictions.columns = ['true_label', 'predicted_label']
            else:
                print(f" Column names: {df.columns.tolist()}")
                raise ValueError("Could not find diagnosis columns")
            
            print(f"  Predictions: {len(predictions)}")
            
            # Calculate metrics
            metrics = {
                'accuracy': accuracy_score(predictions['true_label'], predictions['predicted_label']),
                'precision_macro': precision_score(predictions['true_label'], predictions['predicted_label'], average='macro'),
                'precision_weighted': precision_score(predictions['true_label'], predictions['predicted_label'], average='weighted'),
                'recall_macro': recall_score(predictions['true_label'], predictions['predicted_label'], average='macro'),
                'recall_weighted': recall_score(predictions['true_label'], predictions['predicted_label'], average='weighted'),
                'f1_macro': f1_score(predictions['true_label'], predictions['predicted_label'], average='macro'),
                'f1_weighted': f1_score(predictions['true_label'], predictions['predicted_label'], average='weighted')
            }
            
            return {
                'name': 'Symbolic Reasoning',
                'metrics': metrics,
                'predictions': predictions
            }
    
    print(f" No symbolic results found in any of these paths:")
    for path in possible_paths:
        print(f"   - {path}")
    raise FileNotFoundError("Symbolic results not found. Run evaluate_symbolic_reasoning.py first!")


# =============================================================================
# 2. COMPARISON METRICS
# =============================================================================

def compare_metrics(catboost_results, symbolic_results):
    
    print(f"\n{'='*80}")
    print("METRICS COMPARISON")
    print(f"{'='*80}")
    
    metrics_names = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
    
    comparison = []
    
    for metric in metrics_names:
        catboost_val = catboost_results['metrics'][metric]
        symbolic_val = symbolic_results['metrics'][metric]
        diff = catboost_val - symbolic_val
        
        comparison.append({
            'metric': metric,
            'catboost': catboost_val,
            'symbolic': symbolic_val,
            'difference': diff,
            'winner': 'CatBoost' if diff > 0 else 'Symbolic' if diff < 0 else 'Tie'
        })
        
        print(f"\n{metric.upper().replace('_', ' ')}:")
        print(f"  CatBoost: {catboost_val:.4f}")
        print(f"  Symbolic: {symbolic_val:.4f}")
        print(f"  Difference: {diff:+.4f}")
        print(f"  Winner: {comparison[-1]['winner']}")
    
    # Overall winner
    catboost_wins = sum(1 for c in comparison if c['winner'] == 'CatBoost')
    symbolic_wins = sum(1 for c in comparison if c['winner'] == 'Symbolic')
    
    print(f"\n{'─'*80}")
    print("OVERALL WINNER:")
    print(f"{'─'*80}")
    print(f"CatBoost wins: {catboost_wins}/{len(metrics_names)} metrics")
    print(f"Symbolic wins: {symbolic_wins}/{len(metrics_names)} metrics")
    
    if catboost_wins > symbolic_wins:
        print(f"\n WINNER: CatBoost ML Model")
    elif symbolic_wins > catboost_wins:
        print(f"\n WINNER: Symbolic Reasoning")
    else:
        print(f"\n Both approaches perform equally")
    
    return pd.DataFrame(comparison)


# =============================================================================
# 3. VISUALIZATIONS
# =============================================================================

def plot_metrics_comparison(comparison_df, output_path):
    
    metrics = comparison_df['metric'].tolist()
    catboost = comparison_df['catboost'].tolist()
    symbolic = comparison_df['symbolic'].tolist()
    
    x = np.arange(len(metrics))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Okabe-Ito colorblind-safe palette
    bars1 = ax.bar(x - width/2, catboost, width, label='CatBoost ML', 
                   color='#E69F00', alpha=0.85, edgecolor='black', linewidth=1)
    bars2 = ax.bar(x + width/2, symbolic, width, label='Symbolic Reasoning', 
                   color='#56B4E9', alpha=0.85, edgecolor='black', linewidth=1)
    
    ax.set_xlabel('Metric', fontsize=12, fontweight='bold')
    ax.set_ylabel('Score', fontsize=12, fontweight='bold')
    ax.set_title('ML vs Symbolic Reasoning: Performance Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels([m.replace('_', ' ').title() for m in metrics])
    ax.legend(fontsize=11, framealpha=0.9)
    ax.set_ylim([0, 1.1])
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.3f}',
                   ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f" Saved: {output_path}")
    plt.close()


def plot_difference_chart(comparison_df, output_path):
    
    metrics = comparison_df['metric'].tolist()
    differences = comparison_df['difference'].tolist()
    
    # Okabe-Ito colors based on winner
    colors = ['#E69F00' if d > 0 else '#56B4E9' if d < 0 else '#009E73' 
              for d in differences]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    bars = ax.barh(metrics, differences, color=colors, alpha=0.85, edgecolor='black', linewidth=1)
    
    ax.set_xlabel('Difference (CatBoost - Symbolic)', fontsize=12, fontweight='bold')
    ax.set_ylabel('Metric', fontsize=12, fontweight='bold')
    ax.set_title('Performance Difference: CatBoost vs Symbolic', fontsize=14, fontweight='bold')
    ax.axvline(x=0, color='black', linestyle='-', linewidth=1.5)
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, (bar, val) in enumerate(zip(bars, differences)):
        label = f'{val:+.3f}'
        ax.text(val, i, label, va='center', 
               ha='left' if val > 0 else 'right',
               fontsize=10, fontweight='bold')
    
    # Add legend with Okabe-Ito colors
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='#E69F00', alpha=0.85, label='CatBoost Better'),
        Patch(facecolor='#56B4E9', alpha=0.85, label='Symbolic Better'),
        Patch(facecolor='#009E73', alpha=0.85, label='Tie')
    ]
    ax.legend(handles=legend_elements, loc='best', framealpha=0.9)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f" Saved: {output_path}")
    plt.close()


def plot_confusion_matrices_comparison(catboost_preds, symbolic_preds, output_path):
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # CatBoost confusion matrix
    cm_catboost = confusion_matrix(
        catboost_preds['true_label'], 
        catboost_preds['predicted_label']
    )
    cm_catboost_norm = cm_catboost.astype('float') / cm_catboost.sum(axis=1)[:, np.newaxis]
    
    labels = sorted(catboost_preds['true_label'].unique())
    
    # Use inferno colormap for both
    sns.heatmap(cm_catboost_norm, annot=True, fmt='.2f', cmap='inferno',
                xticklabels=labels, yticklabels=labels,
                ax=axes[0], cbar_kws={'label': 'Proportion'})
    axes[0].set_title('CatBoost ML Model', fontsize=13, fontweight='bold', pad=10)
    axes[0].set_xlabel('Predicted', fontweight='bold')
    axes[0].set_ylabel('True', fontweight='bold')
    
    # Symbolic confusion matrix
    cm_symbolic = confusion_matrix(
        symbolic_preds['true_label'], 
        symbolic_preds['predicted_label']
    )
    cm_symbolic_norm = cm_symbolic.astype('float') / cm_symbolic.sum(axis=1)[:, np.newaxis]
    
    sns.heatmap(cm_symbolic_norm, annot=True, fmt='.2f', cmap='inferno',
                xticklabels=labels, yticklabels=labels,
                ax=axes[1], cbar_kws={'label': 'Proportion'})
    axes[1].set_title('Symbolic Reasoning (ICHD-3)', fontsize=13, fontweight='bold', pad=10)
    axes[1].set_xlabel('Predicted', fontweight='bold')
    axes[1].set_ylabel('True', fontweight='bold')
    
    plt.suptitle('Confusion Matrix Comparison', fontsize=15, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f" Saved: {output_path}")
    plt.close()


# =============================================================================
# 4. DETAILED ANALYSIS
# =============================================================================

def analyze_per_class_performance(catboost_preds, symbolic_preds, output_path):
    
    print(f"\n{'='*80}")
    print("PER-CLASS ANALYSIS")
    print(f"{'='*80}")
    
    # Get unique classes
    classes = sorted(set(catboost_preds['true_label'].unique()) | 
                    set(symbolic_preds['true_label'].unique()))
    
    per_class_results = []
    
    for cls in classes:
        # CatBoost
        catboost_mask = catboost_preds['true_label'] == cls
        catboost_correct = (catboost_preds[catboost_mask]['true_label'] == 
                           catboost_preds[catboost_mask]['predicted_label']).sum()
        catboost_total = catboost_mask.sum()
        catboost_acc = catboost_correct / catboost_total if catboost_total > 0 else 0
        
        # Symbolic
        symbolic_mask = symbolic_preds['true_label'] == cls
        symbolic_correct = (symbolic_preds[symbolic_mask]['true_label'] == 
                          symbolic_preds[symbolic_mask]['predicted_label']).sum()
        symbolic_total = symbolic_mask.sum()
        symbolic_acc = symbolic_correct / symbolic_total if symbolic_total > 0 else 0
        
        per_class_results.append({
            'class': cls,
            'samples': max(catboost_total, symbolic_total),
            'catboost_accuracy': catboost_acc,
            'symbolic_accuracy': symbolic_acc,
            'difference': catboost_acc - symbolic_acc
        })
        
        print(f"\n{cls}:")
        print(f"  Samples: {max(catboost_total, symbolic_total)}")
        print(f"  CatBoost: {catboost_acc:.3f}")
        print(f"  Symbolic: {symbolic_acc:.3f}")
        print(f"  Difference: {catboost_acc - symbolic_acc:+.3f}")
    
    # Save to CSV
    df = pd.DataFrame(per_class_results)
    df.to_csv(output_path, index=False)
    print(f"\n Saved per-class analysis: {output_path}")
    
    return df


# =============================================================================
# 5. GENERATE COMPARISON REPORT
# =============================================================================

def generate_comparison_report(catboost_results, symbolic_results, output_dir):
    
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"\n{'='*80}")
    print("GENERATING COMPARISON REPORT")
    print(f"{'='*80}")
    
    # 1. Compare metrics
    comparison_df = compare_metrics(catboost_results, symbolic_results)
    
    # Save comparison table
    comparison_path = output_dir / 'metrics_comparison.csv'
    comparison_df.to_csv(comparison_path, index=False)
    print(f"\n✓ Saved comparison table: {comparison_path}")
    
    # 2. Generate visualizations
    print("\nGenerating visualizations...")
    
    plot_metrics_comparison(
        comparison_df,
        output_dir / 'metrics_comparison.png'
    )
    
    plot_difference_chart(
        comparison_df,
        output_dir / 'performance_difference.png'
    )
    
    plot_confusion_matrices_comparison(
        catboost_results['predictions'],
        symbolic_results['predictions'],
        output_dir / 'confusion_matrices_comparison.png'
    )
    
    # 3. Per-class analysis
    per_class_df = analyze_per_class_performance(
        catboost_results['predictions'],
        symbolic_results['predictions'],
        output_dir / 'per_class_comparison.csv'
    )
    
    # 4. Generate summary report
    summary = {
        'catboost': {
            'name': 'CatBoost ML Model',
            'accuracy': catboost_results['metrics']['accuracy'],
            'f1_weighted': catboost_results['metrics']['f1_weighted'],
            'precision_weighted': catboost_results['metrics']['precision_weighted'],
            'recall_weighted': catboost_results['metrics']['recall_weighted']
        },
        'symbolic': {
            'name': 'Symbolic Reasoning (ICHD-3)',
            'accuracy': symbolic_results['metrics']['accuracy'],
            'f1_weighted': symbolic_results['metrics']['f1_weighted'],
            'precision_weighted': symbolic_results['metrics']['precision_weighted'],
            'recall_weighted': symbolic_results['metrics']['recall_weighted']
        },
        'comparison': {
            'accuracy_difference': catboost_results['metrics']['accuracy'] - symbolic_results['metrics']['accuracy'],
            'f1_difference': catboost_results['metrics']['f1_weighted'] - symbolic_results['metrics']['f1_weighted'],
            'winner': 'CatBoost' if catboost_results['metrics']['accuracy'] > symbolic_results['metrics']['accuracy'] else 'Symbolic'
        }
    }
    
    summary_path = output_dir / 'comparison_summary.json'
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)
    print(f" Saved summary: {summary_path}")
    
    print(f"\n All comparison results saved to: {output_dir}")
    
    return summary


# =============================================================================
# 6. MAIN
# =============================================================================

def run_comparison(catboost_dir='/Users/M1HR/Desktop/MIGRAINE/evaluation_results/catboost',
                  symbolic_dir='/Users/M1HR/Desktop/MIGRAINE/',
                  output_dir='/Users/M1HR/Desktop/MIGRAINE/evaluation_results/comparison'):
    
    print("\n" + "="*80)
    print("ML vs SYMBOLIC REASONING COMPARISON")
    print("="*80)
    
    # Load results
    catboost_results = load_catboost_results(catboost_dir)
    symbolic_results = load_symbolic_results(symbolic_dir)
    
    # Generate comparison
    summary = generate_comparison_report(
        catboost_results,
        symbolic_results,
        output_dir
    )
    
    print("\n" + "="*80)
    print(" COMPARISON COMPLETE!")
    print("="*80)
    
    print(f"\n FINAL RESULTS:")
    print(f"\nCatBoost ML:")
    print(f"  Accuracy: {summary['catboost']['accuracy']:.4f}")
    print(f"  F1-Score: {summary['catboost']['f1_weighted']:.4f}")
    
    print(f"\nSymbolic Reasoning:")
    print(f"  Accuracy: {summary['symbolic']['accuracy']:.4f}")
    print(f"  F1-Score: {summary['symbolic']['f1_weighted']:.4f}")
    
    print(f"\nDifference:")
    print(f"  Accuracy: {summary['comparison']['accuracy_difference']:+.4f}")
    print(f"  F1-Score: {summary['comparison']['f1_difference']:+.4f}")
    
    print(f"\n Winner: {summary['comparison']['winner']}")
    
    print(f"\n Results: {output_dir}")
    
    return summary


if __name__ == "__main__":
    
    summary = run_comparison()