In [None]:
pip install catboost

In [6]:
import os
print(os.getcwd())
os.chdir("/Users/M1HR/Desktop/MIGRAINE")

/Users/M1HR/Desktop/MIGRAINE/data


In [None]:

import pandas as pd
import numpy as np
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
from catboost import CatBoostClassifier, Pool
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# =============================================================================
# 1. LOAD AND PREPARE DATA
# =============================================================================

def load_data(csv_path):
   
    print(f"\n{'='*80}")
    print("LOADING DATA")
    print(f"{'='*80}")
    
    df = pd.read_csv(csv_path)
    
    print(f"✓ Loaded {len(df)} patients")
    print(f"✓ Features: {df.shape[1]} columns")
    
    # Show data info
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nFirst few rows:")
    print(df.head())
    
    # Check for missing values
    missing = df.isnull().sum()
    if missing.any():
        print(f"\n Missing values:")
        print(missing[missing > 0])
    else:
        print(f"\n✓ No missing values")
    
    return df


def prepare_data(df):
    
    print(f"\n{'='*80}")
    print("PREPARING DATA")
    print(f"{'='*80}")
    
    # Separate features (X) and target (y)
    X = df.drop(columns=['Type', 'Patient_ID', 'Age'])
    y = df['Type']
    
    print(f" Features (X): {X.shape}")
    print(f" Target (y): {y.shape}")
    
    # Show feature names
    print(f"\nFeature columns ({len(X.columns)}):")
    for i, col in enumerate(X.columns, 1):
        print(f"  {i:2d}. {col}")
    
    # Show target distribution
    print(f"\nTarget distribution:")
    print(y.value_counts().sort_index())
    
    # Check for class imbalance
    class_counts = y.value_counts()
    imbalance_ratio = class_counts.max() / class_counts.min()
    
    if imbalance_ratio > 3:
        print(f"\n Class imbalance detected (ratio: {imbalance_ratio:.1f})")
        print(f"   Most common: {class_counts.idxmax()} ({class_counts.max()} samples)")
        print(f"   Least common: {class_counts.idxmin()} ({class_counts.min()} samples)")
    else:
        print(f"\n Classes reasonably balanced (ratio: {imbalance_ratio:.1f})")
    
    return X, y


# =============================================================================
# 2. TRAIN-TEST SPLIT
# =============================================================================

def split_data(X, y, test_size=0.2, random_state=42):
    ]    
    print(f"\n{'='*80}")
    print("TRAIN-TEST SPLIT")
    print(f"{'='*80}")
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=test_size,
        random_state=random_state,
        stratify=y  # Maintain class distribution
    )
    
    print(f"Train size: {len(X_train)} ({(1-test_size)*100:.0f}%)")
    print(f"Test size:  {len(X_test)} ({test_size*100:.0f}%)")
    
    # Check distributions
    print(f"\nTrain distribution:")
    print(y_train.value_counts().sort_index())
    
    print(f"\nTest distribution:")
    print(y_test.value_counts().sort_index())
    
    return X_train, X_test, y_train, y_test


# =============================================================================
# 3. TRAIN CATBOOST MODEL
# =============================================================================

def train_catboost(X_train, y_train, X_test, y_test):
    
    print(f"\n{'='*80}")
    print("TRAINING CATBOOST MODEL")
    print(f"{'='*80}")
    
    # Initialize model
    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.1,
        depth=6,
        loss_function='MultiClass',
        eval_metric='Accuracy',
        random_seed=42,
        verbose=100,  # Print every 100 iterations
        early_stopping_rounds=50,
        task_type='CPU'
    )
    
    # Create pools for efficient training
    train_pool = Pool(X_train, y_train)
    test_pool = Pool(X_test, y_test)
    
    print("\nTraining...")
    
    # Train
    model.fit(
        train_pool,
        eval_set=test_pool,
        plot=False
    )
    
    print(f"\n✓ Training complete!")
    print(f"  Best iteration: {model.get_best_iteration()}")
    print(f"  Best score: {model.get_best_score()['validation']['Accuracy']:.4f}")
    
    return model


# =============================================================================
# 4. EVALUATE MODEL
# =============================================================================

def evaluate_model(model, X_train, y_train, X_test, y_test):
    
    print(f"\n{'='*80}")
    print("MODEL EVALUATION")
    print(f"{'='*80}")
    
    # Predictions
    y_train_pred = model.predict(X_train).flatten()
    y_test_pred = model.predict(X_test).flatten()
    
    # Metrics
    results = {
        'train': {
            'accuracy': accuracy_score(y_train, y_train_pred),
            'precision_macro': precision_score(y_train, y_train_pred, average='macro'),
            'precision_weighted': precision_score(y_train, y_train_pred, average='weighted'),
            'recall_macro': recall_score(y_train, y_train_pred, average='macro'),
            'recall_weighted': recall_score(y_train, y_train_pred, average='weighted'),
            'f1_macro': f1_score(y_train, y_train_pred, average='macro'),
            'f1_weighted': f1_score(y_train, y_train_pred, average='weighted')
        },
        'test': {
            'accuracy': accuracy_score(y_test, y_test_pred),
            'precision_macro': precision_score(y_test, y_test_pred, average='macro'),
            'precision_weighted': precision_score(y_test, y_test_pred, average='weighted'),
            'recall_macro': recall_score(y_test, y_test_pred, average='macro'),
            'recall_weighted': recall_score(y_test, y_test_pred, average='weighted'),
            'f1_macro': f1_score(y_test, y_test_pred, average='macro'),
            'f1_weighted': f1_score(y_test, y_test_pred, average='weighted')
        }
    }
    
    # Print results
    print("\nTRAIN METRICS:")
    print(f"  Accuracy:           {results['train']['accuracy']:.4f}")
    print(f"  Precision (macro):  {results['train']['precision_macro']:.4f}")
    print(f"  Precision (weighted): {results['train']['precision_weighted']:.4f}")
    print(f"  Recall (macro):     {results['train']['recall_macro']:.4f}")
    print(f"  Recall (weighted):  {results['train']['recall_weighted']:.4f}")
    print(f"  F1 (macro):         {results['train']['f1_macro']:.4f}")
    print(f"  F1 (weighted):      {results['train']['f1_weighted']:.4f}")
    
    print("\nTEST METRICS:")
    print(f"  Accuracy:           {results['test']['accuracy']:.4f}")
    print(f"  Precision (macro):  {results['test']['precision_macro']:.4f}")
    print(f"  Precision (weighted): {results['test']['precision_weighted']:.4f}")
    print(f"  Recall (macro):     {results['test']['recall_macro']:.4f}")
    print(f"  Recall (weighted):  {results['test']['recall_weighted']:.4f}")
    print(f"  F1 (macro):         {results['test']['f1_macro']:.4f}")
    print(f"  F1 (weighted):      {results['test']['f1_weighted']:.4f}")
    
    # Classification report
    print("\nDETAILED CLASSIFICATION REPORT (Test Set):")
    print(classification_report(y_test, y_test_pred))
    
    return results, y_train_pred, y_test_pred


# =============================================================================
# 5. VISUALIZATIONS
# =============================================================================

def plot_confusion_matrix(y_true, y_pred, title, output_path):
    
    cm = confusion_matrix(y_true, y_pred)
    
    # Get unique labels
    labels = sorted(y_true.unique())
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Raw counts
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=labels, yticklabels=labels,
                ax=axes[0], cbar_kws={'label': 'Count'})
    axes[0].set_title(f'{title} - Raw Counts')
    axes[0].set_xlabel('Predicted')
    axes[0].set_ylabel('True')
    
    # Normalized
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    sns.heatmap(cm_norm, annot=True, fmt='.2f', cmap='Blues',
                xticklabels=labels, yticklabels=labels,
                ax=axes[1], cbar_kws={'label': 'Proportion'})
    axes[1].set_title(f'{title} - Normalized')
    axes[1].set_xlabel('Predicted')
    axes[1].set_ylabel('True')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"Saved: {output_path}")
    plt.close()


def plot_feature_importance(model, feature_names, output_path):
    
    # Get feature importance
    importance = model.get_feature_importance()
    
    # Create dataframe
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=True)
    
    # Plot
    plt.figure(figsize=(10, max(8, len(feature_names) * 0.3)))
    
    colors = plt.cm.viridis(importance_df['importance'] / importance_df['importance'].max())
    
    plt.barh(range(len(importance_df)), importance_df['importance'], color=colors)
    plt.yticks(range(len(importance_df)), importance_df['feature'])
    plt.xlabel('Importance')
    plt.title('CatBoost Feature Importance')
    plt.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"Saved: {output_path}")
    plt.close()


def plot_metrics_comparison(results, output_path):
    
    metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
    metric_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    
    train_scores = [results['train'][m] for m in metrics]
    test_scores = [results['test'][m] for m in metrics]
    
    x = np.arange(len(metrics))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    bars1 = ax.bar(x - width/2, train_scores, width, label='Train', color='skyblue')
    bars2 = ax.bar(x + width/2, test_scores, width, label='Test', color='coral')
    
    ax.set_xlabel('Metric')
    ax.set_ylabel('Score')
    ax.set_title('CatBoost Model Performance: Train vs Test')
    ax.set_xticks(x)
    ax.set_xticklabels(metric_names)
    ax.legend()
    ax.set_ylim([0, 1.1])
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.3f}',
                   ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"✓ Saved: {output_path}")
    plt.close()


# =============================================================================
# 6. SAVE RESULTS
# =============================================================================

def save_results(model, results, y_test, y_test_pred, feature_names, output_dir):
    
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"\n{'='*80}")
    print("SAVING RESULTS")
    print(f"{'='*80}")
    
    # 1. Save model
    model_path = output_dir / 'catboost_model.cbm'
    model.save_model(str(model_path))
    print(f"Model saved: {model_path}")
    
    # 2. Save metrics
    metrics_path = output_dir / 'catboost_metrics.json'
    with open(metrics_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Metrics saved: {metrics_path}")
    
    # 3. Save predictions
    predictions_df = pd.DataFrame({
        'true_label': y_test,
        'predicted_label': y_test_pred
    })
    predictions_path = output_dir / 'catboost_predictions.csv'
    predictions_df.to_csv(predictions_path, index=False)
    print(f"Predictions saved: {predictions_path}")
    
    # 4. Save feature importance
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': model.get_feature_importance()
    }).sort_values('importance', ascending=False)
    importance_path = output_dir / 'feature_importance.csv'
    importance_df.to_csv(importance_path, index=False)
    print(f"Feature importance saved: {importance_path}")
    
    # 5. Generate visualizations
    print("\nGenerating visualizations...")
    
    # Confusion matrix
    plot_confusion_matrix(
        y_test, y_test_pred, 
        'CatBoost Confusion Matrix (Test Set)',
        output_dir / 'confusion_matrix.png'
    )
    
    # Feature importance
    plot_feature_importance(
        model, feature_names,
        output_dir / 'feature_importance.png'
    )
    
    # Metrics comparison
    plot_metrics_comparison(
        results,
        output_dir / 'metrics_comparison.png'
    )
    
    print(f"\nAll results saved to: {output_dir}")


# =============================================================================
# 7. MAIN PIPELINE
# =============================================================================

def run_catboost_pipeline(data_path, output_dir='evaluation_results/catboost'):
    
    print("\n" + "="*80)
    print("CATBOOST MODEL TRAINING PIPELINE")
    print("="*80)
    
    # Load data
    df = load_data(data_path)
    
    # Prepare data
    X, y = prepare_data(df)
    
    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2)
    
    # Train model
    model = train_catboost(X_train, y_train, X_test, y_test)
    
    # Evaluate model
    results, y_train_pred, y_test_pred = evaluate_model(
        model, X_train, y_train, X_test, y_test
    )
    
    # Save results
    save_results(
        model, results, y_test, y_test_pred, 
        X.columns.tolist(), output_dir
    )
    
    print("\n" + "="*80)
    print("CATBOOST TRAINING COMPLETE")
    print("="*80)
    
    print(f"\nSUMMARY:")
    print(f"  Test Accuracy: {results['test']['accuracy']:.4f}")
    print(f"  Test F1 (weighted): {results['test']['f1_weighted']:.4f}")
    print(f"\nResults saved to: {output_dir}")
    print(f"\nNext step: Compare with symbolic reasoning results!")
    
    return model, results


# =============================================================================
# 8. MAIN
# =============================================================================

if __name__ == "__main__":
    
    data_path = 'data/migraine_with_id.csv'

    model, results = run_catboost_pipeline(data_path)