# 04 - Supervised Learning - Classification Models

This notebook covers:
1. Loading feature-selected data
2. Training multiple classification models:
   - Logistic Regression
   - Decision Tree
   - Random Forest
   - Support Vector Machine (SVM)
3. Model evaluation and comparison
4. Performance metrics and visualizations


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, roc_auc_score, confusion_matrix, 
                           classification_report, roc_curve)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


In [None]:
# Load feature-selected data
print("Loading feature-selected data...")
X_train = joblib.load('../data/X_train_selected.pkl')
X_test = joblib.load('../data/X_test_selected.pkl')
y_train = joblib.load('../data/y_train.pkl')
y_test = joblib.load('../data/y_test.pkl')
selected_features = joblib.load('../models/selected_features.pkl')

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Selected features: {selected_features}")

# Display first few rows
print("\nFirst 5 rows of training data:")
print(X_train.head())


In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42, probability=True)
}

# Dictionary to store results
results = {}

print("Training and evaluating models...")
print("=" * 50)

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    
    # Store results
    results[name] = {
        'model': model,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc
    }
    
    # Print results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    if auc is not None:
        print(f"AUC: {auc:.4f}")
    
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))


In [None]:
# Create comparison DataFrame
comparison_data = []
for name, result in results.items():
    comparison_data.append({
        'Model': name,
        'Accuracy': result['accuracy'],
        'Precision': result['precision'],
        'Recall': result['recall'],
        'F1-Score': result['f1'],
        'AUC': result['auc'] if result['auc'] is not None else 'N/A'
    })

comparison_df = pd.DataFrame(comparison_data)
print("Model Performance Comparison:")
print(comparison_df.round(4))

# Find best model based on F1-score
best_model_name = comparison_df.loc[comparison_df['F1-Score'].idxmax(), 'Model']
print(f"\nBest model based on F1-Score: {best_model_name}")
print(f"Best F1-Score: {comparison_df['F1-Score'].max():.4f}")


In [None]:
# Visualize model performance
plt.figure(figsize=(15, 12))

# 1. Performance metrics comparison
plt.subplot(2, 3, 1)
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
width = 0.2

for i, (name, result) in enumerate(results.items()):
    values = [result['accuracy'], result['precision'], result['recall'], result['f1']]
    plt.bar(x + i*width, values, width, label=name, alpha=0.8)

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x + width*1.5, metrics)
plt.legend()
plt.ylim(0, 1)

# 2. Confusion matrices
for i, (name, result) in enumerate(results.items()):
    plt.subplot(2, 3, i+2)
    cm = confusion_matrix(y_test, result['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['No Disease', 'Disease'],
                yticklabels=['No Disease', 'Disease'])
    plt.title(f'{name} - Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

plt.tight_layout()
plt.show()


In [None]:
# ROC Curves
plt.figure(figsize=(10, 8))

for name, result in results.items():
    if result['y_pred_proba'] is not None:
        fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])
        auc = result['auc']
        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Feature importance for tree-based models
tree_models = ['Decision Tree', 'Random Forest']
plt.figure(figsize=(12, 6))

for i, name in enumerate(tree_models):
    if name in results:
        plt.subplot(1, 2, i+1)
        model = results[name]['model']
        if hasattr(model, 'feature_importances_'):
            importance = model.feature_importances_
            feature_names = X_train.columns
            
            # Sort by importance
            indices = np.argsort(importance)[::-1]
            
            plt.barh(range(len(importance)), importance[indices], 
                    color='lightgreen', alpha=0.7, edgecolor='black')
            plt.yticks(range(len(importance)), [feature_names[j] for j in indices])
            plt.xlabel('Feature Importance')
            plt.title(f'{name} - Feature Importance')
            plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()


In [None]:
# Save trained models and results
import os

# Create directories if they don't exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../results', exist_ok=True)

# Save individual models
for name, result in results.items():
    model_filename = f'../models/{name.lower().replace(" ", "_")}_model.pkl'
    joblib.dump(result['model'], model_filename)
    print(f"Saved {name} model to {model_filename}")

# Save all results
joblib.dump(results, '../models/supervised_learning_results.pkl')
joblib.dump(comparison_df, '../results/model_comparison.pkl')

# Save evaluation metrics to text file
with open('../results/evaluation_metrics.txt', 'w') as f:
    f.write("Heart Disease Prediction - Model Evaluation Results\n")
    f.write("=" * 50 + "\n\n")
    
    for name, result in results.items():
        f.write(f"{name}:\n")
        f.write(f"  Accuracy: {result['accuracy']:.4f}\n")
        f.write(f"  Precision: {result['precision']:.4f}\n")
        f.write(f"  Recall: {result['recall']:.4f}\n")
        f.write(f"  F1-Score: {result['f1']:.4f}\n")
        if result['auc'] is not None:
            f.write(f"  AUC: {result['auc']:.4f}\n")
        f.write("\n")
    
    f.write(f"Best Model: {best_model_name}\n")
    f.write(f"Best F1-Score: {comparison_df['F1-Score'].max():.4f}\n")

print("\nSupervised learning completed and models saved!")
print("Files saved:")
print("- Individual model files in ../models/")
print("- ../models/supervised_learning_results.pkl")
print("- ../results/model_comparison.pkl")
print("- ../results/evaluation_metrics.txt")

# Display final summary
print(f"\nFinal Summary:")
print(f"- Best performing model: {best_model_name}")
print(f"- Best F1-Score: {comparison_df['F1-Score'].max():.4f}")
print(f"- Best Accuracy: {comparison_df['Accuracy'].max():.4f}")
print(f"- All models trained and saved successfully!")
