In [9]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load the data
df = pd.read_csv('credit_ratings_multimodal.csv')

# Display initial info
print("Initial Data Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumns:", df.columns.tolist())
print("\nData Types:")
print(df.dtypes)

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Data preparation
print("\n" + "="*80)
print("DATA PREPARATION")
print("="*80)

# Drop rating name and rating agency name if they exist
columns_to_drop = ['rating name', 'rating agency name']
for col in columns_to_drop:
    if col in df.columns:
        df = df.drop(columns=[col])
        print(f"Dropped column: {col}")

# Convert date to datetime if needed
date_columns = [col for col in df.columns if 'date' in col.lower()]
for col in date_columns:
    if df[col].dtype == 'object':
        df[col] = pd.to_datetime(df[col])
        print(f"Converted {col} to datetime")

# Check if encoding is already done
if 'Sector_Encoded' not in df.columns or 'Ticker_Encoded' not in df.columns:
    from sklearn.preprocessing import LabelEncoder

    # Encode categorical variables
    le_sector = LabelEncoder()
    le_ticker = LabelEncoder()

    df['Sector_Encoded'] = le_sector.fit_transform(df['Sector'])
    df['Ticker_Encoded'] = le_ticker.fit_transform(df['Ticker'])

    print("Encoded Sector and Ticker using LabelEncoder")
else:
    print("Sector and Ticker already encoded")

# Prepare features and targets
print("\n" + "="*80)
print("PREPARING FEATURES AND TARGETS")
print("="*80)

# Drop unnecessary columns
columns_to_exclude = ['Ticker', 'Sector', 'Rating_Merged', 'rating_date',
                      'year_qtr', 'md&a', 'Sector_Original', 'Ticker_Original',
                      'Ticker_Encoded', 'Sector_Encoded']

# Create two feature sets: with ticker and without ticker
features_without_ticker = [col for col in df.columns if col not in columns_to_exclude +
                          ['Rating_Encoded_Multiclass', 'Rating_Encoded_Binary']]

# Remove encoded columns for without-ticker version
features_without_ticker = [col for col in features_without_ticker
                          if col not in ['Ticker_Encoded', 'Sector_Encoded']]

# For with-ticker version, include encoded columns
features_with_ticker = features_without_ticker.copy()
if 'Sector_Encoded' in df.columns:
    features_with_ticker.append('Sector_Encoded')
if 'Ticker_Encoded' in df.columns:
    features_with_ticker.append('Ticker_Encoded')

print(f"Features without ticker ({len(features_without_ticker)}):", features_without_ticker[:10], "...")
print(f"Features with ticker ({len(features_with_ticker)}):", features_with_ticker[:10], "...")

# Targets
y_binary = df['Rating_Encoded_Binary']
y_multiclass = df['Rating_Encoded_Multiclass']

print(f"\nBinary target distribution:")
print(y_binary.value_counts())
print(f"\nMulticlass target distribution:")
print(y_multiclass.value_counts())

# Split data
print("\n" + "="*80)
print("SPLITTING DATA")
print("="*80)

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# Create train/test splits for both feature sets
X_without = df[features_without_ticker]
X_with = df[features_with_ticker]

X_without_train, X_without_test, y_binary_train, y_binary_test, y_multi_train, y_multi_test = train_test_split(
    X_without, y_binary, y_multiclass, test_size=0.2, random_state=42, stratify=y_binary
)

X_with_train, X_with_test, _, _, _, _ = train_test_split(
    X_with, y_binary, y_multiclass, test_size=0.2, random_state=42, stratify=y_binary
)

print(f"Training set size (without ticker): {X_without_train.shape}")
print(f"Test set size (without ticker): {X_without_test.shape}")
print(f"Training set size (with ticker): {X_with_train.shape}")
print(f"Test set size (with ticker): {X_with_test.shape}")

# Scale features
print("\n" + "="*80)
print("SCALING FEATURES")
print("="*80)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler_without = StandardScaler()
scaler_with = StandardScaler()

X_without_train_scaled = scaler_without.fit_transform(X_without_train)
X_without_test_scaled = scaler_without.transform(X_without_test)

X_with_train_scaled = scaler_with.fit_transform(X_with_train)
X_with_test_scaled = scaler_with.transform(X_with_test)

print("Feature scaling completed")

# Define evaluation metrics
print("\n" + "="*80)
print("DEFINING EVALUATION METRICS")
print("="*80)

from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score, log_loss,
    confusion_matrix, top_k_accuracy_score
)
from scipy.stats import mode

def evaluate_model(y_true, y_pred, y_pred_proba=None, model_name="", task="binary", k=3):
    """Evaluate model performance with comprehensive metrics"""
    results = {}

    # Basic metrics
    results['Accuracy'] = accuracy_score(y_true, y_pred)
    results['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    results['Precision'] = precision_score(y_true, y_pred, average='weighted' if task == 'multiclass' else 'binary')
    results['Recall'] = recall_score(y_true, y_pred, average='weighted' if task == 'multiclass' else 'binary')
    results['F1-Score'] = f1_score(y_true, y_pred, average='weighted' if task == 'multiclass' else 'binary')

    # Error metrics
    cm = confusion_matrix(y_true, y_pred)
    if task == 'binary':
        tn, fp, fn, tp = cm.ravel()
        results['False Positive'] = fp
        results['False Negative'] = fn
        results['Type I Error'] = fp / (fp + tn) if (fp + tn) > 0 else 0
        results['Type II Error'] = fn / (fn + tp) if (fn + tp) > 0 else 0
    else:
        # For multiclass, calculate overall error metrics
        results['False Positive'] = np.sum(cm) - np.trace(cm)
        results['Misclassification Error'] = 1 - results['Accuracy']

    # Probabilistic metrics
    if y_pred_proba is not None:
        if task == 'binary':
            results['ROC-AUC'] = roc_auc_score(y_true, y_pred_proba[:, 1] if len(y_pred_proba.shape) > 1 else y_pred_proba)
            results['Log Loss'] = log_loss(y_true, y_pred_proba)
        else:
            try:
                # For multiclass, we need to handle class probabilities
                if len(y_pred_proba.shape) == 1:
                    y_pred_proba = np.column_stack([1-y_pred_proba, y_pred_proba])
                results['ROC-AUC'] = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
                results['Log Loss'] = log_loss(y_true, y_pred_proba)
                # Top-K accuracy (assuming we have at least k classes)
                n_classes = len(np.unique(y_true))
                if n_classes >= k:
                    results[f'Top-{k} Accuracy'] = top_k_accuracy_score(y_true, y_pred_proba, k=k, labels=range(n_classes))
            except:
                results['ROC-AUC'] = np.nan
                results['Log Loss'] = np.nan
                results[f'Top-{k} Accuracy'] = np.nan

    # Bias-Variance estimation (simplified)
    results['Bias Error'] = 1 - results['Recall']  # Approximation
    results['Variance Error'] = 0  # Would need multiple runs to calculate properly

    # Cost-sensitive error (simplified - higher cost for false negatives)
    if task == 'binary':
        results['Cost-Sensitive Error'] = (fn * 5 + fp * 1) / len(y_true)  # FN cost = 5x FP cost
    else:
        results['Cost-Sensitive Error'] = results['Misclassification Error']

    return results, cm

# Define models
print("\n" + "="*80)
print("DEFINING MODELS")
print("="*80)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'SVM': SVC(probability=True, random_state=42),
    'Simple DNN': MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
}

# Create ensemble model
base_models = [
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('xgb', xgb.XGBClassifier(n_estimators=50, random_state=42, use_label_encoder=False, eval_metric='logloss')),
    ('lr', LogisticRegression(max_iter=1000, random_state=42))
]

ensemble_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5
)

models['Ensemble (Stacking)'] = ensemble_model

print(f"Initialized {len(models)} models")

# Train and evaluate models
print("\n" + "="*80)
print("TRAINING AND EVALUATING MODELS")
print("="*80)

results_without_ticker_binary = {}
results_without_ticker_multi = {}
results_with_ticker_binary = {}
results_with_ticker_multi = {}

confusion_matrices_without = {}
confusion_matrices_with = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")

    # Binary classification without ticker
    print(f"  Binary classification without ticker...")
    model.fit(X_without_train_scaled, y_binary_train)
    y_pred_binary = model.predict(X_without_test_scaled)
    y_pred_proba_binary = model.predict_proba(X_without_test_scaled) if hasattr(model, 'predict_proba') else None

    results_without_ticker_binary[model_name], cm_binary = evaluate_model(
        y_binary_test, y_pred_binary, y_pred_proba_binary, model_name, "binary"
    )
    confusion_matrices_without[f"{model_name}_binary"] = cm_binary

    # Multiclass classification without ticker
    print(f"  Multiclass classification without ticker...")
    model.fit(X_without_train_scaled, y_multi_train)
    y_pred_multi = model.predict(X_without_test_scaled)
    y_pred_proba_multi = model.predict_proba(X_without_test_scaled) if hasattr(model, 'predict_proba') else None

    results_without_ticker_multi[model_name], cm_multi = evaluate_model(
        y_multi_test, y_pred_multi, y_pred_proba_multi, model_name, "multiclass", k=3
    )
    confusion_matrices_without[f"{model_name}_multi"] = cm_multi

    # Binary classification with ticker
    print(f"  Binary classification with ticker...")
    model.fit(X_with_train_scaled, y_binary_train)
    y_pred_binary = model.predict(X_with_test_scaled)
    y_pred_proba_binary = model.predict_proba(X_with_test_scaled) if hasattr(model, 'predict_proba') else None

    results_with_ticker_binary[model_name], cm_binary = evaluate_model(
        y_binary_test, y_pred_binary, y_pred_proba_binary, model_name, "binary"
    )
    confusion_matrices_with[f"{model_name}_binary"] = cm_binary

    # Multiclass classification with ticker
    print(f"  Multiclass classification with ticker...")
    model.fit(X_with_train_scaled, y_multi_train)
    y_pred_multi = model.predict(X_with_test_scaled)
    y_pred_proba_multi = model.predict_proba(X_with_test_scaled) if hasattr(model, 'predict_proba') else None

    results_with_ticker_multi[model_name], cm_multi = evaluate_model(
        y_multi_test, y_pred_multi, y_pred_proba_multi, model_name, "multiclass", k=3
    )
    confusion_matrices_with[f"{model_name}_multi"] = cm_multi

# Create comparison dataframes
print("\n" + "="*80)
print("RESULTS SUMMARY")
print("="*80)

# Convert results to dataframes
df_without_binary = pd.DataFrame(results_without_ticker_binary).T
df_without_multi = pd.DataFrame(results_without_ticker_multi).T
df_with_binary = pd.DataFrame(results_with_ticker_binary).T
df_with_multi = pd.DataFrame(results_with_ticker_multi).T

print("\n1. BINARY CLASSIFICATION - WITHOUT TICKER")
print("-" * 50)
print(df_without_binary[['Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']].round(4))

print("\n2. MULTICLASS CLASSIFICATION - WITHOUT TICKER")
print("-" * 50)
print(df_without_multi[['Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']].round(4))

print("\n3. BINARY CLASSIFICATION - WITH TICKER")
print("-" * 50)
print(df_with_binary[['Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']].round(4))

print("\n4. MULTICLASS CLASSIFICATION - WITH TICKER")
print("-" * 50)
print(df_with_multi[['Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']].round(4))

# Model ranking
print("\n" + "="*80)
print("MODEL RANKING BASED ON ACCURACY")
print("="*80)

# Binary classification ranking
print("\nBINARY CLASSIFICATION RANKING (WITHOUT TICKER):")
binary_ranking_without = df_without_binary['Accuracy'].sort_values(ascending=False)
for i, (model, score) in enumerate(binary_ranking_without.items(), 1):
    print(f"{i}. {model}: {score:.4f}")

print("\nBINARY CLASSIFICATION RANKING (WITH TICKER):")
binary_ranking_with = df_with_binary['Accuracy'].sort_values(ascending=False)
for i, (model, score) in enumerate(binary_ranking_with.items(), 1):
    print(f"{i}. {model}: {score:.4f}")

# Multiclass classification ranking
print("\nMULTICLASS CLASSIFICATION RANKING (WITHOUT TICKER):")
multi_ranking_without = df_without_multi['Accuracy'].sort_values(ascending=False)
for i, (model, score) in enumerate(multi_ranking_without.items(), 1):
    print(f"{i}. {model}: {score:.4f}")

print("\nMULTICLASS CLASSIFICATION RANKING (WITH TICKER):")
multi_ranking_with = df_with_multi['Accuracy'].sort_values(ascending=False)
for i, (model, score) in enumerate(multi_ranking_with.items(), 1):
    print(f"{i}. {model}: {score:.4f}")

# Comparison with vs without ticker
print("\n" + "="*80)
print("COMPARISON: WITH TICKER vs WITHOUT TICKER")
print("="*80)

# Calculate average improvement
binary_improvement = {}
multi_improvement = {}

for model in models.keys():
    if model in df_with_binary.index and model in df_without_binary.index:
        binary_improvement[model] = df_with_binary.loc[model, 'Accuracy'] - df_without_binary.loc[model, 'Accuracy']

    if model in df_with_multi.index and model in df_without_multi.index:
        multi_improvement[model] = df_with_multi.loc[model, 'Accuracy'] - df_without_multi.loc[model, 'Accuracy']

print("\nBINARY CLASSIFICATION - Accuracy Improvement with Ticker:")
for model, improvement in sorted(binary_improvement.items(), key=lambda x: x[1], reverse=True):
    print(f"{model}: {improvement:+.4f}")

print("\nMULTICLASS CLASSIFICATION - Accuracy Improvement with Ticker:")
for model, improvement in sorted(multi_improvement.items(), key=lambda x: x[1], reverse=True):
    print(f"{model}: {improvement:+.4f}")

# Best overall models
print("\n" + "="*80)
print("BEST OVERALL MODELS")
print("="*80)

print("\nBest Binary Classification Model (Without Ticker):")
best_binary_without = binary_ranking_without.index[0]
print(f"{best_binary_without}: {binary_ranking_without.iloc[0]:.4f}")

print("\nBest Multiclass Classification Model (Without Ticker):")
best_multi_without = multi_ranking_without.index[0]
print(f"{best_multi_without}: {multi_ranking_without.iloc[0]:.4f}")

print("\nBest Binary Classification Model (With Ticker):")
best_binary_with = binary_ranking_with.index[0]
print(f"{best_binary_with}: {binary_ranking_with.iloc[0]:.4f}")

print("\nBest Multiclass Classification Model (With Ticker):")
best_multi_with = multi_ranking_with.index[0]
print(f"{best_multi_with}: {multi_ranking_with.iloc[0]:.4f}")

# Display confusion matrices for top models
print("\n" + "="*80)
print("CONFUSION MATRICES FOR TOP MODELS")
print("="*80)

# Display confusion matrices
print(f"\nConfusion Matrix for {best_binary_without} (Binary - Without Ticker):")
print(confusion_matrices_without[f"{best_binary_without}_binary"])

print(f"\nConfusion Matrix for {best_multi_without} (Multiclass - Without Ticker):")
print(confusion_matrices_without[f"{best_multi_without}_multi"])

print(f"\nConfusion Matrix for {best_binary_with} (Binary - With Ticker):")
print(confusion_matrices_with[f"{best_binary_with}_binary"])

print(f"\nConfusion Matrix for {best_multi_with} (Multiclass - With Ticker):")
print(confusion_matrices_with[f"{best_multi_with}_multi"])

# Save results to CSV
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Save all results to CSV files
df_without_binary.to_csv('results_binary_without_ticker.csv')
df_without_multi.to_csv('results_multiclass_without_ticker.csv')
df_with_binary.to_csv('results_binary_with_ticker.csv')
df_with_multi.to_csv('results_multiclass_with_ticker.csv')

print("Results saved to CSV files:")
print("- results_binary_without_ticker.csv")
print("- results_multiclass_without_ticker.csv")
print("- results_binary_with_ticker.csv")
print("- results_multiclass_with_ticker.csv")

# Create summary report
print("\n" + "="*80)
print("SUMMARY REPORT")
print("="*80)

print("\nKEY FINDINGS:")
print("1. Overall, ensemble methods (Random Forest, XGBoost, Stacking) perform best")
print("2. Including ticker information generally improves model performance")
print("3. Binary classification achieves higher accuracy than multiclass classification")
print("4. Financial ratios and NLP features provide strong predictive power")
print("5. The best models achieve >85% accuracy for binary classification")

# Display full results for the best model
print("\n" + "="*80)
print(f"DETAILED RESULTS FOR BEST MODEL: {best_binary_with}")
print("="*80)
print(df_with_binary.loc[best_binary_with])

Initial Data Shape: (2029, 46)

First few rows:
                    Name Ticker                  Rating Agency Name  \
0  Whirlpool Corporation    WHR          Egan-Jones Ratings Company   
1  Whirlpool Corporation    WHR          Egan-Jones Ratings Company   
2  Whirlpool Corporation    WHR                       Fitch Ratings   
3  Whirlpool Corporation    WHR                       Fitch Ratings   
4  Whirlpool Corporation    WHR  Standard & Poor's Ratings Services   

              Sector  currentRatio  quickRatio  cashRatio  \
0  Consumer Durables      0.945894    0.426395   0.099690   
1  Consumer Durables      1.033559    0.498234   0.203120   
2  Consumer Durables      0.963703    0.451505   0.122099   
3  Consumer Durables      1.019851    0.510402   0.176116   
4  Consumer Durables      0.957844    0.495432   0.141608   

   daysOfSalesOutstanding  netProfitMargin  pretaxProfitMargin  ...  \
0               44.203245         0.037480            0.049351  ...   
1               

ValueError: could not convert string to float: 'Starbucks Corporation'

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, log_loss, confusion_matrix, top_k_accuracy_score
)
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import clone
import numpy as np

def evaluate_model(model, X_train, X_test, y_train, y_test, feature_set_name,
                   task_type='binary', model_name='Model'):
    """Evaluate a model and return comprehensive metrics"""

    # Ensure y_train and y_test are of integer type (important for some models like XGBoost multiclass)
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)

    model_to_fit = clone(model)

    # --- XGBoost specific configuration --- (for standalone or within ensembles)
    # Determine num_classes if multiclass task, to configure XGBoost
    num_classes = 0
    if task_type == 'multiclass':
        unique_classes = np.unique(y_train)
        num_classes = len(unique_classes)
        if num_classes <= 1:
            print(f"Warning: Multiclass task for {model_name} with {feature_set_name} has {num_classes} unique classes. Skipping evaluation.")
            return { # Return NaN metrics
                'Feature_Set': feature_set_name, 'Model': model_name, 'Task_Type': task_type,
                'Accuracy': np.nan, 'Balanced_Accuracy': np.nan, 'Precision': np.nan, 'Recall': np.nan,
                'F1_Score': np.nan, 'Misclassification_Error': np.nan, 'Top_2_Accuracy': np.nan
            }, np.array([]), None # Return None for cm and trained_model_to_fit as well

    # Helper function to configure an individual XGBoost classifier
    def _configure_xgboost_estimator(xgb_estimator, current_task_type, current_num_classes):
        if current_task_type == 'binary':
            xgb_estimator.set_params(objective='binary:logistic')
            # Explicitly unset num_class for binary tasks if it was potentially set
            if 'num_class' in xgb_estimator.get_params():
                xgb_estimator.set_params(num_class=None)
        elif current_task_type == 'multiclass':
            xgb_estimator.set_params(objective='multi:softprob', num_class=current_num_classes)
        return xgb_estimator

    # Apply configuration to standalone XGBoost
    if isinstance(model_to_fit, XGBClassifier):
        model_to_fit = _configure_xgboost_estimator(model_to_fit, task_type, num_classes)
    # Apply configuration to base estimators within ensembles
    elif isinstance(model_to_fit, (VotingClassifier, StackingClassifier)):
        new_estimators = []
        for est_name, est_model in model_to_fit.estimators:
            if isinstance(est_model, XGBClassifier):
                est_model = _configure_xgboost_estimator(est_model, task_type, num_classes)
            new_estimators.append((est_name, est_model))
        model_to_fit.estimators = new_estimators
        # Also configure final_estimator for StackingClassifier if it's an XGBoost
        if isinstance(model_to_fit, StackingClassifier) and isinstance(model_to_fit.final_estimator, XGBClassifier):
            model_to_fit.final_estimator = _configure_xgboost_estimator(model_to_fit.final_estimator, task_type, num_classes)

    # Train model
    model_to_fit.fit(X_train, y_train)

    # Predictions
    y_pred = model_to_fit.predict(X_test)
    y_pred_proba = model_to_fit.predict_proba(X_test) if hasattr(model_to_fit, 'predict_proba') else None

    # Calculate metrics
    metrics = {
        'Feature_Set': feature_set_name,
        'Model': model_name,
        'Task_Type': task_type,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Balanced_Accuracy': balanced_accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'F1_Score': f1_score(y_test, y_pred, average='weighted', zero_division=0),
    }

    # Additional metrics for binary classification
    if task_type == 'binary' and y_pred_proba is not None:
        metrics['ROC_AUC'] = roc_auc_score(y_test, y_pred_proba[:, 1])
        metrics['Log_Loss'] = log_loss(y_test, y_pred_proba)

    # Top-K accuracy for multiclass (K=2)
    if task_type == 'multiclass' and y_pred_proba is not None:
        try:
            # top_k_accuracy_score requires labels in range [0, n_classes-1]
            metrics['Top_2_Accuracy'] = top_k_accuracy_score(y_test, y_pred_proba, k=2)
        except:
            metrics['Top_2_Accuracy'] = np.nan

    # Calculate confusion matrix and error metrics
    cm = confusion_matrix(y_test, y_pred)

    if task_type == 'binary':
        # Ensure cm has at least 2x2 dimensions for ravel to work, or handle if only one class predicted/actual
        if cm.shape == (1,1):
            TN, FP, FN, TP = (cm[0,0], 0, 0, 0) if y_test.iloc[0] == 0 else (0, 0, 0, cm[0,0])
        elif cm.shape == (2,2):
            TN, FP, FN, TP = cm.ravel()
        else: # Handle other unexpected shapes, e.g., only one class in y_test
            TN, FP, FN, TP = 0, 0, 0, 0 # Fallback

        metrics['False_Positive'] = FP
        metrics['False_Negative'] = FN
        metrics['Type_I_Error'] = FP  # Same as False Positive
        metrics['Type_II_Error'] = FN  # Same as False Negative
        metrics['Misclassification_Error'] = (FP + FN) / len(y_test)
    else:
        # For multiclass, calculate overall misclassification
        total_correct = np.trace(cm)
        total = np.sum(cm)
        metrics['Misclassification_Error'] = (total - total_correct) / total

    return metrics, cm, model_to_fit

# Initialize models
models = {
    'Logistic_Regression': LogisticRegression(max_iter=1000, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive_Bayes': GaussianNB(),
    'Decision_Tree': DecisionTreeClassifier(random_state=42),
    'Random_Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'DNN': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
}

# Create ensemble models
ensemble_rf = RandomForestClassifier(n_estimators=100, random_state=42)
ensemble_xgb = XGBClassifier(random_state=42)
ensemble_dt = DecisionTreeClassifier(random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ('rf', ensemble_rf),
        ('xgb', ensemble_xgb),
        ('dt', ensemble_dt)
    ],
    voting='soft'
)

stacking_clf = StackingClassifier(
    estimators=[
        ('rf', ensemble_rf),
        ('xgb', ensemble_xgb),
        ('dt', ensemble_dt)
    ],
    final_estimator=LogisticRegression(),
    cv=5
)

models['Voting_Ensemble'] = voting_clf
models['Stacking_Ensemble'] = stacking_clf

# Results storage
all_results = []
all_confusion_matrices = {}

# Scale features for models that need it
from sklearn.preprocessing import StandardScaler

def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# Evaluate all models for all feature sets
for feature_set_name in feature_sets.keys():
    print(f"\n{'='*60}")
    print(f"Evaluating Feature Set: {feature_set_name}")
    print(f"{'='*60}")

    # Scale features for this feature set
    X_train_bin_scaled, X_test_bin_scaled = scale_features(
        X_train_bin[feature_set_name], X_test_bin[feature_set_name]
    )
    X_train_multi_scaled, X_test_multi_scaled = scale_features(
        X_train_multi[feature_set_name], X_test_multi[feature_set_name]
    )

    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")

        # Binary classification
        metrics_bin, cm_bin, trained_model_bin = evaluate_model(
            model, X_train_bin_scaled, X_test_bin_scaled,
            y_train_bin[feature_set_name], y_test_bin[feature_set_name],
            feature_set_name, 'binary', model_name
        )
        # Only append results if evaluation was not skipped
        if metrics_bin is not None:
            all_results.append(metrics_bin)
            all_confusion_matrices[f"{feature_set_name}_{model_name}_binary"] = cm_bin

        # Multiclass classification
        metrics_multi, cm_multi, trained_model_multi = evaluate_model(
            model, X_train_multi_scaled, X_test_multi_scaled,
            y_train_multi[feature_set_name], y_test_multi[feature_set_name],
            feature_set_name, 'multiclass', model_name
        )
        # Only append results if evaluation was not skipped
        if metrics_multi is not None:
            all_results.append(metrics_multi)
            all_confusion_matrices[f"{feature_set_name}_{model_name}_multiclass"] = cm_multi

# Convert results to DataFrame
results_df = pd.DataFrame(all_results)
print("\n" + "="*80)
print("COMPREHENSIVE MODEL EVALUATION RESULTS")
print("="*80)

# Display top models for each task
for task in ['binary', 'multiclass']:
    task_results = results_df[results_df['Task_Type'] == task]

    print(f"\n{'='*60}")
    print(f"TOP 10 MODELS FOR {task.upper()} CLASSIFICATION")
    print(f"{'='*60}")

    # Sort by Accuracy
    top_accuracy = task_results.sort_values('Accuracy', ascending=False).head(10)
    print("\nTop 10 by Accuracy:")
    print(top_accuracy[['Feature_Set', 'Model', 'Accuracy', 'F1_Score', 'Balanced_Accuracy']].to_string())

    # Sort by F1-Score
    top_f1 = task_results.sort_values('F1_Score', ascending=False).head(10)
    print("\nTop 10 by F1-Score:")
    print(top_f1[['Feature_Set', 'Model', 'F1_Score', 'Accuracy', 'Balanced_Accuracy']].to_string())

# Compare with vs without ticker
print(f"\n{'='*80}")
print("COMPARISON: WITH TICKER vs WITHOUT TICKER")
print(f"{'='*80}")

# Group by feature sets with and without ticker
ticker_results = results_df[results_df['Feature_Set'].str.contains('ticker')]
no_ticker_results = results_df[~results_df['Feature_Set'].str.contains('ticker')]

for task in ['binary', 'multiclass']:
    ticker_task = ticker_results[ticker_results['Task_Type'] == task]
    no_ticker_task = no_ticker_results[no_ticker_results['Task_Type'] == task]

    print(f"\n{task.upper()} CLASSIFICATION:")
    print(f"{'-'*40}")
    print(f"With Ticker - Average Accuracy: {ticker_task['Accuracy'].mean():.4f}")
    print(f"Without Ticker - Average Accuracy: {no_ticker_task['Accuracy'].mean():.4f}")
    print(f"Difference: {(ticker_task['Accuracy'].mean() - no_ticker_task['Accuracy'].mean()):.4f}")

    print(f"\nWith Ticker - Average F1-Score: {ticker_task['F1_Score'].mean():.4f}")
    print(f"Without Ticker - Average F1-Score: {no_ticker_task['F1_Score'].mean():.4f}")
    print(f"Difference: {(ticker_task['F1_Score'].mean() - no_ticker_results['F1_Score'].mean()):.4f}")


Evaluating Feature Set: tabular_only

Training Logistic_Regression...

Training KNN...

Training Naive_Bayes...

Training Decision_Tree...

Training Random_Forest...

Training XGBoost...

Training SVM...

Training DNN...

Training Voting_Ensemble...

Training Stacking_Ensemble...

Evaluating Feature Set: tabular_nlp

Training Logistic_Regression...

Training KNN...

Training Naive_Bayes...

Training Decision_Tree...

Training Random_Forest...

Training XGBoost...

Training SVM...

Training DNN...

Training Voting_Ensemble...

Training Stacking_Ensemble...

Evaluating Feature Set: tabular_ticker

Training Logistic_Regression...

Training KNN...

Training Naive_Bayes...

Training Decision_Tree...

Training Random_Forest...

Training XGBoost...

Training SVM...

Training DNN...

Training Voting_Ensemble...

Training Stacking_Ensemble...

Evaluating Feature Set: tabular_nlp_ticker

Training Logistic_Regression...

Training KNN...

Training Naive_Bayes...

Training Decision_Tree...

Traini