In [3]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.inspection import permutation_importance

In [2]:
def bool_to_int(df):
    df_convert = df.copy()
    for col in df_convert.columns:
        if df_convert[col].dtype == bool:
            df_convert[col] = df_convert[col].astype(int)
    return df_convert

In [3]:
def export_model_results(y_test, y_pred, model_name, filename):
    model_results = {
        'model': model_name,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred)
    }

    with open(filename, 'wb') as f:
        pickle.dump(model_results, f)

In [4]:
def export_tree_feature_importances(model, feature_names, model_name):
    importances = model.feature_importances_
    feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    feature_df = feature_df.sort_values(by='Importance', ascending=False)

    with open(f"feature_importances/{model_name}_feature_importance.pkl", "wb") as f:
        pickle.dump(feature_df, f)

In [5]:
def export_logreg_coefficients(model, feature_names, model_name):
    coefs = model.coef_[0]
    coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefs})
    coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
    coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False)

    with open(f"feature_importances/{model_name}_coefficients.pkl", "wb") as f:
        pickle.dump(coef_df, f)

In [17]:
def export_permutation_importance(model, X_test, y_test, feature_names, model_name):
    X_sample = X_test.sample(1000, random_state=42)
    y_sample = y_test.loc[X_sample.index]
    
    result = permutation_importance(model, X_sample, y_sample, n_repeats=2, random_state=42)
    df_perm = pd.DataFrame({'Feature': feature_names, 'Importance': result.importances_mean})
    df_perm = df_perm.sort_values(by='Importance', ascending=False)

    with open(f"feature_importances/{model_name}_permutation.pkl", "wb") as f:
        pickle.dump(df_perm, f)

In [5]:
def plot_logreg_coefficients(model, feature_names, title='Logistic Regression Coefficients'):
    coefs = model.coef_[0]
    coef_series = pd.Series(coefs, index=feature_names).sort_values(ascending=False)

    plt.figure(figsize=(10,6), dpi=200)
    sns.barplot(x=coef_series.index, y=coef_series.values)
    plt.xticks(rotation=90)
    plt.title(title)
    plt.ylabel("Coefficient")
    plt.xlabel("")
    plt.tight_layout()
    

In [18]:
def plot_permutation_importance(model, X_test, y_test, feature_names, title="Permutation Importance"):
    X_sample = X_test.sample(1000, random_state=42)
    y_sample = y_test.loc[X_sample.index]
    
    result = permutation_importance(model, X_sample, y_sample, n_repeats=2, random_state=42)
    perm_series = pd.Series(result.importances_mean, index=feature_names).sort_values(ascending=False)

    plt.figure(figsize=(10,6), dpi=200)
    sns.barplot(x=perm_series.index, y=perm_series.values)
    plt.xticks(rotation=90)
    plt.title(title)
    plt.ylabel("Mean Importance Drop")
    plt.xlabel("")
    plt.tight_layout()

In [4]:
def plot_tree_feature_importance(model, feature_names, title='Feature Importance'):
    importances = model.feature_importances_
    imp_series = pd.Series(importances, index=feature_names).sort_values(ascending=False)

    plt.figure(figsize=(10,6), dpi=200)
    sns.barplot(x=imp_series.index, y=imp_series.values)
    plt.xticks(rotation=90)
    plt.title(title)
    plt.ylabel('Importance')
    plt.xlabel("")
    plt.tight_layout()

In [8]:
def plot_model_importances(dictionary, model_name, importance_type='importance'):
    untuned = dictionary['untuned'].copy()
    tuned = dictionary['tuned'].copy()
    
    untuned['Model'] = 'Untuned'
    tuned['Model'] = 'Tuned'
    
    combined = pd.concat([untuned, tuned])

    feature_col = None
    for col in combined.columns:
        if 'feature' in col.lower():
            feature_col = col
            break

    feature_order = (combined.groupby(feature_col)[importance_type].mean().sort_values(ascending=False).index)
    combined[feature_col] = pd.Categorical(combined[feature_col], categories=feature_order, ordered=True)

    plt.figure(figsize=(12,6), dpi=200)
    sns.barplot(data=combined, y=feature_col, x=importance_type, hue='Model')
    plt.xlabel(importance_type)
    plt.ylabel('Feature')
    plt.tight_layout()
    