In [10]:
# Pandas for DataFrames
import pandas as pd
pd.set_option('display.max_columns', 100)

# Numpy for numerical computing
import numpy as np

# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline

# Seaborn for easier visualization
import seaborn as sns
sns.set_style('darkgrid')

# display Python object in all frontends
from IPython.display import display

# store elements as dictionary keys and their counts as dictionary values
from collections import Counter

# Import Logistic Regression
from sklearn.linear_model import LogisticRegression

# Import SVM classifier 
from sklearn.svm import SVC 

# Import RandomForestClassifier and GradientBoostingClassifer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

# Function for splitting training and test set
from sklearn.model_selection import train_test_split, StratifiedKFold

# Function for creating model pipelines - sklearn
from sklearn.pipeline import make_pipeline

# Function for creating model pipelines - imblearn
from imblearn.pipeline import make_pipeline as imbl_pipe

# Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE

# StandardScaler
from sklearn.preprocessing import StandardScaler

# GridSearchCV
from sklearn.model_selection import GridSearchCV

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score

# set class weights for imbalaced datasets
from sklearn.utils.class_weight import compute_class_weight

# Ignore some warning messages
import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning, UndefinedMetricWarning
warnings.simplefilter(action='ignore', category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=DataConversionWarning)
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning) 


Using TensorFlow backend.


In [1]:
def fit_tune_CV(pipe, scorer):
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    print('skf calculated')
    # Create empty dictionary called fitted_models
    fit_models = {}

    # Loop through model pipelines, tuning each one and saving it to fitted_models
    for name, pipeline in pipe.items():
        print(name, 'Searching best Hyperparametres')
        # Create cross-validation object from pipeline and hyperparameters
        model = GridSearchCV(pipeline, hyperparameters[name], cv=skf,
                             scoring=scorer, iid=True, n_jobs=-1)
        print(name, 'fitting')
        # Fit model on X_train, y_train
        model.fit(X_train, y_train)

        # Store model in fitted_models[name]
        fit_models[name] = model

        # Print '{name} has been fitted'
        print(name, 'has been fitted')
    return fit_models

In [2]:
def evaluation(fit_models):
    lst = []
    for name, model in fit_models.items():
        pred = model.predict(X_test)
        lst.append([name, model.best_score_, f1_score(y_test, pred, average='macro'),
                    accuracy_score(y_test, pred)])

    eval_df = pd.DataFrame(lst, columns=['model', 'CV_score', 'f1_macro', 'accuracy'])
    eval_df.set_index('model', inplace = True)
    return eval_df

In [3]:
def eval_plot(eval_df):
    #eval_df = evaluation(fit_models)
    eval_dfp = eval_df.reset_index()
    eval_dfp = pd.melt(eval_dfp,id_vars='model',var_name='metrics', value_name='score')

    sns.catplot(x='model', y='score', hue='metrics',data=eval_dfp, kind='bar',
                palette={'CV_score' : 'red', 'f1_macro' : 'orange',
                          'accuracy' : 'royalblue'})
    plt.title('Evaluation Metrics', fontsize=14)
    plt.xticks(fontsize=12)
    plt.xlabel('Model', size=12)
    plt.ylabel('Score', size=12)
    plt.show()

In [4]:
def plot_conf_mat_w_and_wo_norm(fit_models, model_id, color):
    # Plot confusion matrix heatmaps
    pred = fit_models[model_id].predict(X_test)

    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    f.suptitle(models[model_id], fontsize=14)
    f.subplots_adjust(top=0.85, wspace=0.3)

    # confusion matrix without normalization
    mat = confusion_matrix(y_test, pred)
    sns.heatmap(mat,
                annot=True,
                annot_kws=dict(fontsize=14),
                fmt='d',
                cbar=True,
                square=True,
                cmap=color,
                ax=ax1)

    ax1.set_xticklabels(labels=target_names)
    ax1.set_yticklabels(labels=target_names, va='center')
    ax1.set_title('Confusion Matrix w/o Normalization')
    ax1.set_xlabel('Predicted Labels', size=12)
    ax1.set_ylabel('True Labels', size=12)

    # normalized confusion matrix
    matn = mat / mat.sum(axis=1)[:, np.newaxis]
    sns.heatmap(matn,
                annot=True,
                annot_kws=dict(fontsize=14),
                fmt='.2f',
                cbar=True,
                square=True,
                cmap=color,
                vmin = 0,
                vmax = 1,
                ax=ax2)

    ax2.set_xticklabels(labels=target_names)
    ax2.set_yticklabels(labels=target_names, va='center')
    ax2.set_title('Normalized Confusion Matrix')
    ax2.set_xlabel('Predicted Label', size=12)
    ax2.set_ylabel('True Label', size=12)

    plt.show()

In [5]:
def plot_norm_conf_matrices(fit_models, color):
    # Prepare list of coordintaes for axes
    lt = []
    col = 2
    row = int(len(fit_models) / col)
    for r in range(row):
        for c in range(col):
            lt.append([r, c])

    # Create figure and subplots
    figs_y = row * 4
    f, axs = plt.subplots(row, col, figsize=(10, figs_y))
    f.suptitle('Normalized Confusion Matrices', fontsize=14)
    f.subplots_adjust(top=0.94, wspace=0.90, hspace=0.2)

    i = 0
    # Loop for each fitted model        
    for id, model in fit_models.items():
        pred = model.predict(X_test)
        name = models[id]
        r = lt[i][0]
        c = lt[i][1]
        i += 1

        mat = confusion_matrix(y_test, pred)    
        # normalized confusion matrix
        matn = mat / mat.sum(axis=1)[:, np.newaxis]
        sns.heatmap(matn,
                    annot=True,
                    annot_kws=dict(fontsize=14),
                    fmt='.2f',
                    cbar=False,
                    square=True,
                    cmap=color,
                    vmin = 0,
                    vmax = 1,
                    #cbar_kws = {'shrink' : 0.85},
                    ax=axs[r, c])

        axs[r, c].set_xticklabels(labels=target_names)
        axs[r, c].set_yticklabels(labels=target_names, va='center')
        axs[r, c].set_title(name)
        axs[r, c].set_xlabel('Predicted Label', size=12)
        axs[r, c].set_ylabel('True Label', size=12)

    plt.show()

In [6]:
def class_rep_cm(fit_models, model_id):
    # Predict classes using model_id
    pred = fit_models[model_id].predict(X_test)
    print()
    print('\t', models[model_id])
    print('\t', '='*len(models[model_id]))

    # Display confusion matrix for y_test and pred
    conf_df = pd.DataFrame(confusion_matrix(y_test, pred), columns=target_names, index=target_names)
    conf_df.index.name = 'True Labels'
    conf_df = conf_df.rename_axis('Predicted Labels', axis='columns')
    display(conf_df)

    # Display classification report
    print()
    print(classification_report(y_test, pred, target_names=target_names))

In [8]:
def best_hyp_param(fit_models):
    # Display best_params_ for each fitted model

    # Initialize empty dataframe
    bp_df = pd.DataFrame()

    # Loop through all fitted models
    for name, model in fit_models.items():
        # Dictionary of best_params
        d = model.best_params_
        # Model name from model dictionary
        model_name = models[name]

        # Create dataframe for best_params_dictionary
        bp_dft = pd.DataFrame.from_dict(d, orient='index', columns=['Value'])
        # Insert the column 'Model'
        bp_dft.insert(0, 'Model', model_name)
        # Concatenate previous dataframe with new one from this run
        bp_df = pd.concat([bp_df, bp_dft])

    # Finalize the output of the dataframe
    bp_df.reset_index(inplace=True)
    bp_df.set_index('Model', inplace = True)
    bp_df.rename(columns={'index' : 'Hyperparameter'}, inplace=True)
    return bp_df
