In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix,recall_score,roc_auc_score,roc_curve,precision_score,f1_score,auc,accuracy_score

In [2]:
def boxplot_func(df,column):
    # assign fig and axes
    fig,axs= plt.subplots(5,10,figsize=(18,24))
    fig.suptitle('Outliers')
    i=0
    # drawing plots
    for c in column:
        i+=1
        plt.subplot(5,10,i)
        sns.boxplot(df[c])
        plt.xlabel(c)
        plt.tick_params(axis='x')
    plt.show()

In [9]:
def remove_Outliers(df,columns):
    for i in columns:
        q1=np.percentile(df.loc[:,i],25)
        q3=np.percentile(df.loc[:,i],75)
        iqr  = q3-q1
        print("Old Shape: ", df.shape)
        Old_shape=df.shape
        min  = q1 - (iqr*1.5)
        max  = q3 + (iqr*1.5)
        df = df.drop(df[df.loc[:,i]<min].index) 
        df = df.drop(df[df.loc[:,i]>max].index)
        print("New Shape: ", df.shape)
        New_shape=df.shape
        print("Total number of observations dropped in train set:",Old_shape[0]-New_shape[0])
        return df

In [4]:
def train_test_split_func(df,target_column):
    X=df.drop([target_column],axis=1)
    Y=df[target_column]

    #Stratified KFold Cross Validation:
    skf=StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(X,Y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

    print('Shape of X_train :',X_train.shape)
    print('Shape of X_test :',X_test.shape)
    print('Shape of y_train :',y_train.shape)
    print('Shape of y_test :',y_test.shape)
    return X_train,X_test,y_train,y_test

In [5]:
def metrics(y_true,y_pred):
    print("Confusion Matrix")
    print(confusion_matrix(y_true,y_pred))
    
    print("Accuracy:", accuracy_score(y_true,y_pred))
    print("Precision:", precision_score(y_true,y_pred))
    print("F1 Score:", f1_score(y_true,y_pred))
    print("Recall:", recall_score(y_true,y_pred))
    
    false_positive_rate,recall,thresholds = roc_curve(y_true,y_pred)
    roc_auc = auc(false_positive_rate,recall)
    plt.title('Reciver Operating Characteristics(ROC)')
    plt.plot(false_positive_rate,recall,'b')
    plt.ylabel('Recall(True Positive Rate)')
    plt.xlabel('False Positive Rate')
    plt.plot([0,1],[0,1],'r--')
    plt.title("AUC=%0.2f"%roc_auc)
    plt.show()

In [10]:
def model_validation(names,classifiers,X_train,X_test,y_train,y_test):
    for name, clf in zip(names, classifiers):
        print("#"*10,"Model Validation for %s "%name,"#"*10)
        print("Training Metrics of %s"%name)
        model=clf.fit(X_train,y_train)
        metrics(y_train,model.predict(X_train))
        pred = model.predict(X_test)
        print("Testing Metrics of %s"%name)
        metrics(y_test,pred)
    return name,model

In [12]:
def plotImpFeatures(model,columns,ModelName):
    fig = plt.figure()

    #Important Features
    if ModelName!="Naive Bayes" or ModelName!="LDA" or ModelName!="QDA":
        feature_imp = pd.Series(model.feature_importances_,index=columns).sort_values(ascending=False)
        print("Important model Features:\n",feature_imp)

        plt.figure(figsize=(15,15))
        sns.barplot(x=feature_imp, y=feature_imp.index)
        # Add labels to your graph
        plt.title("Feature Importances")
        plt.ylabel("Features")
        plt.xlabel("Importances")
        plt.show()
    else:
        imp = permutation_importance(model, X_test, y_test)
        importances = imp.importances_mean
        std = imp.importances_std
        indices = np.argsort(importances)[::-1]
        feature = pd.DataFrame({"imp":importances,"col":columns})
        feature = feature.sort_values(['imp','col'],ascending=[True,False]).iloc[-30:]
        feature.plot(kind='barh',x='col',y='imp',figsize=(10,7),legend=None)
        plt.title("Feature Importances")
        plt.ylabel("Features")
        plt.xlabel("Importances")
        plt.show()