In [1]:
import pandas as pd
import numpy as np
import os
import datetime
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score, f1_score, balanced_accuracy_score #classification_report
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.utils import resample
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from factor_analyzer import FactorAnalyzer
from pytrie import StringTrie
from collections import Counter
from sklearn.impute import KNNImputer

In [3]:
import warnings
warnings.filterwarnings("ignore")

# All Functions

In [4]:
def var_imp(lm_coef, X,y):
    feature_importance = lm_coef
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    featfig = plt.figure()
    print(pos)
    print(feature_importance[sorted_idx])
    print(np.array(X.columns)[sorted_idx])
    featax = featfig.add_subplot(1, 1, 1)
    featax.barh(pos, feature_importance[sorted_idx], align='center')
    featax.set_yticks(pos)
    featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=8)
    featax.set_xlabel('Relative Feature Importance')
    plt.tight_layout()   
    plt.show()

In [5]:
def var_imp_df(lm_coef, X,y):
    feature_importance = lm_coef
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    label = np.array(X.columns)[sorted_idx]
    value = feature_importance[sorted_idx]
    feat_imp_df = pd.DataFrame({'feature':label,'Imp_index':value})
    return feat_imp_df

In [6]:
# Module to do factor Analysis on the columns
# Function: to find the Strings with first few letters e.g. 'ASE', 'Z_ASE'
def prefixSearch(arr,prefix): 
    trie=StringTrie() 
    for key in arr: 
        trie[key] = key 
    return trie.values(prefix) 

# Function to calculate No of Factors based on Eigen Values
def eigenvalues(data):
    eigvals=np.array(data)
    eigvals=eigvals.T
    corrmat=np.corrcoef(eigvals)
    eigenvalues, eigenvectors = np.linalg.eig(corrmat)
    Count=Counter(eigenvalues>1)[1]
    return Count,eigenvectors

# Function to calculate Factor Scores
def FacterCluster(df,rotation = 'varimax'):
    fa = FactorAnalyzer(n_factors=eigenvalues(df)[0],rotation=rotation,method='ml',use_smc=True)
    df1=pd.DataFrame.from_records(fa.fit_transform(df))
    df1=pd.DataFrame(df1)
    df1 = df1.add_prefix('Factor_')
    df2 = pd.concat([df, df1], axis=1)
    return df2

In [7]:
# Module for resampling Unbalanced data
def up_sample_imbalanced(df,feature='lapse' ):
    y_zero = df[df[feature]== 0]
    y_one = df[df[feature]== 1]
    ##Upsample the 1 cases
    df_resampled = resample(y_one,
                            replace=True, # sample with replacement
                            n_samples=y_zero.shape[0])
    ndf = pd.concat([y_zero, df_resampled]).reset_index()
    return ndf

## Classification Functions

In [8]:
# Logistic Regression
def Logistic_Regression(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**Logistic Regression**')
    logmodel = LogisticRegression(random_state=0,class_weight='balanced', solver = 'saga')
    logmodel.fit(X_train,y_train)
    predictions = logmodel.predict(X_test)
    Train_Accuracy = accuracy_score(y_train, logmodel.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    print("Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    pred_prob = pd.concat([pd.DataFrame(logmodel.predict_proba(mdf[X_train.columns])).reset_index(drop=True),mdf[pd.DataFrame(y_train).columns]], axis=1)
    pred_prob = pd.concat([pred_prob.reset_index(drop=True),pd.DataFrame(logmodel.predict(mdf[X_train.columns]))], axis=1)
    pred_prob.columns = ['pred_prob_0','pred_prob_1', 'Actual', 'predicted']
    var_imp = var_imp_df(abs(logmodel.coef_[0]), mdf[X_train.columns],mdf[pd.DataFrame(y_train).columns])
    model_metrics = {'Model':['Logistic_Regression'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x], 'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                     'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC','F1_Score',
                                                 'Balanced_F1_Score','Train_Accuracy',
                                                 'Test_Accuracy','TimeTaken'])
    return moddf, pred_prob, var_imp

In [9]:
def Logistic_Regression_cv(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**Logistic Regression Cross Validation**')
    logmodel = LogisticRegressionCV(cv=5,random_state=0,class_weight='balanced')
    logmodel.fit(X_train,y_train)
    predictions = logmodel.predict(X_test)
    Train_Accuracy = accuracy_score(y_train, logmodel.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    print("Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    #print("Classification Report: \n ", classification_report(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    pred_prob = pd.concat([pd.DataFrame(logmodel.predict_proba(mdf[X_train.columns])).reset_index(drop=True),mdf[pd.DataFrame(y_train).columns]], axis=1)
    pred_prob = pd.concat([pred_prob.reset_index(drop=True),pd.DataFrame(logmodel.predict(mdf[X_train.columns]))], axis=1)
    pred_prob.columns = ['pred_prob_0','pred_prob_1', 'Actual', 'predicted']
    var_imp = var_imp_df(abs(logmodel.coef_[0]), mdf[X_train.columns],mdf[pd.DataFrame(y_train).columns])
    model_metrics = {'Model':['Logistic_Regression_CV'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x],'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                    'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC',
                                                 'F1_Score','Balanced_F1_Score',
                                                 'Train_Accuracy','Test_Accuracy','TimeTaken'])
    return moddf, pred_prob,var_imp    

In [10]:
# Random Forest
def random_forest(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**Random Forest**')
    ## Configuring parameters and values for searched
    tuned_parameters = [{'max_depth': [10, 15],
                         'n_estimators': [10,20],
                         'max_features': ['sqrt', 'auto', 'log2']}]
    ## Initializing the RF classifier
    radm_clf = RandomForestClassifier()
    ## Configuring search with the tunable parameters
    clf = GridSearchCV(radm_clf,
                       tuned_parameters,
                       cv=15,
                       scoring='roc_auc')
    ## Fitting the training set
    clf.fit(X_train, y_train)
    m = clf.best_params_
    # Using best parameters from grid search
    clf = RandomForestClassifier(max_depth=m['max_depth'], n_estimators=m['n_estimators'],
                                 max_features = m['max_features'],random_state=101)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    Train_Accuracy = accuracy_score(y_train, clf.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    print("Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    pred_prob = pd.concat([pd.DataFrame(clf.predict_proba(mdf[X_train.columns])).reset_index(drop=True),mdf[pd.DataFrame(y_train).columns]], axis=1)
    pred_prob = pd.concat([pred_prob.reset_index(drop=True),pd.DataFrame(clf.predict(mdf[X_train.columns]))], axis=1)
    pred_prob.columns = ['pred_prob_0','pred_prob_1', 'Actual', 'predicted']
    var_imp = var_imp_df(abs(clf.feature_importances_), mdf[X_train.columns],mdf[pd.DataFrame(y_train).columns])
    model_metrics = {'Model':['random_forest_clf'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x],'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                    'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC',
                                                 'F1_Score','Balanced_F1_Score',
                                                 'Train_Accuracy','Test_Accuracy','TimeTaken'])
    return moddf, pred_prob, var_imp

In [11]:
# Gradient Boosting
def gradient_boost(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**Gradient Boosting Classifier**')
    gboost_clf = GradientBoostingClassifier( n_estimators=500, max_depth=10)
    gboost_clf.fit(X_train, y_train)
    predictions = gboost_clf.predict(X_test)
    Train_Accuracy = accuracy_score(y_train, gboost_clf.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    print("Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    pred_prob = pd.concat([pd.DataFrame(gboost_clf.predict_proba(mdf[X_train.columns])).reset_index(drop=True),mdf[pd.DataFrame(y_train).columns]], axis=1)
    pred_prob = pd.concat([pred_prob.reset_index(drop=True),pd.DataFrame(gboost_clf.predict(mdf[X_train.columns]))], axis=1)
    pred_prob.columns = ['pred_prob_0','pred_prob_1', 'Actual', 'predicted']
    var_imp = var_imp_df(abs(gboost_clf.feature_importances_), mdf[X_train.columns],mdf[pd.DataFrame(y_train).columns])
    model_metrics = {'Model':['gradient_boost_clf'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x],'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                    'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC',
                                                 'F1_Score','Balanced_F1_Score',
                                                 'Train_Accuracy','Test_Accuracy','TimeTaken'])
    return moddf, pred_prob, var_imp

In [12]:
# Decision Tree
def decision_tree_clf(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**Decision Tree Classification**')
    # Grid Search for best Parameters
    tuned_parameters = [{'criterion': ['gini','entropy'], 
                         'max_depth': range(2,10), 
                         'max_features':[None,'auto', 'sqrt', 'log2']}]
    clf_tree = DecisionTreeClassifier()
    clf = GridSearchCV(clf_tree,
                   tuned_parameters,
                   cv=10,
                   scoring='roc_auc')
    clf.fit(X_train, y_train)
    m = clf.best_params_
    # Model with best parameters
    clf_tree = DecisionTreeClassifier(criterion = m['criterion'], 
                                      max_depth = m['max_depth'], max_features=m['max_features'])
    clf_tree.fit( X_train, y_train )
    predictions = clf_tree.predict(X_test)
    Train_Accuracy = accuracy_score(y_train, clf_tree.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    print("Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    pred_prob = pd.concat([pd.DataFrame(clf_tree.predict_proba(mdf[X_train.columns])).reset_index(drop=True),mdf[pd.DataFrame(y_train).columns]], axis=1)
    pred_prob = pd.concat([pred_prob.reset_index(drop=True),pd.DataFrame(clf_tree.predict(mdf[X_train.columns]))], axis=1)
    pred_prob.columns = ['pred_prob_0','pred_prob_1', 'Actual', 'predicted']
    model_metrics = {'Model':['decision_tree_clf'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x],'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                    'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC',
                                                 'F1_Score','Balanced_F1_Score',
                                                 'Train_Accuracy','Test_Accuracy','TimeTaken'])
    var_imp = var_imp_df(abs(clf_tree.feature_importances_), mdf[X_train.columns],mdf[pd.DataFrame(y_train).columns])
    return moddf, pred_prob,var_imp    

In [13]:
# KNN Clasifier
def knn_clf(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**KNN Classifier**')
    # Find Best Parameters
    ## Creating a dictionary with hyperparameters and possible values for searching
    tuned_parameters = [{'n_neighbors': range(5,10),'metric': ['canberra', 'euclidean', 'minkowski']}] 
    ## Configuring grid search
    clf = GridSearchCV(KNeighborsClassifier(),
                       tuned_parameters,
                       cv=10,
                       scoring='roc_auc')
    clf.fit(X_train, y_train )
    m = clf.best_params_
    # Model with best parameters
    knn_clf = KNeighborsClassifier(n_neighbors = m['n_neighbors'], metric = m['metric'])
    knn_clf.fit( X_train, y_train )
    predictions = knn_clf.predict(X_test)
    print("Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    Train_Accuracy = accuracy_score(y_train, knn_clf.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    pred_prob = pd.concat([pd.DataFrame(knn_clf.predict_proba(mdf[X_train.columns])).reset_index(drop=True),mdf[pd.DataFrame(y_train).columns]], axis=1)
    pred_prob = pd.concat([pred_prob.reset_index(drop=True),pd.DataFrame(knn_clf.predict(mdf[X_train.columns]))], axis=1)
    pred_prob.columns = ['pred_prob_0','pred_prob_1', 'Actual', 'predicted']
    model_metrics = {'Model':['knn_clf'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x],'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                    'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC',
                                                 'F1_Score','Balanced_F1_Score',
                                                 'Train_Accuracy','Test_Accuracy','TimeTaken'])
    var_imp = []
    return moddf, pred_prob, var_imp
    #var_imp(abs(knn_clf.feature_importances_), X,y)

In [14]:
# LSVC
def LSVC_Clf(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**LSVC Classifier**')
    lsvc = LinearSVC()
    lsvc.fit(X_train,y_train)
    predictions = lsvc.predict(X_test)
    Train_Accuracy = accuracy_score(y_train, lsvc.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    print(" Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    var_imp = var_imp_df(abs(lsvc.coef_[0]), mdf[X_train.columns],mdf[pd.DataFrame(y_train).columns])
    pred_prob = pd.concat([pd.DataFrame(lsvc.predict(mdf[X_train.columns])).reset_index(drop=True),y], axis=1)
    pred_prob['1'] = abs(pred_prob[0]-1)
    pred_prob['predicted'] = pred_prob['1']
    pred_prob = pred_prob.loc[:,[0,'1','lapse','predicted']]
    pred_prob.columns = ['pred_prob_0','pred_prob_1', 'Actual', 'predicted']
    model_metrics = {'Model':['LSVC_Clf'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x],'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                    'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC',
                                                 'F1_Score','Balanced_F1_Score',
                                                 'Train_Accuracy','Test_Accuracy','TimeTaken'])
    return moddf, pred_prob, var_imp

In [15]:
# SVM
def SVM_clf(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**SVM Classifier**')
    svc = SVC()
    svc.fit(X_train,y_train)
    predictions = svc.predict(X_test)
    print("Train Accuracy :: ", accuracy_score(y_train, svc.predict(X_train)))
    print("Test Accuracy  :: ", accuracy_score(y_test, predictions))
    print(" Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    print(" Classification Report: \n ", classification_report(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    print("auc = ", aucx)
    print("F1 Score = ", f1x)
    print("Accuracy = ", accuracyx)
    model_metrics = {'Model':['SVM_clf'],'Accuracy':[accuracyx], 'AUC': [aucx], 'F1_Score':[f1x]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC','F1_Score'])
    return moddf    #var_imp(abs(svc.coef_[0]), X,y)

In [16]:
# XG Boost
def xgb_clf(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**XG Boost Classifier**')
    xgbm = XGBClassifier()
    xgbm.fit(X_train,y_train)
    predictions = xgbm.predict(X_test)
    Train_Accuracy = accuracy_score(y_train, xgbm.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    print(" Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    pred_prob = pd.concat([pd.DataFrame(xgbm.predict_proba(mdf[X_train.columns])).reset_index(drop=True),mdf[pd.DataFrame(y_train).columns]], axis=1)
    pred_prob = pd.concat([pred_prob.reset_index(drop=True),pd.DataFrame(xgbm.predict(mdf[X_train.columns]))], axis=1)
    pred_prob.columns = ['pred_prob_0','pred_prob_1', 'Actual', 'predicted']
    var_imp = var_imp_df(abs(xgbm.feature_importances_), mdf[X_train.columns],mdf[pd.DataFrame(y_train).columns])
    model_metrics = {'Model':['xgb_clf'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x],'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                    'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC',
                                                 'F1_Score','Balanced_F1_Score',
                                                 'Train_Accuracy','Test_Accuracy','TimeTaken'])
    return moddf, pred_prob, var_imp

In [17]:
# MLP (Multi-layer Perceptron classifier)Clf
def mlp_clf(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**MLP Classifier**')
    mlp = MLPClassifier(random_state=1, max_iter=300)
    mlp.fit(X_train,y_train)
    predictions = mlp.predict(X_test)
    Train_Accuracy = accuracy_score(y_train, mlp.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    print(" Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    pred_prob = pd.concat([pd.DataFrame(mlp.predict_proba(mdf[X_train.columns])).reset_index(drop=True),mdf[pd.DataFrame(y_train).columns]], axis=1)
    pred_prob = pd.concat([pred_prob.reset_index(drop=True),pd.DataFrame(mlp.predict(mdf[X_train.columns]))], axis=1)
    pred_prob.columns = ['pred_prob_0','pred_prob_1', 'Actual', 'predicted']
    var_imp = []
    model_metrics = {'Model':['mlp_clf'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x],'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                    'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC',
                                                 'F1_Score','Balanced_F1_Score',
                                                 'Train_Accuracy','Test_Accuracy','TimeTaken'])
    return moddf, pred_prob, var_imp

In [18]:
# LinearDiscriminantAnalysis
def lda_clf(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**Linear Discriminant Analysis**')
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train,y_train)
    predictions = lda.predict(X_test)
    Train_Accuracy = accuracy_score(y_train, lda.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    print(" Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    pred_prob = pd.concat([pd.DataFrame(lda.predict_proba(mdf[X_train.columns])).reset_index(drop=True),mdf[pd.DataFrame(y_train).columns]], axis=1)
    pred_prob = pd.concat([pred_prob.reset_index(drop=True),pd.DataFrame(lda.predict(mdf[X_train.columns]))], axis=1)
    pred_prob.columns = ['pred_prob_0','pred_prob_1', 'Actual', 'predicted']
    var_imp = []
    model_metrics = {'Model':['lda_clf'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x],'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                    'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC',
                                                 'F1_Score','Balanced_F1_Score',
                                                 'Train_Accuracy','Test_Accuracy','TimeTaken'])
    return moddf, pred_prob,var_imp

In [19]:
# SGDClassifier (stochastic gradient descent)
def sgd_clf(X_train, X_test, y_train, y_test, mdf):
    start = time.time()
    print('**Stochastic Gradient Descent Clf**')
    tuned_parameters = [{'loss': ['log', 'modified_huber']}] 
    ## Configuring grid search
    clf = GridSearchCV(SGDClassifier(),
                       tuned_parameters,
                       cv=10,
                       scoring='roc_auc')
    clf.fit(X_train, y_train )
    m = clf.best_params_
    print(m)
    ####
    sgd = SGDClassifier(loss = m['loss'])
    sgd.fit(X_train,y_train)
    predictions = sgd.predict(X_test)
    Train_Accuracy = accuracy_score(y_train, sgd.predict(X_train))
    Test_Accuracy = accuracy_score(y_test, predictions)
    print(" Confusion matrix: \n ", confusion_matrix(y_test, predictions))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
    aucx = metrics.auc(fpr, tpr)
    f1x = metrics.f1_score(y_test, predictions)
    bal_f1scr = metrics.balanced_accuracy_score(y_test, predictions)
    accuracyx = metrics.accuracy_score(y_test, predictions)
    pred_prob = pd.DataFrame(sgd.predict_proba(mdf[X_train.columns]))
    pred_prob.columns = ['pred_prob_0','pred_prob_1']
    pred_prob['predicted'] = (pred_prob['pred_prob_1'] > 0.65).astype('int')
    var_imp = []
    model_metrics = {'Model':['sgd_clf'],'Accuracy':[accuracyx],
                     'AUC': [aucx], 'F1_Score':[f1x],'Balanced_F1_Score':[bal_f1scr],
                    'Train_Accuracy':[Train_Accuracy], 'Test_Accuracy':[Test_Accuracy],
                    'TimeTaken':[time.time()-start]}
    moddf = pd.DataFrame(model_metrics, columns=['Model','Accuracy','AUC',
                                                 'F1_Score','Balanced_F1_Score',
                                                 'Train_Accuracy','Test_Accuracy','TimeTaken'])
    return moddf, pred_prob,var_imp

# Feature Engineering  - will vary based on context, project

In [20]:
# Read Transaction Data

In [21]:
dft = pd.read_excel("Jli_Surver_data_lapse.xlsx")

In [22]:
dft = dft[dft['INVOICE_DATE']<'12/30/2017']

In [23]:
# Join With Survey Data

In [24]:
dfs = pd.read_csv("survey_data.csv")

In [25]:
sat_df = pd.read_csv("score1.csv")

In [26]:
dfs = dfs.merge(sat_df, how = 'left', left_on='SATISFACTION', right_on='find')
dfs.rename(columns={'replace':'SATISFACTION_scr'},inplace=True)
dfs = dfs.drop(['find'], axis=1)

In [27]:
return_df = pd.read_csv("Score2.csv")

In [28]:
dfs = dfs.merge(return_df, how = 'left', left_on='OIL_CHANGE_RETURN', right_on='find')
dfs.rename(columns={'replace':'OIL_CHANGE_RETURN_scr'},inplace=True)
dfs = dfs.drop(['find'], axis=1)

In [29]:
dfs = dfs.merge(return_df, how = 'left', left_on='ROUTINE_MAINT_RETURN', right_on='find')
dfs.rename(columns={'replace':'ROUTINE_MAINT_RETURN_scr'},inplace=True)
dfs = dfs.drop(['find'], axis=1)

In [30]:
gender_df = pd.read_csv("gender_map.csv")
dfs = dfs.merge(gender_df, how = 'left', left_on='GENDER', right_on='find')
dfs['GENDER'] = dfs['replace']

In [31]:
dfs = dfs.loc[:,['INVOICE_NUMBER', 'SATISFACTION_scr',
                 'OIL_CHANGE_RETURN_scr','ROUTINE_MAINT_RETURN_scr',
                 'KNOWLEDGEABLE', 'FRIENDLY', 'QUALITY', 'TRUSTED_RECOMMENDATION',
                 'TIMELY', 'GOOD_VALUE', 'WELL_TRAINED', 'EQUIPMENT',
                 'GENDER', 'AGE']]

In [32]:
dfs.sort_values("INVOICE_NUMBER", inplace = True) 
dfs.drop_duplicates(subset ="INVOICE_NUMBER", keep = False, inplace = True) 

**Consolidaded Dataset**

Section Covers Multiple aspects for data prep
- Calculation of Lapse based on **Fixed Days**
- Calculation of Lapse based on **Category Purchase Cycle**
- Creation of **Dummy Variables** for Categorical data like Gender
- **Removing Rows** where 50% of columns have missing values
- **KNN Imputation** for missing values
- **Upsampling** to factor for unbalanced dataset 

In [33]:
df = dft.merge(dfs, how = 'left', on='INVOICE_NUMBER')

In [34]:
uuid = 'leadkey'
invoice_dt = 'INVOICE_DATE'
inv_no = 'INVOICE_NUMBER'

In [35]:
# Lapse using Avg. Category Purchase interval X 2
df['previous_visit'] = df.sort_values(by=(invoice_dt)).groupby(uuid)[invoice_dt].shift(1)
df['IPI'] = (df[invoice_dt] - df['previous_visit']).astype('timedelta64[D]')
churn_factor = df['IPI'].mean() * 2
df['lapse_IPI'] = (df['IPI'] > churn_factor).astype(np.int8)

In [36]:
# Tech Team if Lapse is to be defined based on Fixed No. of Days, use this variable input
# Fixed No. of Days- lapse
# Category Purchase interval - lapse_IPI
lapse_days = 60 # Default valye is 365

In [37]:
# Customer Churn- Last Invoice >X Days
ndf = df.groupby(uuid)[invoice_dt].max()
ndf = pd.DataFrame(ndf).reset_index()
ndf.columns = [uuid,'lst_inv_dt']
ndf['lapse'] = (ndf['lst_inv_dt'] + datetime.timedelta(days=lapse_days)) > max(df[invoice_dt])
ndf['lapse'] = ndf['lapse'].astype(np.int8)
ndf['key'] = ndf[uuid].map(str) + ndf['lst_inv_dt'].map(str)
df['key'] = df[uuid].map(str) + df[invoice_dt].map(str)
df=pd.merge(df,ndf.loc[:,['lapse','key']],how = 'left',on='key')
df['lapse'] = df['lapse'].fillna(0)

In [38]:
#Tech Team: This is input parameters  i.e. feature selection for the models
x_input_feat = ['SATISFACTION_scr', 'OIL_CHANGE_RETURN_scr',
                'KNOWLEDGEABLE', 'FRIENDLY', 'QUALITY',
                'TRUSTED_RECOMMENDATION', 'TIMELY', 'GOOD_VALUE', 'WELL_TRAINED',
                'EQUIPMENT', 'GENDER', 'AGE', 'TOTAL_INVOICE_AMOUNT']

In [39]:
# Future Enhancement: Build Alternative Model without using Survey Parameters when values are missing

In [40]:
# Create Dummy Variables - for Categorical Variables - converting to one-hot-encoding
cat_vals = df[x_input_feat].select_dtypes(include='object').columns
df_mod_op = pd.get_dummies(df[x_input_feat], columns = cat_vals, drop_first = True)
x_input_feat_enc = df_mod_op.columns

In [41]:
x_input_feat_enc

Index(['SATISFACTION_scr', 'OIL_CHANGE_RETURN_scr', 'KNOWLEDGEABLE',
       'FRIENDLY', 'QUALITY', 'TRUSTED_RECOMMENDATION', 'TIMELY', 'GOOD_VALUE',
       'WELL_TRAINED', 'EQUIPMENT', 'TOTAL_INVOICE_AMOUNT', 'GENDER_Male',
       'GENDER_Prefer Not to Answer', 'AGE_26-35', 'AGE_36-45', 'AGE_46-55',
       'AGE_56-65', 'AGE_66+', 'AGE_Prefer not to answer',
       'AGE_Prefiero no responder'],
      dtype='object')

In [42]:
# Remove Rows which have >50% Of columns with missing values
df_mod_op = pd.get_dummies(df, columns = cat_vals, drop_first = True)
df_mod_op['na_check'] = (df_mod_op[x_input_feat_enc].isna().sum(axis=1) >  round(len(x_input_feat)/2)).astype('int')
df_mod_op = df_mod_op.loc[df_mod_op['na_check']==0,:]

In [43]:
# Tech Team: This is input parameter, whether lapse has to be based on Fixed No. of Days or
# based in Inter Purchase Interval or Category Purchase Cycle (CPC)
# Fixed Days columne is 'lapse', CPC based is 'lapse_IPI'
y_var = 'lapse_IPI'

In [44]:
# KNN Imputer for Missing Values (this is in the case where the missing values is less than 50% of columns)
imputer = KNNImputer()
df_mod_op2 = pd.DataFrame(imputer.fit_transform(df_mod_op[x_input_feat_enc], df_mod_op[y_var]))
df_mod_op2.columns =x_input_feat_enc
df_mod_op3 = pd.concat([df_mod_op.drop(x_input_feat_enc, axis=1).reset_index(drop=True),df_mod_op2],axis=1)

In [45]:
# Data Upsampling
ndf = up_sample_imbalanced(df_mod_op3, feature=y_var)

In [46]:
# Using Factor Analysis Variable as model input
X_Fact = FacterCluster(ndf[x_input_feat_enc])

# Running the Models

In [47]:
X_train, X_test, y_train, y_test = train_test_split(ndf[x_input_feat_enc], ndf[y_var], test_size=0.2, random_state=101)

In [48]:
pd.DataFrame(y_train).columns

Index(['lapse_IPI'], dtype='object')

In [49]:
model1, Logistic_Regression_proba,Logistic_Regression_varimp  = Logistic_Regression(X_train, X_test, y_train, y_test, ndf)
model2,Logistic_Regression_CV_proba,Logistic_Regression_CV_varimp = Logistic_Regression_cv(X_train, X_test, y_train, y_test, ndf)
model3, random_forest_clf_proba,random_forest_clf_varimp = random_forest(X_train, X_test, y_train, y_test, ndf)
model4,gradient_boost_clf_proba,gradient_boost_clf_varimp = gradient_boost(X_train, X_test, y_train, y_test,ndf)
model5,decision_tree_clf_proba,decision_tree_clf_varimp = decision_tree_clf(X_train, X_test, y_train, y_test,ndf)
model6,knn_clf_proba,knn_clf_varimp = knn_clf(X_train, X_test, y_train, y_test,ndf)
model7, xgb_clf_proba,xgb_clf_varimp= xgb_clf(X_train, X_test, y_train, y_test,ndf)
model8, mlp_clf_proba,mlp_clf_varimp = mlp_clf(X_train, X_test, y_train, y_test,ndf)
model9,lda_clf_proba,lda_clf_varimp = lda_clf(X_train, X_test, y_train, y_test,ndf)
model10,sgd_clf_proba,sgd_clf_varimp = sgd_clf(X_train, X_test, y_train, y_test,ndf)

**Logistic Regression**
Confusion matrix: 
  [[463 462]
 [447 478]]
**Logistic Regression Cross Validation**
Confusion matrix: 
  [[528 397]
 [458 467]]
**Random Forest**
Confusion matrix: 
  [[874  51]
 [132 793]]
**Gradient Boosting Classifier**
Confusion matrix: 
  [[894  31]
 [  0 925]]
**Decision Tree Classification**
Confusion matrix: 
  [[794 131]
 [502 423]]
**KNN Classifier**
Confusion matrix: 
  [[710 215]
 [  3 922]]
**XG Boost Classifier**
 Confusion matrix: 
  [[694 231]
 [303 622]]
**MLP Classifier**
 Confusion matrix: 
  [[409 516]
 [ 51 874]]
**Linear Discriminant Analysis**
 Confusion matrix: 
  [[521 404]
 [456 469]]
**Stochastic Gradient Descent Clf**
{'loss': 'modified_huber'}
 Confusion matrix: 
  [[857  68]
 [863  62]]


In [50]:
cons_model = model1.append([model2,model3,model4,model5,model6,model7,model8,model9,model10]).reset_index()
cons_model = pd.DataFrame(cons_model)

In [51]:
probas = [Logistic_Regression_proba,Logistic_Regression_CV_proba,random_forest_clf_proba,
          gradient_boost_clf_proba,decision_tree_clf_proba,knn_clf_proba,xgb_clf_proba,
         mlp_clf_proba,lda_clf_proba,sgd_clf_proba]

In [52]:
varimp = [Logistic_Regression_varimp, Logistic_Regression_CV_varimp,random_forest_clf_varimp,
          gradient_boost_clf_varimp,decision_tree_clf_varimp,knn_clf_varimp,xgb_clf_varimp,
          mlp_clf_varimp ,lda_clf_varimp ,sgd_clf_varimp]

In [53]:
# Find the model with best AUC Score
best_mod = cons_model[(cons_model['Accuracy']>0.6) & (cons_model['AUC']>0.6) & (cons_model['F1_Score']>0.6)].drop('index',axis=1).sort_values(by=['AUC'],ascending=False)
best_mod

Unnamed: 0,Model,Accuracy,AUC,F1_Score,Balanced_F1_Score,Train_Accuracy,Test_Accuracy,TimeTaken
3,gradient_boost_clf,0.983243,0.983243,0.983519,0.983243,0.996621,0.983243,13.27862
2,random_forest_clf,0.901081,0.901081,0.896552,0.901081,0.918897,0.901081,11.397573
5,knn_clf,0.882162,0.882162,0.894277,0.882162,0.90538,0.882162,24.495088
6,xgb_clf,0.711351,0.711351,0.699663,0.711351,0.739389,0.711351,1.622687
7,mlp_clf,0.693514,0.693514,0.755076,0.693514,0.733712,0.693514,5.006267


In [54]:
# Prediction Probabilities
probas[best_mod.reset_index().iloc[0,0]]

Unnamed: 0,pred_prob_0,pred_prob_1,Actual,predicted
0,0.999955,0.000045,0,0
1,0.999995,0.000005,0,0
2,0.999974,0.000026,0,0
3,0.999924,0.000076,0,0
4,0.999998,0.000002,0,0
...,...,...,...,...
9243,0.000175,0.999825,1,1
9244,0.000094,0.999906,1,1
9245,0.000088,0.999912,1,1
9246,0.000082,0.999918,1,1


In [55]:
#link this back to the original dataset...

In [56]:
# Drivers of Churn - Variable Importance Index
varimp[best_mod.reset_index().iloc[0,0]]

Unnamed: 0,feature,Imp_index
0,AGE_Prefiero no responder,0.875318
1,GENDER_Prefer Not to Answer,1.024431
2,AGE_Prefer not to answer,1.22546
3,AGE_26-35,3.606281
4,AGE_36-45,5.288754
5,AGE_46-55,5.609354
6,SATISFACTION_scr,5.924238
7,AGE_66+,6.335664
8,OIL_CHANGE_RETURN_scr,6.596329
9,AGE_56-65,6.755829
