# Import libraries

In [1]:
# ===== BASIC LIBRARIES =====
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()


# ===== MODELS =====
# simpler models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# ensemmble methods
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
import xgboost as xgb; xgb.set_config(verbosity=0) # to deactivate warning about change in library
from xgboost import XGBClassifier


# ===== HYPERPARAMETER TUNING =====
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate


# ===== PERFORMANCE MEASURES =====
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score


# ===== EXPORTS MODELS =====
import pickle


pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.3f}'.format

# Load preprocessed data

In [2]:
train_set = pd.read_csv('train_churndata_preprocessed.csv')
print(train_set.shape)
train_set.head()

(7482, 29)


Unnamed: 0,Churn,Tenure,CityTier,WarehouseToHome,HourSpendOnApp,NumberOfDeviceRegistered,SatisfactionScore,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount,PreferredLoginDevice_Mobile Phone,PreferredPaymentMode_Debit Card,PreferredPaymentMode_Credit Card,PreferredPaymentMode_Cash on Delivery,PreferredPaymentMode_UPI,PreferredPaymentMode_E wallet,Gender_Female,PreferedOrderCat_Mobile Phone,PreferedOrderCat_Laptop & Accessory,PreferedOrderCat_Fashion,PreferedOrderCat_Grocery,PreferedOrderCat_Others,MaritalStatus_Single,MaritalStatus_Divorced,MaritalStatus_Married
0,0,-0.667,0.0,-0.193,0.0,0.0,0.5,0.0,1.0,-0.4,0.0,0.0,0.0,-0.277,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0
1,0,0.667,0.0,-0.093,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.2,0.043,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-1.0
2,0,-0.75,0.0,-0.553,-0.066,0.0,0.0,-0.25,0.0,-0.6,-1.0,-0.493,-0.2,-0.665,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0
3,0,0.1,0.0,-0.701,-1.0,-1.0,-0.5,-0.25,1.0,1.6,0.0,-0.493,-0.6,-0.931,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0
4,0,-0.417,0.0,-0.302,-1.0,-1.0,-0.5,-0.5,1.0,-0.6,-1.0,-0.493,0.0,-0.308,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0


In [3]:
X_train = train_set.drop(['Churn'], axis = 1).values
y_train = train_set['Churn'].values

# Setting things up

In [4]:
metrics = ['accuracy','f1','precision','recall']
models = []
scores = []

# Exploring Models

In [5]:
%%time
exp_models = [LogisticRegression(), 
               KNeighborsClassifier(), 
               SVC(random_state = 42),
               SGDClassifier(random_state = 42),
               RandomForestClassifier(random_state = 42),
               ExtraTreesClassifier(random_state = 42),
               AdaBoostClassifier(random_state = 42),
               GradientBoostingClassifier(random_state = 42),
               XGBClassifier(random_state = 42, use_label_encoder=False)]


for model in exp_models:
    model.fit(X_train, y_train)
    
    cross_val = cross_validate(model, X_train, y_train, cv= 10, scoring = metrics)

    models.append(f'{model.__class__.__name__}')
    scores.append(list(cross_val.values())[2:])

Wall time: 54.4 s


In [6]:
data = []
for i in range(len(models)):
    for j in range(len(metrics)):
        data.append(scores[i][j].mean())

data = np.reshape(data, (len(models), len(metrics)))
        
df_models = pd.DataFrame(data, index = models, columns = metrics)
df_models.style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,accuracy,f1,precision,recall
LogisticRegression,0.823047,0.825959,0.812739,0.840156
KNeighborsClassifier,0.913527,0.920268,0.854341,0.997328
SVC,0.922218,0.924306,0.899414,0.950821
SGDClassifier,0.813957,0.819785,0.795612,0.846567
RandomForestClassifier,0.980897,0.980412,0.983415,0.97836
ExtraTreesClassifier,0.99185,0.991826,0.989894,0.993855
AdaBoostClassifier,0.888021,0.88565,0.885935,0.890186
GradientBoostingClassifier,0.923708,0.919813,0.934453,0.911567
XGBClassifier,0.97903,0.978079,0.985616,0.97222


In [7]:
df_models[df_models.f1 > 0.9]

Unnamed: 0,accuracy,f1,precision,recall
KNeighborsClassifier,0.914,0.92,0.854,0.997
SVC,0.922,0.924,0.899,0.951
RandomForestClassifier,0.981,0.98,0.983,0.978
ExtraTreesClassifier,0.992,0.992,0.99,0.994
GradientBoostingClassifier,0.924,0.92,0.934,0.912
XGBClassifier,0.979,0.978,0.986,0.972


# Fine Tune models

In [10]:
fine_tuned_models = []
new_models = []
new_scores = []

### KNeighbors Classifier

In [8]:
%%time

'''KNeighborsClassifier
parameters: 
    n_neighbors: number of neighbors to use for kneighbors queries; (default = 5)
                    a smaller number can give the model more flexibility, thus lower bias but higher variance (risk overfitting)
                    Samewise, a higher number can increase bias but lower variance (risk underfitting)
    weights: weight function used in prediction; (default = 'uniform')
    
    leaf_size: leaf size passed to BallTree or KDTree (algorithm); (default = 30)
    '''

param_grid = {'n_neighbors':np.arange(2, 15, 1),
              'weights':['uniform', 'distance'],
              'leaf_size':np.arange(15, 40, 2),
             }

knn_clf = GridSearchCV(estimator = KNeighborsClassifier(n_jobs = -1),
                      param_grid = param_grid, 
                      cv = 10, 
                      scoring = metrics,
                      refit = 'recall',
                      error_score = 'raise', 
                      verbose = 2)
                     
knn_clf.fit(X_train, y_train)

Fitting 10 folds for each of 338 candidates, totalling 3380 fits
[CV] END .......leaf_size=15, n_neighbors=2, weights=uniform; total time=   0.1s
[CV] END .......leaf_size=15, n_neighbors=2, weights=uniform; total time=   0.0s
[CV] END .......leaf_size=15, n_neighbors=2, weights=uniform; total time=   0.0s
[CV] END .......leaf_size=15, n_neighbors=2, weights=uniform; total time=   0.0s
[CV] END .......leaf_size=15, n_neighbors=2, weights=uniform; total time=   0.0s
[CV] END .......leaf_size=15, n_neighbors=2, weights=uniform; total time=   0.0s
[CV] END .......leaf_size=15, n_neighbors=2, weights=uniform; total time=   0.0s
[CV] END .......leaf_size=15, n_neighbors=2, weights=uniform; total time=   0.0s
[CV] END .......leaf_size=15, n_neighbors=2, weights=uniform; total time=   0.0s
[CV] END .......leaf_size=15, n_neighbors=2, weights=uniform; total time=   0.0s
[CV] END ......leaf_size=15, n_neighbors=2, weights=distance; total time=   0.0s
[CV] END ......leaf_size=15, n_neighbors=2, 

GridSearchCV(cv=10, error_score='raise',
             estimator=KNeighborsClassifier(n_jobs=-1),
             param_grid={'leaf_size': array([15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39]),
                         'n_neighbors': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                         'weights': ['uniform', 'distance']},
             refit='recall', scoring=['accuracy', 'f1', 'precision', 'recall'],
             verbose=2)

In [11]:
fine_tuned_models.append(knn_clf.best_estimator_)
knn_clf.best_params_

{'leaf_size': 15, 'n_neighbors': 2, 'weights': 'distance'}

### Support Vector Classification (SVC)

In [12]:
%%time

'''SVC -SupportVectorClassification
parameters: 
    C: regularization parameter; inversely proportional to C; (default = 1.0)
                    
    kernel: kernel type to be used in the algorithm (linear, poly, rbf, sigmoid, precoumpted); (default = 'rbf')
    
    degree: degree of polynomial kernel function; (default = 3)
    
    gamma: kernel coefficient for 'rbf', 'poly' and 'sigmoid'; (default = 'scale' = 1/(n_features * X.var()))
            If set too large, risk of overfitting
    
    tol: tolerance for stopping criterion; (default = 0.001)
    '''

param_grid = {'C':[1.0, 1.5, 2.0, 2.5],
              'kernel':['poly', 'rbf', 'sigmoid'],
              'degree':[2, 3],
              'gamma':['scale', 'auto'],
              'tol':[0.001, 0.003]
             }

svc_clf = GridSearchCV(estimator = SVC(probability = True, class_weight = 'balanced'),
                      param_grid = param_grid, 
                      cv = 5, 
                      scoring = metrics,
                      refit = 'recall',
                      error_score = 'raise', 
                      verbose = 2)
                     
svc_clf.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END C=1.0, degree=2, gamma=scale, kernel=poly, tol=0.001; total time=   4.3s
[CV] END C=1.0, degree=2, gamma=scale, kernel=poly, tol=0.001; total time=   4.4s
[CV] END C=1.0, degree=2, gamma=scale, kernel=poly, tol=0.001; total time=   4.2s
[CV] END C=1.0, degree=2, gamma=scale, kernel=poly, tol=0.001; total time=   4.4s
[CV] END C=1.0, degree=2, gamma=scale, kernel=poly, tol=0.001; total time=   4.2s
[CV] END C=1.0, degree=2, gamma=scale, kernel=poly, tol=0.003; total time=   4.4s
[CV] END C=1.0, degree=2, gamma=scale, kernel=poly, tol=0.003; total time=   4.5s
[CV] END C=1.0, degree=2, gamma=scale, kernel=poly, tol=0.003; total time=   4.1s
[CV] END C=1.0, degree=2, gamma=scale, kernel=poly, tol=0.003; total time=   4.2s
[CV] END C=1.0, degree=2, gamma=scale, kernel=poly, tol=0.003; total time=   4.5s
[CV] END C=1.0, degree=2, gamma=scale, kernel=rbf, tol=0.001; total time=   5.4s
[CV] END C=1.0, degree=2, gamma=scale

GridSearchCV(cv=5, error_score='raise',
             estimator=SVC(class_weight='balanced', probability=True),
             param_grid={'C': [1.0, 1.5, 2.0, 2.5], 'degree': [2, 3],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['poly', 'rbf', 'sigmoid'],
                         'tol': [0.001, 0.003]},
             refit='recall', scoring=['accuracy', 'f1', 'precision', 'recall'],
             verbose=2)

In [13]:
fine_tuned_models.append(svc_clf.best_estimator_)
svc_clf.best_params_

{'C': 2.5, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf', 'tol': 0.001}

### Extra Trees Classifier

In [14]:
%%time

'''ExtraTreesClassifier
parameters: 
    n_estimators: number of trees in the forest; (default = 100)
                    generally random forests do not overfit as more trees are added, however, the increase on
                    performance does not necessarily justifies the increase on complexity (more time/resources)
                    
    max_depth: maximum depth of each tree in the forest; (default = None, all leaves are expanded as much as possible)
                    generally, the deeper the tree, the more splits, which can lead to higher performance, as it
                    allows the model to better fit the data (lower bias), however this can also lead to higher
                    variance as the model can start to overfit the training data
    
    min_samples_split: minimum number of samples required to split an internal node; (default = 2)
                    this can reduce the number of splits, which can reduce overfitting, however, if too large, it
                    can also lead to underfitting the training set
    
    min_samples_leaf: mininum number of samples that must be present in a leaf/terminal node (end of the tree); (default = 1)
                    as before, a higher value can reduce overfitting but a too high value can lead to underfitting.
                    Specially in regression, a higher value can also smooth the model, by avoiding leaves with only one value
                    
    max_features: number of features to consider for best split (deafault = 'auto', total number of features)
                    a higher number will lead to better performance, but a too high number will also lead to overfitting.
                    Moreover, the higher the number, the more computationally expensive the model becomes.
    
    max_samples: number (or %) of samples to draw from X to train each base estimator (default = None, total num of samples)
                     The lower the value, the more randomness is introduced to the data.
                     --Only possible if Bootstrap = True.
    '''

param_grid = {'n_estimators':[ 500], #200, 250,
              'max_depth':[28, 29, 30],
              'min_samples_split':[2, 3, 4],
              'min_samples_leaf':[1, 2],
              'max_features':[4, 5, 6, 7],
              'max_samples':[0.9, 1.0]
             }

et_clf = GridSearchCV(estimator = ExtraTreesClassifier(random_state = 42, n_jobs = -1, bootstrap  = True),
                      param_grid = param_grid, 
                      cv = 10, 
                      scoring = metrics,
                      refit = 'recall',
                      error_score = 'raise', 
                      verbose = 2)
                     
et_clf.fit(X_train, y_train)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits
[CV] END max_depth=28, max_features=4, max_samples=0.9, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   3.2s
[CV] END max_depth=28, max_features=4, max_samples=0.9, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.9s
[CV] END max_depth=28, max_features=4, max_samples=0.9, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.9s
[CV] END max_depth=28, max_features=4, max_samples=0.9, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.8s
[CV] END max_depth=28, max_features=4, max_samples=0.9, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.8s
[CV] END max_depth=28, max_features=4, max_samples=0.9, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   1.0s
[CV] END max_depth=28, max_features=4, max_samples=0.9, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.

GridSearchCV(cv=10, error_score='raise',
             estimator=ExtraTreesClassifier(bootstrap=True, n_jobs=-1,
                                            random_state=42),
             param_grid={'max_depth': [28, 29, 30],
                         'max_features': [4, 5, 6, 7],
                         'max_samples': [0.9, 1.0], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [500]},
             refit='recall', scoring=['accuracy', 'f1', 'precision', 'recall'],
             verbose=2)

In [15]:
fine_tuned_models.append(et_clf.best_estimator_)
et_clf.best_params_

{'max_depth': 28,
 'max_features': 4,
 'max_samples': 0.9,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 500}

### XGBClassifier

In [16]:
%%time
# https://xgboost.readthedocs.io/en/stable/python/python_api.html?highlight=xgboost%20xgbregressor#xgboost.XGBRegressor
'''XGBClassifier
parameters: 
    n_estimators: number of gradient boosted trees;
                    
    max_depth: maximum depth of each tree; (default = 6)
                    a higher value leads to a more complex model, more likely to overfit.
    
    learning_rate: boosting learning rate; (default = 0.3)
                    it shrinks the feature weights to make the boosting process more conservative. It prevents overfitting.
    
    subsample: subsample ratio if the training instances; (default = 1)
                    a lower value can help prevent overfitting.
   
    colsample_bytree: subsample ratio of columns (features) when constructing each tree; (default = 1)
                    similar to max_features in RandomForest/ExtraTrees
                    
    reg_alpha: L1 regularization term on weights; (default = 1)
                    increasing this value can prevent overfitting
                    
    reg_lambda: L2 regularization term on weights; (default = 0)
                    increasing this value can prevent overfitting
    '''

param_grid = {'n_estimators':[500],
              'max_depth':[6, 14, 18],
              'learning_rate':[0.05, 0.1, 0.15],
              'subsample':[0.9, 1.0],
              'colsample_bytree':[0.8, 0.9, 1.],
              'reg_lambda':[1.0, 1.2, 1.4],
              'n_jobs':[-1],
              'random_state':[42]}

xgb_clf = GridSearchCV(estimator = XGBClassifier(use_label_encoder=False),
                      param_grid = param_grid, 
                      cv = 10, 
                      scoring = metrics,
                      refit = 'recall',
                      error_score = 'raise', 
                      verbose = 2)
                     
xgb_clf.fit(X_train, y_train, eval_metric = 'logloss')

Fitting 10 folds for each of 162 candidates, totalling 1620 fits
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=500, n_jobs=-1, random_state=42, reg_lambda=1.0, subsample=0.9; total time=   1.6s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=500, n_jobs=-1, random_state=42, reg_lambda=1.0, subsample=0.9; total time=   1.7s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=500, n_jobs=-1, random_state=42, reg_lambda=1.0, subsample=0.9; total time=   1.8s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=500, n_jobs=-1, random_state=42, reg_lambda=1.0, subsample=0.9; total time=   1.8s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=500, n_jobs=-1, random_state=42, reg_lambda=1.0, subsample=0.9; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=500, n_jobs=-1, random_state=42, reg_lambda=1.0, subsample

GridSearchCV(cv=10, error_score='raise',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monot...
                                     subsample=None, tree_method=None,
                                     use_label_encoder=False,
                                     validate_parameters=None, verbosity=None),
             param_grid={'colsample_bytree': [0.8, 0.9, 1.0],
        

In [17]:
fine_tuned_models.append(xgb_clf.best_estimator_)
xgb_clf.best_params_

{'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': 14,
 'n_estimators': 500,
 'n_jobs': -1,
 'random_state': 42,
 'reg_lambda': 1.0,
 'subsample': 1.0}

## Comparing models

In [18]:
%%time
for model in fine_tuned_models:
    cross_val = cross_validate(model, X_train, y_train, cv= 10, scoring = metrics)

    new_models.append(f'{model.__class__.__name__}')
    new_scores.append(list(cross_val.values())[2:])

data = []
for model in range(len(fine_tuned_models)):
    for score in range(len(metrics)):
        data.append(new_scores[model][score].mean())

reshaped_data = np.reshape(data, (len(fine_tuned_models), len(metrics)))
        
df_models = pd.DataFrame(reshaped_data, index = new_models, columns = metrics)
df_models.style.highlight_max(color = 'lightgreen', axis = 0)

Wall time: 1min 33s


Unnamed: 0,accuracy,f1,precision,recall
KNeighborsClassifier,0.972736,0.97346,0.948801,0.999465
SVC,0.947075,0.948835,0.918661,0.981296
ExtraTreesClassifier,0.989578,0.989592,0.985473,0.993857
XGBClassifier,0.982768,0.982462,0.983641,0.981837


## Combining models

### VotingClassifier

In [19]:
%%time
'''VotingClassifier
parameters: 
    weights: sequence of weights (float or int) to weight the occurrences of predicted class labels (hard voting) 
            or class probabilities before averaging (soft voting).

    '''

param_grid = {'weights':[[0.25, 0.25, 0.25, 0.25],
                         [0.15, 0.15, 0.4, 0.3],
                         [0.2, 0.1, 0.4, 0.3],
                         [0.2, 0.1, 0.45, 0.25],
                         [0.25, 0.05, 0.4, 0.3],
                         [0.25, 0.05, 0.45, 0.25],
                         [0.3, 0.05, 0.45, 0.2],
                         [0.15, 0.15, 0.45, 0.25],
                         [0.2, 0.1, 0.3, 0.4]]}

vot_clf = GridSearchCV(estimator = VotingClassifier(estimators=[('knn', knn_clf.best_estimator_),
                                                                     ('svc', svc_clf.best_estimator_),
                                                                     ('extra_trees', et_clf.best_estimator_), 
                                                                     ('xgboosting', xgb_clf.best_estimator_)],
                                                      voting = 'soft',
                                                      n_jobs = -1),
                      param_grid = param_grid, 
                      cv = 10, 
                      scoring = metrics,
                      refit = 'recall',
                      error_score = 'raise', 
                      verbose = 2)
                     
vot_clf.fit(X_train, y_train)
vot_clf.best_params_

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] END ...................weights=[0.25, 0.25, 0.25, 0.25]; total time=   8.8s
[CV] END ...................weights=[0.25, 0.25, 0.25, 0.25]; total time=   8.3s
[CV] END ...................weights=[0.25, 0.25, 0.25, 0.25]; total time=   8.1s
[CV] END ...................weights=[0.25, 0.25, 0.25, 0.25]; total time=   7.6s
[CV] END ...................weights=[0.25, 0.25, 0.25, 0.25]; total time=   7.4s
[CV] END ...................weights=[0.25, 0.25, 0.25, 0.25]; total time=   7.8s
[CV] END ...................weights=[0.25, 0.25, 0.25, 0.25]; total time=   7.6s
[CV] END ...................weights=[0.25, 0.25, 0.25, 0.25]; total time=   7.5s
[CV] END ...................weights=[0.25, 0.25, 0.25, 0.25]; total time=   7.6s
[CV] END ...................weights=[0.25, 0.25, 0.25, 0.25]; total time=   7.7s
[CV] END .....................weights=[0.15, 0.15, 0.4, 0.3]; total time=   7.6s
[CV] END .....................weights=[0.15, 0.1

{'weights': [0.3, 0.05, 0.45, 0.2]}

In [20]:
fine_tuned_models.append(vot_clf.best_estimator_)

In [21]:
%%time
cross_val = cross_validate(vot_clf.best_estimator_, X_train, y_train, cv= 10, scoring = metrics)
new_models.append(f'{vot_clf.best_estimator_.__class__.__name__}')
new_scores.append(list(cross_val.values())[2:])
        

for score in range(len(metrics)):
    data.append(new_scores[-1][score].mean())

reshaped_data = np.reshape(data, (len(fine_tuned_models), len(metrics)))
        
df_models = pd.DataFrame(reshaped_data, index = new_models, columns = metrics)
df_models.style.highlight_max(color = 'lightgreen', axis = 0)

Wall time: 1min 15s


Unnamed: 0,accuracy,f1,precision,recall
KNeighborsClassifier,0.972736,0.97346,0.948801,0.999465
SVC,0.947075,0.948835,0.918661,0.981296
ExtraTreesClassifier,0.989578,0.989592,0.985473,0.993857
XGBClassifier,0.982768,0.982462,0.983641,0.981837
VotingClassifier,0.990378,0.990445,0.983167,0.997862


## Stacking Classifier

In [22]:
%%time
'''StackingClassifier
parameters: 
    weights: sequence of weights (float or int) to weight the occurrences of predicted class labels (hard voting) 
            or class probabilities before averaging (soft voting).

    '''

param_grid = {'final_estimator':exp_models,
              'passthrough':[True, False]}

stk_clf = GridSearchCV(estimator = StackingClassifier(estimators=[('knn', knn_clf.best_estimator_),
                                                                     ('svc', svc_clf.best_estimator_),
                                                                     ('extra_trees', et_clf.best_estimator_), 
                                                                     ('xgboosting', xgb_clf.best_estimator_)],
                                                      n_jobs = -1),
                      param_grid = param_grid, 
                      cv = 10, 
                      scoring = metrics,
                      refit = 'recall',
                      error_score = 'raise', 
                      verbose = 2)
                     
stk_clf.fit(X_train, y_train)
stk_clf.best_params_

Fitting 10 folds for each of 18 candidates, totalling 180 fits
[CV] END final_estimator=LogisticRegression(), passthrough=True; total time=  19.5s
[CV] END final_estimator=LogisticRegression(), passthrough=True; total time=  21.3s
[CV] END final_estimator=LogisticRegression(), passthrough=True; total time=  21.0s
[CV] END final_estimator=LogisticRegression(), passthrough=True; total time=  21.8s
[CV] END final_estimator=LogisticRegression(), passthrough=True; total time=  20.6s
[CV] END final_estimator=LogisticRegression(), passthrough=True; total time=  20.0s
[CV] END final_estimator=LogisticRegression(), passthrough=True; total time=  20.1s
[CV] END final_estimator=LogisticRegression(), passthrough=True; total time=  19.7s
[CV] END final_estimator=LogisticRegression(), passthrough=True; total time=  24.3s
[CV] END final_estimator=LogisticRegression(), passthrough=True; total time=  21.1s
[CV] END final_estimator=LogisticRegression(), passthrough=False; total time=  24.8s
[CV] END fin

{'final_estimator': KNeighborsClassifier(), 'passthrough': True}

In [23]:
fine_tuned_models.append(stk_clf.best_estimator_)

In [24]:
%%time
cross_val = cross_validate(stk_clf.best_estimator_, X_train, y_train, cv= 10, scoring = metrics)
new_models.append(f'{stk_clf.best_estimator_.__class__.__name__}')
new_scores.append(list(cross_val.values())[2:])
        

for score in range(len(metrics)):
    data.append(new_scores[-1][score].mean())

reshaped_data = np.reshape(data, (len(fine_tuned_models), len(metrics)))
        
df_models = pd.DataFrame(reshaped_data, index = new_models, columns = metrics)
df_models.style.highlight_max(color = 'lightgreen', axis = 0)

Wall time: 3min 49s


Unnamed: 0,accuracy,f1,precision,recall
KNeighborsClassifier,0.972736,0.97346,0.948801,0.999465
SVC,0.947075,0.948835,0.918661,0.981296
ExtraTreesClassifier,0.989578,0.989592,0.985473,0.993857
XGBClassifier,0.982768,0.982462,0.983641,0.981837
VotingClassifier,0.990378,0.990445,0.983167,0.997862
StackingClassifier,0.97875,0.97917,0.960699,0.998396


# Exporting best model

In [25]:
filename = 'vot_clf.pkl'
pickle.dump(vot_clf.best_estimator_, open(filename, 'wb'))

<!-- # saving best estimator
filename = 'vot_clf.pkl'
pickle.dump(vot_clf_gd.best_estimator_, open(filename, 'wb')) -->