### Getting Dataset Ready for Model Building

In [1]:
# Importing the needed modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime 
warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', FutureWarning)

# To display plots inside the iPython Notebook itself
%matplotlib inline

In [2]:
# To verify how data is orgainzed in file(to find the delimiter) and then
# use corresponding function to open the file. eg
# data could be in .csv. .tsv, excel format etc.
pathOfDataFile = "data/bank-full.csv"
firstFewLines = list()
noOfLinesToView = 5

with open(pathOfDataFile) as dataFile:
    firstFewLines = [next(dataFile) for i in range(noOfLinesToView)]
    for line in firstFewLines:
        print(line)

# Import the semi-colon delimited data file into pandas dataFrame
bankPromo_df = pd.read_csv(pathOfDataFile, sep = ";")

# Rename the Target/Final Outcome column from "y" to "Subscribed" as based on data description.
bankPromo_df = bankPromo_df.rename(columns={"y":"Subscribed"})

bankPromo_df.head(7)

"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"

58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"

44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"

33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"

47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"



Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Subscribed
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no


In [3]:
# Make a copy of original data frame
bankPromoModel_hsng_Df = bankPromo_df.copy()
bankPromoModel_hsng_Df['Target'] = bankPromoModel_hsng_Df['housing'].apply(lambda resp : 1 if resp == "yes" else 0)
bankPromoModel_hsng_Df['Target'] = bankPromoModel_hsng_Df['Target'].astype(np.int)

# Delete the original 'housing' column
del bankPromoModel_hsng_Df['housing']

# List final variables of the new dataset
bankPromoModel_hsng_Df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age           45211 non-null int64
job           45211 non-null object
marital       45211 non-null object
education     45211 non-null object
default       45211 non-null object
balance       45211 non-null int64
loan          45211 non-null object
contact       45211 non-null object
day           45211 non-null int64
month         45211 non-null object
duration      45211 non-null int64
campaign      45211 non-null int64
pdays         45211 non-null int64
previous      45211 non-null int64
poutcome      45211 non-null object
Subscribed    45211 non-null object
Target        45211 non-null int64
dtypes: int64(8), object(9)
memory usage: 5.9+ MB


#### Get list of categorical variables , keeping 'housing' as target/response variable

In [4]:
# Get the unique values(Levels) for categorical variables.
# List to hold names of categorical variables
categoricalVars = list()
# List to hold names of numerical variables
numericalVars = list()

for colName in bankPromo_df.columns:
    if bankPromo_df[colName].dtype == np.int64:
        numericalVars.append(colName)
    elif bankPromo_df[colName].dtype == np.object:
        categoricalVars.append(colName)
    else:
        pass
    
# Remove Target column from final categorical Var list
categoricalVars.remove('housing')

print(numericalVars)
print(categoricalVars)

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
['job', 'marital', 'education', 'default', 'loan', 'contact', 'month', 'poutcome', 'Subscribed']


#### Load Test Dataset

In [5]:
pathOfAdditionalDataFile = "data/bank.csv"

# Import the semi-colon delimited data file into pandas dataFrame
bankPromoAdditional_h_df = pd.read_csv(pathOfAdditionalDataFile, sep = ";")

# Rename the Target/Final Outcome column from "y" to "Subscribed" as based on data description.
bankPromoAdditional_h_df = bankPromoAdditional_h_df.rename(columns={"y":"Subscribed"})

bankPromoAdditional_h_df['Target'] = bankPromoAdditional_h_df['housing'].apply(lambda resp : 1 if resp == "yes" else 0)
bankPromoAdditional_h_df['Target'] = bankPromoAdditional_h_df['Target'].astype(np.int)
# Delete the original 'housing' column
del bankPromoAdditional_h_df['housing']

bankPromoAdditional_h_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
age           4521 non-null int64
job           4521 non-null object
marital       4521 non-null object
education     4521 non-null object
default       4521 non-null object
balance       4521 non-null int64
loan          4521 non-null object
contact       4521 non-null object
day           4521 non-null int64
month         4521 non-null object
duration      4521 non-null int64
campaign      4521 non-null int64
pdays         4521 non-null int64
previous      4521 non-null int64
poutcome      4521 non-null object
Subscribed    4521 non-null object
Target        4521 non-null int64
dtypes: int64(8), object(9)
memory usage: 600.5+ KB


#####  Delete any features that do not relate to the response variable in the business sense

BankPromo dataset contains 

i) bank client data like age, balance, education, job , marital status , any loans - housing/personal , if they have defaulted  

ii) information regarding last contact  in current campaign - contact type, day/month when last contacted, duration of last call 

iii) other attributes like number of times contacted during current campaign ,number of days since last contact, number of contacts made in last campaign, outcome from previous campaign , outcome from current campagin i.e. Subscribed or not  


From business use case perspective, any information regarding Term deposit subscription campaign may have no effect on the client data but vice-versa may not be true. It will be interesting to find what features from the campaign information should be considered to predict if a customer has a housing loan or not

#### Perform One Hot Encoding for categorical variables in dataset

In [6]:
## Training Dataset
###################
# Convert all categorical variables to corresponding indicator variables
for categoricalVar in categoricalVars:
    tmpDf = pd.DataFrame()
    # Remove 1st class level to avoid multicollinearity
    tmpDf = pd.get_dummies(bankPromoModel_hsng_Df[categoricalVar], prefix=categoricalVar, drop_first=True)
    bankPromoModel_hsng_Df = pd.concat((bankPromoModel_hsng_Df, tmpDf), axis=1)

# Now remove the original categorical vars since indicator variables are created from them.
bankPromoModel_hsng_Df.drop(categoricalVars, inplace=True, axis=1)
print("Training dataset info")
print("=====================")
bankPromoModel_hsng_Df.info()


## Test Dataset
###################
# Covert all categorical variables to corresponding indicator variables
for categoricalVar in categoricalVars:
    tmpDf = pd.DataFrame()
    # Remove 1st class level to avoid multicollinearity
    tmpDf = pd.get_dummies(bankPromoAdditional_h_df[categoricalVar], prefix=categoricalVar, drop_first=True)
    bankPromoAdditional_h_df = pd.concat((bankPromoAdditional_h_df, tmpDf), axis=1)

# Now remove the original categorical vars since indicator variables are created from them.
bankPromoAdditional_h_df.drop(categoricalVars, inplace=True, axis=1)

if 'Target' in bankPromoAdditional_h_df:
    y_Final = bankPromoAdditional_h_df['Target'].values # get the labels we want
    del bankPromoAdditional_h_df['Target']        # get rid of the class label
    X_Final = bankPromoAdditional_h_df.values

print("Test dataset info")
print("=====================")
bankPromoAdditional_h_df.info()

Training dataset info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 43 columns):
age                    45211 non-null int64
balance                45211 non-null int64
day                    45211 non-null int64
duration               45211 non-null int64
campaign               45211 non-null int64
pdays                  45211 non-null int64
previous               45211 non-null int64
Target                 45211 non-null int64
job_blue-collar        45211 non-null uint8
job_entrepreneur       45211 non-null uint8
job_housemaid          45211 non-null uint8
job_management         45211 non-null uint8
job_retired            45211 non-null uint8
job_self-employed      45211 non-null uint8
job_services           45211 non-null uint8
job_student            45211 non-null uint8
job_technician         45211 non-null uint8
job_unemployed         45211 non-null uint8
job_unknown            45211 non-null uint8
marital_married        45211 non-

#### Create 10 Splits  Cross Validation Object

In [7]:
# Training and Test Split
# Since housing is a balanced dataset ( with 56% yes and 44% No , we will use simple KFold and ShuffleSplit cv objects)
from sklearn.model_selection import KFold
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

if 'Target' in bankPromoModel_hsng_Df:
    y = bankPromoModel_hsng_Df['Target'].values # get the labels we want
    del bankPromoModel_hsng_Df['Target']        # get rid of the class label
    X = bankPromoModel_hsng_Df.values           # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# To use the cross validation object in scikit learn, we need to grab an instance
# of the object and set it up. This object will be able to split our data into 
# training and testing splits
num_cv_iterations = 10
kfold_cv_object = KFold(n_splits=num_cv_iterations , shuffle = False , random_state=999)
#n_splits=num_cv_iterations,test_size  = 0.2, random_state=999
                         
print(kfold_cv_object)


KFold(n_splits=10, random_state=999, shuffle=False)


In [8]:
from sklearn.model_selection import ShuffleSplit
# To use the cross validation object in scikit learn, we need to grab an instance
# of the object and set it up. This object will be able to split our data into 
# training and testing splits
num_cv_iterations = 10
shuffle_cv_object = ShuffleSplit(n_splits=num_cv_iterations, test_size = 0.1, random_state=999)
                         
print(shuffle_cv_object)


ShuffleSplit(n_splits=10, random_state=999, test_size=0.1, train_size=None)


In [9]:
########## Random Forest ############################

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

scoring = {'F1_Score': 'f1', 'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score), 'Precision': 'precision', \
          'Recall': 'recall'}

baseRfModel = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=999, n_jobs=-1))
scores = cross_validate(baseRfModel, X, y=y, cv=kfold_cv_object, n_jobs=-1, scoring=scoring)

display(pd.DataFrame(scores))

Unnamed: 0,fit_time,score_time,test_F1_Score,train_F1_Score,test_AUC,train_AUC,test_Accuracy,train_Accuracy,test_Precision,train_Precision,test_Recall,train_Recall
0,1.176114,0.534747,0.896149,0.987041,0.571459,0.999279,0.815126,0.986679,0.902201,0.994939,0.890178,0.979267
1,1.164129,0.532699,0.612521,0.985981,0.591019,0.999197,0.479761,0.985598,0.920297,0.994546,0.459012,0.977562
2,1.13443,0.536497,0.468588,0.989164,0.615618,0.999279,0.592126,0.987442,0.398139,0.994372,0.569328,0.98401
3,1.159558,0.532365,0.325622,0.987655,0.555389,0.999319,0.484185,0.986409,0.62905,0.995186,0.219664,0.980237
4,1.105849,0.531151,0.257084,0.989353,0.636202,0.999351,0.762221,0.987442,0.42369,0.994513,0.184524,0.984247
5,1.078768,0.534301,0.593172,0.98858,0.721869,0.999325,0.654722,0.987122,0.635045,0.99478,0.556479,0.982456
6,1.113295,0.535136,0.509861,0.988034,0.675282,0.999286,0.626189,0.986532,0.643956,0.994505,0.421988,0.981646
7,1.032596,0.53056,0.741019,0.986357,0.765246,0.999155,0.649193,0.985697,0.909419,0.994329,0.625241,0.978513
8,1.059316,0.537769,0.866192,0.985505,0.813121,0.999079,0.804689,0.984591,0.871873,0.993845,0.860584,0.977303
9,1.035465,0.537426,0.354954,0.988036,0.635451,0.999086,0.641451,0.985869,0.285531,0.99418,0.46898,0.981968


In [26]:
#################################
# Create randomized grid
#################################

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'log2', 8, 9, 10]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [x for x in np.arange(2, 101, 10)]

# Minimum number of samples required at each leaf node
min_samples_leaf = [x for x in np.arange(1, 101, 4)]

#Class weights
class_weight = [None, 'balanced', 'balanced_subsample']

# Method of selecting samples for training each tree
bootstrap = [True, False]

criterion = ['gini', 'entropy']

# Create the random grid
random_grid = {'randomforestclassifier__n_estimators': n_estimators,
              'randomforestclassifier__max_features': max_features,
              'randomforestclassifier__max_depth': max_depth,
              'randomforestclassifier__min_samples_split': min_samples_split,
              'randomforestclassifier__min_samples_leaf': min_samples_leaf,
              'randomforestclassifier__class_weight': class_weight,
              'randomforestclassifier__bootstrap': bootstrap,
              'randomforestclassifier__criterion': criterion}

print(random_grid)

{'randomforestclassifier__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'randomforestclassifier__max_features': ['auto', 'log2', 8, 9, 10], 'randomforestclassifier__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'randomforestclassifier__min_samples_split': [2, 12, 22, 32, 42, 52, 62, 72, 82, 92], 'randomforestclassifier__min_samples_leaf': [1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 65, 69, 73, 77, 81, 85, 89, 93, 97], 'randomforestclassifier__class_weight': [None, 'balanced', 'balanced_subsample'], 'randomforestclassifier__bootstrap': [True, False], 'randomforestclassifier__criterion': ['gini', 'entropy']}


In [27]:
from sklearn.model_selection import RandomizedSearchCV
#################################
# Random Search Training
#################################

# Use the random grid to search for best hyperparameters
# First create the base model to tune
#rf = RandomForestClassifier() #Originally was this
rf = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=999, n_jobs=-1)) 

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_randomgrid = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                                   n_iter = 200, 
                                   cv = kfold_cv_object,
                                   verbose=2, 
                                   random_state=999, 
                                   n_jobs = -1,
                                   scoring=scoring,
                                   refit='Accuracy', \
                                   return_train_score=True)


# Fit the random search model
rf_randomgrid.fit(X, y=y)

print("The best parameters are %s with a score of %0.2f"
      % (rf_randomgrid.best_params_, rf_randomgrid.best_score_))
#rf_random.best_params_

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 237 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 520 tasks      | elapsed: 25.4min
[Parallel(n_jobs=-1)]: Done 885 tasks      | elapsed: 42.2min
[Parallel(n_jobs=-1)]: Done 1330 tasks      | elapsed: 65.0min
[Parallel(n_jobs=-1)]: Done 1857 tasks      | elapsed: 94.4min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 99.4min finished


The best parameters are {'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__min_samples_split': 82, 'randomforestclassifier__min_samples_leaf': 25, 'randomforestclassifier__max_features': 'log2', 'randomforestclassifier__max_depth': 30, 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__bootstrap': False} with a score of 0.69


In [10]:

rfRandomGridEst = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 200, min_samples_split = 82, min_samples_leaf = 25, \
                                                                         max_features = 'log2', max_depth = 30, class_weight = "balanced", \
                                                                         criterion = 'entropy', bootstrap = False,random_state=999, n_jobs=-1))

scores = cross_validate(rfRandomGridEst, X, y=y, cv=kfold_cv_object, n_jobs=-1, scoring=scoring)

display(pd.DataFrame(scores))

print("\n Mean values for Performance Metrices on Test Set are .......")
testCol = ['test_F1_Score', 'test_AUC', 'test_Accuracy', 'test_Precision', 'test_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[testCol].mean()).T)

print("\n Mean values for Performance Metrices on Training Set are .......")
trainCol = ['train_F1_Score', 'train_AUC', 'train_Accuracy', 'train_Precision', 'train_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[trainCol].mean()).T)

Unnamed: 0,fit_time,score_time,test_F1_Score,train_F1_Score,test_AUC,train_AUC,test_Accuracy,train_Accuracy,test_Precision,train_Precision,test_Recall,train_Recall
0,8.173242,1.130251,0.945183,0.79456,0.632266,0.874154,0.896064,0.795448,0.896064,0.828145,1.0,0.763592
1,8.118589,1.212926,0.854148,0.793773,0.59723,0.876642,0.753816,0.796854,0.910081,0.837175,0.804691,0.754649
2,7.784884,1.416365,0.499487,0.821021,0.641772,0.885518,0.46096,0.803219,0.353386,0.873069,0.851541,0.774829
3,8.335069,0.9588,0.130249,0.821248,0.59903,0.892708,0.459412,0.814008,0.740891,0.879318,0.071401,0.770373
4,8.360055,0.961714,0.099815,0.815014,0.68309,0.881721,0.784561,0.795675,0.72973,0.879599,0.053571,0.759265
5,8.212106,1.101879,0.603139,0.817914,0.776542,0.887913,0.686795,0.806955,0.7065,0.879725,0.526161,0.764219
6,8.162438,1.108456,0.544082,0.824281,0.731959,0.888828,0.666003,0.811403,0.733116,0.872679,0.432549,0.780969
7,8.209686,1.110232,0.795808,0.797653,0.788606,0.878692,0.706923,0.79828,0.902797,0.84866,0.711491,0.75243
8,8.22348,1.038435,0.898419,0.802842,0.856661,0.876029,0.842292,0.79882,0.852623,0.845568,0.949413,0.764226
9,8.181271,1.123418,0.371585,0.817368,0.643284,0.880685,0.643884,0.796977,0.295469,0.878028,0.500526,0.764548



 Mean values for Performance Metrices on Test Set are .......


Unnamed: 0,test_F1_Score,test_AUC,test_Accuracy,test_Precision,test_Recall
0,0.574191,0.695044,0.690071,0.712066,0.590134



 Mean values for Performance Metrices on Training Set are .......


Unnamed: 0,train_F1_Score,train_AUC,train_Accuracy,train_Precision,train_Recall
0,0.810567,0.882289,0.801764,0.862197,0.76491


In [14]:
########################################################
# Create Smaller grid 1 based upon Random Grid CV results
#######################################################

# Number of trees in random forest
n_estimators = [200, 202, 204, 206]

# Number of features to consider at every split
max_features = ['log2']

# Maximum number of levels in tree
max_depth = [27,28]

# Minimum number of samples required to split a node
min_samples_split = [81,83,85]

min_samples_leaf = [18,20,22]

criterion = ['entropy']

class_weight = ['balanced']

# Method of selecting samples for training each tree
bootstrap = [False]

# Create the random grid
subGrid = {'randomforestclassifier__n_estimators': n_estimators,
        'randomforestclassifier__max_features': max_features,
        'randomforestclassifier__max_depth': max_depth,
        'randomforestclassifier__min_samples_split': min_samples_split,
        'randomforestclassifier__min_samples_leaf': min_samples_leaf,
        'randomforestclassifier__class_weight': class_weight,
        'randomforestclassifier__criterion' : criterion,
        'randomforestclassifier__bootstrap': bootstrap}

print(subGrid)


{'randomforestclassifier__n_estimators': [200, 202, 204, 206], 'randomforestclassifier__max_features': ['log2'], 'randomforestclassifier__max_depth': [27, 28], 'randomforestclassifier__min_samples_split': [81, 83, 85], 'randomforestclassifier__min_samples_leaf': [18, 20, 22], 'randomforestclassifier__class_weight': ['balanced'], 'randomforestclassifier__criterion': ['entropy'], 'randomforestclassifier__bootstrap': [False]}


In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

#################################
# Sub Grid Search
#################################

scoring = {'F1_Score': 'f1', 'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score), 'Precision': 'precision', \
          'Recall': 'recall'}

rfSubGridEstimator = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=999, n_jobs=-1)) 

rfSubGridModel = GridSearchCV(estimator = rfSubGridEstimator, 
                              param_grid= subGrid,  
                              cv = kfold_cv_object,
                              verbose=2, 
                              n_jobs = -1,
                              scoring=scoring,
                              refit='Accuracy', 
                              return_train_score=True)


# Fit the random search model
rfSubGridModel.fit(X, y=y)

print("The best parameters are %s with a score of %0.2f"
      % (rfSubGridModel.best_params_, rfSubGridModel.best_score_))


Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 14.8min finished


The best parameters are {'randomforestclassifier__bootstrap': False, 'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 28, 'randomforestclassifier__max_features': 'log2', 'randomforestclassifier__min_samples_leaf': 22, 'randomforestclassifier__min_samples_split': 83, 'randomforestclassifier__n_estimators': 200} with a score of 0.69


In [29]:
########################################################
# Create Smaller grid 1 based upon Random Grid CV results
#######################################################

# Number of trees in random forest
n_estimators = [201, 203, 205, 207]

# Number of features to consider at every split
max_features = ['log2']

# Maximum number of levels in tree
max_depth = [25,26]

# Minimum number of samples required to split a node
min_samples_split = [80,82,84]

min_samples_leaf = [24,25,26]

criterion = ['entropy']

class_weight = ['balanced']

# Method of selecting samples for training each tree
bootstrap = [False]

# Create the random grid
subGrid = {'randomforestclassifier__n_estimators': n_estimators,
        'randomforestclassifier__max_features': max_features,
        'randomforestclassifier__max_depth': max_depth,
        'randomforestclassifier__min_samples_split': min_samples_split,
        'randomforestclassifier__min_samples_leaf': min_samples_leaf,
        'randomforestclassifier__class_weight': class_weight,
        'randomforestclassifier__criterion' : criterion,
        'randomforestclassifier__bootstrap': bootstrap}

print(subGrid)

{'randomforestclassifier__n_estimators': [201, 203, 205, 207], 'randomforestclassifier__max_features': ['log2'], 'randomforestclassifier__max_depth': [25, 26], 'randomforestclassifier__min_samples_split': [80, 82, 84], 'randomforestclassifier__min_samples_leaf': [24, 25, 26], 'randomforestclassifier__class_weight': ['balanced'], 'randomforestclassifier__criterion': ['entropy'], 'randomforestclassifier__bootstrap': [False]}


In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

#################################
# Sub Grid Search
#################################

scoring = {'F1_Score': 'f1', 'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score), 'Precision': 'precision', \
          'Recall': 'recall'}

rfSubGridEstimator = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=999, n_jobs=-1)) 

rfSubGridModel = GridSearchCV(estimator = rfSubGridEstimator, 
                              param_grid= subGrid,  
                              cv = kfold_cv_object,
                              verbose=2, 
                              n_jobs = -1,
                              scoring=scoring,
                              refit='Accuracy', 
                              return_train_score=True)


# Fit the random search model
rfSubGridModel.fit(X, y=y)

print("The best parameters are %s with a score of %0.2f"
      % (rfSubGridModel.best_params_, rfSubGridModel.best_score_))


Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 14.6min finished


The best parameters are {'randomforestclassifier__bootstrap': False, 'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 25, 'randomforestclassifier__max_features': 'log2', 'randomforestclassifier__min_samples_leaf': 25, 'randomforestclassifier__min_samples_split': 80, 'randomforestclassifier__n_estimators': 207} with a score of 0.69


In [18]:
########################################################
# Create Smaller grid 1 based upon Random Grid CV results
#######################################################

# Number of trees in random forest
n_estimators = [191, 192, 193, 194]

# Number of features to consider at every split
max_features = ['log2']

# Maximum number of levels in tree
max_depth = [29,30]

# Minimum number of samples required to split a node
min_samples_split = [76,78,86]

min_samples_leaf = [19,21,23]

criterion = ['entropy']

class_weight = ['balanced']

# Method of selecting samples for training each tree
bootstrap = [False]

# Create the random grid
subGrid = {'randomforestclassifier__n_estimators': n_estimators,
        'randomforestclassifier__max_features': max_features,
        'randomforestclassifier__max_depth': max_depth,
        'randomforestclassifier__min_samples_split': min_samples_split,
        'randomforestclassifier__min_samples_leaf': min_samples_leaf,
        'randomforestclassifier__class_weight': class_weight,
        'randomforestclassifier__criterion' : criterion,
        'randomforestclassifier__bootstrap': bootstrap}

print(subGrid)

{'randomforestclassifier__n_estimators': [191, 192, 193, 194], 'randomforestclassifier__max_features': ['log2'], 'randomforestclassifier__max_depth': [29, 30], 'randomforestclassifier__min_samples_split': [76, 78, 86], 'randomforestclassifier__min_samples_leaf': [19, 21, 23], 'randomforestclassifier__class_weight': ['balanced'], 'randomforestclassifier__criterion': ['entropy'], 'randomforestclassifier__bootstrap': [False]}


In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

#################################
# Sub Grid Search
#################################

scoring = {'F1_Score': 'f1', 'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score), 'Precision': 'precision', \
          'Recall': 'recall'}

rfSubGridEstimator = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=999, n_jobs=-1)) 

rfSubGridModel = GridSearchCV(estimator = rfSubGridEstimator, 
                              param_grid= subGrid,  
                              cv = kfold_cv_object,
                              verbose=2, 
                              n_jobs = -1,
                              scoring=scoring,
                              refit='Accuracy', 
                              return_train_score=True)


# Fit the random search model
rfSubGridModel.fit(X, y=y)

print("The best parameters are %s with a score of %0.2f"
      % (rfSubGridModel.best_params_, rfSubGridModel.best_score_))


Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 14.1min finished


The best parameters are {'randomforestclassifier__bootstrap': False, 'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 29, 'randomforestclassifier__max_features': 'log2', 'randomforestclassifier__min_samples_leaf': 19, 'randomforestclassifier__min_samples_split': 76, 'randomforestclassifier__n_estimators': 194} with a score of 0.69


In [20]:
########################################################
# Create Smaller grid 1 based upon Random Grid CV results
#######################################################

# Number of trees in random forest
n_estimators = [195, 196, 197, 198]

# Number of features to consider at every split
max_features = ['log2']

# Maximum number of levels in tree
max_depth = [31,32]

# Minimum number of samples required to split a node
min_samples_split = [77,79,87]

min_samples_leaf = [27,28,29]

criterion = ['entropy']

class_weight = ['balanced']

# Method of selecting samples for training each tree
bootstrap = [False]

# Create the random grid
subGrid = {'randomforestclassifier__n_estimators': n_estimators,
        'randomforestclassifier__max_features': max_features,
        'randomforestclassifier__max_depth': max_depth,
        'randomforestclassifier__min_samples_split': min_samples_split,
        'randomforestclassifier__min_samples_leaf': min_samples_leaf,
        'randomforestclassifier__class_weight': class_weight,
        'randomforestclassifier__criterion' : criterion,
        'randomforestclassifier__bootstrap': bootstrap}

print(subGrid)

{'randomforestclassifier__n_estimators': [195, 196, 197, 198], 'randomforestclassifier__max_features': ['log2'], 'randomforestclassifier__max_depth': [31, 32], 'randomforestclassifier__min_samples_split': [77, 79, 87], 'randomforestclassifier__min_samples_leaf': [27, 28, 29], 'randomforestclassifier__class_weight': ['balanced'], 'randomforestclassifier__criterion': ['entropy'], 'randomforestclassifier__bootstrap': [False]}


In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

#################################
# Sub Grid Search
#################################

scoring = {'F1_Score': 'f1', 'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score), 'Precision': 'precision', \
          'Recall': 'recall'}

rfSubGridEstimator = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=999, n_jobs=-1)) 

rfSubGridModel = GridSearchCV(estimator = rfSubGridEstimator, 
                              param_grid= subGrid,  
                              cv = kfold_cv_object,
                              verbose=2, 
                              n_jobs = -1,
                              scoring=scoring,
                              refit='Accuracy', 
                              return_train_score=True)


# Fit the random search model
rfSubGridModel.fit(X, y=y)

print("The best parameters are %s with a score of %0.2f"
      % (rfSubGridModel.best_params_, rfSubGridModel.best_score_))


Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 13.9min finished


The best parameters are {'randomforestclassifier__bootstrap': False, 'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 32, 'randomforestclassifier__max_features': 'log2', 'randomforestclassifier__min_samples_leaf': 27, 'randomforestclassifier__min_samples_split': 79, 'randomforestclassifier__n_estimators': 197} with a score of 0.69


In [22]:
########################################################
# Create Smaller grid 1 based upon Random Grid CV results
#######################################################

# Number of trees in random forest
n_estimators = [191,199,208,209]

# Number of features to consider at every split
max_features = ['log2']

# Maximum number of levels in tree
max_depth = [33,34]

# Minimum number of samples required to split a node
min_samples_split = [88,89,90]

min_samples_leaf = [30,31,32]

criterion = ['entropy']

class_weight = ['balanced']

# Method of selecting samples for training each tree
bootstrap = [False]

# Create the random grid
subGrid = {'randomforestclassifier__n_estimators': n_estimators,
        'randomforestclassifier__max_features': max_features,
        'randomforestclassifier__max_depth': max_depth,
        'randomforestclassifier__min_samples_split': min_samples_split,
        'randomforestclassifier__min_samples_leaf': min_samples_leaf,
        'randomforestclassifier__class_weight': class_weight,
        'randomforestclassifier__criterion' : criterion,
        'randomforestclassifier__bootstrap': bootstrap}

print(subGrid)

{'randomforestclassifier__n_estimators': [191, 199, 208, 209], 'randomforestclassifier__max_features': ['log2'], 'randomforestclassifier__max_depth': [33, 34], 'randomforestclassifier__min_samples_split': [88, 89, 90], 'randomforestclassifier__min_samples_leaf': [30, 31, 32], 'randomforestclassifier__class_weight': ['balanced'], 'randomforestclassifier__criterion': ['entropy'], 'randomforestclassifier__bootstrap': [False]}


In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

#################################
# Sub Grid Search
#################################

scoring = {'F1_Score': 'f1', 'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score), 'Precision': 'precision', \
          'Recall': 'recall'}

rfSubGridEstimator = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=999, n_jobs=-1)) 

rfSubGridModel = GridSearchCV(estimator = rfSubGridEstimator, 
                              param_grid= subGrid,  
                              cv = kfold_cv_object,
                              verbose=2, 
                              n_jobs = -1,
                              scoring=scoring,
                              refit='Accuracy', 
                              return_train_score=True)


# Fit the random search model
rfSubGridModel.fit(X, y=y)

print("The best parameters are %s with a score of %0.2f"
      % (rfSubGridModel.best_params_, rfSubGridModel.best_score_))


Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 14.0min finished


The best parameters are {'randomforestclassifier__bootstrap': False, 'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 34, 'randomforestclassifier__max_features': 'log2', 'randomforestclassifier__min_samples_leaf': 30, 'randomforestclassifier__min_samples_split': 88, 'randomforestclassifier__n_estimators': 191} with a score of 0.69


### Selecting the best parameters from Five GridSearches

In this section, we are comparing the performance metrics using the five GridSearch parameters and choosing the best one to fit on the whole data set. 

In [26]:
# Fitting the model using parameter from first Grid Search
rfRandomGridEst = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 191, min_samples_split = 88, min_samples_leaf = 30, \
                                                                         max_features = 'log2', max_depth = 34, class_weight = "balanced", \
                                                                         criterion = 'entropy', bootstrap = False,random_state=999, n_jobs=-1))

scores = cross_validate(rfRandomGridEst, X, y=y, cv=kfold_cv_object, n_jobs=-1, scoring=scoring)

#display(pd.DataFrame(scores).mean())

print("\n Mean values for Performance Metrices on Test Set are .......")
testCol = ['test_F1_Score', 'test_AUC', 'test_Accuracy', 'test_Precision', 'test_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[testCol].mean()).T)

print("\n Mean values for Performance Metrices on Training Set are .......")
trainCol = ['train_F1_Score', 'train_AUC', 'train_Accuracy', 'train_Precision', 'train_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[trainCol].mean()).T)


 Mean values for Performance Metrices on Test Set are .......


Unnamed: 0,test_F1_Score,test_AUC,test_Accuracy,test_Precision,test_Recall
0,0.571489,0.691526,0.688301,0.717623,0.587227



 Mean values for Performance Metrices on Training Set are .......


Unnamed: 0,train_F1_Score,train_AUC,train_Accuracy,train_Precision,train_Recall
0,0.80734,0.879171,0.798636,0.860085,0.76085


In [27]:
# Fitting the model using parameter from second Grid Search
rfRandomGridEst = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 197, min_samples_split = 79, min_samples_leaf = 27, \
                                                                         max_features = 'log2', max_depth = 32, class_weight = "balanced", \
                                                                         criterion = 'entropy', bootstrap = False,random_state=999, n_jobs=-1))

scores = cross_validate(rfRandomGridEst, X, y=y, cv=kfold_cv_object, n_jobs=-1, scoring=scoring)

#display(pd.DataFrame(scores).mean())

print("\n Mean values for Performance Metrices on Test Set are .......")
testCol = ['test_F1_Score', 'test_AUC', 'test_Accuracy', 'test_Precision', 'test_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[testCol].mean()).T)

print("\n Mean values for Performance Metrices on Training Set are .......")
trainCol = ['train_F1_Score', 'train_AUC', 'train_Accuracy', 'train_Precision', 'train_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[trainCol].mean()).T)


 Mean values for Performance Metrices on Test Set are .......


Unnamed: 0,test_F1_Score,test_AUC,test_Accuracy,test_Precision,test_Recall
0,0.577149,0.694355,0.690513,0.714612,0.593358



 Mean values for Performance Metrices on Training Set are .......


Unnamed: 0,train_F1_Score,train_AUC,train_Accuracy,train_Precision,train_Recall
0,0.809876,0.881481,0.801027,0.861577,0.764184


In [28]:
# Fitting the model using parameter from third Grid Search
rfRandomGridEst = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 194, min_samples_split = 76, min_samples_leaf = 19, \
                                                                         max_features = 'log2', max_depth = 29, class_weight = "balanced", \
                                                                         criterion = 'entropy', bootstrap = False,random_state=999, n_jobs=-1))

scores = cross_validate(rfRandomGridEst, X, y=y, cv=kfold_cv_object, n_jobs=-1, scoring=scoring)

#display(pd.DataFrame(scores).mean())

print("\n Mean values for Performance Metrices on Test Set are .......")
testCol = ['test_F1_Score', 'test_AUC', 'test_Accuracy', 'test_Precision', 'test_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[testCol].mean()).T)

print("\n Mean values for Performance Metrices on Training Set are .......")
trainCol = ['train_F1_Score', 'train_AUC', 'train_Accuracy', 'train_Precision', 'train_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[trainCol].mean()).T)


 Mean values for Performance Metrices on Test Set are .......


Unnamed: 0,test_F1_Score,test_AUC,test_Accuracy,test_Precision,test_Recall
0,0.577023,0.695651,0.694627,0.717628,0.584089



 Mean values for Performance Metrices on Training Set are .......


Unnamed: 0,train_F1_Score,train_AUC,train_Accuracy,train_Precision,train_Recall
0,0.814803,0.887103,0.805969,0.865543,0.769817


In [31]:
# Fitting the model using parameter from fourth Grid Search
rfRandomGridEst = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 207, min_samples_split = 80, min_samples_leaf = 25, \
                                                                         max_features = 'log2', max_depth = 25, class_weight = "balanced", \
                                                                         criterion = 'entropy', bootstrap = False,random_state=999, n_jobs=-1))

scores = cross_validate(rfRandomGridEst, X, y=y, cv=kfold_cv_object, n_jobs=-1, scoring=scoring)

#display(pd.DataFrame(scores).mean())

print("\n Mean values for Performance Metrices on Test Set are .......")
testCol = ['test_F1_Score', 'test_AUC', 'test_Accuracy', 'test_Precision', 'test_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[testCol].mean()).T)

print("\n Mean values for Performance Metrices on Training Set are .......")
trainCol = ['train_F1_Score', 'train_AUC', 'train_Accuracy', 'train_Precision', 'train_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[trainCol].mean()).T)


 Mean values for Performance Metrices on Test Set are .......


Unnamed: 0,test_F1_Score,test_AUC,test_Accuracy,test_Precision,test_Recall
0,0.578606,0.695207,0.691995,0.715741,0.58809



 Mean values for Performance Metrices on Training Set are .......


Unnamed: 0,train_F1_Score,train_AUC,train_Accuracy,train_Precision,train_Recall
0,0.810158,0.882294,0.801361,0.861934,0.764393


In [32]:
# Fitting the model using parameter from fourth Grid Search
rfRandomGridEst = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 200, min_samples_split = 83, min_samples_leaf = 22, \
                                                                         max_features = 'log2', max_depth = 28, class_weight = "balanced", \
                                                                         criterion = 'entropy', bootstrap = False,random_state=999, n_jobs=-1))

scores = cross_validate(rfRandomGridEst, X, y=y, cv=kfold_cv_object, n_jobs=-1, scoring=scoring)

#display(pd.DataFrame(scores).mean())

print("\n Mean values for Performance Metrices on Test Set are .......")
testCol = ['test_F1_Score', 'test_AUC', 'test_Accuracy', 'test_Precision', 'test_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[testCol].mean()).T)

print("\n Mean values for Performance Metrices on Training Set are .......")
trainCol = ['train_F1_Score', 'train_AUC', 'train_Accuracy', 'train_Precision', 'train_Recall' ]
display(pd.DataFrame(pd.DataFrame(scores)[trainCol].mean()).T)


 Mean values for Performance Metrices on Test Set are .......


Unnamed: 0,test_F1_Score,test_AUC,test_Accuracy,test_Precision,test_Recall
0,0.580179,0.696036,0.693322,0.71494,0.590794



 Mean values for Performance Metrices on Training Set are .......


Unnamed: 0,train_F1_Score,train_AUC,train_Accuracy,train_Precision,train_Recall
0,0.811959,0.883871,0.803089,0.863058,0.766726


The parameter set that gives the highes accuracy is as below
n_estimators = 194
min_samples_split = 76
min_samples_leaf = 19
max_features = 'log2'
max_depth = 29
class_weight = "balanced"
criterion = 'entropy' 
bootstrap = False
random_state=999, n_jobs=-1

The metrics are 
	test_F1_Score	test_AUC	test_Accuracy	test_Precision	test_Recall
0	0.577023	0.695651	0.694627	0.717628	0.584089

These parameters are used to predict.

### Testing the model against the test data set

In [44]:
from sklearn import metrics as mt
# Initialize an Empty Dataframe to store Model performance Stats
modelPerformanceMetrices = pd.DataFrame(columns=['Accuracy', 'AuC', 'F1 Score', 'Precision', 'Recall'])

rfRandomGridEst = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 194, min_samples_split = 76, min_samples_leaf = 19, \
                                                                         max_features = 'log2', max_depth = 29, class_weight = "balanced", \
                                                                         criterion = 'entropy', bootstrap = False,random_state=999, n_jobs=-1))

#clf=RandomForestClassifier(rfRandomGridEst)

#Fit the model on the data set
rfRandomGridEst.fit(X,y)

y_hat = rfRandomGridEst.predict(X_Final)

tmpDict = dict()
tmpDict['Accuracy'] = mt.accuracy_score(y_Final,y_hat)
tmpDict['AuC'] = mt.roc_auc_score(y_Final,y_hat)
tmpDict['F1 Score'] = mt.f1_score(y_Final,y_hat)
tmpDict['Precision'] = mt.precision_score(y_Final,y_hat)
tmpDict['Recall'] = mt.recall_score(y_Final,y_hat)
    
modelPerformanceMetrices = modelPerformanceMetrices.append(tmpDict, ignore_index=True)
   
print("\n Model Performnace Metrices")
display(modelPerformanceMetrices)



 Model Performnace Metrices


Unnamed: 0,Accuracy,AuC,F1 Score,Precision,Recall
0,0.795842,0.800809,0.808863,0.860352,0.763189
