In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_context("paper")   

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

import statistics
import math
from random import random
from IPython.display import display
from sklearn.preprocessing import LabelEncoder

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
import xgboost as xgb # from xgboost import XGBClassifier
import lightgbm as lgb

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

d_train = pd.read_pickle('dev_rsc/d_train.pk1')
Target = ['Survived']
data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']

-------------------------


In [35]:
def prtSep(length=20, marker="-"):
    print( marker * length)

In [33]:
#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),
    
    # XGB 
    #     conda install -c anaconda py-xgboost
    xgb.XGBClassifier(),
     
    # LGB
    lgb.LGBMClassifier()

    ]
#split dataset in cross-validation with this splitter class: 
#note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = d_train[Target]

#index through MLA and save performance to table
row_index = 0
for alg in MLA:
    
    #set name and parameters
    MLA_name = alg.__class__.__name__

    print("Processing:MLA_name)
    
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    #score model with cross validation
    cv_results = model_selection.cross_validate(alg, d_train[data1_x_bin], d_train[Target], cv  = cv_split)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
    alg.fit(d_train[data1_x_bin], d_train[Target])
    MLA_predict[MLA_name] = alg.predict(d_train[data1_x_bin])
    
    row_index+=1


#print and sort table
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy Mean,MLA Test Accuracy Mean,MLA Test Accuracy 3*STD,MLA Time
22,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",,0.829478,0.0623358,0.0431961
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",,0.829478,0.0594549,0.105601
21,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",,0.829478,0.0527546,0.0346958
14,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",,0.827612,0.0409157,0.0236037
15,NuSVC,"{'break_ties': False, 'cache_size': 200, 'clas...",,0.826119,0.0456629,0.028099
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",,0.822388,0.0627665,0.0803951
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",,0.822015,0.0529916,0.0551973
17,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",,0.821269,0.0518925,0.00199993
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",,0.81903,0.0710182,0.0131007
13,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",,0.813806,0.0690863,0.00280523
