# Part III: Ensembles and Final Result

## AdaBoost

Train an AdaBoost classifier using Decision Tree stubs as weak learners. Compare its performance to results obtained in Part II using 10 fold CV.

In [8]:
import pandas as pd
import numpy as np

# load traning set
# note: it can be done by simply loading the data sets. 

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [9]:
import proj2_lib.util as utils

utils.file_config

{'feature_pipeline_file': 'feature_pipeline.pkl',
 'labels_pipeline_file': 'labels_pipeline.pkl',
 'objstore_path': 'objects',
 'processed_data_path': 'processed_data',
 'raw_data_csv': 'KaggleV2-May-2016.csv',
 'raw_data_path': 'data',
 'test_csv': 'test_set.csv',
 'train_csv': 'train_set.csv'}

In [10]:
file_config = utils.file_config
import proj2_lib.preprocess as preprocess
train_X, train_y = preprocess.load_train_data(config=file_config)
print(train_X.shape)
print(train_y.shape)

(90526, 101)
(90526,)


In [12]:
# AdaBoost code goes here
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth =1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
ada_clf_fitted = ada_clf.fit(train_X,train_y)

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

def measure_performance(X,y,clf, show_accuracy=True, show_AUC=True, show_classification_report=True, show_confusion_matrix=True):

    accuracy_scores = cross_val_score(clf, X, y, 
                        scoring="accuracy", cv=10)
    
    
    AUC_scores = cross_val_score(clf, X, y, 
                        scoring="roc_auc", cv=10)
    
    y_pred=clf.predict(X)   
    
    print ("Fitted model:")
    print (clf,"\n")
    if show_accuracy:
        print ("Accuracy:")
        print (accuracy_scores,"\n")
        print ("Mean Accuracy")
        print (accuracy_scores.mean(), "\n")
        
    if show_AUC:
        print ("AUC:")
        print (AUC_scores,"\n")        
        print ("Mean AUC")
        print (AUC_scores.mean(), "\n")        
        
    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y,y_pred),"\n")
        
    if show_confusion_matrix:
        print ("Confusion matrix")
        print (metrics.confusion_matrix(y,y_pred),"\n")

In [8]:
# AdaBoost   cv=10
measure_performance(train_X,train_y,ada_clf_fitted, show_classification_report=True, show_confusion_matrix=True)

Fitted model:
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=None) 

Accuracy:
[ 0.79863029  0.79642108  0.79763614  0.79851983  0.79818845  0.79807799
  0.79739284  0.79606717  0.79783473  0.79772426] 

Mean Accuracy
0.797649276693 

AUC:
[ 0.72792921  0.72593463  0.72843011  0.73287909  0.7216347   0.73067697
  0.72817761  0.72339046  0.73631401  0.72340417] 

Mean AUC
0.727877096994 

Classification report
             precision    recall  f1-score   support

         -1       0.80      1.00      0.89     72246
          1       0.48      0.01      0.02     18280

avg / total       0.73      0.80      

In [32]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

def measure_performance_1fold(X,y,clf, show_accuracy=True, show_AUC=True, show_classification_report=True, show_confusion_matrix=True):
# k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more
    accuracy_scores = cross_val_score(clf, X, y, 
                        scoring="accuracy", cv=2) # cv=10
    
    
    AUC_scores = cross_val_score(clf, X, y, 
                        scoring="roc_auc", cv=2) #  cv=10
    
    y_pred=clf.predict(X)   
    
    print ("Fitted model:")
    print (clf,"\n")
    if show_accuracy:
        print ("Accuracy:")
        print (accuracy_scores,"\n")
        print ("Mean Accuracy")
        print (accuracy_scores.mean(), "\n")
        
    if show_AUC:
        print ("AUC:")
        print (AUC_scores,"\n")        
        print ("Mean AUC")
        print (AUC_scores.mean(), "\n")        
        
    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y,y_pred),"\n")
        
    if show_confusion_matrix:
        print ("Confusion matrix")
        print (metrics.confusion_matrix(y,y_pred),"\n")

In [13]:
# AdaBoost cv=2
measure_performance_1fold(train_X,train_y,ada_clf_fitted, show_classification_report=True, show_confusion_matrix=True)

Fitted model:
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=None) 

Accuracy:
[ 0.79771557  0.7972958 ] 

Mean Accuracy
0.797505688973 

AUC:
[ 0.72558335  0.72770482] 

Mean AUC
0.726644083359 

Classification report
             precision    recall  f1-score   support

         -1       0.80      1.00      0.89     72246
          1       0.48      0.01      0.02     18280

avg / total       0.73      0.80      0.71     90526
 

Confusion matrix
[[72060   186]
 [18111   169]] 



In part two we get mean AUC of 0.581839502662, 0.693280593178 , 0.667313327537 for decision tree, random forest and liner SVM using 10 fold CV, respectivly. 
Here we get an mean AUC of 0.727877096994 for CV=10 and AUC of 0.726644083359 for CV=2 (just 1 train and 1 validation set).


## Stacking

Choose a set of 5 or so classifiers. Write a function that trains an ensemble using stacking

In [15]:
import pandas as pd
import numpy as np

PROCESSED_DATA_DIR = 'processed_data'
clean_df = pd.read_csv(PROCESSED_DATA_DIR + "/train_set.csv", parse_dates=['ScheduledDay','AppointmentDay'],
                      dtype={'Age': np.float64}, encoding='latin-1')
clean_df.head()

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=1234)
for train_part3_index, validation_index in split.split(clean_df, clean_df['No-show']):
    train_PIII_set = clean_df.iloc[train_part3_index]
    validation_set = clean_df.iloc[validation_index]

# check class proportions on train and test sets to make sure 
# properly stratified
print("Train part3 set:")
print(train_PIII_set['No-show'].value_counts() / len(train_PIII_set))

print("validation set:")
print(validation_set['No-show'].value_counts() / len(validation_set))
print(len(clean_df))
print(len(train_PIII_set))
print(len(validation_set))

# save train and test sets as csvs
#train_PIII_set.to_csv(PROCESSED_DATA_DIR + '/train_PIII_set.csv', index=False)
#validation_set.to_csv(PROCESSED_DATA_DIR + '/validation_set.csv', index=False)

train_PIII_set_labels = train_PIII_set['No-show'].copy()
train_PIII_set = train_PIII_set.drop('No-show', axis=1)
train_PIII_set.head()


Train part3 set:
No     0.798068
Yes    0.201932
Name: No-show, dtype: float64
validation set:
No     0.798071
Yes    0.201929
Name: No-show, dtype: float64
90526
63368
27158


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received
15544,346975800000.0,5624436,F,2016-04-26 16:02:13,2016-05-12,70.0,SANTO ANTÃ?NIO,0,0,0,0,0,1
4267,2957690000000.0,5767809,F,2016-06-03 07:06:41,2016-06-07,73.0,ROMÃ?O,0,1,1,0,0,0
85469,356143100000.0,5703318,F,2016-05-16 13:46:33,2016-05-17,53.0,SANTA MARTHA,1,0,0,0,1,0
81643,82433900000000.0,5692846,F,2016-05-12 16:28:47,2016-05-13,34.0,SANTO ANDRÃ?,1,0,0,0,0,0
226,4476215000000.0,5539468,F,2016-04-01 15:01:22,2016-04-29,15.0,CONSOLAÃ?Ã?O,0,0,0,0,0,1


In [57]:
# Edit the config file to split the traing set into new training (train_p3_set) set 
# and new test set (validation set)
new_config = utils.file_config.copy()
new_config['test_csv']= 'validation_set.csv'
new_config['train_csv']= 'train_p3_set.csv'
new_config['raw_data_csv']= 'train_set.csv'
new_config['raw_data_path']= 'processed_data'
new_config

{'feature_pipeline_file': 'feature_pipeline.pkl',
 'labels_pipeline_file': 'labels_pipeline.pkl',
 'objstore_path': 'objects',
 'processed_data_path': 'processed_data',
 'raw_data_csv': 'train_set.csv',
 'raw_data_path': 'processed_data',
 'test_csv': 'validation_set.csv',
 'train_csv': 'train_p3_set.csv'}

In [53]:
# ONLY NEED TO RUN THIS STEP ONCE (switch this to True to run it)
RUN_MAKE_TRAIN_TEST_FILES = False
if RUN_MAKE_TRAIN_TEST_FILES:
    utils.make_train_test_sets(config=new_config)

In [61]:
import proj2_lib.preprocess as preprocess

# ONLY NEED TO RUN THIS STEP ONCE
RUN_FIT_PREPROCESSING = False
if RUN_FIT_PREPROCESSING:
    preprocess.fit_save_pipelines(config=new_config)

In [63]:
# Note : the test set here is validation set
train_X, train_y = preprocess.load_train_data(config=new_config)
print(train_X.shape)
print(train_y.shape)
#test_X, test_y = preprocess.load_test_data(config=new_config)
#print(test_X.shape)
#print(test_y.shape)

(70526, 101)
(70526,)


In [25]:
def build_stack_ensemble(X, y):
    import pandas as pd
    import numpy as np

    # create train/validation sets
    # using StratifiedShuffleSplit
    from sklearn.model_selection import StratifiedShuffleSplit
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2 , random_state=1234)
    data_set=pd.DataFrame(X.copy())
    data_set['label']=y.copy().ravel() 
    
    for train_index, val_index in split.split(data_set, data_set["label"]):
        train_p3_set = data_set.iloc[train_index]
        val_p3_set = data_set.iloc[val_index]

    train_p3_y = np.array(pd.DataFrame(train_p3_set['label'].copy(), columns=["label"])).ravel()
    train_p3_X = np.array(train_p3_set.drop('label', axis=1))

    val_p3_y = np.array(pd.DataFrame(val_p3_set['label'].copy(), columns=["label"])).ravel()
    val_p3_X = np.array(val_p3_set.drop('label', axis=1))    
    
    
    
    # train classifiers in ensemble using train set
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier    
    from sklearn.svm import LinearSVC
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier 
    from sklearn.linear_model import LogisticRegression
    
    # "Decision Tree", "Random Forest", "Linear SVM", "AdaBoost", "Nearest Neighbors" 
    # "Gaussian Process","Neural Net", "Naive Bayes", "QDA"

    classifiers = [DecisionTreeClassifier(), RandomForestClassifier(),LinearSVC(), AdaBoostClassifier(), KNeighborsClassifier(n_neighbors=7)]

    
    clf1 = DecisionTreeClassifier()
    clf1_fitted = clf1.fit(train_p3_X, train_p3_y)

    clf2 = RandomForestClassifier()
    clf2_fitted = clf2.fit(train_p3_X, train_p3_y)
    
    clf3 = LinearSVC()
    clf3_fitted = clf3.fit(train_p3_X, train_p3_y)

    clf4 = AdaBoostClassifier()
    clf4_fitted = clf4.fit(train_p3_X, train_p3_y)    
    
    clf5 = KNeighborsClassifier(n_neighbors=7)
    clf5_fitted = clf5.fit(train_p3_X, train_p3_y)        

    
    level2_feature_matrix=np.full((val_p3_X.shape[0],5), 0.0)
    count = 0
    # iterate over classifiers
    for clf in (clf1_fitted, clf2_fitted, clf3_fitted, clf4_fitted, clf4_fitted):
        
    # create new feature matrix for validation
    # set by getting predictions from the ensemble
    # classifiers            
        level2_feature_matrix[:,count] = clf.predict(val_p3_X)
        count+=1
      
    # train logistic regression classifier on
    # new feature matrix
    LR = LogisticRegression()
    LR_fitted = LR.fit(level2_feature_matrix, val_p3_y)
    
    
    # return all trained classifiers
    return (clf1_fitted,clf2_fitted,clf3_fitted,clf4_fitted,clf5_fitted,LR_fitted)


In [26]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class StackingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        
        return None 
        
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        
        self.clf1_, self.clf2_ , self.clf3_, self.clf4_, self.clf5_, self.LR_ = build_stack_ensemble(X, y)        

        return self
        # Fit
        #super(Stack_Ensemble_Classifier, self).fit(X, y)  
        #return self      
    
    def decision_function(self, X):
        check_is_fitted(self, ['clf1_', 'clf2_', 'clf3_', 'clf4_', 'clf5_', 'LR_'])
        X = check_array(X)
        
        level2_feature_matrix=np.full((X.shape[0],5), 0.0)
        count=0
        for clf in (self.clf1_, self.clf2_ , self.clf3_, self.clf4_,self.clf5_):
            level2_feature_matrix[:,count]=clf.predict(X)
            count+=1
        
        return self.LR_.predict(level2_feature_matrix)
    
    def predict(self, X):
        
        f = self.decision_function(X)
        return f

In [34]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

stack_clf = StackingClassifier()
stack_clf_fitted = stack_clf.fit(train_X,train_y)
#roc_scores = cross_val_score(stack_clf, train_X,train_y, scoring="roc_auc", cv=10)
measure_performance_1fold(train_X,train_y,stack_clf_fitted, show_classification_report=True, show_confusion_matrix=True)

Fitted model:
StackingClassifier() 

Accuracy:
[ 0.7979365   0.79696441] 

Mean Accuracy
0.797450456223 

AUC:
[ 0.51894196  0.51860314] 

Mean AUC
0.518772549006 

Classification report
             precision    recall  f1-score   support

         -1       0.83      1.00      0.91     72246
          1       0.93      0.22      0.35     18280

avg / total       0.85      0.84      0.79     90526
 

Confusion matrix
[[71961   285]
 [14349  3931]] 



Use 10-fold cross validation to measure performance of your stacked classifier. See Part II solution to see how to roll your own sklearn classifier along with http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator

## Final Result

Choose a single model based on all previous project steps. Train this model on the complete training dataset and measure it's performance on the held out test set.

Compare to the 10-fold CV estimate you got previously.

In [35]:
# final result goes here

# AdaBoost trained on all training dataset
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth =1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
ada_clf_fitted = ada_clf.fit(train_X,train_y)
#measure_performance(train_X,train_y,ada_clf_fitted, show_classification_report=True, show_confusion_matrix=True)

In [36]:
file_config = utils.file_config
import proj2_lib.preprocess as preprocess
test_X, test_y = preprocess.load_test_data(config=file_config)
print(test_X.shape)
print(test_y.shape)

(20000, 101)
(20000,)


In [37]:
# test using test dataset
ada_clf_fitted = ada_clf.fit(test_X,test_y)
measure_performance(test_X,test_y,ada_clf_fitted, show_classification_report=True, show_confusion_matrix=True)

Fitted model:
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=None) 

Accuracy:
[ 0.79610195  0.799       0.797       0.7975      0.796       0.7945
  0.7975      0.7985      0.796       0.79789895] 

Mean Accuracy
0.79700008985 

AUC:
[ 0.71969333  0.71786908  0.71090319  0.72948848  0.72423168  0.72437902
  0.71775664  0.72437204  0.74670898  0.7442785 ] 

Mean AUC
0.72596809356 

Classification report
             precision    recall  f1-score   support

         -1       0.80      0.99      0.89     15961
          1       0.49      0.02      0.04      4039

avg / total       0.74      0.80      0.72  