# Creation of pickled pipelines
Notice that `feature_importance.ipynb` was used to generate the file `feature_importance.csv` which ranks the importance of the features according to several metrics. 

Our goal is to implement the following pipelines using all, top20, top50, top100 and top150 features
* Pipeline 1: Logistic regression (`o`)
* Pipeline 2: Naive bayes
* Pipeline 3: Random forest
* Pipeline 4: XGBoost
* Pipeline 5: Neural Network [To do]
* Pipeline 6: SVM [To do]

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.svm import LinearSVC

from pipeline_utilities import create_base_pipeline
from pipeline_utilities import pickle_pipeline
from load_data import load_data

In [2]:
X, X_ensemble, X_dropout, y, y_ensemble, y_dropout, X_train, y_train, train_csv, test_csv = load_data()

In [7]:
X.shape

(80000, 200)

In [6]:
X_ensemble.shape

(60000, 200)

In [8]:
X_dropout.shape

(60000, 200)

In [4]:
X_train.shape

(140000, 200)

In [11]:
X_train.tail()

Unnamed: 0_level_0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
ID_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
train_40229,15.1432,-8.3687,12.2701,5.2384,11.4195,-10.7507,5.2407,12.481,-0.9351,9.272,...,-3.6074,12.6048,3.7759,1.0008,16.2003,0.1415,3.0516,8.2448,17.163,13.4558
train_163637,8.7181,0.7561,6.7905,7.025,11.7998,-8.2966,6.0874,16.2614,-4.1693,5.9257,...,2.4634,14.3199,2.6097,1.5432,20.3437,1.3267,-1.3356,9.5717,18.8114,-5.4102
train_95573,10.2345,7.8213,10.9351,4.2713,13.1699,2.2795,4.2074,19.5125,-0.7537,8.4267,...,6.3495,1.9259,3.9075,9.2249,15.8319,0.148,1.2781,10.2785,13.8203,16.0938
train_104667,12.1894,-1.2565,12.1256,6.4777,12.0089,2.0835,5.2671,14.71,4.3011,7.9739,...,4.8735,9.2275,2.3494,9.6457,20.5058,-0.4216,-0.5202,7.439,17.392,-8.583
train_133952,16.3986,2.4849,13.4346,8.3635,12.5364,8.9614,4.9465,17.7668,1.0017,7.6885,...,-1.8904,6.8405,3.3239,0.2475,24.0276,-1.0515,5.6184,9.572,13.152,0.0644


In [13]:
X_ensemble.tail()

Unnamed: 0_level_0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
ID_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
train_40229,15.1432,-8.3687,12.2701,5.2384,11.4195,-10.7507,5.2407,12.481,-0.9351,9.272,...,-3.6074,12.6048,3.7759,1.0008,16.2003,0.1415,3.0516,8.2448,17.163,13.4558
train_163637,8.7181,0.7561,6.7905,7.025,11.7998,-8.2966,6.0874,16.2614,-4.1693,5.9257,...,2.4634,14.3199,2.6097,1.5432,20.3437,1.3267,-1.3356,9.5717,18.8114,-5.4102
train_95573,10.2345,7.8213,10.9351,4.2713,13.1699,2.2795,4.2074,19.5125,-0.7537,8.4267,...,6.3495,1.9259,3.9075,9.2249,15.8319,0.148,1.2781,10.2785,13.8203,16.0938
train_104667,12.1894,-1.2565,12.1256,6.4777,12.0089,2.0835,5.2671,14.71,4.3011,7.9739,...,4.8735,9.2275,2.3494,9.6457,20.5058,-0.4216,-0.5202,7.439,17.392,-8.583
train_133952,16.3986,2.4849,13.4346,8.3635,12.5364,8.9614,4.9465,17.7668,1.0017,7.6885,...,-1.8904,6.8405,3.3239,0.2475,24.0276,-1.0515,5.6184,9.572,13.152,0.0644


## Get 'most relevant' features

In [None]:
feature_importance = pd.read_csv('feature_importance.csv', index_col='feature').sort_values(by='average_rank')
top10_features = list(feature_importance.index[:10])
top20_features = list(feature_importance.index[:20])
top50_features = list(feature_importance.index[:50])
top100_features = list(feature_importance.index[:100])
top150_features = list(feature_importance.index[:150])
all_features = list(feature_importance.index)
feature_importance.head()

## Description of the pipelines we will use

In [None]:
base_pipeline_descriptors = []



# Linear support vector classifier
#classifier = LinearSVC(random_state=0, tol=1e-5)
#parameters = {}
#base_pipeline_descriptors.append((classifier, parameters, top10_features, 'linearsvm_top10.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top20_features, 'linearsvm_top20.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top50_features, 'linearsvm_top50.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top100_features, 'linearsvm_top100.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top150_features, 'linearsvm_top150.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, all_features, 'linearsvm_all.pkl'))







# Support vector machines
#classifier = svm.SVC()
#parameters = {'gamma':['scale']}
#base_pipeline_descriptors.append((classifier, parameters, top10_features, 'svm_top10.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top20_features, 'svm_top20.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top50_features, 'svm_top50.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top100_features, 'svm_top100.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top150_features, 'svm_top150.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, all_features, 'svm_all.pkl'))







# linear classifier
#classifier = LogisticRegression(penalty='l1')
#parameters = {'C': 10**np.linspace(-4,-2, 20)}
base_pipeline_descriptors.append((classifier, parameters, top10_features, 'linear_classifier_top10.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top20_features, 'linear_classifier_top20.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top50_features, 'linear_classifier_top50.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top100_features, 'linear_classifier_top100.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top150_features, 'linear_classifier_top150.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, all_features, 'linear_classifier_all.pkl'))



# naive bayes
#classifier = GaussianNB()
#parameters = {}
base_pipeline_descriptors.append((classifier, parameters, top10_features, 'naive_classifier_top10.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top20_features, 'naive_classifier_top20.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top50_features, 'naive_classifier_top50.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top100_features, 'naive_classifier_top100.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top150_features, 'naive_classifier_top150.pkl'))
base_pipeline_descriptors.append((classifier, parameters, all_features, 'naive_classifier_all.pkl'))


# random forest
classifier = RandomForestClassifier()
parameters = {
    'max_depth': [5, 10, 20],
    'min_samples_leaf' : [10, 100, 1000],
    'n_estimators': [50, 100, 200],   
    'max_features': ['sqrt']
    
}
#base_pipeline_descriptors.append((classifier, parameters, top10_features, 'forest_classifier_top10.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top20_features, 'forest_classifier_top20.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top50_features, 'forest_classifier_top50.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top100_features, 'forest_classifier_top100.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top150_features, 'forest_classifier_top150.pkl'))
base_pipeline_descriptors.append((classifier, parameters, all_features, 'forest_classifier_all.pkl'))



classifier = XGBClassifier()
parameters = {
    'max_depth':[5, 10, 20],   
    'min_child_weight' : [10, 100, 1000], 
    'n_estimators': [50, 100, 200],
    'gamma': [0, 100]
    
}
#base_pipeline_descriptors.append((classifier, parameters, top10_features, 'xgboost_classifier_top10.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top20_features, 'xgboost_classifier_top20.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top50_features, 'xgboost_classifier_top50.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top100_features, 'xgboost_classifier_top100.pkl'))
#base_pipeline_descriptors.append((classifier, parameters, top150_features, 'xgboost_classifier_top150.pkl'))
base_pipeline_descriptors.append((classifier, parameters, all_features, 'xgboost_classifier_all.pkl'))

## Creation, training and pickling

In [None]:
## main code
for descriptor in base_pipeline_descriptors[-1:]:
    classifier,parameters,features,filename = descriptor
    base_pipeline = create_base_pipeline(descriptor, cv=2)
    base_pipeline.fit(X,y)
    pickle_pipeline(base_pipeline, filename)