# Creation of pickled pipelines
Notice that `feature_importance.ipynb` was used to generate the file `feature_importance.csv` which ranks the importance of the features according to several metrics. 

Our goal is to implement the following pipelines using all, top20, top50, top100 and top150 features
* Pipeline 1: Logistic regression (`o`)
* Pipeline 2: Naive bayes
* Pipeline 3: Random forest
* Pipeline 4: XGBoost
* Pipeline 5: Neural Network [To do]
* Pipeline 6: SVM [To do]

* https://scikit-learn.org/stable/modules/naive_bayes.html

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from pipeline_utilities import create_base_pipeline
from pipeline_utilities import pickle_pipeline


In [2]:
train_csv = pd.read_csv('train.csv').set_index('ID_code')
#test_csv = pd.read_csv('test.csv').set_index('ID_code')
train_csv = pd.read_csv('train.csv').set_index('ID_code')
#test_csv = pd.read_csv('test.csv').set_index('ID_code')
Features = train_csv.drop(columns = 'target')
target = train_csv.target
X, X_dropout, y, y_dropout = train_test_split(Features, target, test_size=0.25, random_state=42)

## Get 'most relevant' features

In [3]:
feature_importance = pd.read_csv('feature_importance.csv', index_col='feature').sort_values(by='average_rank')
top10_features = list(feature_importance.index[:10])
top20_features = list(feature_importance.index[:20])
top50_features = list(feature_importance.index[:50])
top100_features = list(feature_importance.index[:100])
top150_features = list(feature_importance.index[:150])
all_features = list(feature_importance.index)
feature_importance.head()

Unnamed: 0_level_0,tree_importance,linear_importance,linear_rank,tree_rank,average_rank
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
var_81,0.015771,0.221647,1.0,1.0,1.0
var_139,0.011978,0.202066,2.0,3.0,2.5
var_12,0.012252,0.187078,4.0,2.0,3.0
var_53,0.010661,0.177735,5.0,4.0,4.5
var_110,0.010287,0.172094,8.0,5.0,6.5


## Description of the pipelines we will use

In [4]:
base_pipeline_descriptors = []

# linear classifier
classifier = LogisticRegression(penalty='l1')
parameters = {'C': 10**np.linspace(-4,-2, 20)}
base_pipeline_descriptors.append((classifier, parameters, top10_features, 'linear_classifier_top10.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top20_features, 'linear_classifier_top20.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top50_features, 'linear_classifier_top50.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top100_features, 'linear_classifier_top100.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top150_features, 'linear_classifier_top150.pkl'))
base_pipeline_descriptors.append((classifier, parameters, all_features, 'linear_classifier_all.pkl'))



# naive bayes
classifier = GaussianNB()
parameters = {}
base_pipeline_descriptors.append((classifier, parameters, top10_features, 'naive_classifier_top10.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top20_features, 'naive_classifier_top20.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top50_features, 'naive_classifier_top50.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top100_features, 'naive_classifier_top100.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top150_features, 'naive_classifier_top150.pkl'))
base_pipeline_descriptors.append((classifier, parameters, all_features, 'naive_classifier_all.pkl'))


# random forest
classifier = RandomForestClassifier()
parameters = {
    'max_depth': [5, 10, 20],
    'min_samples_leaf' : [10, 100, 1000],
    'n_estimators': [50, 100, 200],   
    'max_features': ['sqrt']
    
}
base_pipeline_descriptors.append((classifier, parameters, top10_features, 'forest_classifier_top10.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top20_features, 'forest_classifier_top20.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top50_features, 'forest_classifier_top50.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top100_features, 'forest_classifier_top100.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top150_features, 'forest_classifier_top150.pkl'))
base_pipeline_descriptors.append((classifier, parameters, all_features, 'forest_classifier_all.pkl'))



classifier = XGBClassifier()
parameters = {
    'max_depth':[5, 10, 20],   
    'min_child_weight' : [10, 100, 1000], 
    'n_estimators': [50, 100, 200],
    'gamma': [0, 100]
    
}
base_pipeline_descriptors.append((classifier, parameters, top10_features, 'xgboost_classifier_top10.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top20_features, 'xgboost_classifier_top20.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top50_features, 'xgboost_classifier_top50.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top100_features, 'xgboost_classifier_top100.pkl'))
base_pipeline_descriptors.append((classifier, parameters, top150_features, 'xgboost_classifier_top150.pkl'))
base_pipeline_descriptors.append((classifier, parameters, all_features, 'xgboost_classifier_all.pkl'))

## Creation, training and pickling

In [None]:
## main code
for descriptor in base_pipeline_descriptors[13:18]:
    classifier,parameters,features,filename = descriptor
    base_pipeline = create_base_pipeline(descriptor, cv=2)
    base_pipeline.fit(X,y)
    pickle_pipeline(base_pipeline, filename)

forest_classifier_top20.pkl was created
forest_classifier_top50.pkl was created


In [7]:
base_pipeline.predict_proba(X)[:,1]

array([0.12646402, 0.08474043, 0.09292218, ..., 0.16774449, 0.13382349,
       0.06882277])