# Modeling Pipeline
Creating a Pipeline to test out each model

### Importing Libraries

In [13]:
import _pickle as pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

### Loading the Data
Option to use either Dataset by just commenting out the undesired one.

In [4]:
# Top 10 features Dataset
with open("Top-10-Features-Models/top10_df.pkl",'rb') as fp:
    df = pickle.load(fp)
    
# Top 10 Correlated Dataset
# with open("Top-10-Correlation-Models/top10_corr_df.pkl",'rb') as fp:
#     df = pickle.load(fp)

### Scaling the Data

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

features_df = df.drop(["Decision"], 1)

scaled_df = pd.DataFrame(scaler.fit_transform(features_df), 
                               index=features_df.index, 
                               columns=features_df.columns)

df = scaled_df.join(df.Decision)

### Splitting the Data

In [6]:
X = df.drop(["Decision"], 1)
y = df.Decision

# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Creating a Pipeline 
Using 10 Different Classification Models

In [7]:
# Importing the 10 models
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

## Preventing error from occuring: XGBoost causes kernel to die.
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier

### Creating pipelines for each model

In [10]:
# Adaboost
pipe_ada = Pipeline([('clf', AdaBoostClassifier())])

# Gradient Boost
pipe_gb  = Pipeline([('clf', GradientBoostingClassifier())])

# Random Forest
pipe_rf  = Pipeline([('clf', RandomForestClassifier())])

# Decision Tree
pipe_dt  = Pipeline([('clf', DecisionTreeClassifier())])

# Dummy (Baseline)
pipe_dum = Pipeline([('clf', DummyClassifier())])

# K Nearest Neighbors
pipe_knn = Pipeline([('clf', KNeighborsClassifier())])

# Logistic Regression
pipe_lr  = Pipeline([('clf', LogisticRegression())])

# Naive Bayes
pipe_nb  = Pipeline([('clf', GaussianNB())])

# Support Vector Machine
pipe_svm = Pipeline([('clf', SVC())])

# XGBoost
pipe_xgb = Pipeline([('clf', XGBClassifier())])

### Creating a List of Model Names and Pipelines

In [31]:
pipelines = [pipe_ada, pipe_gb, pipe_rf, pipe_dt, pipe_dum, pipe_knn, pipe_lr, pipe_nb, pipe_svm, pipe_xgb]

models = ['Adaboost', 
          'GradientBoost', 
          'RandomForest', 
          'DecisionTree', 
          'Dummy(Baseline)', 
          'KNN', 
          'LogisticRegression',
          'NaiveBayes',
          'SupportVectorMachine',
          'XGBoost']

# Zipping the the strings and pipelines together and creating a dictionary
model_pipelines = dict(zip(models, pipelines))

### Fitting and Training each Pipeline
(Using default parameters initially)

In [36]:
# Looping through each Pipeline to fit and train each model
for name, pipe in model_pipelines.items():
    print(pipe)
    pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('clf', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))])
Pipeline(memory=None,
     steps=[('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])
Pipeline(memory=None,
     steps=[('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_scor



Pipeline(memory=None,
     steps=[('clf', GaussianNB(priors=None, var_smoothing=1e-09))])
Pipeline(memory=None,
     steps=[('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])




Pipeline(memory=None,
     steps=[('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))])


### Classification Report for each Pipeline

In [37]:
# Looping through each model's predictions and getting their classification reports
for name, pipe in model_pipelines.items():
    print('\n'+ name + ' (Macro Avg - F1 Score):')
    
    report = classification_report(y_test, pipe.predict(X_test), target_names=['Sell', 'Buy', 'Hold'], output_dict=True)
    print(report['macro avg']['f1-score'])


Adaboost (Macro Avg - F1 Score):
0.34822250432853535

GradientBoost (Macro Avg - F1 Score):
0.3431787749786248

RandomForest (Macro Avg - F1 Score):
0.3756400865302727

DecisionTree (Macro Avg - F1 Score):
0.36039506834044205

Dummy(Baseline) (Macro Avg - F1 Score):
0.3336222492246383

KNN (Macro Avg - F1 Score):
0.36300443032863144

LogisticRegression (Macro Avg - F1 Score):
0.2305696283747186

NaiveBayes (Macro Avg - F1 Score):
0.20873732913818646

SupportVectorMachine (Macro Avg - F1 Score):
0.2319655621129266

XGBoost (Macro Avg - F1 Score):
0.34300840024402146


Choosing the best performing model based on the __Macro Average for the F1 Score__.  This is due to the fact that we want to optimize the amount of classifications (_Recall_) and reduce the amount of misclassifications (_Precision_).  _Macro Average_ is used because of the class imbalance and the desire to classify more 'Buys' and 'Sells' than 'Holds'.

However, in the end, we will be evaluating on _Precision_ Score for Macro Average.

## Top 3 Classifiers for GridSearch
1. Random Forest
2. KNN
3. Decision Tree

_(Based on Macro Avg F1-Score)_

In [38]:
# Importing Grid Search
from sklearn.model_selection import GridSearchCV

In [39]:
top3 = 

KeyError: ('RandomForest', 'KNN')