# Modeling Pipeline
Creating a Pipeline to test out each model

### Importing Libraries

In [13]:
import _pickle as pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

### Loading the Data
Option to use either Dataset by just commenting out the undesired one.

In [4]:
# Top 10 features Dataset
with open("Top-10-Features-Models/top10_df.pkl",'rb') as fp:
    df = pickle.load(fp)
    
# Top 10 Correlated Dataset
# with open("Top-10-Correlation-Models/top10_corr_df.pkl",'rb') as fp:
#     df = pickle.load(fp)

### Scaling the Data

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

features_df = df.drop(["Decision"], 1)

scaled_df = pd.DataFrame(scaler.fit_transform(features_df), 
                               index=features_df.index, 
                               columns=features_df.columns)

df = scaled_df.join(df.Decision)

### Splitting the Data

In [6]:
X = df.drop(["Decision"], 1)
y = df.Decision

# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Creating a Pipeline 
Using 10 Different Classification Models

In [7]:
# Importing the 10 models
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

## Preventing error from occuring: XGBoost causes kernel to die.
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier

### Creating pipelines for each model

In [10]:
# Adaboost
pipe_ada = Pipeline([('clf', AdaBoostClassifier())])

# Gradient Boost
pipe_gb  = Pipeline([('clf', GradientBoostingClassifier())])

# Random Forest
pipe_rf  = Pipeline([('clf', RandomForestClassifier())])

# Decision Tree
pipe_dt  = Pipeline([('clf', DecisionTreeClassifier())])

# Dummy (Baseline)
pipe_dum = Pipeline([('clf', DummyClassifier())])

# K Nearest Neighbors
pipe_knn = Pipeline([('clf', KNeighborsClassifier())])

# Logistic Regression
pipe_lr  = Pipeline([('clf', LogisticRegression())])

# Naive Bayes
pipe_nb  = Pipeline([('clf', GaussianNB())])

# Support Vector Machine
pipe_svm = Pipeline([('clf', SVC())])

# XGBoost
pipe_xgb = Pipeline([('clf', XGBClassifier())])

### Creating a List of Model Names and Pipelines

In [11]:
pipelines = [pipe_ada, pipe_gb, pipe_rf, pipe_dt, pipe_dum, pipe_knn, pipe_lr, pipe_nb, pipe_svm, pipe_xgb]

models = ['Adaboost', 
          'GradientBoost', 
          'RandomForest', 
          'DecisionTree', 
          'Dummy(Baseline)', 
          'KNN', 
          'LogisticRegression',
          'NaiveBayes',
          'SupportVectorMachine',
          'XGBoost']

### Fitting and Training each Pipeline

In [12]:
# Looping through each Pipeline to fit and train each model
for pipe in pipelines:
    print(pipe)
    pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('clf', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))])
Pipeline(memory=None,
     steps=[('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])
Pipeline(memory=None,
     steps=[('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_



Pipeline(memory=None,
     steps=[('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])
Pipeline(memory=None,
     steps=[('clf', DummyClassifier(constant=None, random_state=None, strategy='stratified'))])
Pipeline(memory=None,
     steps=[('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])
Pipeline(memory=None,
     steps=[('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001



Pipeline(memory=None,
     steps=[('clf', GaussianNB(priors=None, var_smoothing=1e-09))])
Pipeline(memory=None,
     steps=[('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])




Pipeline(memory=None,
     steps=[('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))])


### Classification Report for each Pipeline

In [17]:
for index, val in enumerate(pipelines):
    print('\n'+ models[index] + ' -'*30)
    
    report = classification_report(y_test, val.predict(X_test), target_names=['Sell', 'Buy', 'Hold'])
    print(report)


Adaboost - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
              precision    recall  f1-score   support

        Sell       0.35      0.04      0.08      2898
         Buy       0.47      0.35      0.40      5509
        Hold       0.46      0.74      0.57      6636

   micro avg       0.46      0.46      0.46     15043
   macro avg       0.43      0.38      0.35     15043
weighted avg       0.44      0.46      0.41     15043


GradientBoost - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
              precision    recall  f1-score   support

        Sell       0.35      0.03      0.06      2898
         Buy       0.48      0.34      0.39      5509
        Hold       0.46      0.76      0.57      6636

   micro avg       0.46      0.46      0.46     15043
   macro avg       0.43      0.38      0.34     15043
weighted avg       0.45      0.46      0.41     15043


RandomForest - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
              preci