In [1]:
import pandas as pd
import numpy as np
import sklearn.datasets

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (8,8)

## 3950 Assignment 1: Part 2

For this assignment we want to use some sort of tree based model to classify the data below. We have a very small training set, so overfitting is a very real concern. 

Some specifics for this assignment:
<ul>
<li>Please use the show_eda to control if EDA stuff is shown. I don't really need to see all the EDA stuff (nor do you after you've done it), so we can make it configurable with a variable to speed up time. Please set this FALSE when you submit, so I can run all and see the outcome without histograms etc...
<li>Please ensure that whatever model you end up with is in a variable named best at the end.
<li>Please use some pipeline in prepping the data. The test data is in an identical format to the training data, so whatever pipeline you've created for your training will work for the testing. 
<li>The accuracy scoring will be an average of accuracy and roc_auc. 
</ul>

### Grading Metrics
<ul>
<li><b>Pipeline Used - 10pts</b> The data loading needs to be in a pipeline. See the test part for illustration. When testing I'll call your pipe with the new data (format is identical to training), so any prep stuff should be in the pipeline. 
<li><b>Tree Based Model Used - 5pts</b> The model used for classification needs to be some variety of tree, beyond that it is up to you. 
<li><b>Accuracy - 5pts</b> The final accuracy acheived. This will be a rough ranking, I'm assuming most people will get a similar level of accuracy, marks will only be deducted if yours is far wosrse, as that's an indication that you probably didn't take any/many steps to improve things. 
<li><b>Clarity and Formatting - 5pts</b> Is it organized and can I read it?
    <ul>
    <li> <b>Note:</b> for this assignment, and in general, please get rid of my comments and replace them with your own. I'm going to read this, so all of these instructions aren't really required. Think of this as a template, get rid of the stuff that isn't needed, and leave only the things you need to explain your code. 
    </ul>
</ul>

For submission, please drop the URL for your repository in the dropbox.

In [2]:

name = "Michael Harris"


#### Data Setup

In [3]:
#Load data
df = pd.read_csv("training.csv")
df = df.drop(columns={"id"})
df.sample(5)

Unnamed: 0,target,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,var_200
214,0,0.078,0.03,0.126,0.917,0.488,0.382,0.569,0.34,0.223,...,0.783,0.009,0.773,0.598,0.465,0.771,0.313,0.748,0.953,0.744
156,0,0.89,0.17,0.975,0.565,0.961,0.712,0.331,0.793,0.64,...,0.23,0.317,0.908,0.02,0.717,0.462,0.909,0.958,0.326,0.619
50,1,0.848,0.319,0.46,0.313,0.112,0.078,0.776,0.834,0.394,...,0.424,0.767,0.467,0.614,0.118,0.71,0.807,0.335,0.987,0.259
242,0,0.04,0.211,0.379,0.776,0.068,0.18,0.563,0.191,0.168,...,0.849,0.21,0.98,0.545,0.919,0.525,0.708,0.601,0.282,0.092
141,1,0.628,0.865,0.663,0.681,0.581,0.457,0.784,0.87,0.6,...,0.154,0.464,0.204,0.414,0.185,0.61,0.305,0.751,0.51,0.876


In [4]:
# Create testing and training data
df2 = pd.get_dummies(df, drop_first=True)
y = np.array(df2["target"]).reshape(-1,1)
X = np.array(df2.drop(columns={"target"}))
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### Modelling

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# Create model using scalar and a forest
scaler = StandardScaler()
estimator = RandomForestClassifier(n_jobs=-1)
pipe = Pipeline(steps=[("scaler", scaler), ("forest", estimator)])

params = {'forest__max_depth':[5,6,7,8,9],
          'forest__min_samples_split':[3,4,5,6,7],
          'forest__min_samples_leaf' :[2,3,4,5,6],
          'forest__n_estimators' :[20,50,80,100]}
 
best = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs=-1) 
best.fit(X_train, y_train.ravel())
best1 = best.best_estimator_
print(best1.score(X_test, y_test))
print(best1)

0.6349206349206349
Pipeline(steps=[('scaler', StandardScaler()),
                ('forest',
                 RandomForestClassifier(max_depth=7, min_samples_leaf=5,
                                        min_samples_split=3, n_estimators=80,
                                        n_jobs=-1))])


### Finishing

In [6]:
print(best.score(X_test, y_test))
print(best)

0.5079365079365079
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('forest',
                                        RandomForestClassifier(n_jobs=-1))]),
             n_jobs=-1,
             param_grid={'forest__max_depth': [5, 6, 7, 8, 9],
                         'forest__min_samples_leaf': [2, 3, 4, 5, 6],
                         'forest__min_samples_split': [3, 4, 5, 6, 7],
                         'forest__n_estimators': [20, 50, 80]})


### Testing


In [11]:
#Load Test Data
test_df = pd.read_csv("testing.csv")
test_df = test_df.drop(columns={"id"})
#Create tests and score
test_y = np.array(test_df["target"]).reshape(-1,1)
test_X = np.array(test_df.drop(columns={"target"}))

preds = best.predict(test_X)

roc_score = roc_auc_score(test_y, preds)
acc_score = accuracy_score(test_y, preds)

print(roc_score)
print(acc_score)
print(name, np.mean([roc_score, acc_score]))


0.6206286525541148
0.6201012658227848
Michael Harris 0.6203649591884498


### What Accuracy Changes Were Used

Please list here what you did to try to increase accuracy and/or limit overfitting:
<ul>
<li> For the model I used a forest instead of a single tree, in combination with a standard scalar.
<li> For that forest used a grid search with a combination of four parameters: max depth, minimum sample split, minimum sample leaf and n estimators.
</ul>