In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.preprocessing
import sklearn.model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
os.chdir('C:/Users/nts21/Documents/tempus_dataset')
data=pd.read_table('takehome1.txt')
data.head()

Unnamed: 0,response,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V16553,V16554,V16555,V16556,V16557,V16558,V16559,V16560,V16561,V16562
0,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data.shape

(530, 16563)

In [4]:
#now that we know what kind of values we are working with, let's try to predict the responses column.
#first, let's get rid of any columns that are all ones or zeros, as they are not helpful.
col_to_use=list(data.columns)
for i in data.columns:
    if np.sum(data[i])==0 or np.sum(data[i])==530:
        col_to_use.remove(i)
col_to_use.remove('response')

In [5]:
#OK, now we're ready to start trying to predict!
#We'll start with simple logistic regression
print('Predicting on training data with a logistic regression model:')
model=LogisticRegression()
fit=model.fit(data[col_to_use], data['response'])
lr_prediction=fit.predict(data[col_to_use])
temp=lr_prediction-data['response']
false_pos=sum(temp==1)
false_neg=sum(temp==-1)
temp=lr_prediction+data['response']
true_pos=sum(temp==2)
precision=true_pos/(true_pos+false_pos)
recall=true_pos/(true_pos+false_neg)
f_score=2*precision*recall/(precision+recall)
print('total f-score: %.2f' % (f_score))
print('total false positives: %.2f' % (false_pos))
print('total false negatives: %.2f' % (false_neg))
print('total true positives: %.2f' % (true_pos))

Predicting on training data with a logistic regression model:
total f-score: 1.00
total false positives: 0.00
total false negatives: 0.00
total true positives: 123.00


In [6]:
#A perfect fit to the training set. Let's see how it holds up if we break up the test and training sets. We'll cross 
#validate the data with five separate groups.
print('Predicting on 5x cross validated data with a logistic regression model:')
for i in range(5):
    temp=list(range(5))
    temp.remove(i)
    val=data[i::5]
    train=[data[temp[0]::5], data[temp[1]::5], data[temp[2]::5], data[temp[3]::5]]
    train=pd.concat(train)
    model=LogisticRegression()
    fit=model.fit(train[col_to_use], train['response'])
    lr_prediction=fit.predict(val[col_to_use])
    temp=lr_prediction-val['response']
    false_pos=sum(temp==1)
    false_neg=sum(temp==-1)
    temp=lr_prediction+val['response']
    true_pos=sum(temp==2)
    precision=true_pos/(true_pos+false_pos)
    recall=true_pos/(true_pos+false_neg)
    f_score=2*precision*recall/(precision+recall)
    print('total f-score: %.2f' % (f_score))
    print('total false positives: %.2f' % (false_pos))
    print('total false negatives: %.2f' % (false_neg))
    print('total true positives: %.2f' % (true_pos))

Predicting on 5x cross validated data with a logistic regression model:
total f-score: 0.62
total false positives: 2.00
total false negatives: 13.00
total true positives: 12.00
total f-score: 0.67
total false positives: 1.00
total false negatives: 12.00
total true positives: 13.00
total f-score: 0.62
total false positives: 4.00
total false negatives: 12.00
total true positives: 13.00
total f-score: 0.63
total false positives: 0.00
total false negatives: 13.00
total true positives: 11.00
total f-score: 0.60
total false positives: 4.00
total false negatives: 12.00
total true positives: 12.00


In [7]:
#The values for Logistic Regression look reasonable. Let's try Gradient Boosting now.
print('Predicting on training data with a gradient boosting model:')
model=GradientBoostingClassifier()
fit=model.fit(data[col_to_use], data['response'])
gb_prediction=fit.predict(data[col_to_use])
temp=gb_prediction-data['response']
false_pos=sum(temp==1)
false_neg=sum(temp==-1)
temp=gb_prediction+data['response']
true_pos=sum(temp==2)
precision=true_pos/(true_pos+false_pos)
recall=true_pos/(true_pos+false_neg)
f_score=2*precision*recall/(precision+recall)
print('total f-score: %.2f' % (f_score))
print('total false positives: %.2f' % (false_pos))
print('total false negatives: %.2f' % (false_neg))
print('total true positives: %.2f' % (true_pos))

Predicting on training data with a gradient boosting model:
total f-score: 0.96
total false positives: 0.00
total false negatives: 9.00
total true positives: 114.00


In [8]:
#Pretty close, but not quite perfect on the training set. Let's try cross-validating it as well.
print('Predicting on 5x cross validated data with a gradient boosting model:')
for i in range(5):
    temp=list(range(5))
    temp.remove(i)
    val=data[i::5]
    train=[data[temp[0]::5], data[temp[1]::5], data[temp[2]::5], data[temp[3]::5]]
    train=pd.concat(train)
    model=GradientBoostingClassifier()
    fit=model.fit(train[col_to_use], train['response'])
    gb_prediction=fit.predict(val[col_to_use])
    temp=gb_prediction-val['response']
    false_pos=sum(temp==1)
    false_neg=sum(temp==-1)
    temp=gb_prediction+val['response']
    true_pos=sum(temp==2)
    precision=true_pos/(true_pos+false_pos)
    recall=true_pos/(true_pos+false_neg)
    f_score=2*precision*recall/(precision+recall)
    print('total f-score: %.2f' % (f_score))
    print('total false positives: %.2f' % (false_pos))
    print('total false negatives: %.2f' % (false_neg))
    print('total true positives: %.2f' % (true_pos))

Predicting on 5x cross validated data with a gradient boosting model:
total f-score: 0.68
total false positives: 2.00
total false negatives: 11.00
total true positives: 14.00
total f-score: 0.70
total false positives: 1.00
total false negatives: 11.00
total true positives: 14.00
total f-score: 0.73
total false positives: 1.00
total false negatives: 10.00
total true positives: 15.00
total f-score: 0.68
total false positives: 1.00
total false negatives: 11.00
total true positives: 13.00
total f-score: 0.62
total false positives: 5.00
total false negatives: 11.00
total true positives: 13.00


In [9]:
#The gradient boosting did a pretty good job. Now let's try with a random forest. To prevent overfitting, let's see how many nodes
#are necessary to fit the training set.
print('Predicting on training data with a random forest model:')
for j in [10, 25, 50, 75, 100, 200]:
    model=RandomForestClassifier(j, n_jobs=-1)
    fit=model.fit(data[col_to_use], data['response'])
    rf_prediction=fit.predict(data[col_to_use])
    temp=rf_prediction-data['response']
    false_pos=sum(temp==1)
    false_neg=sum(temp==-1)
    temp=rf_prediction+data['response']
    true_pos=sum(temp==2)
    precision=true_pos/(true_pos+false_pos)
    recall=true_pos/(true_pos+false_neg)
    f_score=2*precision*recall/(precision+recall)
    print('total f-score with %s nodes: %.2f' % (j, f_score))
    print('total false positives with %s nodes: %.2f' % (j, false_pos))
    print('total false negatives with %s nodes: %.2f' % (j, false_neg))
    print('total true positives with %s nodes: %.2f' % (j, true_pos))

Predicting on training data with a random forest model:
total f-score with 10 nodes: 0.96
total false positives with 10 nodes: 1.00
total false negatives with 10 nodes: 9.00
total true positives with 10 nodes: 114.00
total f-score with 25 nodes: 0.98
total false positives with 25 nodes: 0.00
total false negatives with 25 nodes: 4.00
total true positives with 25 nodes: 119.00
total f-score with 50 nodes: 1.00
total false positives with 50 nodes: 0.00
total false negatives with 50 nodes: 0.00
total true positives with 50 nodes: 123.00
total f-score with 75 nodes: 1.00
total false positives with 75 nodes: 0.00
total false negatives with 75 nodes: 0.00
total true positives with 75 nodes: 123.00
total f-score with 100 nodes: 1.00
total false positives with 100 nodes: 0.00
total false negatives with 100 nodes: 0.00
total true positives with 100 nodes: 123.00
total f-score with 200 nodes: 1.00
total false positives with 200 nodes: 0.00
total false negatives with 200 nodes: 0.00
total true pos

In [10]:
#Only about 50 nodes fits the data perfectly. Let's compare its performance on the cross-validated test data now
print('Predicting on 5x cross validated data with a random forest model:')
for j in [10, 25, 50, 75, 100, 200]:
    f_score_vals=[]
    for i in range(5):
        temp=list(range(5))
        temp.remove(i)
        val=data[i::5]
        train=[data[temp[0]::5], data[temp[1]::5], data[temp[2]::5], data[temp[3]::5]]
        train=pd.concat(train)
        model=RandomForestClassifier(j, n_jobs=-1)
        fit=model.fit(train[col_to_use], train['response'])
        rf_prediction=fit.predict(val[col_to_use])
        temp=rf_prediction-val['response']
        false_pos=sum(temp==1)
        false_neg=sum(temp==-1)
        temp=rf_prediction+val['response']
        true_pos=sum(temp==2)
        precision=true_pos/(true_pos+false_pos)
        recall=true_pos/(true_pos+false_neg)
        f_score=2*precision*recall/(precision+recall)
        #print(f_score)
        #print(false_pos)
        #print(false_neg)
        #print(true_pos)
        f_score_vals.append(f_score)
    print('with %s nodes, total f-score of %.2f' % (j, np.mean(f_score_vals)))

Predicting on 5x cross validated data with a random forest model:
with 10 nodes, total f-score of 0.47
with 25 nodes, total f-score of 0.57
with 50 nodes, total f-score of 0.61
with 75 nodes, total f-score of 0.61
with 100 nodes, total f-score of 0.61
with 200 nodes, total f-score of 0.62


In [None]:
#Our best results were probably with the gradient boosting model, though the models are all comparable.
#Some future directions include optimizing the classification cutoff in the forest and logistic regression models, a major
#limitation of the sklearn toolkit. Also, modeling should be attempted with a neural net predictor in keras, though my guess
#would be performance would be comparable.