In [1]:
import pandas as pd
import numpy as np
import utils
from time import time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

### Load Data and Scaler

In [2]:
from sklearn.externals import joblib
scale = joblib.load("models/scaler.sav")

In [3]:
train = pd.read_csv("data/final_data/clean_train.csv")
X,y = utils.setup_data(train,scale,'Team2_win')
train.head()

Unnamed: 0,Team1_first_downs,Team1_passing_first_downs,Team1_rushing_first_downs,Team1_rushing_yds,Team1_rushing_attempts,Team1_passing_attempts,Team1_passing_completions,Team1_passing_interceptions,Team1_avg_per_pass,Team1_pass_yds,...,Team2_third_down_suc,Team2_fourth_down_attempts,Team2_fourth_down_suc,Team1_third_down_conv_pct,Team2_third_down_conv_pct,Team1_fourth_down_conv_pct,Team2_fourth_down_conv_pct,Team1_turnover_margin,Team2_turnover_margin,Team2_win
0,16,12,2,192,49,12,6,2,3.5,42,...,11,2,2,0.285714,0.578947,0.0,1.0,2,-2,1
1,23,15,6,304,63,10,7,0,13.7,137,...,4,4,1,0.461538,0.266667,0.0,0.25,0,0,0
2,35,18,14,277,38,49,27,1,8.1,395,...,5,1,0,0.5,0.277778,1.0,0.0,1,-1,0
3,19,13,6,326,49,13,6,0,6.5,85,...,7,3,3,0.416667,0.583333,0.666667,1.0,-1,1,1
4,15,6,6,67,31,35,19,1,3.8,132,...,8,2,1,0.133333,0.5,0.5,0.5,0,0,1


In [4]:
test = pd.read_csv("data/final_data/clean_test.csv")
X_test,y_test = utils.setup_data(test,scale,target_col='Team2_win')
test.head()

Unnamed: 0,Team1_first_downs,Team1_passing_first_downs,Team1_rushing_first_downs,Team1_rushing_yds,Team1_rushing_attempts,Team1_passing_attempts,Team1_passing_completions,Team1_passing_interceptions,Team1_avg_per_pass,Team1_pass_yds,...,Team2_third_down_suc,Team2_fourth_down_attempts,Team2_fourth_down_suc,Team1_third_down_conv_pct,Team2_third_down_conv_pct,Team1_fourth_down_conv_pct,Team2_fourth_down_conv_pct,Team1_turnover_margin,Team2_turnover_margin,Team2_win
0,24,11,9,197,51,24,12,1,7.0,167,...,2,2,0,0.214286,0.166667,0.2,0.0,1,-1,0
1,17,6,10,109,25,47,16,1,5.8,270,...,5,1,1,0.294118,0.294118,0.333333,1.0,2,-2,1
2,20,15,4,365,58,17,10,0,3.5,60,...,2,1,0,0.266667,0.181818,0.5,0.0,-2,2,0
3,0,0,0,164,50,23,12,0,7.0,160,...,0,0,0,0.0,0.0,0.0,0.0,-1,1,1
4,25,15,10,265,47,29,20,0,11.1,323,...,6,4,3,0.571429,0.352941,1.0,0.75,-1,1,0


In [5]:
log=LogisticRegression()
params = {
    'penalty' : ['l1','l2'],
    'tol' : np.arange(0.0, 0.1, 0.0001),
    'C' : np.arange(0.0, 0.5, 0.001),
}

In [6]:
random_serach = RandomizedSearchCV(log,params,cv=10,random_state=10,scoring='accuracy',refit = True)

In [7]:
start = time()
random_serach.fit(X,y)
print("RandomizedSearchCV took %.2f seconds." % ((time() - start)))

RandomizedSearchCV took 0.89 seconds.


In [8]:
utils.report(random_serach.cv_results_,n_top=10)

Model with rank: 1
Mean validation score: 0.938 (std: 0.015)
Parameters: {'tol': 0.07050000000000001, 'penalty': 'l2', 'C': 0.463}

Model with rank: 2
Mean validation score: 0.936 (std: 0.016)
Parameters: {'tol': 0.066, 'penalty': 'l2', 'C': 0.266}

Model with rank: 3
Mean validation score: 0.936 (std: 0.014)
Parameters: {'tol': 0.0167, 'penalty': 'l2', 'C': 0.44}

Model with rank: 3
Mean validation score: 0.936 (std: 0.016)
Parameters: {'tol': 0.08410000000000001, 'penalty': 'l2', 'C': 0.308}

Model with rank: 5
Mean validation score: 0.933 (std: 0.015)
Parameters: {'tol': 0.0712, 'penalty': 'l2', 'C': 0.221}

Model with rank: 6
Mean validation score: 0.932 (std: 0.015)
Parameters: {'tol': 0.035300000000000005, 'penalty': 'l2', 'C': 0.17200000000000001}

Model with rank: 6
Mean validation score: 0.932 (std: 0.017)
Parameters: {'tol': 0.059500000000000004, 'penalty': 'l2', 'C': 0.052000000000000005}

Model with rank: 8
Mean validation score: 0.932 (std: 0.016)
Parameters: {'tol': 0.074

In [9]:
log_final=LogisticRegression(**random_serach.best_params_)
log_final.fit(X,y)

LogisticRegression(C=0.463, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear',
          tol=0.07050000000000001, verbose=0, warm_start=False)

In [10]:
y_pred = log_final.predict(X_test)
y_pred_prob = log_final.predict_proba(X_test)[:, 1]

In [11]:
print("Logistic Acc Score: {0:.3%}".format(accuracy_score(y_test,y_pred)))

Logistic Acc Score: 93.503%


In [12]:
pickle.dump(log_final,open("models/Log_model.sav",'wb'))