# Simple Model: Version 4
Uses logisitic regression to test out these models along with v2 variables

In [1]:
#Import packages
import pandas as pd
import numpy as np
import os, sys

#Import other files
raw_path = os.path.join('..', 'data', 'raw')
proc_path = os.path.join('..', 'data', 'processed')
sys.path.append(os.path.join('..', 'src'))
from model.Log_Model import *
from model.Scoring import *
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest

%load_ext autoreload
%autoreload 2




In [2]:
#Read in data
ss_v1 = pd.read_csv(os.path.join(proc_path, 'scoring_set_v1.csv'))
ss_v2 = pd.read_csv(os.path.join(proc_path, 'scoring_set_v2.csv'))
ss_v3 = pd.read_csv(os.path.join(proc_path, 'scoring_set_v3.csv'))

# Reads in data
seeds = pd.read_csv(os.path.join(raw_path, 'TourneySeeds.csv'))
slots = pd.read_csv(os.path.join(raw_path, 'TourneySlots.csv'))
results = pd.read_csv(os.path.join(raw_path, 'TourneyCompactResults.csv'))
features = pd.read_csv(os.path.join(proc_path, 'team_features_v2.csv'))

seeds = seeds[seeds['Season']>2003]
slots = slots[slots['Season']>2003]
results = results[results['Season']>2003]

In [3]:
#Preps file for model
x = ss_v1.drop(['Outcome', 'Team_A', 'Team_B'], 1)
y = [1 if a>0 else 0 for a in ss_v1['Outcome']]

In [4]:
#Creates model
game_model = Log_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_acc()

#Scores in simulated tournament
scorer = Scorer(features)
scorer.set_variables(slots, seeds, results, game_model)
print 'Avg Pts', scorer.score_model()

#Creates model with scaling
game_model_s = Log_Model()
game_model_s.set_training(x,y)
game_model_s.set_pipeline([('scaler', RobustScaler())], None)
game_model_s.calc_model()
print game_model_s.get_model_type()
print game_model_s.get_acc()

#Scores in simulated tournament
scorer_s = Scorer(features)
scorer_s.set_variables(slots, seeds, results, game_model_s)
print 'Avg Pts', scorer_s.score_model()

#Creates model with feature selection
game_model_f = Log_Model()
game_model_f.set_training(x,y)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_acc()

#Scores in simulated tournament
scorer_f = Scorer(features)
scorer_f.set_variables(slots, seeds, results, game_model_f)
print 'Avg Pts', scorer_f.score_model()

lr
0.7
Avg Pts 81.8461538462
lr
0.688235294118
Avg Pts 81.8461538462
lr
0.708235294118
Avg Pts 75.4615384615


In [4]:
#Preps file for model
x = ss_v2.drop(['Team_A', 'Team_B', 'Outcome'], 1)
                
#drops variables that were used in cluster
x = x.drop(['total_poss_A', 'total_poss_B', 'oeff_A', 'oeff_B', 'deff_A', 'deff_B'], 1)
y = [1 if a>0 else 0 for a in ss_v2['Outcome']]

In [5]:
#Creates model
game_model = Log_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_acc()

#Scores in simulated tournament
scorer = Scorer(features)
scorer.set_variables(slots, seeds, results, game_model)
print 'Avg Pts', scorer.score_model()

lr
0.702352941176
Avg Pts 82.4615384615


In [6]:
#Creates model with scaling
game_model_s = Log_Model()
game_model_s.set_training(x,y)
game_model_s.set_pipeline([('scaler', RobustScaler())], None)
game_model_s.calc_model()
print game_model_s.get_model_type()
print game_model_s.get_acc()

#Scores in simulated tournament
scorer_s = Scorer(features)
scorer_s.set_variables(slots, seeds, results, game_model_s)
print 'Avg Pts', scorer_s.score_model()

lr
0.701176470588
Avg Pts 82.5384615385


In [7]:
#Creates model with feature selection
game_model_f = Log_Model()
game_model_f.set_training(x,y)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_acc()

#Scores in simulated tournament
scorer_f = Scorer(features)
scorer_f.set_variables(slots, seeds, results, game_model_f)
print 'Avg Pts', scorer_f.score_model()

lr
0.712941176471
Avg Pts 72.4615384615


In [8]:
#Creates model with scaler and feature selection
game_model_fs = Log_Model()
game_model_fs.set_training(x,y)
steps = [('scaler', RobustScaler()), ('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_fs.set_pipeline(steps, params)
game_model_fs.calc_model()
print game_model_fs.get_model_type()
print game_model_fs.get_acc()


#Scores in simulated tournament
scorer_fs = Scorer(features)
scorer_fs.set_variables(slots, seeds, results, game_model_fs)
print 'Avg Pts', scorer_fs.score_model()

lr
0.703529411765
Avg Pts 72.7692307692


In [13]:
#Makes interactions
interactions = []
for idx1 in range(3):
    for idx2 in range(3):
        interactions.append(('clstr_'+str(idx1)+'_A', 'clstr_'+str(idx2)+'_B'))
interactions

#Preps file for model
x = ss_v3.drop(['Team_A', 'Team_B', 'Outcome'], 1)
                
#drops variables that were used in cluster
x = x.drop(['total_poss_A', 'total_poss_B', 'oeff_A', 'oeff_B', 'deff_A', 'deff_B'], 1)
x = x.drop(['clstr_0_A', 'clstr_0_B', 'clstr_1_A', 'clstr_1_B', 'clstr_2_A', 'clstr_2_B'], 1)
y = [1 if a>0 else 0 for a in ss_v3['Outcome']]

In [14]:
#Creates model
game_model = Log_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_acc()

#Scores in simulated tournament
scorer = Scorer(features)
scorer.set_variables(slots, seeds, results, game_model)
print 'Avg Pts', scorer.score_model(interactions)

#Creates model with scaling
game_model_s = Log_Model()
game_model_s.set_training(x,y)
game_model_s.set_pipeline([('scaler', RobustScaler())], None)
game_model_s.calc_model()
print game_model_s.get_model_type()
print game_model_s.get_acc()

#Scores in simulated tournament
scorer_s = Scorer(features)
scorer_s.set_variables(slots, seeds, results, game_model_s)
print 'Avg Pts', scorer_s.score_model(interactions)

#Creates model with feature selection
game_model_f = Log_Model()
game_model_f.set_training(x,y)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_acc()

#Scores in simulated tournament
scorer_f = Scorer(features)
scorer_f.set_variables(slots, seeds, results, game_model_f)
print 'Avg Pts', scorer_f.score_model(interactions)

lr
0.702352941176
Avg Pts 81.2307692308
lr
0.709411764706
Avg Pts 85.0769230769
lr
0.697647058824
Avg Pts 79.7692307692


In [15]:
#Pickles best model, which was ridge with feature selection
import pickle
fn = os.path.join(proc_path, 'Models', 'model_v4.p')
pickle.dump(game_model_s, open(fn, 'wb'))

In [11]:
#Makes interactions
interactions = []
for idx1 in range(3):
    for idx2 in range(3):
        interactions.append(('clstr_'+str(idx1)+'_A', 'clstr_'+str(idx2)+'_B'))
interactions

#Preps file for model
x = ss_v3.drop(['Team_A', 'Team_B', 'Outcome'], 1)
                
#drops variables that were used in cluster
x = x.drop(['total_poss_A', 'total_poss_B', 'oeff_A', 'oeff_B', 'deff_A', 'deff_B'], 1)
# x = x.drop(['clstr_0_A', 'clstr_0_B', 'clstr_1_A', 'clstr_1_B', 'clstr_2_A', 'clstr_2_B'], 1)
y = [1 if a>0 else 0 for a in ss_v3['Outcome']]

In [12]:
#Creates model
game_model = Log_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_acc()

#Scores in simulated tournament
scorer = Scorer(features)
scorer.set_variables(slots, seeds, results, game_model)
print 'Avg Pts', scorer.score_model(interactions)

#Creates model with scaling
game_model_s = Log_Model()
game_model_s.set_training(x,y)
game_model_s.set_pipeline([('scaler', RobustScaler())], None)
game_model_s.calc_model()
print game_model_s.get_model_type()
print game_model_s.get_acc()

#Scores in simulated tournament
scorer_s = Scorer(features)
scorer_s.set_variables(slots, seeds, results, game_model_s)
print 'Avg Pts', scorer_s.score_model(interactions)

#Creates model with feature selection
game_model_f = Log_Model()
game_model_f.set_training(x,y)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_acc()

#Scores in simulated tournament
scorer_f = Scorer(features)
scorer_f.set_variables(slots, seeds, results, game_model_f)
print 'Avg Pts', scorer_f.score_model(interactions)

lr
0.702352941176
Avg Pts 84.8461538462
lr
0.705882352941
Avg Pts 84.8461538462
svc_other
0.701176470588
Avg Pts 80.6153846154


In [None]:
#So in conclusion, it looks like the highest accuracy is .708, which belongs to the V1 variables with everything
#THe highest points is 86.2, which is V2 variables with nothing