# Simple Model: Version 3
Creates a model with some combined features

In [1]:
#Import packages
import pandas as pd
import numpy as np
import os, sys

#Import other files
raw_path = os.path.join('..', 'data', 'raw')
proc_path = os.path.join('..', 'data', 'processed')
sys.path.append(os.path.join('..', 'src'))
from model.Reg_Model import *
from model.Scoring import *

%load_ext autoreload
%autoreload 2




In [2]:
#Read in data
ss_v3 = pd.read_csv(os.path.join(proc_path, 'scoring_set_v3.csv'))

# Reads in data
seeds = pd.read_csv(os.path.join(raw_path, 'TourneySeeds.csv'))
slots = pd.read_csv(os.path.join(raw_path, 'TourneySlots.csv'))
results = pd.read_csv(os.path.join(raw_path, 'TourneyCompactResults.csv'))
features = pd.read_csv(os.path.join(proc_path, 'team_features_v2.csv'))

seeds = seeds[seeds['Season']>2003]
slots = slots[slots['Season']>2003]
results = results[results['Season']>2003]

# Starting model building process

In [3]:
#Makes interactions
interactions = []
for idx1 in range(3):
    for idx2 in range(3):
        interactions.append(('clstr_'+str(idx1)+'_A', 'clstr_'+str(idx2)+'_B'))
interactions

[('clstr_0_A', 'clstr_0_B'),
 ('clstr_0_A', 'clstr_1_B'),
 ('clstr_0_A', 'clstr_2_B'),
 ('clstr_1_A', 'clstr_0_B'),
 ('clstr_1_A', 'clstr_1_B'),
 ('clstr_1_A', 'clstr_2_B'),
 ('clstr_2_A', 'clstr_0_B'),
 ('clstr_2_A', 'clstr_1_B'),
 ('clstr_2_A', 'clstr_2_B')]

In [4]:
#Preps file for model
x = ss_v3.drop(['Team_A', 'Team_B', 'Outcome'], 1)
                
#drops variables that were used in cluster
x = x.drop(['total_poss_A', 'total_poss_B', 'oeff_A', 'oeff_B', 'deff_A', 'deff_B'], 1)
x = x.drop(['clstr_0_A', 'clstr_0_B', 'clstr_1_A', 'clstr_1_B', 'clstr_2_A', 'clstr_2_B'], 1)
y = ss_v3['Outcome']

In [5]:
#Creates model
game_model = Reg_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_mse()

#Scores in simulated tournament
scorer = Scorer(features)
scorer.set_variables(slots, seeds, results, game_model)
print 'Avg Pts', scorer.score_model(interactions)

ridge
128.685979663
Avg Pts 79.6153846154


In [6]:
#Creates model with scaling
from sklearn.preprocessing import RobustScaler

game_model_s = Reg_Model()
game_model_s.set_training(x,y)
game_model_s.set_pipeline([('scaler', RobustScaler())], None)
game_model_s.calc_model()
print game_model_s.get_model_type()
print game_model_s.get_mse()

#Scores in simulated tournament
scorer_s = Scorer(features)
scorer_s.set_variables(slots, seeds, results, game_model_s)
print 'Avg Pts', scorer_s.score_model(interactions)

ridge
128.80605095
Avg Pts 79.6153846154


In [7]:
#Creates model with feature selection
from sklearn.feature_selection import SelectKBest

game_model_f = Reg_Model()
game_model_f.set_training(x,y)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_mse()

#Scores in simulated tournament
scorer_f = Scorer(features)
scorer_f.set_variables(slots, seeds, results, game_model_f)
print 'Avg Pts', scorer_f.score_model(interactions)

ridge
126.969225453
Avg Pts 78.1538461538


In [8]:
#Creates model with scaler and feature selection

game_model_fs = Reg_Model()
game_model_fs.set_training(x,y)
steps = [('scaler', RobustScaler()), ('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_fs.set_pipeline(steps, params)
game_model_fs.calc_model()
print game_model_fs.get_model_type()
print game_model_fs.get_mse()


#Scores in simulated tournament
scorer_fs = Scorer(features)
scorer_fs.set_variables(slots, seeds, results, game_model_fs)
print 'Avg Pts', scorer_fs.score_model(interactions)

ridge
127.097937199
Avg Pts 78.4615384615


In [9]:
#starts process again, but keeps original clusters
#Preps file for model
x = ss_v3.drop(['Team_A', 'Team_B', 'Outcome'], 1)
                
#drops variables that were used in cluster
x = x.drop(['total_poss_A', 'total_poss_B', 'oeff_A', 'oeff_B', 'deff_A', 'deff_B'], 1)
# x = x.drop(['clstr_0_A', 'clstr_0_B', 'clstr_1_A', 'clstr_1_B', 'clstr_2_A', 'clstr_2_B'], 1)
y = ss_v3['Outcome']

In [10]:
#Creates model
game_model = Reg_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_mse()

#Scores in simulated tournament
scorer = Scorer(features)
scorer.set_variables(slots, seeds, results, game_model)
print 'Avg Pts', scorer.score_model(interactions)

ridge
130.616130703
Avg Pts 79.6153846154


In [11]:
#Creates model with scaling
from sklearn.preprocessing import RobustScaler

game_model_s = Reg_Model()
game_model_s.set_training(x,y)
game_model_s.set_pipeline([('scaler', RobustScaler())], None)
game_model_s.calc_model()
print game_model_s.get_model_type()
print game_model_s.get_mse()

#Scores in simulated tournament
scorer_s = Scorer(features)
scorer_s.set_variables(slots, seeds, results, game_model_s)
print 'Avg Pts', scorer_s.score_model(interactions)

ridge
127.951624197
Avg Pts 79.6153846154


In [12]:
#Creates model with feature selection
from sklearn.feature_selection import SelectKBest

game_model_f = Reg_Model()
game_model_f.set_training(x,y)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_mse()

#Scores in simulated tournament
scorer_f = Scorer(features)
scorer_f.set_variables(slots, seeds, results, game_model_f)
print 'Avg Pts', scorer_f.score_model(interactions)

ridge
126.909423185
Avg Pts 77.0769230769


In [13]:
#Creates model with scaler and feature selection

game_model_fs = Reg_Model()
game_model_fs.set_training(x,y)
steps = [('scaler', RobustScaler()), ('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_fs.set_pipeline(steps, params)
game_model_fs.calc_model()
print game_model_fs.get_model_type()
print game_model_fs.get_mse()


#Scores in simulated tournament
scorer_fs = Scorer(features)
scorer_fs.set_variables(slots, seeds, results, game_model_fs)
print 'Avg Pts', scorer_fs.score_model(interactions)

ridge
126.358305107
Avg Pts 78.4615384615


In [14]:
#Pickles best model, which was ridge with feature selection
import pickle
fn = os.path.join(proc_path, 'Models', 'model_v3.p')
pickle.dump(game_model_fs, open(fn, 'wb'))