# Simple Model: Version 2
Creates a model with some combined features

In [1]:
#Import packages
import pandas as pd
import numpy as np
import os, sys

#Import other files
raw_path = os.path.join('..', 'data', 'raw')
proc_path = os.path.join('..', 'data', 'processed')
sys.path.append(os.path.join('..', 'src'))
from model.Reg_Model import *
from model.Scoring import *

In [2]:
#Read in data
ss_v2 = pd.read_csv(os.path.join(proc_path, 'scoring_set_v2.csv'))
ss_v2 = ss_v2[ss_v2['Season']> 2003].sort_index(axis=1)

# Reads in data
seeds = pd.read_csv(os.path.join(raw_path, 'TourneySeeds.csv'))
slots = pd.read_csv(os.path.join(raw_path, 'TourneySlots.csv'))
results = pd.read_csv(os.path.join(raw_path, 'TourneyCompactResults.csv'))
features = pd.read_csv(os.path.join(proc_path, 'team_features_v2.csv'))

slots = slots[slots['Season']>2003]

# Starting model building process

In [3]:
#Preps file for model
x = ss_v2.drop(['Season', 'Wscore', 'Lscore', 'Wteam', 'Lteam', 
                'Outcome'], 1)
#drops variables that were used in cluster
# x = x.drop(['total_poss_W', 'total_poss_L', 'oeff_W', 'oeff_L', 'deff_W', 'deff_L'], 1)
y = ss_v2['Outcome']

In [4]:
#Creates model
game_model = Reg_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_mse()

#Scores in simulated tournament
scorer = Scorer(slots, seeds, results, game_model, features)
print 'Avg Pts', scorer.score_model()

ridge
128.132715587
Avg Pts 85.6923076923


In [5]:
#Creates model with scaling
from sklearn.preprocessing import RobustScaler

game_model_s = Reg_Model()
game_model_s.set_training(x,y)
game_model_s.set_pipeline([('scaler', RobustScaler())], None)
game_model_s.calc_model()
print game_model_s.get_model_type()
print game_model_s.get_mse()

#Scores in simulated tournament
scorer_s = Scorer(slots, seeds, results, game_model_s, features)
print 'Avg Pts', scorer_s.score_model()

ridge
129.085412434
Avg Pts 85.6923076923


In [6]:
#Creates model with feature selection
from sklearn.feature_selection import SelectKBest

game_model_f = Reg_Model()
game_model_f.set_training(x,y)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_mse()

#Scores in simulated tournament
scorer_f = Scorer(slots, seeds, results, game_model_f, features)
print 'Avg Pts', scorer_f.score_model()

ridge
125.769128885
Avg Pts 76.4615384615


In [7]:
#Creates model with scaler and feature selection

game_model_fs = Reg_Model()
game_model_fs.set_training(x,y)
steps = [('scaler', RobustScaler()), ('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_fs.set_pipeline(steps, params)
game_model_fs.calc_model()
print game_model_fs.get_model_type()
print game_model_fs.get_mse()


#Scores in simulated tournament
scorer_fs = Scorer(slots, seeds, results, game_model_fs, features)
print 'Avg Pts', scorer_fs.score_model()

ridge
126.502826381
Avg Pts 79.4615384615


In [8]:
#Pickles best model, which was ridge with feature selection
import pickle
fn = os.path.join(proc_path, 'Models', 'model_v2.p')
pickle.dump(game_model_f, open(fn, 'wb'))

In [4]:
#Creates model with interaction effect
game_model = Reg_Model()

interactions = []
for w in range(4):
    for l in range(4):
        w_clstr = 'clstr_' + str(w) + '_W'
        l_clstr = 'clstr_' + str(l) + '_L'
        interactions.append((w_clstr, l_clstr))

game_model.set_training(x,y, interactions)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_mse()

#Scores in simulated tournament
scorer = Scorer(slots, seeds, results, game_model, features)
print 'Avg Pts', scorer.score_model()

ridge
128.806129274
Avg Pts 83.9230769231


In [5]:
#Creates model with scaling
from sklearn.preprocessing import RobustScaler

game_model_s = Reg_Model()
game_model_s.set_training(x,y, interactions)
game_model_s.set_pipeline([('scaler', RobustScaler())], None)
game_model_s.calc_model()
print game_model_s.get_model_type()
print game_model_s.get_mse()

#Scores in simulated tournament
scorer_s = Scorer(slots, seeds, results, game_model_s, features)
print 'Avg Pts', scorer_s.score_model()

#don't know why this error only pops up here
#should probably move interaction effect into pre-processing

gbm
131.431253384
Avg Pts

KeyError: "['clstr_3_Wclstr_0_L' 'clstr_0_Wclstr_2_L' 'clstr_3_Wclstr_2_L'\n 'clstr_0_Wclstr_3_L' 'clstr_1_Wclstr_1_L' 'clstr_2_Wclstr_3_L'\n 'clstr_1_Wclstr_2_L' 'clstr_1_Wclstr_0_L' 'clstr_2_Wclstr_0_L'\n 'clstr_0_Wclstr_0_L' 'clstr_2_Wclstr_1_L' 'clstr_0_Wclstr_1_L'\n 'clstr_3_Wclstr_3_L' 'clstr_1_Wclstr_3_L' 'clstr_3_Wclstr_1_L'\n 'clstr_2_Wclstr_2_L'] not in index"

In [12]:
#Creates model with feature selection
from sklearn.feature_selection import SelectKBest

game_model_f = Reg_Model()
game_model_f.set_training(x,y, interactions)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_mse()

#Scores in simulated tournament
scorer_f = Scorer(slots, seeds, results, game_model_f, features)
print 'Avg Pts', scorer_f.score_model()

 ridge
125.941795921
Avg Pts

KeyError: "['clstr_3_Wclstr_0_L' 'clstr_0_Wclstr_2_L' 'clstr_3_Wclstr_2_L'\n 'clstr_0_Wclstr_3_L' 'clstr_1_Wclstr_1_L' 'clstr_2_Wclstr_3_L'\n 'clstr_1_Wclstr_2_L' 'clstr_1_Wclstr_0_L' 'clstr_2_Wclstr_0_L'\n 'clstr_0_Wclstr_0_L' 'clstr_2_Wclstr_1_L' 'clstr_0_Wclstr_1_L'\n 'clstr_3_Wclstr_3_L' 'clstr_1_Wclstr_3_L' 'clstr_3_Wclstr_1_L'\n 'clstr_2_Wclstr_2_L'] not in index"