# Simple Model: Version 2
Creates a model with some combined features

In [16]:
#Import packages
import pandas as pd
import numpy as np
import os, sys

#Import other files
raw_path = os.path.join('..', 'data', 'raw')
proc_path = os.path.join('..', 'data', 'processed')
sys.path.append(os.path.join('..', 'src'))
from model.Reg_Model import *
from model.Scoring import *
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
#Read in data
ss_v2 = pd.read_csv(os.path.join(proc_path, 'scoring_set_v2.csv'))

# Reads in data
seeds = pd.read_csv(os.path.join(raw_path, 'TourneySeeds.csv'))
slots = pd.read_csv(os.path.join(raw_path, 'TourneySlots.csv'))
results = pd.read_csv(os.path.join(raw_path, 'TourneyCompactResults.csv'))
features = pd.read_csv(os.path.join(proc_path, 'team_features_v2.csv'))

seeds = seeds[seeds['Season']>2003]
slots = slots[slots['Season']>2003]
results = results[results['Season']>2003]

# Starting model building process

In [18]:
#Preps file for model
x = ss_v2.drop(['Team_A', 'Team_B', 'Outcome'], 1)
                
#drops variables that were used in cluster
x = x.drop(['total_poss_A', 'total_poss_B', 'oeff_A', 'oeff_B', 'deff_A', 'deff_B'], 1)
y = ss_v2['Outcome']

In [4]:
#Creates model
game_model = Reg_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_mse()

#Scores in simulated tournament
scorer = Scorer(features)
scorer.set_variables(slots, seeds, results, game_model)
print 'Avg Pts', scorer.score_model()

ridge
127.528385217
Avg Pts 76.6923076923


In [5]:
#Creates model with scaling
game_model_s = Reg_Model()
game_model_s.set_training(x,y)
game_model_s.set_pipeline([('scaler', RobustScaler())], None)
game_model_s.calc_model()
print game_model_s.get_model_type()
print game_model_s.get_mse()

#Scores in simulated tournament
scorer_s = Scorer(features)
scorer_s.set_variables(slots, seeds, results, game_model_s)
print 'Avg Pts', scorer_s.score_model()

ridge
126.384523372
Avg Pts 76.6923076923


In [6]:
#Creates model with feature selection
game_model_f = Reg_Model()
game_model_f.set_training(x,y)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_mse()

#Scores in simulated tournament
scorer_f = Scorer(features)
scorer_f.set_variables(slots, seeds, results, game_model_f)
print 'Avg Pts', scorer_f.score_model()

ridge
126.860846505
Avg Pts 81.2307692308


In [19]:
#Creates model with scaler and feature selection

game_model_fs = Reg_Model()
game_model_fs.set_training(x,y)
steps = [('scaler', RobustScaler()), ('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_fs.set_pipeline(steps, params)
game_model_fs.calc_model()
print game_model_fs.get_model_type()
print game_model_fs.get_mse()


#Scores in simulated tournament
scorer_fs = Scorer(features)
scorer_fs.set_variables(slots, seeds, results, game_model_fs)
print 'Avg Pts', scorer_fs.score_model()

ridge
125.544911594
Avg Pts 82.4615384615


In [8]:
#Redo, but keep all variables
#Preps file for model
x = ss_v2.drop(['Team_A', 'Team_B', 'Outcome'], 1)
                
#drops variables that were used in cluster
# x = x.drop(['total_poss_A', 'total_poss_B', 'oeff_A', 'oeff_B', 'deff_A', 'deff_B'], 1)
y = ss_v2['Outcome']

In [9]:
#Creates model
game_model = Reg_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_mse()

#Scores in simulated tournament
scorer = Scorer(features)
scorer.set_variables(slots, seeds, results, game_model)
print 'Avg Pts', scorer.score_model()

ridge
126.549957546
Avg Pts 75.0769230769


In [10]:
#Creates model
game_model = Reg_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print game_model.get_mse()

#Scores in simulated tournament
scorer = Scorer(features)
scorer.set_variables(slots, seeds, results, game_model)
print 'Avg Pts', scorer.score_model()

ridge
125.924489696
Avg Pts 75.0769230769


In [11]:
#Creates model with feature selection
from sklearn.feature_selection import SelectKBest

game_model_f = Reg_Model()
game_model_f.set_training(x,y)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_mse()

#Scores in simulated tournament
scorer_f = Scorer(features)
scorer_f.set_variables(slots, seeds, results, game_model_f)
print 'Avg Pts', scorer_f.score_model()

ridge
127.004062045
Avg Pts 81.2307692308


In [12]:
#Creates model with scaler and feature selection

game_model_fs = Reg_Model()
game_model_fs.set_training(x,y)
steps = [('scaler', RobustScaler()), ('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_fs.set_pipeline(steps, params)
game_model_fs.calc_model()
print game_model_fs.get_model_type()
print game_model_fs.get_mse()


#Scores in simulated tournament
scorer_fs = Scorer(features)
scorer_fs.set_variables(slots, seeds, results, game_model_fs)
print 'Avg Pts', scorer_fs.score_model()

ridge
126.173636032
Avg Pts 82.4615384615


In [20]:
#Pickles best model, which was ridge with feature selection
import pickle
fn = os.path.join(proc_path, 'Models', 'model_v2.p')
pickle.dump(game_model_fs, open(fn, 'wb'))