# Simple Model: Version 1
Creates a model with variables we have so far

In [1]:
#Import packages
import pandas as pd
import numpy as np
import os, sys

#Import other files
raw_path = os.path.join('..', 'data', 'raw')
proc_path = os.path.join('..', 'data', 'processed')
sys.path.append(os.path.join('..', 'src'))
from model.Reg_Model import *
from model.Scoring import *
%load_ext autoreload
%autoreload 2




In [2]:
#Read in data
ss_v1 = pd.read_csv(os.path.join(proc_path, 'scoring_set_v1.csv'))

# Reads in data
seeds = pd.read_csv(os.path.join(raw_path, 'TourneySeeds.csv'))
slots = pd.read_csv(os.path.join(raw_path, 'TourneySlots.csv'))
results = pd.read_csv(os.path.join(raw_path, 'TourneyCompactResults.csv'))

slots = slots[slots['Season']>2003]
features = pd.read_csv(os.path.join(proc_path, 'team_features.csv'))

In [3]:
#Preps file for model
x = ss_v1.drop(['Outcome', 'Team_A', 'Team_B'], 1)
y = ss_v1['Outcome']

In [4]:
#Creates model
game_model = Reg_Model()
game_model.set_training(x,y)
game_model.calc_model()
print game_model.get_model_type()
print 'MSE', game_model.get_mse()

#Scores in simulated tournament
scorer = Scorer(features)
scorer.set_variables(slots, seeds, results, game_model)
print 'Avg Pts', scorer.score_model()

ridge
MSE 125.439718043
Avg Pts 77.2307692308


In [5]:
#Creates model with scaling
from sklearn.preprocessing import RobustScaler

game_model_s = Reg_Model()
game_model_s.set_training(x,y)
game_model_s.set_pipeline([('scaler', RobustScaler())], None)
game_model_s.calc_model()
print game_model_s.get_model_type()
print game_model_s.get_mse()


#Scores in simulated tournament
scorer_s = Scorer(features)
scorer_s.set_variables(slots, seeds, results, game_model_s)

print 'Avg Pts', scorer_s.score_model()

ridge
125.003619922
Avg Pts 77.2307692308


In [6]:
#Creates model with feature selection
from sklearn.feature_selection import SelectKBest

game_model_f = Reg_Model()
game_model_f.set_training(x,y)
steps = [('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_f.set_pipeline(steps, params)
game_model_f.calc_model()
print game_model_f.get_model_type()
print game_model_f.get_mse()


#Scores in simulated tournament
scorer_f = Scorer(features)
scorer_f.set_variables(slots, seeds, results, game_model_f)
print 'Avg Pts', scorer_f.score_model()

ridge
125.785204267
Avg Pts 75.1538461538


In [7]:
#Creates model with scaler and feature selection
from sklearn.feature_selection import SelectKBest

game_model_fs = Reg_Model()
game_model_fs.set_training(x,y)
steps = [('scaler', RobustScaler()), ('feature_selection', SelectKBest())]
params = dict(feature_selection__k=[3,5,10])
game_model_fs.set_pipeline(steps, params)
game_model_fs.calc_model()
print game_model_fs.get_model_type()
print game_model_fs.get_mse()


#Scores in simulated tournament
scorer_fs = Scorer(features)
scorer_fs.set_variables(slots, seeds, results, game_model_fs)
print 'Avg Pts', scorer_fs.score_model()

ridge
125.611593672
Avg Pts 73.9230769231


In [8]:
#Pickles best model, which was ridge with featre selection
import pickle
fn = os.path.join(proc_path, 'Models', 'model_v1.p')
pickle.dump(game_model_f, open(fn, 'wb'))