# Submissions

In [1]:
from sklearn import datasets
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import VotingClassifier as VC
from sklearn.ensemble import ExtraTreesClassifier as ETC
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn import linear_model
import matplotlib.pyplot as plt
from __future__ import division
import pandas as pd
import numpy as np
import math
import pickle
%matplotlib inline

#iris = datasets.load_iris()
#X, y = iris.data[:, 1:3], iris.target
df = pd.read_csv("Matchup_KP.csv")
features = ['FieldGoalAvg', 'ThreePointAvg', 'FreeThrowAvg', 'ReboundAvg', 'AssistAvg', 'TurnOverAvg', 
            'StealAvg', 'BlockAvg', 'PersonalFoulAvg', 'Pyth', 'Rank', 'AdjustO', 'AdjustO Rank', 'AdjustD', 
            'AdjustD Rank', 'AdjustT', 'AdjustT Rank', 'Luck', 'Luck Rank']
features2 = ['Pyth', 'Rank', 'AdjustO', 'AdjustO Rank', 'AdjustD', 
            'AdjustD Rank', 'AdjustT', 'AdjustT Rank', 'Luck', 'Luck Rank']
# Load Elo Ratings (dict)
team_elos = pickle.load(open("../elo/data/team_elos.p", "rb"))
# Extremely Random Forest Features
features3 = ['Pyth', 'AdjustO', 'AdjustD', 'Luck','FieldGoalAvg']
# Random Forest Features
features4 = ['Pyth', 'AdjustO', 'AdjustD', 'Luck','BlockAvg']
# Gradient Boosting Features
features5 = ['Pyth', 'AdjustO', 'Luck','BlockAvg','AssistAvg']
#ADA Boost
features6 = ['Pyth','Luck','AdjustO','AdjustD','FieldGoalAvg']

In [2]:
Submission = pd.read_csv("2016Submission.csv")
submission_test = Submission[features2]

X = df[features2]
y = df.WinLoss

In [3]:
def fit_best_model(model, parameters):
    m = model()
    m = GridSearchCV(m, parameters)
    m.fit(X, y)
    return m.best_estimator_, m.best_params_

In [4]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ['Loss', 'Win'], rotation=45)
    plt.yticks(tick_marks, ['Loss', 'Win'])
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()


def accuracies(cm):
    totals = np.sum(cm, 1)
    no_acc = cm[0, 0] / totals[0]
    yes_acc = cm[1, 1] / totals[1]
    print("Loss Acc: {0}".format(no_acc))
    print("Win Acc: {0}".format(yes_acc))
    print("Avg Acc: {0}".format((no_acc + yes_acc)/2))
    

def run_model(model):
    clf = model
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    cm = confusion_matrix(y_test, predictions)
    plot_confusion_matrix(cm)
    accuracies(cm)

In [5]:
def get_elo(season, team):
    try:
        return team_elos[season][team]
    except:
        try:
            # Get the previous season's ending value.
            team_elos[season][team] = team_elos[season-1][team]
            return team_elos[season][team]
        except:
            # Get the starter elo.
            team_elos[season][team] = base_elo
            return team_elos[season][team]
        
def get_elo_diff(team_1, team_2, season):
    elo1 = get_elo(season, team_1)
    elo2 = get_elo(season, team_2)
    return elo1 - elo2

In [6]:
def append_elos():
    dFrames = [df, Submission]
    for i, dFrame in enumerate(dFrames):
        elo_diffs = []
        for index, row in dFrame.iterrows():
            elo_diffs.append(get_elo_diff(row['Wteam'], row['Lteam'], row['Year']))
        elos = pd.DataFrame(elo_diffs, columns = ['Elo Rank'])
        if i==0:
            X_elos = pd.concat([elos, X], axis=1)
        else:
            submission_elos = pd.concat([elos, submission_test], axis=1)
    return X_elos, submission_elos

In [7]:
class EstimatorSelectionHelper:
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, n_jobs=-1)
            gs.fit(X,y)
            self.grid_searches[key] = gs.best_estimator_
            
    def score_summary(this, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series(dict(params.items() + d.items()))

        rows = [row(k, gsc.cv_validation_scores, gsc.parameters) 
                for k in this.keys
                for gsc in this.grid_searches[k].grid_scores_]
        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns]

In [8]:
models = { 
    'ExtraTreesClassifier': ETC(),
    'RandomForestClassifier': RFC(),
    'AdaBoostClassifier': ABC(),
    'GradientBoostingClassifier': GBC(),
#    'SVC': SVC(),
    'LogisticRegression': LR()
}

params = { 
    'ExtraTreesClassifier': {'n_estimators':[10, 16, 32, 50, 100], 'max_features': ['sqrt']},
    'RandomForestClassifier': {
        'n_estimators':[10, 16, 32, 50, 100, 300, 500, 1000],
        'max_features': [ None, 'auto', 'sqrt', 'log2']
    },
    'AdaBoostClassifier':  { 'n_estimators': [10, 16, 32, 50, 100, 500, 1000] },
    'GradientBoostingClassifier': {
        'n_estimators':[10, 16, 32, 50, 100, 500, 1000], 
        'learning_rate':[0.01, 0.8, 1]
    },
#    'SVC': [
#        {'kernel': ['linear'], 'C': [1, 10]},
#        {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
#    ],
    'LogisticRegression': { 'penalty': ['l1', 'l2'], 'C': np.logspace(-4, 4, 8) }
}

In [9]:
helper = EstimatorSelectionHelper(models, params)
helper.fit(X, y)

Running GridSearchCV for LogisticRegression.
Running GridSearchCV for AdaBoostClassifier.
Running GridSearchCV for GradientBoostingClassifier.
Running GridSearchCV for ExtraTreesClassifier.
Running GridSearchCV for RandomForestClassifier.


In [10]:
X_elos, submission_elos = append_elos()
cols = X_elos.columns.tolist()
cols = cols[0:7] + cols[7:][::-1]
X_elos = X_elos[cols]
submission_elos = submission_elos[cols]
del X_elos['AdjustT']
del submission_elos['AdjustT']
helper2 = EstimatorSelectionHelper(models, params)
helper2.fit(X_elos, y)

Running GridSearchCV for LogisticRegression.
Running GridSearchCV for AdaBoostClassifier.
Running GridSearchCV for GradientBoostingClassifier.
Running GridSearchCV for ExtraTreesClassifier.
Running GridSearchCV for RandomForestClassifier.


In [11]:
for model in helper2.grid_searches.keys():
    clf = helper2.grid_searches[model]
    predictions = clf.predict_proba(submission_elos)
    Submission.Pred = predictions[:,1]
    model_DF = Submission[['Id','Pred']].sort_values(by = 'Id')
    model_DF.to_csv("results3/"+model+"submissionELO.csv",index = False)

In [12]:
print helper2.grid_searches['RandomForestClassifier'].feature_importances_
print "\n"
print helper2.grid_searches['ExtraTreesClassifier'].feature_importances_
print "\n"
print helper2.grid_searches['GradientBoostingClassifier'].feature_importances_

[ 0.24958628  0.14385444  0.12218565  0.0830409   0.07379973  0.06188813
  0.0690053   0.06429626  0.063806    0.06853731]


[ 0.14867886  0.14081998  0.13522297  0.10397777  0.10023914  0.08291633
  0.09042629  0.06762896  0.06441525  0.06567447]


[  9.09965064e-01   4.77051131e-02   0.00000000e+00   1.99648178e-02
   9.18177752e-03   0.00000000e+00   0.00000000e+00   0.00000000e+00
   4.72051876e-04   1.27111753e-02]


In [13]:
for model in helper.grid_searches.keys():
    clf = helper.grid_searches[model]
    predictions = clf.predict_proba(submission_test)
    Submission.Pred = predictions[:,1]
    model_DF = Submission[['Id','Pred']].sort_values(by = 'Id')
    model_DF.to_csv("results3/"+model+"submission.csv",index = False)

# Voting Ensemble Classifier

In [16]:
estims = [
    ('lr', helper.grid_searches['LogisticRegression']),
    ('rfc', helper.grid_searches['RandomForestClassifier']),
    ('etc', helper.grid_searches['ExtraTreesClassifier']),
    ('gbc', helper.grid_searches['GradientBoostingClassifier'])
]

In [17]:
params = {
    'voting':['soft'],
    'weights': [
        [1, 10, 10, 0.8],
        [1, 5, 5, 0.8],
        [1, 6, 5, 0.8],
        [1, 5, 6, 0.8], 
        [1, 7, 4, 0.8],
        [1, 4, 7, 0.8],
        [1, 8, 3, 0.8], 
        [1, 3, 8, 0.8]
    ]
}
vc = VC(estimators=estims)
grid1 = GridSearchCV(vc, params, n_jobs=-1)
grid2 = GridSearchCV(vc, params, n_jobs=-1)
grid1.fit(X, y)
print grid1.best_params_
grid2.fit(X_elos, y)
print grid2.best_params_
vc1 = grid1.best_estimator_
vc2 = grid2.best_estimator_
preds1 = vc1.predict_proba(submission_test)
preds2 = vc2.predict_proba(submission_elos)

Submission.Pred = preds1[:,1]
EnsembleSubmission1 = Submission[['Id','Pred']].sort_values(by = 'Id')
EnsembleSubmission1.to_csv("results3/votingSubmission.csv",index = False)

Submission.Pred = preds2[:,1]
EnsembleSubmission2 = Submission[['Id','Pred']].sort_values(by = 'Id')
EnsembleSubmission2.to_csv("results3/votingSubmissionELO.csv",index = False)

{'voting': 'soft', 'weights': [1, 4, 7, 0.8]}
{'voting': 'soft', 'weights': [1, 5, 6, 0.8]}
