In [1]:
import pandas as pd
import csv
import math
import numpy as np
from sklearn import cross_validation as CV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.grid_search import GridSearchCV
import random
import pickle

In [3]:
def get_data():
    X = pd.read_csv('data/elo_ratings.csv')
    y = X['W/L']
    del X['W/L']
    return X.values, y.values

In [4]:
def get_elo(season, team):
    try:
        return team_elos[season][team]
    except:
        try:
            # Get the previous season's ending value.
            team_elos[season][team] = team_elos[season-1][team]
            return team_elos[season][team]
        except:
            # Get the starter elo.
            team_elos[season][team] = base_elo
            return team_elos[season][team]

In [5]:
def get_stat(season, team, field):
    try:
        l = team_stats[season][team][field]
        return float(sum(l)) / float(len(l))
    except:
        return 0

In [6]:
def get_elo_diff(team_1, team_2, season):
    elo1 = get_elo(season, team_1)
    elo2 = get_elo(season, team_2)
    return elo1 - elo2

In [7]:
def predict_winner(team_1, team_2, model, season, stat_fields):
    features = []

    # Team 1
    features.append(get_elo(season, team_1))
    for stat in stat_fields:
        features.append(get_stat(season, team_1, stat))

    # Team 2
    features.append(get_elo(season, team_2))
    for stat in stat_fields:
        features.append(get_stat(season, team_2, stat))

    return model.predict_proba([features])

In [8]:
def build_team_dict():
    team_ids = pd.read_csv('../../../data/Teams.csv')
    team_id_map = {}
    for index, row in team_ids.iterrows():
        team_id_map[row['Team_Id']] = row['Team_Name']
    return team_id_map

In [9]:
def get_modelsGS(X, y):
    lr = LR()
    gbc = GBC()
    #nb = NB()
    #rfc = RFC()

    params1 = {
        'n_estimators':[10, 16, 32, 50, 100, 500, 1000], 
        'learning_rate':[0.01, 0.8, 1]
    }
    grid1 = GridSearchCV(gbc, params1, n_jobs=-1, cv=10)
    grid1.fit(X, y)
    gbc = grid1.best_estimator_

    params2 = {
        'penalty':['l1', 'l2'],
        'C':np.logspace(-4, 4, 8)
    }
    grid2 = GridSearchCV(lr, params2, n_jobs=-1, cv=10)
    grid2.fit(X, y)
    lr = grid2.best_estimator_
    
    return gbc, lr

In [10]:
def get_models(X, y):
    lr = LR()
    gbc = GBC()


    # Check accuracy.
    print("Doing cross-validation on LG model.")
    print(
        CV.cross_val_score(
        lr, X, y, cv=10, scoring='accuracy', n_jobs=-1)
        .mean()
    )
    print("Doing cross-validation on GBC model.")
    print(
        CV.cross_val_score(
        gbc, X, y, cv=10, scoring='accuracy', n_jobs=-1)
        .mean()
    )

    lr.fit(X, y)
    gbc.fit(X, y)
    
    return gbc, lr

In [14]:
base_elo = 1600
team_elos = pickle.load(open("data/team_elos.p", "rb"))
team_stats = pickle.load(open("data/team_stats.p", "rb"))
prediction_year = 2016
seeds = pd.read_csv('../../../data/TourneySeeds.csv')
stat_fields = ['score', 'fga', 'fgp', 'fga3', '3pp', 'ftp', 'or', 'dr', 'ast', 'to', 'stl', 'blk', 'pf']

submissions = {
    'LR': [],
    'GBC': []
}

In [11]:
# Get data
X, y = get_data()
gbc, lr = get_models(X, y)

Doing cross-validation on LG model.
0.727632936003
Doing cross-validation on GBC model.
0.725753013018


In [15]:
# Now predict tournament matchups:
prediction_year = 2016
tourney_teams = []
for index, row in seeds.iterrows():
    if row['Season'] == prediction_year:
        tourney_teams.append(row['Team'])

# Build our prediction of every matchup.
tourney_teams.sort()
for team_1 in tourney_teams:
    for team_2 in tourney_teams:
        if team_1 < team_2:
            predictionLR = predict_winner(team_1, team_2, lr, prediction_year, stat_fields)
            predictionGBC = predict_winner(team_1, team_2, gbc, prediction_year, stat_fields)
            label = str(prediction_year) + '_' + str(team_1) + '_' + \
                str(team_2)
            submissions['LR'].append([label, predictionLR[0][0]])
            submissions['GBC'].append([label, predictionGBC[0][0]])

In [None]:
# Write the results.
with open('../results/submissionLR.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'pred'])
    writer.writerows(submissions['LR'])
with open('../results/submissionGBC.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'pred'])
    writer.writerows(submissions['GBC'])

In [None]:
# Now so that we can use this to fill out a bracket, create a readable
# version.
print("Formatting/writing readable results.")
for model in submissions.keys():
    team_map = build_team_dict()
    readable = []
    less_readable = []  # A version that's easy to look up.
    for pred in submissions[model]:
        parts = pred[0].split('_')
        less_readable.append(
            [team_map[int(parts[1])], team_map[int(parts[2])], pred[1]])
        # Order them properly.
        if pred[1] > 0.5:
            winning = int(parts[1])
            losing = int(parts[2])
            proba = pred[1]
        else:
            winning = int(parts[2])
            losing = int(parts[1])
            proba = 1 - pred[1]
        readable.append(
            [
                '%s beats %s: %f' %
                (team_map[winning], team_map[losing], proba)
            ]
        )
    with open('../results/'+model+'readable-predictions.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(readable)