In [1]:
import pandas as pd
import csv
import math
import numpy as np
from sklearn import cross_validation as CV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.ensemble import RandomForestClassifier as RFC
import random
import pickle

In [2]:
base_elo = 1600
team_elos = {}
team_stats = {}
X = []
y = []
submissions = {
    "LR": [],
    "GBC": [],
    "NB": [],
    "RFC": []
}
prediction_year = 2016

In [3]:
def calc_elo(win_team, lose_team, season):
    winner_rank = get_elo(season, win_team)
    loser_rank = get_elo(season, lose_team)

    """
    Credits for Python Elo Rating Function:
    http://zurb.com/forrst/posts/An_Elo_Rating_function_in_Python_written_for_foo-hQl
    """
    rank_diff = winner_rank - loser_rank
    exp = float(rank_diff * -1) / float(400)
    odds = float(1) / float(1 + math.pow(10, exp))
    if winner_rank < 2100:
        k = 32
    elif winner_rank >= 2100 and winner_rank < 2400:
        k = 24
    else:
        k = 16
    new_winner_rank = round(winner_rank + (k * (1 - odds)))
    new_rank_diff = new_winner_rank - winner_rank
    new_loser_rank = loser_rank - new_rank_diff

    return new_winner_rank, new_loser_rank

In [4]:
def initialize_data():
    for i in range(1985, 2017):
        team_elos[i] = {}
        team_stats[i] = {}

In [5]:
def get_elo(season, team):
    try:
        return team_elos[season][team]
    except:
        try:
            # Get the previous season's ending value.
            team_elos[season][team] = team_elos[season-1][team]
            return team_elos[season][team]
        except:
            # Get the starter elo.
            team_elos[season][team] = base_elo
            return team_elos[season][team]

In [6]:
def predict_winner(team_1, team_2, model, season, stat_fields):
    features = []

    # Team 1
    features.append(get_elo(season, team_1))
    for stat in stat_fields:
        features.append(get_stat(season, team_1, stat))

    # Team 2
    features.append(get_elo(season, team_2))
    for stat in stat_fields:
        features.append(get_stat(season, team_2, stat))

    return model.predict_proba([features])

In [7]:
def update_stats(season, team, fields):
    """
    This accepts some stats for a team and udpates the averages.
    First, we check if the team is in the dict yet. If it's not, we add it.
    Then, we try to check if the key has more than 9 values in it.
        If it does, we remove the first one
        Either way, we append the new one.
    If we can't check, then it doesn't exist, so we just add this.
    Later, we'll get the average of these items.
    """
    if team not in team_stats[season]:
        team_stats[season][team] = {}

    for key, value in fields.items():
        # Make sure we have the field.
        if key not in team_stats[season][team]:
            team_stats[season][team][key] = []

        if len(team_stats[season][team][key]) >= 9:
            team_stats[season][team][key].pop()
        team_stats[season][team][key].append(value)

In [8]:
def get_stat(season, team, field):
    try:
        l = team_stats[season][team][field]
        return float(sum(l)) / float(len(l))
    except:
        return 0

In [9]:
def build_season_data(all_data):
    # Calculate the elo for every game for every team, each season.
    # Store the elo per season so we can retrieve their end elo
    # later in order to predict the tournaments without having to
    # inject the prediction into this loop.
    print("Building season data.")
    for index, row in all_data.iterrows():
        # Used to skip matchups where we don't have usable stats yet.
        skip = 0

        # Get starter or previous elos.
        team_1_elo = get_elo(row['Season'], row['Wteam'])
        team_2_elo = get_elo(row['Season'], row['Lteam'])
        
        # Add 100 to the home team (# taken from Nate Silver analysis.)
        if row['Wloc'] == 'H':
            team_1_elo += 100
        elif row['Wloc'] == 'A':
            team_2_elo += 100

        # We'll create some arrays to use later.
        team_1_features = [team_1_elo]
        team_2_features = [team_2_elo]

        # Build arrays out of the stats we're tracking..
        for field in stat_fields:
            team_1_stat = get_stat(row['Season'], row['Wteam'], field)
            team_2_stat = get_stat(row['Season'], row['Lteam'], field)
            if team_1_stat is not 0 and team_2_stat is not 0:
                team_1_features.append(team_1_stat)
                team_2_features.append(team_2_stat)
            else:
                skip = 1

        if skip == 0:  # Make sure we have stats.
            # Randomly select left and right and 0 or 1 so we can train
            # for multiple classes.
            if random.random() > 0.5:
                X.append(team_1_features + team_2_features)
                y.append(0)
            else:
                X.append(team_2_features + team_1_features)
                y.append(1)

        # AFTER we add the current stuff to the prediction, update for
        # next time. Order here is key so we don't fit on data from the
        # same game we're trying to predict.
        if row['Wfta'] != 0 and row['Lfta'] != 0:
            stat_1_fields = {
                'score': row['Wscore'],
                'fgp': float(row['Wfgm']) / float(row['Wfga']) * 100,
                'fga': row['Wfga'],
                'fga3': row['Wfga3'],
                '3pp': float(row['Wfgm3']) / float(row['Wfga3']) * 100,
                'ftp': float(row['Wftm']) / float(row['Wfta']) * 100,
                'or': row['Wor'],
                'dr': row['Wdr'],
                'ast': row['Wast'],
                'to': row['Wto'],
                'stl': row['Wstl'],
                'blk': row['Wblk'],
                'pf': row['Wpf']
            }
            stat_2_fields = {
                'score': row['Lscore'],
                'fgp': float(row['Lfgm']) / float(row['Lfga']) * 100,
                'fga': row['Lfga'],
                'fga3': row['Lfga3'],
                '3pp': float(row['Lfgm3']) / float(row['Lfga3']) * 100,
                'ftp': float(row['Lftm']) / float(row['Lfta']) * 100,
                'or': row['Lor'],
                'dr': row['Ldr'],
                'ast': row['Last'],
                'to': row['Lto'],
                'stl': row['Lstl'],
                'blk': row['Lblk'],
                'pf': row['Lpf']
            }
            update_stats(row['Season'], row['Wteam'], stat_1_fields)
            update_stats(row['Season'], row['Lteam'], stat_2_fields)

        # Now that we've added them, calc the new elo.
        new_winner_rank, new_loser_rank = calc_elo(
            row['Wteam'], row['Lteam'], row['Season'])
        team_elos[row['Season']][row['Wteam']] = new_winner_rank
        team_elos[row['Season']][row['Lteam']] = new_loser_rank
    return X, y

In [10]:
stat_fields = ['score', 'fga', 'fgp', 'fga3', '3pp', 'ftp', 'or', 'dr', 'ast', 'to', 'stl', 'blk', 'pf']
initialize_data()

tourney_data = pd.read_csv('../../../data/TourneyDetailedResults.csv')
season_data = pd.read_csv('../../../data/RegularSeasonDetailedResults.csv')      

frames = [season_data, tourney_data]
all_data = pd.concat(frames)

In [11]:
# Build the working data.
X, y = build_season_data(all_data)
X = np.array(X)
# Reformat ndarray to DataFrame
headers = [
    'Aelo', 'Ascore', 'Afga', 'Afgp', 'Afga3', 'A3pp', 'Aftp', 'Aor', 'Adr',
    'Aast', 'Ato', 'Astl', 'Ablk', 'Wpf', 'Belo', 'Bscore', 'Bfga', 'Bfgp',
    'Bfga3', 'B3pp', 'Bftp', 'Bor', 'Bdr','Bast', 'Bto', 'Bstl', 'Bblk', 'Bpf'
]
df = pd.DataFrame(X, columns=headers)
target = pd.DataFrame(y, columns=['W/L'])
data = pd.concat([df, target], axis=1)

Building season data.


In [13]:
# Save team_elos and team_stats
data.to_csv("data/elo_ratings.csv", index=False)
pickle.dump(team_elos, open( "data/team_elos.p", "wb" ))
pickle.dump(team_stats, open( "data/team_stats.p", "wb" ))