In [129]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss

pd.set_option('display.max_columns', None)

np.random.seed(2131)

import warnings
warnings.filterwarnings('ignore')

%run ../src/elo.py
%run ../src/elo_game_by_game.py
%run ../src/utility.py

In [2]:
# load files ( DATA )

teams               = pd.read_csv('../data/Teams.csv')
seasons             = pd.read_csv('../data/Seasons.csv')
reg_season_compact  = pd.read_csv('../data/RegularSeasonCompactResults.csv')
reg_season_detailed = pd.read_csv('../data/RegularSeasonDetailedResults.csv')
tourney_compact     = pd.read_csv('../data/TourneyCompactResults.csv')
tourney_detailed    = pd.read_csv('../data/TourneyDetailedResults.csv')
tourney_seeds       = pd.read_csv('../data/TourneySeeds.csv')
tourney_slots       = pd.read_csv('../data/TourneySlots.csv')

In [4]:
reg_season_compact.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


** Competition asks to predict given two teams, the probability of winning of team with lower team id. **

In [6]:
# lets create a target variable depending that
def create_target_variable(df):
    if df['Wteam'] > df['Lteam']:
        return 0
    else:
        return 1
    
reg_season_compact['target'] = reg_season_compact[['Wteam', 'Lteam']].apply(create_target_variable, axis=1)

** Split data into training and test set. **

In [10]:
# public leaderboard tests based on the matches played between 2013-2016
mask = reg_season_compact.Season < 2013
data = reg_season_compact.loc[mask]

In [11]:
train      = data.loc[data.Season < 2010]
validation = data.loc[data.Season >= 2010] 

** Use validation set to find optimum value of the k-factor used in elo ratings. **

In [12]:
def rate_game_by_game(season_list, ratings):
    for season in season_list:
        ratings = rate_team_for_season(reg_season_compact.loc[reg_season_compact.Season == season], ratings)
        
    return ratings

In [130]:
all_teams   = get_all_teams(reg_season_compact)
ratings     = {team: 1300 for team in all_teams}

** Method 1 **

In [131]:
elo_ratings = rate_game_by_game(np.arange(1985, 2011, 1), ratings.copy())

** Method 2 **

In [46]:
def calculate_elo_ratings(reg_season_compact, season_list, ratings):
    for season in season_list:
        season_perf   = reg_season_compact.loc[reg_season_compact.Season == season]
        season        = Season(season_perf)
        teams         = season.get_teams_in_season()
        opponents     = season.get_opponents_list()
        actual_perf   = season.get_actual_performance(opponents)
        expected_perf = season.get_expected(ratings, opponents)
        ratings       = season.update_ratings(ratings)
    
    return ratings

In [84]:
elo_ratings = calculate_elo_ratings(reg_season_compact, np.arange(1985, 2011, 1), ratings.copy())

Location advantage  15
Location advantage  10
Location advantage  30
Location advantage  10
Location advantage  55
Location advantage  0
Location advantage  15
Location advantage  25
Location advantage  35
Location advantage  20
Location advantage  45
Location advantage  55
Location advantage  20
Location advantage  30
Location advantage  50
Location advantage  10
Location advantage  15
Location advantage  20
Location advantage  5
Location advantage  20
Location advantage  20
Location advantage  55
Location advantage  50
Location advantage  15
Location advantage  35
Location advantage  15
Location advantage  20
Location advantage  50
Location advantage  30
Location advantage  30
Location advantage  10
Location advantage  15
Location advantage  5
Location advantage  40
Location advantage  10
Location advantage  30
Location advantage  5
Location advantage  40
Location advantage  35
Location advantage  25
Location advantage  25
Location advantage  20
Location advantage  35
Location advant

** Prediction Phase **

In [20]:
def predict_proba(df, ratings):
    predictions = []
    for team_1, team_2 in zip(df['Wteam'], df['Lteam']):
        if team_1 < team_2:
            predictions.append(expected(ratings[team_1], ratings[team_2]))
        else:
            predictions.append(expected(ratings[team_2], ratings[team_2]))
    
    return np.array(predictions)

In [132]:
y_val = validation.target
y_hat = predict_proba(validation, elo_ratings)
print('Log loss on validation set: {}'.format(log_loss(y_val, y_hat)))

Log loss on validation set: 0.6628114174387922


In [133]:
y_test = reg_season_compact.loc[~mask, 'target']
y_hat  = predict_proba(reg_season_compact.loc[~mask], elo_ratings)
print('Log loss on test set: {}'.format(log_loss(y_test, y_hat)))

Log loss on test set: 0.6842335201996937
