In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss

pd.set_option('display.max_columns', None)

np.random.seed(2131)

import warnings
warnings.filterwarnings('ignore')

%run ../src/utility.py

In [2]:
# load files ( DATA )

teams               = pd.read_csv('../data/Teams.csv')
seasons             = pd.read_csv('../data/Seasons.csv')
reg_season_compact  = pd.read_csv('../data/RegularSeasonCompactResults.csv')
reg_season_detailed = pd.read_csv('../data/RegularSeasonDetailedResults.csv')
tourney_compact     = pd.read_csv('../data/TourneyCompactResults.csv')
tourney_detailed    = pd.read_csv('../data/TourneyDetailedResults.csv')
tourney_seeds       = pd.read_csv('../data/TourneySeeds.csv')
tourney_slots       = pd.read_csv('../data/TourneySlots.csv')

** Helper Methods **

In [3]:
def get_q():
    return np.log(10) / 400

def get_g(rd):
    q_squared = get_q() ** 2
    return 1 / np.sqrt(1 + (3 * q_squared * (rd ** 2)) / (np.pi ** 2) )

def expected_result(your_rating, opponent_rating, opponent_rd):
    return 1 / (1 + 10 ** (-get_g(opponent_rd) * (your_rating - opponent_rating) / 400))

def get_d_squared(your_rating, opponent_ratings, opponent_rds):
    q_squared = get_q() ** 2
    
    sum_ = 0
    for i in range(len(opponent_ratings)):
        e_res = expected_result(your_rating, opponent_ratings[i], opponent_rds[i])
        sum_ = sum_ + ((get_g(opponent_rds[i]) ** 2) * e_res * (1 - e_res))
        
    return 1 / (q_squared * sum_)

def glicko_update(your_rating, your_rd, opponent_ratings, opponent_rds, outcomes):
    q = get_q()

    d_squared = get_d_squared(your_rating, opponent_ratings, opponent_rds)
    
    sum_ = 0
    for i in range(len(opponent_ratings)):
        sum_ = sum_ + get_g(opponent_rds[i]) * (outcomes[i] - expected_result(your_rating, 
                                                                    opponent_ratings[i],
                                                                    opponent_rds[i]
                                                                   ))
    
    r_new  = your_rating + ((q / ((1 / your_rd ** 2) + (1 / d_squared)))) * sum_
    rd_new = np.sqrt(1 / ((1 / your_rd ** 2) + (1 / d_squared)))
    
    return r_new, rd_new

def update_deviation_after_season(old_rd, time_since_last_game):
    return min(np.sqrt(old_rd ** 2 + (15 ** 2) * time_since_last_game), 350) # this needs to be modified

In [4]:
def get_time_periods_since_last_played(all_seasons, team_id, current_season):
    earlier_seasons = all_seasons.loc[all_seasons.Season < current_season]
    mask            = (earlier_seasons.Wteam == team_id) | (earlier_seasons.Lteam == team_id)
    matched_entry   = earlier_seasons.loc[mask, :]\
                      .sort_values(by='Daynum', ascending=False).iloc[0]
    
    current_day     = pd.to_datetime(seasons.loc[seasons.Season == current_season, 'Dayzero']\
                                     .iloc[0])
    
    matched_day     = pd.to_datetime(seasons.loc[seasons.Season == matched_entry.Season, \
                                                 'Dayzero'].iloc[0]) + \
                      pd.DateOffset(matched_entry.Daynum)
    
    return (current_day - matched_day).days // 60

In [5]:
class Season:
    def __init__(self, season_perf):
        self.season_perf = season_perf
    
    def total_games(self):
        return len(self.season_perf)
    
    def get_teams(self):
        return self.season_perf['Wteam'].unique().tolist() + \
               self.season_perf['Lteam'].unique().tolist()
    
    def get_games(self, team_id):
        games_that_won  = self.season_perf.loc[self.season_perf.Wteam == team_id, :]
        games_that_lost = self.season_perf.loc[self.season_perf.Lteam == team_id, :]
        
        return pd.concat((games_that_won, games_that_lost)).sort_values(by='Daynum')
        
    def get_outcomes(self, team_id, games):
        outcomes = []
        for i in range(len(games)):
            if games.iloc[i]['Wteam'] == team_id:
                outcomes.append(1)
            else:
                outcomes.append(0)
        
        return outcomes
    
    def get_opponents(self, team_id, games):
        opponents = []
        
        for i in range(len(games)):
            if games.iloc[i]['Wteam'] != team_id:
                opponents.append(games.iloc[i]['Wteam'])
            else:
                opponents.append(games.iloc[i]['Lteam'])
        
        return opponents

** Split data into training and test set ** 

In [6]:
# lets create a target variable depending that
def create_target_variable(df):
    if df['Wteam'] > df['Lteam']:
        return 0
    else:
        return 1
    
reg_season_compact['target'] = reg_season_compact[['Wteam', 'Lteam']].apply(create_target_variable, axis=1)

In [19]:
# public leaderboard tests based on the matches played between 2013-2016
mask = reg_season_compact.Season < 1991
data = reg_season_compact.loc[mask]

train      = data.loc[data.Season < 1988]
validation = data.loc[(data.Season >= 1988) & (data.Season < 1991)] 

** Predictions **

In [7]:
def predict_proba(df, ratings):
    predictions = []
    for team_1, team_2 in zip(df['Wteam'], df['Lteam']):
        if team_1 in ratings:
            team1_rating, team1_rd = ratings[team_1]
        else:
            team1_rating = 1500
            team1_rd     = 350
        
        if team_2 in ratings:
            team2_rating, team2_rd = ratings[team_2]
        else:
            team2_rating = 1500
            team2_rd     = 350
            
        if team_1 < team_2:    
            predictions.append(expected_win_prob(team1_rating, team1_rd,
                                                 team2_rating, team2_rd))
        else:
            predictions.append(expected_win_prob(team2_rating, team2_rd,
                                                 team1_rating, team1_rd))
    
    return np.array(predictions)

### Glicko

- Initialize ratings 
- For every season calculate new ratings for each of the teams involved.
- After every season barring the first one we will update the rating deviation based on time period difference between last games for each of the teams involved in this season.
- Repeat

In [8]:
def update_rd(all_season, current_season, teams_in_season, ratings):
    
    if len(ratings.keys()) > 0:
        for team in teams_in_season:
            if team in ratings:
                old_rating, old_rd = ratings[team]
                time_period_since_last_game = get_time_periods_since_last_played(all_season,
                                                                                 team,
                                                                                 current_season
                                                                                )
                new_rd = update_deviation_after_season(old_rd, time_period_since_last_game)
                ratings[team] = (old_rating, new_rd)
    
    return ratings

In [9]:
def expected_win_prob(team1_rating, team1_rd, team2_rating, team2_rd):
    g = get_g(np.sqrt(team1_rd ** 2 + team2_rd ** 2))
    return 1 / ( 1 + 10 ** (-g * (team1_rating - team2_rating) / 400))

In [10]:
def calculate_glicko_ratings(reg_season_compact, season_list):
    ratings = {} # ratings will keep track of all ratings through all seasons
    
    for curr_season in season_list:
        season_perf      = reg_season_compact.loc[reg_season_compact.Season == curr_season, :]
        season           = Season(season_perf) 
        teams_in_season  = season.get_teams()
        
        ratings  = update_rd(reg_season_compact, curr_season, teams_in_season, ratings)
        
#         new_ratings = {} # initalize an empty dict to store ratings for this season
        
        for your_team_id in teams_in_season:
            games     = season.get_games(your_team_id)
            outcomes  = season.get_outcomes(your_team_id, games)
            opponents = season.get_opponents(your_team_id, games)

            opponent_ratings = []
            opponent_rds     = []

            for opponent in opponents:
                if opponent in ratings:
                    rating, rd = ratings[opponent]
                    opponent_ratings.append(rating)
                    opponent_rds.append(rd)
                else:
                    ratings[opponent] = (1500, 350) # default rating
                    opponent_ratings.append(1500)
                    opponent_rds.append(350)

            if your_team_id not in ratings:
                ratings[your_team_id] = (1500, 350)
                your_rating        = 1500 # default rating 
                your_rd             = 350 # default rd
            else:
                your_rating, your_rd = ratings[your_team_id]

            new_rating, new_rd = glicko_update(your_rating, your_rd, 
                                               opponent_ratings, opponent_rds, 
                                               outcomes)

            ratings[your_team_id] = (new_rating, new_rd)
        
        # update ratings after each season
#         for k, v in new_ratings.items():
#             ratings[k] = v
    
    return ratings

In [15]:
ratings = calculate_glicko_ratings(reg_season_compact, np.arange(2007, 2016, 1))

** Validation Set **

In [31]:
y_val = validation.target
y_hat = predict_proba(validation, ratings)
print('Log loss on validation set: {}'.format(log_loss(y_val, y_hat)))

Log loss on validation set: 0.6658786399284474


** Predictions **

In [12]:
def predict(sub, ratings):
    predictions = []
    
    for id_, pred in zip(sub['id'], sub['pred']):
        season, team_1, team_2 = id_.split('_')
        team_1 = int(team_1)
        team_2 = int(team_2)
        
        if team_1 < team_2:
            team1_rating, team1_rd = ratings[team_1]
            team2_rating, team2_rd = ratings[team_2]
            
            predictions.append(expected_win_prob(team1_rating, team1_rd,
                                                 team2_rating, team2_rd))
        else:
            predictions.append(expected_win_prob(team2_rating, team2_rd,
                                                 team1_rating, team2_rd))
    
    return predictions

In [13]:
# Create Submission
sub = pd.read_csv('../data/sample_submission.csv')

In [14]:
predictions = predict(sub, ratings)
sub['pred'] = predictions
sub.to_csv('../submissions/glicko_15_after_2003.csv', index=False)