In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss

pd.set_option('display.max_columns', None)

np.random.seed(2131)

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load files ( DATA )

teams               = pd.read_csv('../data/Teams.csv')
seasons             = pd.read_csv('../data/Seasons.csv')
reg_season_compact  = pd.read_csv('../data/RegularSeasonCompactResults.csv')
reg_season_detailed = pd.read_csv('../data/RegularSeasonDetailedResults.csv')
tourney_compact     = pd.read_csv('../data/TourneyCompactResults.csv')
tourney_detailed    = pd.read_csv('../data/TourneyDetailedResults.csv')
tourney_seeds       = pd.read_csv('../data/TourneySeeds.csv')
tourney_slots       = pd.read_csv('../data/TourneySlots.csv')

** Create target variable **

In [3]:
# lets create a target variable depending that
def create_target_variable(df):
    if df['Wteam'] > df['Lteam']:
        return 0
    else:
        return 1
    
reg_season_compact['target'] = reg_season_compact[['Wteam', 'Lteam']].apply(create_target_variable, axis=1)

** Split the dataset into training and test set **

In [4]:
data       = reg_season_compact.loc[reg_season_compact.Season < 2013, :]
train      = data.loc[data.Season < 2010]
validation = data.loc[(data.Season >= 2010) & (data.Season < 2013)]

In [5]:
def num_wins(agg_data, team_1, team_2):
    
    if (team_1, team_2) not in agg_data:
        return 0.5
    
    team_vs_team   = agg_data.ix[(team_1, team_2)]

    seasons        = team_vs_team.index.values
    damping_factor = .90

    sum_ = 0
    for season in seasons:
        years  = 2013 - season
        sum_ = sum_ + (team_vs_team.ix[season] * damping_factor ** years)

    return 0.5 + sum_

In [6]:
def predict_proba(agg_data, df):
    predictions = []
    
    for team_1, team_2 in zip(df['Wteam'], df['Lteam']):
        wins_1 = num_wins(agg_data, team_1, team_2)
        wins_2 = num_wins(agg_data, team_2, team_1)
        
        if team_1 < team_2:
            predictions.append(wins_1 / ( wins_1 + wins_2 ))
        else:
            predictions.append(wins_2 / ( wins_1 + wins_2))
        
    return np.array(predictions)

In [77]:
agg_data    = train.groupby(['Wteam', 'Lteam', 'Season']).size()
y_hat       = predict_proba(agg_data, validation)

In [78]:
y_val = validation.target
print('Log loss on validation set: {}'.format(log_loss(y_val, y_hat)))

Log loss on validation set: 0.6772118765344068


** Full Training **

In [7]:
def predict(agg_data, sub):
    predictions = []
    
    for id_, pred in zip(sub['id'], sub['pred']):
        season, team_1, team_2 = id_.split('_')
        team_1 = int(team_1)
        team_2 = int(team_2)
        
        wins_1 = num_wins(agg_data, team_1, team_2)
        wins_2 = num_wins(agg_data, team_2, team_1)
        
        if team_1 < team_2:
            predictions.append((wins_1) / (wins_1 + wins_2))
        else:
            predictions.append((wins_2) / ( wins_1 + wins_2))
    
    return predictions

In [8]:
# Create Submission
sub = pd.read_csv('../data/sample_submission.csv')

agg_data    = reg_season_compact.groupby(['Wteam', 'Lteam', 'Season']).size()
predictions = predict(agg_data, sub)

In [84]:
sub['pred'] = predictions
sub.to_csv('../submissions/historical_performance_including_2013.csv', index=False)