In [143]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

pd.set_option('display.max_columns', None)

np.random.seed(2131)

import warnings
warnings.filterwarnings('ignore')

%run ../src/season.py

** Load Data **

In [2]:
teams               = pd.read_csv('../data/Teams.csv')
seasons             = pd.read_csv('../data/Seasons.csv')
reg_season_compact  = pd.read_csv('../data/RegularSeasonCompactResults.csv')
reg_season_detailed = pd.read_csv('../data/RegularSeasonDetailedResults.csv')
tourney_compact     = pd.read_csv('../data/TourneyCompactResults.csv')
tourney_detailed    = pd.read_csv('../data/TourneyDetailedResults.csv')
tourney_seeds       = pd.read_csv('../data/TourneySeeds.csv')
tourney_slots       = pd.read_csv('../data/TourneySlots.csv')

In [9]:
def total_teams(df):
    return set(list(df.Wteam.unique()) + list(df.Lteam.unique()))

In [149]:
def get_total_wins_in_season(perf, team_id, curr_year):
    """
    Calculates total number of wins in a single season for a team
    """
    total_wins = 0
    
    season     = Season(perf, curr_year)
    total_wins += season.get_total_wins(team_id)

    return total_wins

def get_total_losses_in_season(perf, team_id, curr_year):
    """
    Calculates total number of losses in a single season for a team
    """
    total_losses = 0
    
    season       = Season(perf, curr_year)
    total_losses += season.get_total_losses(team_id)
    
    return total_losses

def get_total_points_in_season(perf, team_id, curr_year):
    """
    Calcualtes total number of points scored by a team in a year
    """
    
    return perf.loc[perf.Wteam == team_id, 'Wscore'].sum() +\
           perf.loc[perf.Lteam == team_id, 'Lscore'].sum()

In [147]:
reg_season_detailed.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,Wfgm3,Wfga3,Wftm,Wfta,Wor,Wdr,Wast,Wto,Wstl,Wblk,Wpf,Lfgm,Lfga,Lfgm3,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,10,1104,68,1328,62,N,0,27,58,3,14,11,18,14,24,13,23,7,1,22,22,53,2,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,8,20,10,19,15,28,16,13,4,4,18,24,67,6,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,8,18,17,29,17,26,15,10,5,2,25,22,73,3,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,3,9,17,31,6,19,11,12,14,2,18,18,49,6,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,6,14,11,13,17,22,12,14,4,4,20,24,62,6,16,17,27,21,15,12,10,7,1,14


In [156]:
def prepare_training_set(df):
    """
    Given detailed data for regular season preapare training data for pairs of teams
    """
    rows       = [] 
    
    for index, row in df.iterrows():
        wteam = row['Wteam']
        lteam = row['Lteam']
        
        curr_year  = row['Season'] # current year
        perf       = df.loc[df.Season == curr_year, :]
        
        wteam_wins = get_total_wins_in_season(perf, wteam, curr_year)
        lteam_wins = get_total_wins_in_season(perf, lteam, curr_year)
        
        wteam_losses = get_total_losses_in_season(perf, wteam, curr_year)
        lteam_losses = get_total_losses_in_season(perf, lteam, curr_year)
        
        wteam_points = get_total_points_in_season(perf, wteam, curr_year)
        lteam_points = get_total_points_in_season(perf, lteam, curr_year)
                
        rows.append([wteam, lteam, 
                     wteam_wins - lteam_wins, 
                     wteam_losses - lteam_losses,
                     wteam_points - lteam_points,
                     1])
        
        rows.append([lteam, wteam, 
                     lteam_wins - wteam_wins, 
                     lteam_losses - wteam_losses,
                     lteam_points - wteam_points,
                     0])
    
    training_df = pd.DataFrame(rows, columns=['team_1', 'team_2', 'delta_total_wins', 
                                              'delta_total_losses',
                                              'delta_points',
                                              'target'])
    return training_df

In [157]:
training_df = prepare_training_set(reg_season_detailed)

In [62]:
training_df

Unnamed: 0,team_1,team_2,delta_total_wins,delta_total_losses,target
0,1104,1328,-7,5,1
1,1328,1104,7,-5,0
2,1272,1393,-1,1,1
3,1393,1272,1,-1,0
4,1266,1437,8,-10,1
5,1437,1266,-8,10,0
6,1296,1457,-1,4,1
7,1457,1296,1,-4,0
8,1400,1208,3,-2,1
9,1208,1400,-3,2,0


** Load submission **

In [63]:
sub = pd.read_csv('../data/sample_submission.csv')

In [152]:
def prepare_test_set(df, sub):
    rows = []
    
    for ids in sub.id.values:
        curr_year, wteam, lteam = list(map(int, ids.split('_')))
        
        perf       = df.loc[df.Season == curr_year, :]
        
        wteam_wins = get_total_wins_in_season(perf, wteam, curr_year)
        lteam_wins = get_total_wins_in_season(perf, lteam, curr_year)
        
        wteam_losses = get_total_losses_in_season(perf, wteam, curr_year)
        lteam_losses = get_total_losses_in_season(perf, lteam, curr_year)
                
        wteam_points = get_total_points_in_season(perf, wteam, curr_year)
        lteam_points = get_total_points_in_season(perf, lteam, curr_year)
                
        rows.append([wteam, lteam, 
                     wteam_wins - lteam_wins, 
                     wteam_losses - lteam_losses,
                     wteam_points - lteam_points
                     ])
        
        rows.append([lteam, wteam, 
                     lteam_wins - wteam_wins, 
                     lteam_losses - wteam_losses,
                     lteam_points - wteam_points
                     ])
    
    test_df = pd.DataFrame(rows, columns=['team_1', 'team_2', 'delta_total_wins', 
                                              'delta_total_losses'
                                              ])
    return test_df

In [107]:
test_df = prepare_test_set(reg_season_detailed, sub)

In [140]:
Xtr = training_df.iloc[:99672][training_df.columns[:-1]]
ytr = training_df.iloc[:99672]['target']

Xvalid = training_df.iloc[99672:][training_df.columns[:-1]]
yvalid = training_df.iloc[99672:]['target']

In [145]:
# model = RandomForestClassifier(n_estimators=500, max_depth=5, n_jobs=-1)
model = GradientBoostingClassifier()
model.fit(Xtr, ytr)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [146]:
yhat = model.predict_proba(Xvalid)[:, 1]
print('Log loss on the training set ', log_loss(yvalid, yhat))

Log loss on the training set  0.514059606183


In [92]:
X = training_df[training_df.columns[:-1]]
y = training_df.target

In [127]:
model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [128]:
predictions = model.predict_proba(test_df)[:, 1]

In [121]:
def post_process_predictions(team_1, team_2, predictions):
    processed_preds = []
    for i in range(0, len(predictions) - 1, 2):
        if team_1[i] < team_2[i]:
            processed_preds.append(predictions[i] / (predictions[i] + predictions[i+1]))
        else:
            processed_preds.append(predictions[i+1] / (predictions[i] + predictions[i+1]))
    
    return processed_preds

In [129]:
processed_preds = post_process_predictions(test_df.team_1.values,
                                           test_df.team_2.values,
                                           predictions
                                          )

In [131]:
predictions[:10]

array([ 0.60797961,  0.39044567,  0.50489502,  0.49682022,  0.56678113,
        0.43760294,  0.75336884,  0.24787136,  0.39886232,  0.59642617])

In [132]:
processed_preds[:5]

[0.6089385159402807,
 0.50403048278267837,
 0.56430716824884752,
 0.75243567314133497,
 0.40075046342089166]

In [133]:
sub = pd.read_csv('../data/sample_submission.csv')
sub['pred'] = processed_preds
sub.to_csv('../submissions/rf_adjusted_prob.csv', index=False)