In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_rows = 10
sns.set(style='ticks', context='talk')

In [49]:
tables = pd.read_html("http://www.basketball-reference.com/leagues/NBA_1986_games.html")
games = tables[0]
games.head()

Unnamed: 0,Date,Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,.1,Notes
0,"Fri, Oct 25, 1985",Washington Bullets,100,Atlanta Hawks,91,Box Score,,
1,"Fri, Oct 25, 1985",Cleveland Cavaliers,115,Chicago Bulls,116,Box Score,OT,
2,"Fri, Oct 25, 1985",Milwaukee Bucks,116,Detroit Pistons,118,Box Score,,
3,"Fri, Oct 25, 1985",Denver Nuggets,119,Golden State Warriors,105,Box Score,,
4,"Fri, Oct 25, 1985",Boston Celtics,109,New Jersey Nets,113,Box Score,OT,


In [50]:
column_names = {'Date': 'date', 'Start (ET)': 'start',
                'Unamed: 2': 'box', 'Visitor/Neutral': 'away_team', 
                'PTS': 'away_points', 'Home/Neutral': 'home_team',
                'PTS.1': 'home_points', 'Unamed: 7': 'n_ot'}

games = (games.rename(columns=column_names)
    .dropna(thresh=4)
    [['date', 'away_team', 'away_points', 'home_team', 'home_points']]
    .assign(date=lambda x: pd.to_datetime(x['date'], format='%a, %b %d, %Y'))
    .set_index('date', append=True)
    .rename_axis(["game_id", "date"])
    .sort_index())
games.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1985-10-25,Washington Bullets,100,Atlanta Hawks,91
1,1985-10-25,Cleveland Cavaliers,115,Chicago Bulls,116
2,1985-10-25,Milwaukee Bucks,116,Detroit Pistons,118
3,1985-10-25,Denver Nuggets,119,Golden State Warriors,105
4,1985-10-25,Boston Celtics,109,New Jersey Nets,113


In [51]:
tidy = pd.melt(games.reset_index(),
               id_vars=['game_id', 'date'], value_vars=['away_team', 'home_team'],
               value_name='team')
tidy.head()

Unnamed: 0,game_id,date,variable,team
0,0,1985-10-25,away_team,Washington Bullets
1,1,1985-10-25,away_team,Cleveland Cavaliers
2,2,1985-10-25,away_team,Milwaukee Bucks
3,3,1985-10-25,away_team,Denver Nuggets
4,4,1985-10-25,away_team,Boston Celtics


In [52]:
# For each team...  get number of days between games
tidy.groupby('team')['date'].diff().dt.days - 1

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
     ... 
71    1.0
72    1.0
73    1.0
74    5.0
75    1.0
Name: date, dtype: float64

In [53]:
tidy['rest'] = tidy.sort_values('date').groupby('team').date.diff().dt.days - 1
tidy.dropna().head()

Unnamed: 0,game_id,date,variable,team,rest
8,8,1985-10-26,away_team,Detroit Pistons,0.0
9,9,1985-10-26,away_team,Boston Celtics,0.0
11,11,1985-10-26,away_team,Sacramento Kings,0.0
12,12,1985-10-26,away_team,New Jersey Nets,0.0
13,13,1985-10-26,away_team,Houston Rockets,0.0


In [54]:
by_game = (pd.pivot_table(tidy, values='rest',
                          index=['game_id', 'date'],
                          columns='variable')
             .rename(columns={'away_team': 'away_rest',
                              'home_team': 'home_rest'}))
df = pd.concat([games, by_game], axis=1)
df.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points,away_rest,home_rest
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8,1985-10-26,Detroit Pistons,118,Chicago Bulls,121,0.0,0.0
9,1985-10-26,Boston Celtics,105,Cleveland Cavaliers,100,0.0,0.0
11,1985-10-26,Sacramento Kings,112,Denver Nuggets,123,0.0,0.0
13,1985-10-26,Houston Rockets,129,Los Angeles Clippers,130,0.0,0.0
14,1985-10-26,Atlanta Hawks,91,Milwaukee Bucks,117,0.0,0.0


In [55]:
g = sns.FacetGrid(tidy, col='team', col_wrap=6, hue='team')
g.map(sns.barplot, 'variable', 'rest')


<seaborn.axisgrid.FacetGrid at 0x11df34910>

In [56]:
df['home_win'] = df['home_points'] > df['away_points'] 
df['away_win'] = df['away_points'] > df['home_points']
df['rest_spread'] = df['home_rest'] - df['away_rest']
df.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points,away_rest,home_rest,home_win,away_win,rest_spread
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8,1985-10-26,Detroit Pistons,118,Chicago Bulls,121,0.0,0.0,True,False,0.0
9,1985-10-26,Boston Celtics,105,Cleveland Cavaliers,100,0.0,0.0,False,True,0.0
11,1985-10-26,Sacramento Kings,112,Denver Nuggets,123,0.0,0.0,True,False,0.0
13,1985-10-26,Houston Rockets,129,Los Angeles Clippers,130,0.0,0.0,True,False,0.0
14,1985-10-26,Atlanta Hawks,91,Milwaukee Bucks,117,0.0,0.0,True,False,0.0


In [57]:
delta = (by_game.home_rest - by_game.away_rest).dropna().astype(int)
ax = (delta.value_counts()
    .reindex(np.arange(delta.min(), delta.max() + 1), fill_value=0)
    .sort_index()
    .plot(kind='bar', color='k', width=.9, rot=0, figsize=(12, 6))
)
sns.despine()
ax.set(xlabel='Difference in Rest (Home - Away)', ylabel='Games')

[<matplotlib.text.Text at 0x11e926390>, <matplotlib.text.Text at 0x11e913450>]

In [58]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x='rest_spread', y='home_win', data=df.query('-3 <= rest_spread <= 3'),
            color='#4c72b0', ax=ax)
sns.despine()

In [59]:
rest = (tidy.groupby(['date', 'variable'])
            .rest.mean()
            .dropna())
rest.head()

date        variable 
1985-10-26  away_team    0.0
            home_team    0.0
1985-10-27  away_team    1.0
            home_team    1.0
1985-10-29  away_team    2.1
Name: rest, dtype: float64

In [60]:
rest.unstack().head()

variable,away_team,home_team
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1985-10-26,0.0,0.0
1985-10-27,1.0,1.0
1985-10-29,2.1,2.0
1985-10-30,0.75,0.75
1985-10-31,1.0,1.5


In [61]:
rest.unstack().stack()

date        variable 
1985-10-26  away_team    0.00
            home_team    0.00
1985-10-27  away_team    1.00
            home_team    1.00
1985-10-29  away_team    2.10
            home_team    2.00
1985-10-30  away_team    0.75
            home_team    0.75
1985-10-31  away_team    1.00
            home_team    1.50
dtype: float64

In [62]:
ax=(rest.unstack()
        .query('away_team < 7')
        .rolling(7)
        .mean().plot(figsize=(12, 6), linewidth=3, legend=False))
ax.annotate("Home", (rest.index[-1][0], 1.02), color='g', size=14)
ax.annotate("Away", (rest.index[-1][0], 0.82), color='b', size=14)
sns.despine()

In [63]:
df['home_win'] = df.home_points > df.away_points

In [64]:

wins = (
    pd.melt(df.reset_index(),
            id_vars=['game_id', 'date', 'home_win'],
            value_name='team', var_name='is_home',
            value_vars=['home_team', 'away_team'])
   .assign(win=lambda x: x.home_win == (x.is_home == 'home_team'))
   .groupby(['team', 'is_home'])
   .win
   .agg({'n_wins': 'sum', 'n_games': 'count', 'win_pct': 'mean'})
)
wins.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_games,n_wins,win_pct
team,is_home,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Atlanta Hawks,away_team,1,0.0,0.0
Atlanta Hawks,home_team,2,1.0,0.5
Boston Celtics,away_team,2,1.0,0.5
Boston Celtics,home_team,1,1.0,1.0
Chicago Bulls,away_team,2,1.0,0.5


In [65]:
win_percent = (
    # Use sum(wins) / sum(games) since I don't
    # know if teams play the same number of games at
    # home as away
    wins.groupby(level='team', as_index=True)
        .apply(lambda x: x.n_wins.sum() / x.n_games.sum())
)
win_percent.head()

team
Atlanta Hawks          0.333333
Boston Celtics         0.666667
Chicago Bulls          0.750000
Cleveland Cavaliers    0.250000
Dallas Mavericks       0.500000
dtype: float64

In [66]:
df = df.assign(away_strength=df['away_team'].map(win_percent),
               home_strength=df['home_team'].map(win_percent),
               point_diff=df['home_points'] - df['away_points'],
               rest_diff=df['home_rest'] - df['away_rest'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points,away_rest,home_rest,home_win,away_win,rest_spread,away_strength,home_strength,point_diff,rest_diff
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1985-10-25,Washington Bullets,100,Atlanta Hawks,91,,,False,True,,0.666667,0.333333,-9,
1,1985-10-25,Cleveland Cavaliers,115,Chicago Bulls,116,,,True,False,,0.25,0.75,1,
2,1985-10-25,Milwaukee Bucks,116,Detroit Pistons,118,,,True,False,,0.5,0.5,2,
3,1985-10-25,Denver Nuggets,119,Golden State Warriors,105,,,False,True,,1.0,0.25,-14,
4,1985-10-25,Boston Celtics,109,New Jersey Nets,113,,,True,False,,0.666667,0.5,4,


In [67]:
import statsmodels.formula.api as sm

df['home_win'] = df.home_win.astype(int)  # for statsmodels

In [68]:
mod = sm.logit('home_win ~ home_strength + away_strength + home_rest + away_rest', df)
res = mod.fit()
res.summary()

Optimization terminated successfully.
         Current function value: 0.270574
         Iterations 8


0,1,2,3
Dep. Variable:,home_win,No. Observations:,26.0
Model:,Logit,Df Residuals:,21.0
Method:,MLE,Df Model:,4.0
Date:,"Sun, 26 Mar 2017",Pseudo R-squ.:,0.5355
Time:,15:21:04,Log-Likelihood:,-7.0349
converged:,True,LL-Null:,-15.145
,,LLR p-value:,0.002738

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,1.4230,2.728,0.522,0.602,-3.924 6.770
home_strength,6.0148,3.729,1.613,0.107,-1.294 13.324
away_strength,-6.7771,3.560,-1.904,0.057,-13.754 0.200
home_rest,0.4343,0.647,0.671,0.502,-0.834 1.703
away_rest,-0.0473,0.660,-0.072,0.943,-1.340 1.246
