In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_rows = 10
sns.set(style='ticks', context='talk')

In [6]:
tables = pd.read_html("http://www.basketball-reference.com/leagues/NBA_1997_games.html")
games = tables[0]
games.head()

Unnamed: 0,Date,Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,.1,Notes
0,"Fri, Nov 1, 1996",Chicago Bulls,107,Boston Celtics,98,Box Score,,
1,"Fri, Nov 1, 1996",Dallas Mavericks,92,Denver Nuggets,91,Box Score,,
2,"Fri, Nov 1, 1996",Indiana Pacers,89,Detroit Pistons,95,Box Score,,
3,"Fri, Nov 1, 1996",Los Angeles Clippers,97,Golden State Warriors,85,Box Score,,
4,"Fri, Nov 1, 1996",Sacramento Kings,85,Houston Rockets,96,Box Score,,


In [7]:
column_names = {'Date': 'date', 'Start (ET)': 'start',
                'Unamed: 2': 'box', 'Visitor/Neutral': 'away_team', 
                'PTS': 'away_points', 'Home/Neutral': 'home_team',
                'PTS.1': 'home_points', 'Unamed: 7': 'n_ot'}

games = (games.rename(columns=column_names)
    .dropna(thresh=4)
    [['date', 'away_team', 'away_points', 'home_team', 'home_points']]
    .assign(date=lambda x: pd.to_datetime(x['date'], format='%a, %b %d, %Y'))
    .set_index('date', append=True)
    .rename_axis(["game_id", "date"])
    .sort_index())
games.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1996-11-01,Chicago Bulls,107,Boston Celtics,98
1,1996-11-01,Dallas Mavericks,92,Denver Nuggets,91
2,1996-11-01,Indiana Pacers,89,Detroit Pistons,95
3,1996-11-01,Los Angeles Clippers,97,Golden State Warriors,85
4,1996-11-01,Sacramento Kings,85,Houston Rockets,96


In [8]:
tidy = pd.melt(games.reset_index(),
               id_vars=['game_id', 'date'], value_vars=['away_team', 'home_team'],
               value_name='team')
tidy.head()

Unnamed: 0,game_id,date,variable,team
0,0,1996-11-01,away_team,Chicago Bulls
1,1,1996-11-01,away_team,Dallas Mavericks
2,2,1996-11-01,away_team,Indiana Pacers
3,3,1996-11-01,away_team,Los Angeles Clippers
4,4,1996-11-01,away_team,Sacramento Kings


In [9]:
# For each team...  get number of days between games
tidy.groupby('team')['date'].diff().dt.days - 1

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
427    2.0
428    5.0
429    0.0
430    2.0
431    4.0
Name: date, dtype: float64

In [10]:
tidy['rest'] = tidy.sort_values('date').groupby('team').date.diff().dt.days - 1
tidy.dropna().head()

Unnamed: 0,game_id,date,variable,team,rest
14,14,1996-11-02,away_team,Detroit Pistons,0.0
15,15,1996-11-02,away_team,Toronto Raptors,0.0
16,16,1996-11-02,away_team,Philadelphia 76ers,0.0
17,17,1996-11-02,away_team,Sacramento Kings,0.0
18,18,1996-11-02,away_team,Miami Heat,0.0


In [11]:
by_game = (pd.pivot_table(tidy, values='rest',
                          index=['game_id', 'date'],
                          columns='variable')
             .rename(columns={'away_team': 'away_rest',
                              'home_team': 'home_rest'}))
df = pd.concat([games, by_game], axis=1)
df.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points,away_rest,home_rest
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14,1996-11-02,Detroit Pistons,90,Atlanta Hawks,78,0.0,0.0
16,1996-11-02,Philadelphia 76ers,86,Chicago Bulls,115,0.0,0.0
17,1996-11-02,Sacramento Kings,107,Dallas Mavericks,94,0.0,0.0
18,1996-11-02,Miami Heat,97,Indiana Pacers,95,0.0,0.0
19,1996-11-02,Utah Jazz,95,Los Angeles Clippers,90,0.0,0.0


In [12]:
g = sns.FacetGrid(tidy, col='team', col_wrap=6, hue='team')
g.map(sns.barplot, 'variable', 'rest')


<seaborn.axisgrid.FacetGrid at 0x116e5da90>

In [13]:
df['home_win'] = df['home_points'] > df['away_points'] 
df['away_win'] = df['away_points'] > df['home_points']
df['rest_spread'] = df['home_rest'] - df['away_rest']
df.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points,away_rest,home_rest,home_win,away_win,rest_spread
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
14,1996-11-02,Detroit Pistons,90,Atlanta Hawks,78,0.0,0.0,False,True,0.0
16,1996-11-02,Philadelphia 76ers,86,Chicago Bulls,115,0.0,0.0,True,False,0.0
17,1996-11-02,Sacramento Kings,107,Dallas Mavericks,94,0.0,0.0,False,True,0.0
18,1996-11-02,Miami Heat,97,Indiana Pacers,95,0.0,0.0,False,True,0.0
19,1996-11-02,Utah Jazz,95,Los Angeles Clippers,90,0.0,0.0,False,True,0.0


In [14]:
delta = (by_game.home_rest - by_game.away_rest).dropna().astype(int)
ax = (delta.value_counts()
    .reindex(np.arange(delta.min(), delta.max() + 1), fill_value=0)
    .sort_index()
    .plot(kind='bar', color='k', width=.9, rot=0, figsize=(12, 6))
)
sns.despine()
ax.set(xlabel='Difference in Rest (Home - Away)', ylabel='Games')

[<matplotlib.text.Text at 0x10434f890>, <matplotlib.text.Text at 0x10434f090>]

In [15]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x='rest_spread', y='home_win', data=df.query('-3 <= rest_spread <= 3'),
            color='#4c72b0', ax=ax)
sns.despine()

In [16]:
rest = (tidy.groupby(['date', 'variable'])
            .rest.mean()
            .dropna())
rest.head()

date        variable 
1996-11-02  away_team    0.00
            home_team    0.00
1996-11-03  away_team    0.75
            home_team    1.00
1996-11-04  away_team    1.00
Name: rest, dtype: float64

In [17]:
rest.unstack().head()

variable,away_team,home_team
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1996-11-02,0.0,0.0
1996-11-03,0.75,1.0
1996-11-04,1.0,1.0
1996-11-05,1.0,1.7
1996-11-06,0.5,2.333333


In [18]:
rest.unstack().stack()

date        variable 
1996-11-02  away_team    0.000000
            home_team    0.000000
1996-11-03  away_team    0.750000
            home_team    1.000000
1996-11-04  away_team    1.000000
                           ...   
1996-11-28  home_team    2.500000
1996-11-29  away_team    1.400000
            home_team    1.100000
1996-11-30  away_team    0.666667
            home_team    1.444444
dtype: float64

In [19]:
ax=(rest.unstack()
        .query('away_team < 7')
        .rolling(7)
        .mean().plot(figsize=(12, 6), linewidth=3, legend=False))
ax.annotate("Home", (rest.index[-1][0], 1.02), color='g', size=14)
ax.annotate("Away", (rest.index[-1][0], 0.82), color='b', size=14)
sns.despine()

In [20]:
df['home_win'] = df.home_points > df.away_points

In [21]:

wins = (
    pd.melt(df.reset_index(),
            id_vars=['game_id', 'date', 'home_win'],
            value_name='team', var_name='is_home',
            value_vars=['home_team', 'away_team'])
   .assign(win=lambda x: x.home_win == (x.is_home == 'home_team'))
   .groupby(['team', 'is_home'])
   .win
   .agg({'n_wins': 'sum', 'n_games': 'count', 'win_pct': 'mean'})
)
wins.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_games,n_wins,win_pct
team,is_home,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Atlanta Hawks,away_team,11,5.0,0.454545
Atlanta Hawks,home_team,5,4.0,0.8
Boston Celtics,away_team,4,0.0,0.0
Boston Celtics,home_team,10,4.0,0.4
Charlotte Hornets,away_team,7,3.0,0.428571


In [22]:
win_percent = (
    # Use sum(wins) / sum(games) since I don't
    # know if teams play the same number of games at
    # home as away
    wins.groupby(level='team', as_index=True)
        .apply(lambda x: x.n_wins.sum() / x.n_games.sum())
)
win_percent.head()

team
Atlanta Hawks          0.562500
Boston Celtics         0.285714
Charlotte Hornets      0.571429
Chicago Bulls          0.937500
Cleveland Cavaliers    0.642857
dtype: float64

In [23]:
df = df.assign(away_strength=df['away_team'].map(win_percent),
               home_strength=df['home_team'].map(win_percent),
               point_diff=df['home_points'] - df['away_points'],
               rest_diff=df['home_rest'] - df['away_rest'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points,away_rest,home_rest,home_win,away_win,rest_spread,away_strength,home_strength,point_diff,rest_diff
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1996-11-01,Chicago Bulls,107,Boston Celtics,98,,,False,True,,0.9375,0.285714,-9,
1,1996-11-01,Dallas Mavericks,92,Denver Nuggets,91,,,False,True,,0.285714,0.3125,-1,
2,1996-11-01,Indiana Pacers,89,Detroit Pistons,95,,,True,False,,0.384615,0.785714,6,
3,1996-11-01,Los Angeles Clippers,97,Golden State Warriors,85,,,False,True,,0.375,0.266667,-12,
4,1996-11-01,Sacramento Kings,85,Houston Rockets,96,,,True,False,,0.3125,0.9375,11,


In [24]:
import statsmodels.formula.api as sm

df['home_win'] = df.home_win.astype(int)  # for statsmodels

In [25]:
mod = sm.logit('home_win ~ home_strength + away_strength + home_rest + away_rest', df)
res = mod.fit()
res.summary()

Optimization terminated successfully.
         Current function value: 0.433942
         Iterations 7


0,1,2,3
Dep. Variable:,home_win,No. Observations:,201.0
Model:,Logit,Df Residuals:,196.0
Method:,MLE,Df Model:,4.0
Date:,"Sun, 26 Mar 2017",Pseudo R-squ.:,0.3714
Time:,15:22:01,Log-Likelihood:,-87.222
converged:,True,LL-Null:,-138.76
,,LLR p-value:,2.172e-21

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,1.1549,0.638,1.810,0.070,-0.096 2.405
home_strength,4.7881,0.935,5.120,0.000,2.955 6.621
away_strength,-6.4981,1.007,-6.455,0.000,-8.471 -4.525
home_rest,0.1510,0.199,0.759,0.448,-0.239 0.541
away_rest,-0.0507,0.216,-0.235,0.814,-0.473 0.372
