In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_rows = 10
sns.set(style='ticks', context='talk')

In [2]:
tables = pd.read_html("http://www.basketball-reference.com/leagues/NBA_2017_games.html")
games = tables[0]
games.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 7,.1,Notes
0,"Tue, Oct 25, 2016",7:30 pm,New York Knicks,88,Cleveland Cavaliers,117,Box Score,,
1,"Tue, Oct 25, 2016",10:30 pm,San Antonio Spurs,129,Golden State Warriors,100,Box Score,,
2,"Tue, Oct 25, 2016",10:00 pm,Utah Jazz,104,Portland Trail Blazers,113,Box Score,,
3,"Wed, Oct 26, 2016",7:30 pm,Brooklyn Nets,117,Boston Celtics,122,Box Score,,
4,"Wed, Oct 26, 2016",7:00 pm,Dallas Mavericks,121,Indiana Pacers,130,Box Score,OT,


In [3]:
column_names = {'Date': 'date', 'Start (ET)': 'start',
                'Unamed: 2': 'box', 'Visitor/Neutral': 'away_team', 
                'PTS': 'away_points', 'Home/Neutral': 'home_team',
                'PTS.1': 'home_points', 'Unamed: 7': 'n_ot'}

games = (games.rename(columns=column_names)
    .dropna(thresh=4)
    [['date', 'away_team', 'away_points', 'home_team', 'home_points']]
    .assign(date=lambda x: pd.to_datetime(x['date'], format='%a, %b %d, %Y'))
    .set_index('date', append=True)
    .rename_axis(["game_id", "date"])
    .sort_index())
games.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2016-10-25,New York Knicks,88,Cleveland Cavaliers,117
1,2016-10-25,San Antonio Spurs,129,Golden State Warriors,100
2,2016-10-25,Utah Jazz,104,Portland Trail Blazers,113
3,2016-10-26,Brooklyn Nets,117,Boston Celtics,122
4,2016-10-26,Dallas Mavericks,121,Indiana Pacers,130


In [4]:
tidy = pd.melt(games.reset_index(),
               id_vars=['game_id', 'date'], value_vars=['away_team', 'home_team'],
               value_name='team')
tidy.head()

Unnamed: 0,game_id,date,variable,team
0,0,2016-10-25,away_team,New York Knicks
1,1,2016-10-25,away_team,San Antonio Spurs
2,2,2016-10-25,away_team,Utah Jazz
3,3,2016-10-26,away_team,Brooklyn Nets
4,4,2016-10-26,away_team,Dallas Mavericks


In [5]:
# For each team...  get number of days between games
tidy.groupby('team')['date'].diff().dt.days - 1

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
     ... 
85    3.0
86    3.0
87    2.0
88    0.0
89    2.0
Name: date, dtype: float64

In [6]:
tidy['rest'] = tidy.sort_values('date').groupby('team').date.diff().dt.days - 1
tidy.dropna().head()

Unnamed: 0,game_id,date,variable,team,rest
14,14,2016-10-27,away_team,Boston Celtics,0.0
16,16,2016-10-27,away_team,San Antonio Spurs,1.0
17,17,2016-10-28,away_team,Indiana Pacers,1.0
18,18,2016-10-28,away_team,Houston Rockets,1.0
19,19,2016-10-28,away_team,Orlando Magic,1.0


In [7]:
by_game = (pd.pivot_table(tidy, values='rest',
                          index=['game_id', 'date'],
                          columns='variable')
             .rename(columns={'away_team': 'away_rest',
                              'home_team': 'home_rest'}))
df = pd.concat([games, by_game], axis=1)
df.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points,away_rest,home_rest
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
16,2016-10-27,San Antonio Spurs,102,Sacramento Kings,94,1.0,0.0
17,2016-10-28,Indiana Pacers,94,Brooklyn Nets,103,1.0,1.0
18,2016-10-28,Houston Rockets,106,Dallas Mavericks,98,1.0,1.0
19,2016-10-28,Orlando Magic,82,Detroit Pistons,108,1.0,1.0
20,2016-10-28,Charlotte Hornets,97,Miami Heat,91,1.0,1.0


In [13]:
g = sns.FacetGrid(tidy, col='team', col_wrap=6, hue='team')
g.map(sns.barplot, 'variable', 'rest')
plot.g.map

NameError: name 'plot' is not defined

In [26]:
df['home_win'] = df['home_points'] > df['away_points'] 
df['away_win'] = df['away_points'] > df['home_points']
df['rest_spread'] = df['home_rest'] - df['away_rest']
df.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points,away_rest,home_rest,home_win,rest_spread,away_strength,home_strength,point_diff,rest_diff,away_win
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
16,2016-10-27,San Antonio Spurs,102,Sacramento Kings,94,1.0,0.0,False,-1.0,1.0,0.5,-8,-1.0,True
17,2016-10-28,Indiana Pacers,94,Brooklyn Nets,103,1.0,1.0,True,0.0,0.333333,0.25,9,0.0,False
18,2016-10-28,Houston Rockets,106,Dallas Mavericks,98,1.0,1.0,False,0.0,0.666667,0.0,-8,0.0,True
19,2016-10-28,Orlando Magic,82,Detroit Pistons,108,1.0,1.0,True,0.0,0.0,0.666667,26,0.0,False
20,2016-10-28,Charlotte Hornets,97,Miami Heat,91,1.0,1.0,False,0.0,0.666667,0.333333,-6,0.0,True


In [11]:
delta = (by_game.home_rest - by_game.away_rest).dropna().astype(int)
ax = (delta.value_counts()
    .reindex(np.arange(delta.min(), delta.max() + 1), fill_value=0)
    .sort_index()
    .plot(kind='bar', color='k', width=.9, rot=0, figsize=(12, 6))
)
sns.despine()
ax.set(xlabel='Difference in Rest (Home - Away)', ylabel='Games')

[<matplotlib.text.Text at 0x11b2c3850>, <matplotlib.text.Text at 0x11b2b8d90>]

In [14]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x='rest_spread', y='home_win', data=df.query('-3 <= rest_spread <= 3'),
            color='#4c72b0', ax=ax)
sns.despine()

In [15]:
rest = (tidy.groupby(['date', 'variable'])
            .rest.mean()
            .dropna())
rest.head()

date        variable 
2016-10-27  away_team    0.500000
            home_team    0.500000
2016-10-28  away_team    1.250000
            home_team    1.125000
2016-10-29  away_team    0.777778
Name: rest, dtype: float64

In [16]:
rest.unstack().head()

variable,away_team,home_team
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-10-27,0.5,0.5
2016-10-28,1.25,1.125
2016-10-29,0.777778,1.333333
2016-10-30,0.857143,1.0
2016-10-31,0.75,1.0


In [17]:
rest.unstack().stack()

date        variable 
2016-10-27  away_team    0.500000
            home_team    0.500000
2016-10-28  away_team    1.250000
            home_team    1.125000
2016-10-29  away_team    0.777778
            home_team    1.333333
2016-10-30  away_team    0.857143
            home_team    1.000000
2016-10-31  away_team    0.750000
            home_team    1.000000
dtype: float64

In [18]:
ax=(rest.unstack()
        .query('away_team < 7')
        .rolling(7)
        .mean().plot(figsize=(12, 6), linewidth=3, legend=False))
ax.annotate("Home", (rest.index[-1][0], 1.02), color='g', size=14)
ax.annotate("Away", (rest.index[-1][0], 0.82), color='b', size=14)
sns.despine()

In [19]:
df['home_win'] = df.home_points > df.away_points

In [20]:
wins = (
    pd.melt(df.reset_index(),
            id_vars=['game_id', 'date', 'home_win'],
            value_name='team', var_name='is_home',
            value_vars=['home_team', 'away_team'])
   .assign(win=lambda x: x.home_win == (x.is_home == 'home_team'))
   .groupby(['team', 'is_home'])
   .win
   .agg({'n_wins': 'sum', 'n_games': 'count', 'win_pct': 'mean'})
)
wins.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_games,n_wins,win_pct
team,is_home,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Atlanta Hawks,away_team,1,1.0,1.0
Atlanta Hawks,home_team,2,2.0,1.0
Boston Celtics,away_team,2,1.0,0.5
Boston Celtics,home_team,1,1.0,1.0
Brooklyn Nets,away_team,2,0.0,0.0


In [21]:
win_percent = (
    # Use sum(wins) / sum(games) since I don't
    # know if teams play the same number of games at
    # home as away
    wins.groupby(level='team', as_index=True)
        .apply(lambda x: x.n_wins.sum() / x.n_games.sum())
)
win_percent.head()

team
Atlanta Hawks        1.000000
Boston Celtics       0.666667
Brooklyn Nets        0.250000
Charlotte Hornets    0.666667
Chicago Bulls        1.000000
dtype: float64

In [22]:
df = df.assign(away_strength=df['away_team'].map(win_percent),
               home_strength=df['home_team'].map(win_percent),
               point_diff=df['home_points'] - df['away_points'],
               rest_diff=df['home_rest'] - df['away_rest'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,away_team,away_points,home_team,home_points,away_rest,home_rest,home_win,rest_spread,away_strength,home_strength,point_diff,rest_diff
game_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,2016-10-25,New York Knicks,88,Cleveland Cavaliers,117,,,True,,0.5,1.0,29,
1,2016-10-25,San Antonio Spurs,129,Golden State Warriors,100,,,False,,1.0,0.666667,-29,
2,2016-10-25,Utah Jazz,104,Portland Trail Blazers,113,,,True,,0.333333,0.666667,9,
3,2016-10-26,Brooklyn Nets,117,Boston Celtics,122,,,True,,0.25,0.666667,5,
4,2016-10-26,Dallas Mavericks,121,Indiana Pacers,130,,,True,,0.0,0.333333,9,


In [23]:
import statsmodels.formula.api as sm

df['home_win'] = df.home_win.astype(int)  # for statsmodels

In [24]:
mod = sm.logit('home_win ~ home_strength + away_strength + home_rest + away_rest', df)
res = mod.fit()
res.summary()

         Current function value: 0.000000
         Iterations: 35




0,1,2,3
Dep. Variable:,home_win,No. Observations:,29.0
Model:,Logit,Df Residuals:,24.0
Method:,MLE,Df Model:,4.0
Date:,"Sun, 26 Mar 2017",Pseudo R-squ.:,1.0
Time:,14:13:43,Log-Likelihood:,-2.1714e-07
converged:,False,LL-Null:,-19.248
,,LLR p-value:,8.852e-08

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,38.1177,3.06e+04,0.001,0.999,-5.99e+04 6e+04
home_strength,37.6986,3.24e+04,0.001,0.999,-6.35e+04 6.35e+04
away_strength,-158.7919,6.89e+04,-0.002,0.998,-1.35e+05 1.35e+05
home_rest,6.9403,8013.336,0.001,0.999,-1.57e+04 1.57e+04
away_rest,22.4721,1.4e+04,0.002,0.999,-2.73e+04 2.74e+04
