In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

In [369]:
def get_box_stats(url):
    html = urlopen(url)
    soup = BeautifulSoup(html)
    title = soup.find('title').get_text().split(',')
    date = title[1]+', '+title[2].split('|')[0]
    tables = [a.get('id') for a in soup.find_all('table')]
    frames = []
    team_frames = []
    teams = list(set([t.split('_')[1].upper() for t in tables]))
    for k, table_name in enumerate(tables):
        is_home_team = k >= 2
        team = table_name.split('_')[1].upper()
        table = soup.find('table', attrs= {'id': table_name})
        data_stats = [data_stat.get('data-stat') for data_stat in table.find_all(attrs={'scope': 'col'})]
        cols = [col.get_text() for col in table.find_all(attrs={'scope': 'col'})]
        d = {cols[i]: [stat.getText() for stat in table.find('tbody').find_all(attrs={'data-stat': data_stats[i]}
                                                                              )] for i in range(len(cols))}
        df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in d.items() ]))
        df['Date'] = pd.to_datetime(date)
        df['is_home_team'] = is_home_team
        df['own_team'] = team
        df['opp_team'] = [opp_team for opp_team in teams if opp_team!=team][0]
        df = df.loc[df['Starters']!='Reserves']
        df.loc[0:5, 'is_starter'] = 1
        df.loc[5:, 'is_starter'] = 0
        df.loc[:, 'is_starter'] = df.loc[:, 'is_starter'].astype(int)
        df.rename(columns={'Starters': 'Player'}, inplace=True)
        frames.append(df)
        
        df_team = (
            pd.DataFrame({cols[i]: [table.find('tfoot').find(attrs={'data-stat': data_stats[i]}).get_text()]
                          for i in range(1, len(cols))})
                  )
        
        df_team['Date'] = pd.to_datetime(date)
        df_team['is_home_team'] = is_home_team
        df_team['own_team'] = team
        df_team['opp_team'] = [opp_team for opp_team in teams if opp_team!=team][0]
        team_frames.append(df_team)
    merge_frames = [frames[0:2], frames[2:4]]
    merge_team_frames = [team_frames[0:2], team_frames[2:4]]
    full_frames = []
    full_team_frames = []
    for team_frames in merge_frames:
        cols = list(set(team_frames[1].columns)-set(team_frames[0].columns))+['Player']
        df = team_frames[0].merge(team_frames[1][cols], on='Player')
        full_frames.append(df)
    for team_frame in merge_team_frames:
        cols = list(set(team_frame[1].columns)-set(team_frame[0].columns))+['own_team']
        df = team_frame[0].merge(team_frame[1][cols], on='own_team')
        full_team_frames.append(df)
    df = pd.concat(full_frames)
    df_team = pd.concat(full_team_frames)
    return df, df_team

In [353]:
{col: [table.find('tfoot').find(attrs={'data-stat': data_stats[i]}).get_text()]
                          for i in range(len(cols))}


16

In [349]:
df_team = (
            pd.DataFrame({col: [table.find('tfoot').find(attrs={'data-stat': data_stats[i]}).get_text()]
                          for i in range(len(cols))})
                  )
print(df_team)

    TS%
0  83.4


In [245]:
teams = list(set([t.split('_')[1].upper() for t in tables]))
team = 'PHI'
[opp_team for opp_team in teams if opp_team!=team]

['BOS']

In [256]:
months = [a.get('href') for a in soup.find_all('div', attrs={'class': 'filter'})[0].find_all('a')]
months = ['https://www.basketball-reference.com'+month for month in months]
links = []
for month in months:
    html = urlopen(month)
    soup = BeautifulSoup(html)
    table = soup.find('table', attrs={'id': 'schedule'})
    for game in table.find('tbody').find_all(attrs={'data-stat': 'box_score_text'}):
        if len(game.find_all('a')) > 0:
            link = game.find_all('a')[0].get('href')
            links.append(link)
links = ['https://www.basketball-reference.com'+link for link in links]

In [370]:
frames = []
team_frames = []
for link in links:
    df, df_team = get_box_stats(link)
    frames.append(df)
    team_frames.append(df_team)
df = pd.concat(frames)
df_team = pd.concat(team_frames)

In [375]:
df_team[df_team['own_team']=='DAL']

(82, 38)

## Data Cleansing

In [262]:
df.reset_index(inplace=True, drop=True)

In [263]:
df['MP'] = df['MP'].str.split(':').apply(lambda x: np.float(x[0])+np.float(x[1])/60 if isinstance(x, list) else np.nan)

In [265]:
non_numeric_cols = ['Player', 'Date', 'own_team', 'opp_team']
numeric_cols = [col for col in df.columns if col not in non_numeric_cols]
for col in numeric_cols:
    df.loc[df[col].astype(str) == '', col] = np.nan
    df.loc[:, col] = df.loc[:, col].astype(np.float)

In [285]:
team_stats = df.groupby(['Date', 'own_team', 'opp_team'])[['MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA',
                                  'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'is_home_team']].sum().reset_index()
team_stats.loc[:, 'is_home_team'] = team_stats.loc[:, 'is_home_team'].astype(bool)
for stat in ['FG', '3P', 'FT']:
    indx = team_stats[stat+'A'] > 0
    team_stats.loc[indx, stat+'%'] = team_stats.loc[indx, stat]/team_stats.loc[indx, stat+'A']

In [376]:
team_stats.groupby(['own_team'])['PTS'].mean()

own_team
ATL    113.341463
BOS    111.373626
BRK    112.195402
CHI    104.939024
CHO    110.743902
CLE    104.475610
DAL    108.865854
DEN    110.447917
DET    106.627907
GSW    117.440000
HOU    113.279570
IND    107.255814
LAC    115.113636
LAL    111.768293
MEM    103.536585
MIA    105.707317
MIL    117.298969
MIN    112.475610
NOP    115.439024
NYK    104.573171
OKC    113.942529
ORL    106.436782
PHI    114.351064
PHO    107.500000
POR    113.846939
SAC    114.182927
SAS    111.000000
TOR    112.696078
UTA    110.919540
WAS    114.024390
Name: PTS, dtype: float64

## Exploration

In [270]:
df[(df['Player'] == 'Stephen Curry') & (df['PTS'].astype(float)/df['MP'].astype(float) < 0.5)].style

Unnamed: 0,Player,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Date,is_home_team,own_team,opp_team,is_starter,eFG%,DRB%,DRtg,ORtg,AST%,FTr,STL%,TRB%,TOV%,3PAr,BLK%,ORB%,USG%,TS%
4127,Stephen Curry,26.1,5,14,0.357,0,4,0.0,0,0,,0,1,1,6,0,0,2,4,10,-26,2018-11-08 00:00:00,1,GSW,MIL,1,0.357,4.3,132,81,33.6,0.0,0.0,2.2,12.5,0.286,0,0.0,25.7,0.357
10241,Stephen Curry,32.5333,3,12,0.25,2,8,0.25,2,2,1.0,1,2,3,3,1,0,4,1,10,-16,2018-12-12 00:00:00,1,GSW,TOR,1,0.333,6.6,118,68,12.8,0.167,1.5,5.0,23.7,0.667,0,3.4,23.3,0.388
12478,Stephen Curry,36.8,5,17,0.294,2,8,0.25,3,3,1.0,0,2,2,5,1,0,1,5,15,-9,2018-12-25 00:00:00,1,GSW,LAL,1,0.353,7.2,124,87,20.7,0.176,1.3,3.0,5.2,0.471,0,0.0,21.9,0.409
15035,Stephen Curry,31.9667,5,19,0.263,3,12,0.25,1,1,1.0,1,6,7,14,1,0,1,3,14,36,2019-01-08 00:00:00,1,GSW,NYK,1,0.342,21.0,98,115,50.7,0.053,1.5,11.9,4.9,0.632,0,3.3,26.4,0.36
17388,Stephen Curry,30.1333,3,12,0.25,2,10,0.2,3,3,1.0,1,4,5,12,1,0,0,3,11,27,2019-01-21 00:00:00,0,GSW,LAL,1,0.333,14.5,108,132,42.3,0.25,1.5,8.7,0.0,0.833,0,3.3,18.2,0.413
19472,Stephen Curry,32.85,5,15,0.333,2,9,0.222,2,3,0.667,0,4,4,2,0,0,2,2,14,13,2019-02-02 00:00:00,1,GSW,LAL,1,0.4,11.7,104,80,8.2,0.2,0.0,6.3,10.9,0.6,0,0.0,24.5,0.429
22453,Stephen Curry,35.05,5,18,0.278,4,14,0.286,2,3,0.667,0,5,5,6,4,0,4,1,16,-4,2019-02-25 00:00:00,0,GSW,CHO,1,0.389,15.2,103,87,20.0,0.167,5.4,7.9,17.2,0.778,0,0.0,27.6,0.414
24469,Stephen Curry,34.35,6,16,0.375,4,10,0.4,1,2,0.5,0,7,7,4,0,0,7,4,17,20,2019-03-08 00:00:00,1,GSW,DEN,1,0.5,19.6,108,75,15.3,0.125,0.0,11.0,29.3,0.625,0,0.0,30.3,0.504
29421,Stephen Curry,29.4833,3,14,0.214,1,9,0.111,0,0,,1,9,10,7,1,0,2,1,7,32,2019-04-04 00:00:00,0,GSW,LAL,1,0.25,24.4,83,72,29.9,0.0,1.5,14.4,12.5,0.643,0,3.1,22.8,0.25
31470,Stephen Curry,34.7333,3,14,0.214,1,9,0.111,5,5,1.0,0,10,10,7,3,0,3,4,12,-2,2019-04-21 00:00:00,0,GSW,LAC,1,0.25,32.9,100,88,25.6,0.357,4.3,16.9,15.6,0.643,0,0.0,24.0,0.37


In [275]:
df[df['Player']=='Stephen Curry']['MP'].mean()

34.56877394636013

In [279]:
df[(df['own_team']=='GSW') & (df['Date']==pd.to_datetime('2018-11-08'))]['MP'].sum().round()

240.0

In [354]:
d = dict()
for i in range(1, len(cols)):
    d[cols[i]] = [table.find('tfoot').find(attrs={'data-stat': data_stats[i]}).get_text()]
df_team = pd.DataFrame(d)

In [310]:
table.find('tfoot').find(attrs={'data-stat': 'ts_pct'}).getText()

'.509'

In [355]:
{col: }

Unnamed: 0,MP,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg
0,240,0.509,0.49,0.381,0.144,22.6,87.8,53.9,50.0,6.7,8.2,11.9,100.0,100.7,83.4


In [297]:
table = soup.find('table', attrs= {'id': table_name})

In [299]:
data_stats = [data_stat.get('data-stat') for data_stat in table.find_all(attrs={'scope': 'col'})]
cols = [col.get_text() for col in table.find_all(attrs={'scope': 'col'})]
d = {cols[i]: [stat.getText() for stat in table.find('tbody').find_all(attrs={'data-stat': data_stats[i]}
                                                                      )] for i in range(len(cols))}

In [367]:
get_box_stats(url)[1].reset_index().style

    MP  FG FGA   FG% 3P 3PA   3P%  FT FTA   FT%   ...    STL BLK TOV  PF PTS  \
0  240  34  87  .391  5  26  .192  14  23  .609   ...      8   5  16  20  87   

  +/-       Date is_home_team own_team opp_team  
0     2018-10-16        False      PHI      BOS  

[1 rows x 24 columns]
    MP   TS%  eFG%  3PAr   FTr  ORB%  DRB%  TRB%  AST% STL% BLK%  TOV%   USG%  \
0  240  .448  .420  .299  .264  12.2  77.4  46.1  52.9  7.7  8.3  14.1  100.0   

   ORtg   DRtg       Date  is_home_team own_team opp_team  
0  83.4  100.7 2018-10-16         False      PHI      BOS  
    MP  FG FGA   FG%  3P 3PA   3P%  FT FTA   FT%   ...    STL BLK TOV  PF  \
0  240  42  97  .433  11  37  .297  10  14  .714   ...      7   5  14  20   

   PTS +/-       Date is_home_team own_team opp_team  
0  105     2018-10-16         True      BOS      PHI  

[1 rows x 24 columns]
    MP   TS%  eFG%  3PAr   FTr  ORB%  DRB%  TRB%  AST% STL% BLK%  TOV%   USG%  \
0  240  .509  .490  .381  .144  22.6  87.8  53.9  50.0  6.7  8.2

Unnamed: 0,index,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Date,is_home_team,own_team,opp_team,eFG%,DRB%,DRtg,ORtg,AST%,FTr,STL%,TRB%,TOV%,3PAr,BLK%,ORB%,USG%,TS%
0,0,240,34,87,0.391,5,26,0.192,14,23,0.609,6,41,47,18,8,5,16,20,87,,2018-10-16 00:00:00,False,PHI,BOS,0.42,77.4,100.7,83.4,52.9,0.264,7.7,46.1,14.1,0.299,8.3,12.2,100.0,0.448
1,0,240,42,97,0.433,11,37,0.297,10,14,0.714,12,43,55,21,7,5,14,20,105,,2018-10-16 00:00:00,True,BOS,PHI,0.49,87.8,83.4,100.7,50.0,0.144,6.7,53.9,11.9,0.381,8.2,22.6,100.0,0.509


In [362]:
df.head()

Unnamed: 0,Player,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,AST%,FTr,STL%,TRB%,TOV%,3PAr,BLK%,ORB%,USG%,TS%
0,Ben Simmons,42.75,7.0,14.0,0.5,0.0,0.0,,5.0,10.0,...,34.4,0.714,4.3,16.5,14.0,0.0,3.7,6.9,21.2,0.516
1,Joel Embiid,36.816667,9.0,21.0,0.429,1.0,4.0,0.25,4.0,5.0,...,11.7,0.238,1.3,12.8,17.7,0.19,4.3,5.3,32.5,0.496
2,Robert Covington,34.2,3.0,10.0,0.3,2.0,7.0,0.286,0.0,0.0,...,0.0,0.0,2.7,8.3,16.7,0.7,2.3,2.9,14.9,0.4
3,Markelle Fultz,24.333333,2.0,7.0,0.286,0.0,0.0,,1.0,2.0,...,13.1,0.286,1.9,5.8,27.6,0.0,0.0,0.0,19.0,0.317
4,Dario Saric,22.916667,3.0,8.0,0.375,0.0,4.0,0.0,0.0,0.0,...,7.6,0.0,0.0,12.3,27.3,0.5,0.0,0.0,20.4,0.375
