In [1]:
!pip install nba_api
!pip install beautifulsoup4
!pip install lxml  



# Scraping

## Dependencies & Utils

In [7]:
import pandas as pd
import numpy as np
import pickle
from os import getcwd,makedirs,listdir
from os.path import dirname
from os.path import exists
from nba_api.stats.endpoints import playercareerstats,leaguedashptdefend,leaguedashptstats,leaguedashplayerbiostats,leaguehustlestatsplayer, playerdashptreb,leaguedashplayerstats

path = getcwd()
parent = dirname(path)
DIR_DATA = parent+'/data/'
DIR_RAW_DATA = DIR_DATA+'raw/'
DIR_CLEAN_DATA = DIR_DATA+'clean/'

date_start,date_end = '2010-10-10','2022-08-01',

### Notes about the NBA api
- If season not specified, then it assumes the latest season is the
- If player or team parameter not specified, assumes team is the
- If season_type_all_star not specified, assumes Regular Season
- Data received from the api has a little margin of difference comparing to what is on the official website

In [4]:
player_common_stats_df = leaguedashplayerstats.LeagueDashPlayerStats(
                        season = '2019-20',
                        per_mode_detailed='PerGame',
                        league_id_nullable = '00',
                        season_type_all_star= 'Regular Season').get_data_frames()[0]
player_common_stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529 entries, 0 to 528
Data columns (total 68 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   PLAYER_ID              529 non-null    int64  
 1   PLAYER_NAME            529 non-null    object 
 2   NICKNAME               529 non-null    object 
 3   TEAM_ID                529 non-null    int64  
 4   TEAM_ABBREVIATION      529 non-null    object 
 5   AGE                    529 non-null    float64
 6   GP                     529 non-null    int64  
 7   W                      529 non-null    int64  
 8   L                      529 non-null    int64  
 9   W_PCT                  529 non-null    float64
 10  MIN                    529 non-null    float64
 11  FGM                    529 non-null    float64
 12  FGA                    529 non-null    float64
 13  FG_PCT                 529 non-null    float64
 14  FG3M                   529 non-null    float64
 15  FG3A  

In [5]:
carrer_defend = leaguedashptdefend.LeagueDashPtDefend(
                season = '2020-21',
                per_mode_simple='PerGame',
                defense_category='Overall',
                league_id = '00',
                season_type_all_star = 'Regular Season')

carrer_defend.get_data_frames()[0]

Unnamed: 0,CLOSE_DEF_PERSON_ID,PLAYER_NAME,PLAYER_LAST_TEAM_ID,PLAYER_LAST_TEAM_ABBREVIATION,PLAYER_POSITION,AGE,GP,G,FREQ,D_FGM,D_FGA,D_FG_PCT,NORMAL_FG_PCT,PCT_PLUSMINUS
0,203497,Rudy Gobert,1610612762,UTA,C,29.0,71,71,1.0,9.21,22.30,0.413,0.493,-0.079
1,201572,Brook Lopez,1610612749,MIL,C,33.0,69,69,1.0,8.96,19.80,0.452,0.489,-0.036
2,1627734,Domantas Sabonis,1610612754,IND,F-C,25.0,61,61,1.0,8.54,18.13,0.471,0.488,-0.017
3,203999,Nikola Jokic,1610612743,DEN,C,26.0,71,71,1.0,8.82,18.00,0.490,0.492,-0.002
4,1626167,Myles Turner,1610612754,IND,C-F,25.0,46,46,1.0,8.07,17.85,0.452,0.493,-0.041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,1629033,Theo Pinson,1610612752,NYK,G-F,25.0,12,12,1.0,0.67,1.33,0.500,0.438,0.062
533,1629833,Keljin Blevins,1610612757,POR,G,25.0,14,14,1.0,0.43,1.29,0.333,0.417,-0.084
534,1630203,Grant Riller,1610612766,CHA,G,24.0,5,5,1.0,0.80,1.20,0.667,0.429,0.238
535,1630218,Robert Woodard II,1610612758,SAC,F,21.0,7,7,1.0,0.57,1.14,0.500,0.505,-0.005


In [6]:
df_bio = leaguedashplayerbiostats.LeagueDashPlayerBioStats(
                season = '2020-21',
                per_mode_simple='PerGame',
                league_id = '00',
                season_type_all_star= 'Regular Season').get_data_frames()[0]
df_bio.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,AGE,PLAYER_HEIGHT,PLAYER_HEIGHT_INCHES,PLAYER_WEIGHT,COLLEGE,COUNTRY,...,GP,PTS,REB,AST,NET_RATING,OREB_PCT,DREB_PCT,USG_PCT,TS_PCT,AST_PCT
0,203932,Aaron Gordon,1610612743,DEN,25.0,6-8,80,235,Arizona,USA,...,50,12.4,5.7,3.2,2.1,0.055,0.15,0.204,0.547,0.165
1,1628988,Aaron Holiday,1610612754,IND,24.0,6-0,72,185,UCLA,USA,...,66,7.2,1.3,1.9,-0.2,0.012,0.06,0.189,0.503,0.139
2,1630174,Aaron Nesmith,1610612738,BOS,21.0,6-5,77,215,Vanderbilt,USA,...,46,4.7,2.8,0.5,-0.5,0.041,0.146,0.133,0.573,0.047
3,1627846,Abdel Nader,1610612756,PHX,27.0,6-5,77,225,Iowa State,Egypt,...,24,6.7,2.6,0.8,5.0,0.02,0.151,0.183,0.605,0.078
4,1629690,Adam Mokoka,1610612741,CHI,22.0,6-4,76,190,,France,...,14,1.1,0.4,0.4,-7.1,0.017,0.077,0.171,0.386,0.179


In [7]:
df_bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PLAYER_ID             540 non-null    int64  
 1   PLAYER_NAME           540 non-null    object 
 2   TEAM_ID               540 non-null    int64  
 3   TEAM_ABBREVIATION     540 non-null    object 
 4   AGE                   540 non-null    float64
 5   PLAYER_HEIGHT         540 non-null    object 
 6   PLAYER_HEIGHT_INCHES  540 non-null    int64  
 7   PLAYER_WEIGHT         540 non-null    object 
 8   COLLEGE               540 non-null    object 
 9   COUNTRY               540 non-null    object 
 10  DRAFT_YEAR            540 non-null    object 
 11  DRAFT_ROUND           532 non-null    object 
 12  DRAFT_NUMBER          531 non-null    object 
 13  GP                    540 non-null    int64  
 14  PTS                   540 non-null    float64
 15  REB                   5

In [8]:
speed_distance_player = leaguedashptstats.LeagueDashPtStats(
                            season = '2019-20',
                            season_type_all_star = 'Regular Season',
                            per_mode_simple = 'PerGame',
                            player_or_team = 'Player',
                            # month = ,
                            ).get_data_frames()[0]
speed_distance_player.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,GP,W,L,MIN,MIN1,DIST_FEET,DIST_MILES,DIST_MILES_OFF,DIST_MILES_DEF,AVG_SPEED,AVG_SPEED_OFF,AVG_SPEED_DEF
0,203932,Aaron Gordon,1610612753,ORL,62,30,32,32.53,32.53,12820.23,2.43,1.34,1.08,4.2,4.65,3.76
1,1628988,Aaron Holiday,1610612754,IND,66,42,24,24.5,24.5,9661.73,1.83,0.96,0.87,4.2,4.42,3.98
2,1627846,Abdel Nader,1610612760,OKC,55,37,18,15.76,15.76,6444.6,1.22,0.64,0.58,4.32,4.55,4.09
3,1629690,Adam Mokoka,1610612741,CHI,11,3,8,10.15,10.15,4173.82,0.79,0.41,0.38,4.24,4.44,4.06
4,1629678,Admiral Schofield,1610612764,WAS,33,9,24,11.16,11.16,4341.64,0.82,0.44,0.38,4.19,4.59,3.8


In [9]:
hustle_player = leaguehustlestatsplayer.LeagueHustleStatsPlayer(
                            season = '2019-20',
                            season_type_all_star = 'Regular Season',
                            per_mode_time = 'PerGame',
                            # month = ,
                            )

hustle_player.get_data_frames()[0].head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,AGE,G,MIN,CONTESTED_SHOTS,CONTESTED_SHOTS_2PT,CONTESTED_SHOTS_3PT,...,PCT_LOOSE_BALLS_RECOVERED_DEF,OFF_BOXOUTS,DEF_BOXOUTS,BOX_OUTS,BOX_OUT_PLAYER_TEAM_REBS,BOX_OUT_PLAYER_REBS,PCT_BOX_OUTS_OFF,PCT_BOX_OUTS_DEF,PCT_BOX_OUTS_TEAM_REB,PCT_BOX_OUTS_REB
0,203932,Aaron Gordon,1610612753,ORL,24.0,62,32.5,5.32,3.37,1.95,...,0.507,0.23,0.85,1.08,0.63,0.34,0.209,0.791,0.929,0.5
1,1628988,Aaron Holiday,1610612754,IND,23.0,66,24.5,6.24,2.95,3.29,...,0.522,0.03,1.0,1.03,0.62,0.08,0.029,0.971,0.719,0.088
2,1627846,Abdel Nader,1610612760,OKC,26.0,55,15.8,4.05,2.31,1.75,...,0.586,0.0,0.78,0.78,0.38,0.11,0.0,1.0,0.75,0.214
3,1629690,Adam Mokoka,1610612741,CHI,21.0,11,10.2,1.27,0.36,0.91,...,0.333,0.27,0.55,0.82,0.36,0.18,0.333,0.667,1.0,0.5
4,1629678,Admiral Schofield,1610612764,WAS,23.0,33,11.2,2.48,1.55,0.94,...,0.444,0.03,1.12,1.15,0.45,0.12,0.026,0.974,0.833,0.222


In [10]:
hustle_player.get_data_frames()[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529 entries, 0 to 528
Data columns (total 28 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   PLAYER_ID                      529 non-null    int64  
 1   PLAYER_NAME                    529 non-null    object 
 2   TEAM_ID                        529 non-null    int64  
 3   TEAM_ABBREVIATION              529 non-null    object 
 4   AGE                            529 non-null    float64
 5   G                              529 non-null    int64  
 6   MIN                            529 non-null    float64
 7   CONTESTED_SHOTS                529 non-null    float64
 8   CONTESTED_SHOTS_2PT            529 non-null    float64
 9   CONTESTED_SHOTS_3PT            529 non-null    float64
 10  DEFLECTIONS                    529 non-null    float64
 11  CHARGES_DRAWN                  529 non-null    float64
 12  SCREEN_ASSISTS                 529 non-null    flo

In [11]:
reb_player = playerdashptreb.PlayerDashPtReb(
                            season = '2019-20',
                            season_type_all_star = 'Regular Season',
                            per_mode_simple = 'PerGame',
                            player_id= '203932',
                            team_id = '1610612753'
                            )
reb_player.get_data_frames()[2]

Unnamed: 0,PLAYER_ID,PLAYER_NAME_LAST_FIRST,SORT_ORDER,G,REB_NUM_CONTESTING_RANGE,REB_FREQUENCY,OREB,DREB,REB,C_OREB,C_DREB,C_REB,C_REB_PCT,UC_OREB,UC_DREB,UC_REB,UC_REB_PCT
0,203932,"Gordon, Aaron",1,62,0 Contesting Rebounders,0.669,0.55,4.58,5.13,0.0,0.0,0.0,0.0,0.55,4.58,5.13,1.0
1,203932,"Gordon, Aaron",2,62,1 Contesting Rebounder,0.286,0.92,1.27,2.19,0.92,1.27,2.19,1.0,0.0,0.0,0.0,0.0
2,203932,"Gordon, Aaron",3,62,2+ Contesting Rebounders,0.04,0.24,0.06,0.31,0.24,0.06,0.31,1.0,0.0,0.0,0.0,0.0


In [12]:
reb_player.get_data_frames()[1]

Unnamed: 0,PLAYER_ID,PLAYER_NAME_LAST_FIRST,SORT_ORDER,G,SHOT_TYPE_RANGE,REB_FREQUENCY,OREB,DREB,REB,C_OREB,C_DREB,C_REB,C_REB_PCT,UC_OREB,UC_DREB,UC_REB,UC_REB_PCT
0,203932,"Gordon, Aaron",2,62,Miss 2FG,0.528,1.29,2.76,4.05,0.97,0.71,1.68,0.414,0.32,2.05,2.37,0.586
1,203932,"Gordon, Aaron",3,62,Miss 3FG,0.417,0.4,2.79,3.19,0.19,0.63,0.82,0.258,0.21,2.16,2.37,0.742


In [13]:
reb_player.get_data_frames()[0]

Unnamed: 0,PLAYER_ID,PLAYER_NAME_LAST_FIRST,G,OVERALL,REB_FREQUENCY,OREB,DREB,REB,C_OREB,C_DREB,C_REB,C_REB_PCT,UC_OREB,UC_DREB,UC_REB,UC_REB_PCT
0,203932,"Gordon, Aaron",62,Overall,0.996,1.71,5.92,7.63,1.16,1.34,2.5,0.328,0.55,4.58,5.13,0.672


## NBA API Data

**Observations**
- Some endpoints have **month** parameter others don't
- Existent duplicate record (e.g 201147) 
- Difference in height won't make a difference, most of the players maintain their height since it's a biological thing and its out of the control of athletes
- Difficulty in joining data in a interval of time beacuse new players come in every season as well some retire

### Scraping

Data scraped from the nba stats official site using the nba_api package:

- [LeagueDashPlayerBioStats](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguedashplayerbiostats.md)
    - Season - Year of the season
    - GP - Games played in segment
    - PLAYER_HEIGHT_INCHES- player height in inches
    - PLAYER_WEIGHT- player weight in pounds
    - AGE- age of player
- [LeagueDashPtDefend](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguedashptdefend.md)
    - D_FG_A: The number of opponents shots attempted when a player or team is defending the shot
- [LeagueDashPtStats](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguedashptstats.md)
    - DIST_MILES: Distance run by a player or team measured in miles
    - DIST_MILES_OFF: in offense
    - DIST_MILES_DEF: in defense
    - AVG_SPEED: speed of how many miles the player can run per second
    - AVG_SPEED_OFF: in offense
    - AVG_SPEED_DEF: in defense
- [LeagueHustleStatsPlayer](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguehustlestatsplayer.md)
    - CONTESTED_SHOTS:
    - CONTESTED_SHOTS_2
    - CONTESTED_SHOT_3
    - DEFLECTIONS: The number of times a defensive player or team gets their hand on the ball on a non-shot attempt
    - CHARGES_DRAWN: The number of times a defensive player or team draws a charge
- [LeagueDashPlayerStats](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguedashplayerstats.md)
    - MIN
    - REB
    - OREB
    - DREB
    - BLK: A block occurs when an offensive player attempts a shot, and the defense player tips the ball, blocking their chance to score
    - PF: The number of personal fouls a player or team committed
    - PFD: The number of personal fouls that are drawn by a player or team
- POST UP PLAYS NUMBER???

Problems:
- playerstatsreb can only done player by player threfore requesting multiple times the API will crash

NBA stats glossary source [here](https://www.nba.com/stats/help/glossary#dfgm)

In [17]:
cols_dist = ['DIST_MILES','DIST_MILES_OFF','DIST_MILES_DEF']
cols_speed = ['AVG_SPEED','AVG_SPEED_OFF','AVG_SPEED_DEF']
cols_contested  = ['CONTESTED_SHOTS','CONTESTED_SHOTS_2PT','CONTESTED_SHOTS_3PT']
cols_boxouts = ['BOX_OUTS','OFF_BOXOUTS','DEF_BOXOUTS']
cols_defend = ['D_FG_PCT','D_FGA']
cols_blocks = ['BLK']
cols_fouls = ['PF','PFD']
cols_rebound = ['REB','OREB','DREB']

def scrape_season(year,season_format,league_id,season_type,per_mode):

    if season_type == 'regular':
        season_type_format = 'Regular Season'
    elif season_type == 'post':
        season_type_format = 'Playoffs'
    elif season_type == 'all-star': 
        season_type_format = 'All Star'
    elif season_type == 'pre':
        season_type_format = 'Pre Season'
    
    if per_mode == 'total':
        per_mode_format = 'Totals'
    elif per_mode == 'game':
        per_mode_format = 'PerGame'

    
    player_stats_df = leaguedashplayerbiostats.LeagueDashPlayerBioStats(
                season = season_format,
                per_mode_simple=per_mode_format,
                league_id = league_id,
                season_type_all_star= season_type_format).get_data_frames()[0]

    player_stats_df = player_stats_df.drop(columns = ['PLAYER_HEIGHT','COLLEGE','COUNTRY','DRAFT_ROUND','DRAFT_NUMBER','DRAFT_YEAR','OREB_PCT',
                                                    'DREB_PCT','USG_PCT','TS_PCT','AST_PCT','PTS','REB','AST','NET_RATING'])

    player_common_stats_df = leaguedashplayerstats.LeagueDashPlayerStats(
                        season = season_format,
                        per_mode_detailed=per_mode_format,
                        league_id_nullable = league_id,
                        season_type_all_star= season_type_format).get_data_frames()[0]

    cols_to_merge = ['PLAYER_ID','MIN']+cols_rebound+cols_blocks + cols_fouls
    player_stats_df = player_stats_df.merge(player_common_stats_df[cols_to_merge],on=['PLAYER_ID'])

    # defend_df = leaguedashptdefend.LeagueDashPtDefend(
    #             season = season_format,
    #             per_mode_simple=per_mode_format,
    #             defense_category='Overall',
    #             league_id = league_id,
    #             season_type_all_star = season_type_format).get_data_frames()[0]

    # defend_df = defend_df.drop(columns = ['FREQ','G','PCT_PLUSMINUS','NORMAL_FG_PCT'])
    # defend_df = defend_df.rename(columns = {'CLOSE_DEF_PERSON_ID':'PLAYER_ID'})
    
    # cols_to_merge = ['PLAYER_ID'] + cols_defend

    # player_stats_df = player_stats_df.merge(defend_df[cols_to_merge],on=['PLAYER_ID'])

    speed_distance_data = leaguedashptstats.LeagueDashPtStats(
                            season = season_format,
                            season_type_all_star = season_type_format,
                            per_mode_simple = per_mode_format,
                            player_or_team = 'Player',
                            ).get_data_frames()[0]
    cols_to_merge = ['PLAYER_ID']+cols_dist+cols_speed

    player_stats_df = player_stats_df.merge(speed_distance_data[cols_to_merge],on=['PLAYER_ID'])
    
    touches_data = 0
    
    hustle_data = leaguehustlestatsplayer.LeagueHustleStatsPlayer(
                            season = season_format,
                            season_type_all_star = season_type_format,
                            league_id_nullable = league_id,
                            per_mode_time = per_mode_format,
                            ).get_data_frames()[0]
    

    cols_to_merge = ['PLAYER_ID'] + cols_contested+cols_boxouts+['DEFLECTIONS','CHARGES_DRAWN']

    player_stats_df = player_stats_df.merge(hustle_data[cols_to_merge],on=['PLAYER_ID'])


    #drop empty rows (empty rows exist due to table formatting, not missing data)
    player_stats_df.dropna(subset = ['PLAYER_NAME'], inplace = True)
    
    #there are cases of 2 repeated duplicates
    #optional function do apply and correct the repeated columns
    player_stats_df = player_stats_df.drop_duplicates(subset='PLAYER_ID', keep="first")
    #add a column to indicate if stats are for regular season or playoffs
    player_stats_df.insert(1, "Season", season_type)
    player_stats_df.insert(0, "Year", [year]*(len(player_stats_df.index)))
    return player_stats_df


def scrape_stats_history(start,end,league_id,season_segment):
    
    num_equals = 0
    num_duplicate_index = 0
    year_s,month_s,day_s = start.split("-")

    year_e,month_e,day_e = end.split("-")
    
    year_s = int(year_s)
    year_e = int(year_e)
    # month_s = int(month_s)
    # month_e = int(month_e)
    # day_s = int(day_s)
    # day_e = int(day_e)    
    year_gap = year_e - year_s
    year_list = list(range(year_s,year_e))

    season_format_dict = {i: f'{i}-{(i%2000)+1}' for i in year_list}

    all_player_stats_df = pd.DataFrame()
    
    for year in year_list:

        reg_season_df = scrape_season(year,season_format_dict[year], league_id,'regular','game')
        reg_season_df = reg_season_df.drop_duplicates(subset = ['PLAYER_ID'])
        all_player_stats_df=pd.concat([all_player_stats_df,reg_season_df], ignore_index=True)

        post_season_df = scrape_season(year,season_format_dict[year], league_id,'post','game')
        post_season_df = post_season_df.drop_duplicates(subset = ['PLAYER_ID'])
        all_player_stats_df=pd.concat([all_player_stats_df,post_season_df], ignore_index=True)    

        print('Scraped {} regular and post season player stats'.format(year))

    all_player_stats_df.to_csv(DIR_RAW_DATA+'players_stats.csv',index= False)
    print('Scraped data saved into csv file')


In [18]:
scrape_stats_history(date_start,date_end,'00','game')

Scraped 2010 regular and post season player stats
Scraped 2011 regular and post season player stats
Scraped 2012 regular and post season player stats
Scraped 2013 regular and post season player stats
Scraped 2014 regular and post season player stats
Scraped 2015 regular and post season player stats
Scraped 2016 regular and post season player stats
Scraped 2017 regular and post season player stats
Scraped 2018 regular and post season player stats
Scraped 2019 regular and post season player stats
Scraped 2020 regular and post season player stats
Scraped 2021 regular and post season player stats
Scraped data saved into csv file


In [145]:
players_stats_df = pd.read_csv(DIR_RAW_DATA+'players_stats.csv')

Unnamed: 0,Year,PLAYER_NAME,Season,TEAM_ID,TEAM_ABBREVIATION,AGE,PLAYER_HEIGHT_INCHES,PLAYER_WEIGHT,GP,MIN,...,AVG_SPEED_DEF,PLAYER_ID,CONTESTED_SHOTS,CONTESTED_SHOTS_2PT,CONTESTED_SHOTS_3PT,BOX_OUTS,OFF_BOXOUTS,DEF_BOXOUTS,DEFLECTIONS,CHARGES_DRAWN
4270,2019.0,Aaron Gordon,regular,1610612753,ORL,24.0,80.0,235.0,62,32.5,...,3.76,203932,5.32,3.37,1.95,1.08,0.23,0.85,1.02,0.02
4271,2019.0,Aaron Holiday,regular,1610612754,IND,23.0,72.0,185.0,66,24.5,...,3.98,1628988,6.24,2.95,3.29,1.03,0.03,1.00,1.68,0.05
4272,2019.0,Abdel Nader,regular,1610612760,OKC,26.0,77.0,225.0,55,15.8,...,4.09,1627846,4.05,2.31,1.75,0.78,0.00,0.78,0.98,0.00
4273,2019.0,Adam Mokoka,regular,1610612741,CHI,21.0,77.0,190.0,11,10.2,...,4.06,1629690,1.27,0.36,0.91,0.82,0.27,0.55,0.91,0.00
4274,2019.0,Admiral Schofield,regular,1610612764,WAS,23.0,77.0,241.0,33,11.2,...,3.80,1629678,2.48,1.55,0.94,1.15,0.03,1.12,0.45,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5011,2019.0,Victor Oladipo,post,1610612754,IND,28.0,76.0,213.0,4,30.6,...,3.86,203506,6.50,2.00,4.50,0.50,0.00,0.50,2.00,0.25
5012,2019.0,Vincent Poirier,post,1610612738,BOS,26.0,84.0,235.0,1,1.8,...,0.00,1629738,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
5013,2019.0,Wenyen Gabriel,post,1610612757,POR,23.0,81.0,205.0,4,13.3,...,3.62,1629117,6.75,5.25,1.50,2.50,0.75,1.75,0.50,0.00
5014,2019.0,Wes Iwundu,post,1610612753,ORL,25.0,78.0,195.0,5,15.2,...,4.07,1628411,2.40,1.20,1.20,0.20,0.00,0.20,0.20,0.20


In [146]:
players_stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6617 entries, 0 to 6616
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Year                  6617 non-null   float64
 1   PLAYER_NAME           6617 non-null   object 
 2   Season                6617 non-null   object 
 3   TEAM_ID               6617 non-null   int64  
 4   TEAM_ABBREVIATION     6617 non-null   object 
 5   AGE                   6617 non-null   float64
 6   PLAYER_HEIGHT_INCHES  6612 non-null   float64
 7   PLAYER_WEIGHT         6612 non-null   float64
 8   GP                    6617 non-null   int64  
 9   MIN                   6617 non-null   float64
 10  REB                   6617 non-null   float64
 11  OREB                  6617 non-null   float64
 12  DREB                  6617 non-null   float64
 13  BLK                   6617 non-null   float64
 14  PF                    6617 non-null   float64
 15  PFD                  

## Inactive List

In [3]:
import time
from bs4 import BeautifulSoup as bs
from requests import get

headers = ({
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) \
         AppleWebKit/537.36 (KHTML, like Gecko) \
         Chrome/41.0.2228.0 Safari/537.36'
})

maxPages = 817

def configScraper(pageNum):
    url = f"http://prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate={date_start}&EndDate={date_end}&ILChkBx=yes&Submit=Search&start={pageNum}"
    response = get(url, headers=headers)
    soup = bs(response.text, 'html.parser')
    return soup

def scrapeInactiveList(numPages):
    inactive_list_scraped_df = pd.DataFrame(columns = ['Date','Team','Acquired','Relinquished','Notes'])

    for i in range(0,(maxPages-1)*25+1,25):
        soup = configScraper(i)
        table = soup.find_all('table',class_='datatable center')
        table_rows = table[0].find_all('tr')
        for k in range(1,len(table_rows)):
            data = table_rows[k].find_all('td')
            row_data = [data[0].text.strip(),data[1].text.strip(),data[2].text[2:].strip(),data[3].text[2:].strip(),data[4].text.strip()]
            inactive_list_scraped_df.loc[len(inactive_list_scraped_df.index)] = row_data
        print(f'Scraped page {int(i/25+1)}')
        
    inactive_list_scraped_df.to_csv(DIR_RAW_DATA+'inactive_list_scraped.csv',index = False)

In [None]:
scrapeInactiveList(maxPages)

## Team Schedule

**Observations**
- The season year referes to the year where that season finishes, therefore if a season year is 2018 then it's refering to the season of 2017-18

In [41]:
#seasons schedules to scrape
season_list = ['2014','2015','2016','2017','2018','2019','2020','2021','2022']

#NBA teams to scrape (this dictionary is valid (complete) for 2009-2019 seasons)
team_dict = {
    'ATL': 'Atlanta Hawks',
    'BOS': 'Boston Celtics',
    'BRK': 'Brooklyn Nets',
    'CHA': 'Charlotte Bobcats',
    'CHI': 'Chicago Bulls',
    'CHO': 'Charlotte Hornets',
    'CLE': 'Cleveland Cavaliers',
    'DAL': 'Dallas Mavericks',
    'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons',
    'GSW': 'Golden State Warriors',
    'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers',
    'LAC': 'Los Angeles Clippers',
    'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',
    'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves',
    'NJN': 'New Jersey Nets',
    'NOH': 'New Orleans Hornets',
    'NOP': 'New Orleans Pelicans',
    'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder',
    'ORL': 'Orlando Magic',
    'PHI': 'Philadelphia 76ers',
    'PHO': 'Phoenix Suns',
    'POR': 'Portland Trailblazers',
    'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs',
    'TOR': 'Toronto Raptors',
    'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards'   
}

#teams that moved or otherwise had a name change - need to handle these teams separately (this dictionary is valid for 2009-2019)
teams_relocate_rename_dict = {
    # 'BRK': ['2014', '2015', '2016', '2017', '2018', '2019', '2020','2021','2022'],
    'CHA': ['2014','2015'],
    'CHO': ['2016', '2017', '2018', '2019', '2020','2021','2022'],
    # 'NJN': ['2011', '2012', '2013'],
    'NOH': ['2014'],
    'NOP': ['2015', '2016', '2017', '2018', '2019', '2020','2021','2022'],
}

headers = ({
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) \
         AppleWebKit/537.36 (KHTML, like Gecko) \
         Chrome/41.0.2228.0 Safari/537.36'
})

def scrape_team_season_schedule(team_abrv,year,team_dict):
    
    """
    This function scrapes season schedules. It has three inputs: team_abrv (a string; i.e. 'POR'),a year (a string; i.e. '2017'),
    and a dictionary containing team names; Output is a dataframe with schedule information.
    """
    #website URL to scrape 
    url = "https://www.basketball-reference.com/teams/{}/{}_games.html". format(team_abrv,year)
    response = get(url, headers=headers)
    soup = bs(response.text, 'html.parser')
    
    # use findALL() to get the column headers
    soup.findAll('tr', limit=1)

    #find all rows in table
    rows = soup.findAll('tr')
    sched_data = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    #create a panda frame 
    sched_df = pd.DataFrame(sched_data)

    #drop columns that aren't needed (keeping date, home/away, opponent, OT info)
    sched_df.drop(columns = [1,2,3,6,8,9,10,11,12,13], inplace = True)
    
    #add column headers
    sched_df.columns = ['Date','Away_flag','Opponent','OT_flag']

    #drop empty rows (empty rows exist due to table formatting, not missing data)
    sched_df.dropna(subset = ['Date'], inplace = True)

    #add a column indicating the game number for a given season
    sched_df.reset_index(inplace = True)
    sched_df['Game_num'] = sched_df.index + 1

    #add a column indicating the team
    sched_df['Team'] = team_dict[team_abrv]

    #add a column indicating the year in which the season begins
    sched_df['Year'] = int(year)-1
    
    #reorder columns
    sched_df = sched_df[['Team','Year', 'Game_num','Date','Away_flag','Opponent','OT_flag']]

    return sched_df
    
def scrape_schedule(season_list,team_dict,teams_relocate_rename_dict):
    all_teams_sched_df = pd.DataFrame()
    for team in team_dict:
        
        team_sched_df = pd.DataFrame(columns = ['Team','Year','Game_num','Date','Away_flag','Opponent','OT_flag']) #create empty dataframe with column headers
        
        if team not in teams_relocate_rename_dict: #for those teams that didn't (a) change cities, or (b) otherwise have a name change
            for year in season_list:
                single_season_df = scrape_team_season_schedule(team, year,team_dict)
                team_sched_df=pd.concat([team_sched_df,single_season_df], ignore_index=True)
                print('Scraped {} {} game schedule'.format(team,year))
                #Add a pause to keep web server happy
                time.sleep(1)
        
            print('Scraped multi-season schedule - {} game schedule {} - {}'.format(team, season_list[0], season_list[-1]))

        else:    
            for year in teams_relocate_rename_dict[team]: #for those team that either moved or otherwise had a name change
                single_season_df = scrape_team_season_schedule(team, year,team_dict)
                team_sched_df=pd.concat([team_sched_df,single_season_df], ignore_index=True)
                print('Scraped {} {} game schedule'.format(team,year))
                #Add a pause to keep web server happy
                time.sleep(1)
            
            print('Scraped multi-season schedule - {} game schedule {} - {}'.format(team, season_list[0], season_list[-1]))

    #append 'master schedule' data frame with team's schedule
        all_teams_sched_df = pd.concat([all_teams_sched_df, team_sched_df], ignore_index=True)   

    year_s = int(season_list[0])-1
    year_e = int(season_list[-1])-1

    all_teams_sched_df.to_csv(DIR_RAW_DATA+'all_teams_schedule_{}_{}.csv'.format(str(year_s), str(year_e)))

    return all_teams_sched_df

In [42]:
teams_sched_df = scrape_schedule(season_list,team_dict,teams_relocate_rename_dict)
teams_sched_df.head()

Scraped ATL 2014 game schedule
Scraped ATL 2015 game schedule
Scraped ATL 2016 game schedule
Scraped ATL 2017 game schedule
Scraped ATL 2018 game schedule
Scraped ATL 2019 game schedule
Scraped ATL 2020 game schedule
Scraped ATL 2021 game schedule
Scraped ATL 2022 game schedule
Scraped multi-season schedule - ATL game schedule 2014 - 2022
Scraped BOS 2014 game schedule
Scraped BOS 2015 game schedule
Scraped BOS 2016 game schedule
Scraped BOS 2017 game schedule
Scraped BOS 2018 game schedule
Scraped BOS 2019 game schedule
Scraped BOS 2020 game schedule
Scraped BOS 2021 game schedule
Scraped BOS 2022 game schedule
Scraped multi-season schedule - BOS game schedule 2014 - 2022
Scraped BRK 2014 game schedule
Scraped BRK 2015 game schedule
Scraped BRK 2016 game schedule
Scraped BRK 2017 game schedule
Scraped BRK 2018 game schedule
Scraped BRK 2019 game schedule
Scraped BRK 2020 game schedule
Scraped BRK 2021 game schedule
Scraped BRK 2022 game schedule
Scraped multi-season schedule - BRK gam

KeyError: '[1, 2, 3, 6, 8, 9, 10, 11, 12, 13] not found in axis'