In [1]:
# !pip install nba_api
# !pip install beautifulsoup4
# !pip install lxml  

# Scraping

## Dependencies & Utils

In [2]:
import pandas as pd
import numpy as np
import pickle
from os import getcwd,makedirs,listdir
from os.path import dirname
from os.path import exists
from nba_api.stats.endpoints import playercareerstats,leaguedashptdefend,leaguedashptstats,leaguedashplayerbiostats,leaguehustlestatsplayer, playerdashptreb,leaguedashplayerstats

path = getcwd()
parent = dirname(path)
DIR_DATA = parent+'/data/'
DIR_RAW_DATA = DIR_DATA+'raw/'
DIR_CLEAN_DATA = DIR_DATA+'clean/'

date_start,date_end = '2010-10-10','2022-08-01',

### Notes about the NBA api
- If season not specified, then it assumes the latest season is the
- If player or team parameter not specified, assumes team is the
- If season_type_all_star not specified, assumes Regular Season
- Data received from the api has a little margin of difference comparing to what is on the official website

In [3]:
player_common_stats_df = leaguedashplayerstats.LeagueDashPlayerStats(
                        season = '2019-20',
                        per_mode_detailed='PerGame',
                        league_id_nullable = '00',
                        season_type_all_star= 'Regular Season').get_data_frames()[0]
player_common_stats_df.info()

ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)

In [None]:
carrer_defend = leaguedashptdefend.LeagueDashPtDefend(
                season = '2020-21',
                per_mode_simple='PerGame',
                defense_category='Overall',
                league_id = '00',
                season_type_all_star = 'Regular Season')

carrer_defend.get_data_frames()[0]

In [None]:
df_bio = leaguedashplayerbiostats.LeagueDashPlayerBioStats(
                season = '2020-21',
                per_mode_simple='PerGame',
                league_id = '00',
                season_type_all_star= 'Regular Season').get_data_frames()[0]
df_bio.head()

In [None]:
df_bio.info()

In [None]:
speed_distance_player = leaguedashptstats.LeagueDashPtStats(
                            season = '2019-20',
                            season_type_all_star = 'Regular Season',
                            per_mode_simple = 'PerGame',
                            player_or_team = 'Player',
                            # month = ,
                            ).get_data_frames()[0]
speed_distance_player.head()

In [None]:
hustle_player = leaguehustlestatsplayer.LeagueHustleStatsPlayer(
                            season = '2019-20',
                            season_type_all_star = 'Regular Season',
                            per_mode_time = 'PerGame',
                            # month = ,
                            )

hustle_player.get_data_frames()[0].head()

In [None]:
hustle_player.get_data_frames()[0].info()

In [None]:
reb_player = playerdashptreb.PlayerDashPtReb(
                            season = '2019-20',
                            season_type_all_star = 'Regular Season',
                            per_mode_simple = 'PerGame',
                            player_id= '203932',
                            team_id = '1610612753'
                            )
reb_player.get_data_frames()[2]

In [None]:
reb_player.get_data_frames()[1]

In [None]:
reb_player.get_data_frames()[0]

## NBA API Data

**Observations**
- Some endpoints have **month** parameter others don't
- Existent duplicate record (e.g 201147) 
- Difference in height won't make a difference, most of the players maintain their height since it's a biological thing and its out of the control of athletes
- Difficulty in joining data in a interval of time beacuse new players come in every season as well some retire

### Scraping

Data scraped from the nba stats official site using the nba_api package:

- [LeagueDashPlayerBioStats](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguedashplayerbiostats.md)
    - Season - Year of the season
    - GP - Games played in segment
    - PLAYER_HEIGHT_INCHES- player height in inches
    - PLAYER_WEIGHT- player weight in pounds
    - AGE- age of player
- [LeagueDashPtDefend](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguedashptdefend.md)
    - D_FG_A: The number of opponents shots attempted when a player or team is defending the shot
- [LeagueDashPtStats](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguedashptstats.md)
    - DIST_MILES: Distance run by a player or team measured in miles
    - DIST_MILES_OFF: in offense
    - DIST_MILES_DEF: in defense
    - AVG_SPEED: speed of how many miles the player can run per second
    - AVG_SPEED_OFF: in offense
    - AVG_SPEED_DEF: in defense
- [LeagueHustleStatsPlayer](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguehustlestatsplayer.md)
    - CONTESTED_SHOTS:
    - CONTESTED_SHOTS_2
    - CONTESTED_SHOT_3
    - DEFLECTIONS: The number of times a defensive player or team gets their hand on the ball on a non-shot attempt
    - CHARGES_DRAWN: The number of times a defensive player or team draws a charge
- [LeagueDashPlayerStats](https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguedashplayerstats.md)
    - MIN
    - REB
    - OREB
    - DREB
    - BLK: A block occurs when an offensive player attempts a shot, and the defense player tips the ball, blocking their chance to score
    - PF: The number of personal fouls a player or team committed
    - PFD: The number of personal fouls that are drawn by a player or team
- POST UP PLAYS NUMBER???

Problems:
- playerstatsreb can only done player by player threfore requesting multiple times the API will crash

NBA stats glossary source [here](https://www.nba.com/stats/help/glossary#dfgm)

In [None]:
cols_dist = ['DIST_MILES','DIST_MILES_OFF','DIST_MILES_DEF']
cols_speed = ['AVG_SPEED','AVG_SPEED_OFF','AVG_SPEED_DEF']
cols_contested  = ['CONTESTED_SHOTS','CONTESTED_SHOTS_2PT','CONTESTED_SHOTS_3PT']
cols_boxouts = ['BOX_OUTS','OFF_BOXOUTS','DEF_BOXOUTS']
cols_defend = ['D_FG_PCT','D_FGA']
cols_blocks = ['BLK']
cols_fouls = ['PF','PFD']
cols_rebound = ['REB','OREB','DREB']

def scrape_season(year,season_format,league_id,season_type,per_mode):

    if season_type == 'regular':
        season_type_format = 'Regular Season'
    elif season_type == 'post':
        season_type_format = 'Playoffs'
    elif season_type == 'all-star': 
        season_type_format = 'All Star'
    elif season_type == 'pre':
        season_type_format = 'Pre Season'
    
    if per_mode == 'total':
        per_mode_format = 'Totals'
    elif per_mode == 'game':
        per_mode_format = 'PerGame'

    
    player_stats_df = leaguedashplayerbiostats.LeagueDashPlayerBioStats(
                season = season_format,
                per_mode_simple=per_mode_format,
                league_id = league_id,
                season_type_all_star= season_type_format).get_data_frames()[0]

    player_stats_df = player_stats_df.drop(columns = ['PLAYER_HEIGHT','COLLEGE','COUNTRY','DRAFT_ROUND','DRAFT_NUMBER','DRAFT_YEAR','OREB_PCT',
                                                    'DREB_PCT','USG_PCT','TS_PCT','AST_PCT','PTS','REB','AST','NET_RATING'])

    player_common_stats_df = leaguedashplayerstats.LeagueDashPlayerStats(
                        season = season_format,
                        per_mode_detailed=per_mode_format,
                        league_id_nullable = league_id,
                        season_type_all_star= season_type_format).get_data_frames()[0]

    cols_to_merge = ['PLAYER_ID','MIN']+cols_rebound+cols_blocks + cols_fouls
    player_stats_df = player_stats_df.merge(player_common_stats_df[cols_to_merge],on=['PLAYER_ID'])

    # defend_df = leaguedashptdefend.LeagueDashPtDefend(
    #             season = season_format,
    #             per_mode_simple=per_mode_format,
    #             defense_category='Overall',
    #             league_id = league_id,
    #             season_type_all_star = season_type_format).get_data_frames()[0]

    # defend_df = defend_df.drop(columns = ['FREQ','G','PCT_PLUSMINUS','NORMAL_FG_PCT'])
    # defend_df = defend_df.rename(columns = {'CLOSE_DEF_PERSON_ID':'PLAYER_ID'})
    
    # cols_to_merge = ['PLAYER_ID'] + cols_defend

    # player_stats_df = player_stats_df.merge(defend_df[cols_to_merge],on=['PLAYER_ID'])

    speed_distance_data = leaguedashptstats.LeagueDashPtStats(
                            season = season_format,
                            season_type_all_star = season_type_format,
                            per_mode_simple = per_mode_format,
                            player_or_team = 'Player',
                            ).get_data_frames()[0]
    cols_to_merge = ['PLAYER_ID']+cols_dist+cols_speed

    player_stats_df = player_stats_df.merge(speed_distance_data[cols_to_merge],on=['PLAYER_ID'])
    
    touches_data = 0
    
    hustle_data = leaguehustlestatsplayer.LeagueHustleStatsPlayer(
                            season = season_format,
                            season_type_all_star = season_type_format,
                            league_id_nullable = league_id,
                            per_mode_time = per_mode_format,
                            ).get_data_frames()[0]
    

    cols_to_merge = ['PLAYER_ID'] + cols_contested+cols_boxouts+['DEFLECTIONS','CHARGES_DRAWN']

    player_stats_df = player_stats_df.merge(hustle_data[cols_to_merge],on=['PLAYER_ID'])


    #drop empty rows (empty rows exist due to table formatting, not missing data)
    player_stats_df.dropna(subset = ['PLAYER_NAME'], inplace = True)
    
    #there are cases of 2 repeated duplicates
    #optional function do apply and correct the repeated columns
    player_stats_df = player_stats_df.drop_duplicates(subset='PLAYER_ID', keep="first")
    #add a column to indicate if stats are for regular season or playoffs
    player_stats_df.insert(1, "Season", season_type)
    player_stats_df.insert(0, "Year", [year]*(len(player_stats_df.index)))
    return player_stats_df


def scrape_stats_history(start,end,league_id,season_segment):
    
    num_equals = 0
    num_duplicate_index = 0
    year_s,month_s,day_s = start.split("-")

    year_e,month_e,day_e = end.split("-")
    
    year_s = int(year_s)
    year_e = int(year_e)
    # month_s = int(month_s)
    # month_e = int(month_e)
    # day_s = int(day_s)
    # day_e = int(day_e)    
    year_gap = year_e - year_s
    year_list = list(range(year_s,year_e))

    season_format_dict = {i: f'{i}-{(i%2000)+1}' for i in year_list}

    all_player_stats_df = pd.DataFrame()
    
    for year in year_list:

        reg_season_df = scrape_season(year,season_format_dict[year], league_id,'regular','game')
        reg_season_df = reg_season_df.drop_duplicates(subset = ['PLAYER_ID'])
        all_player_stats_df=pd.concat([all_player_stats_df,reg_season_df], ignore_index=True)

        post_season_df = scrape_season(year,season_format_dict[year], league_id,'post','game')
        post_season_df = post_season_df.drop_duplicates(subset = ['PLAYER_ID'])
        all_player_stats_df=pd.concat([all_player_stats_df,post_season_df], ignore_index=True)    

        print('Scraped {} regular and post season player stats'.format(year))

    all_player_stats_df.to_csv(DIR_RAW_DATA+'players_stats.csv',index= False)
    print('Scraped data saved into csv file')


In [None]:
scrape_stats_history(date_start,date_end,'00','game')

In [None]:
players_stats_df = pd.read_csv(DIR_RAW_DATA+'players_stats.csv')

In [None]:
players_stats_df.info()

## Inactive List

In [None]:
import time
from bs4 import BeautifulSoup as bs
from requests import get

headers = ({
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) \
         AppleWebKit/537.36 (KHTML, like Gecko) \
         Chrome/41.0.2228.0 Safari/537.36'
})

maxPages = 817

def configScraper(pageNum):
    url = f"http://prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate={date_start}&EndDate={date_end}&ILChkBx=yes&Submit=Search&start={pageNum}"
    response = get(url, headers=headers)
    soup = bs(response.text, 'html.parser')
    return soup

def scrapeInactiveList(numPages):
    inactive_list_scraped_df = pd.DataFrame(columns = ['Date','Team','Acquired','Relinquished','Notes'])

    for i in range(0,(maxPages-1)*25+1,25):
        soup = configScraper(i)
        table = soup.find_all('table',class_='datatable center')
        table_rows = table[0].find_all('tr')
        for k in range(1,len(table_rows)):
            data = table_rows[k].find_all('td')
            row_data = [data[0].text.strip(),data[1].text.strip(),data[2].text[2:].strip(),data[3].text[2:].strip(),data[4].text.strip()]
            inactive_list_scraped_df.loc[len(inactive_list_scraped_df.index)] = row_data
        print(f'Scraped page {int(i/25+1)}')
        
    inactive_list_scraped_df.to_csv(DIR_RAW_DATA+'inactive_list_scraped.csv',index = False)

In [None]:
scrapeInactiveList(maxPages)

## Team Schedule

**Observations**
- The season year referes to the year where that season finishes, therefore if a season year is 2018 then it's refering to the season of 2017-18

In [None]:
#seasons schedules to scrape
season_list = ['2014','2015','2016','2017','2018','2019','2020','2021','2022']

#NBA teams to scrape (this dictionary is valid (complete) for 2009-2019 seasons)
team_dict = {
    'ATL': 'Atlanta Hawks',
    'BOS': 'Boston Celtics',
    'BRK': 'Brooklyn Nets',
    'CHA': 'Charlotte Bobcats',
    'CHI': 'Chicago Bulls',
    'CHO': 'Charlotte Hornets',
    'CLE': 'Cleveland Cavaliers',
    'DAL': 'Dallas Mavericks',
    'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons',
    'GSW': 'Golden State Warriors',
    'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers',
    'LAC': 'Los Angeles Clippers',
    'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',
    'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves',
    'NJN': 'New Jersey Nets',
    'NOH': 'New Orleans Hornets',
    'NOP': 'New Orleans Pelicans',
    'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder',
    'ORL': 'Orlando Magic',
    'PHI': 'Philadelphia 76ers',
    'PHO': 'Phoenix Suns',
    'POR': 'Portland Trailblazers',
    'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs',
    'TOR': 'Toronto Raptors',
    'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards'   
}

#teams that moved or otherwise had a name change - need to handle these teams separately (this dictionary is valid for 2009-2019)
teams_relocate_rename_dict = {
    # 'BRK': ['2014', '2015', '2016', '2017', '2018', '2019', '2020','2021','2022'],
    'CHA': ['2014','2015'],
    'CHO': ['2016', '2017', '2018', '2019', '2020','2021','2022'],
    # 'NJN': ['2011', '2012', '2013'],
    'NOH': ['2014'],
    'NOP': ['2015', '2016', '2017', '2018', '2019', '2020','2021','2022'],
}

headers = ({
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) \
         AppleWebKit/537.36 (KHTML, like Gecko) \
         Chrome/41.0.2228.0 Safari/537.36'
})

def scrape_team_season_schedule(team_abrv,year,team_dict):
    
    """
    This function scrapes season schedules. It has three inputs: team_abrv (a string; i.e. 'POR'),a year (a string; i.e. '2017'),
    and a dictionary containing team names; Output is a dataframe with schedule information.
    """
    #website URL to scrape 
    url = "https://www.basketball-reference.com/teams/{}/{}_games.html". format(team_abrv,year)
    response = get(url, headers=headers)
    soup = bs(response.text, 'html.parser')
    
    # use findALL() to get the column headers
    soup.findAll('tr', limit=1)

    #find all rows in table
    rows = soup.findAll('tr')
    sched_data = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    #create a panda frame 
    sched_df = pd.DataFrame(sched_data)

    #drop columns that aren't needed (keeping date, home/away, opponent, OT info)
    sched_df.drop(columns = [1,2,3,6,8,9,10,11,12,13], inplace = True)
    
    #add column headers
    sched_df.columns = ['Date','Away_flag','Opponent','OT_flag']

    #drop empty rows (empty rows exist due to table formatting, not missing data)
    sched_df.dropna(subset = ['Date'], inplace = True)

    #add a column indicating the game number for a given season
    sched_df.reset_index(inplace = True)
    sched_df['Game_num'] = sched_df.index + 1

    #add a column indicating the team
    sched_df['Team'] = team_dict[team_abrv]

    #add a column indicating the year in which the season begins
    sched_df['Year'] = int(year)-1
    
    #reorder columns
    sched_df = sched_df[['Team','Year', 'Game_num','Date','Away_flag','Opponent','OT_flag']]

    return sched_df
    
def scrape_schedule(season_list,team_dict,teams_relocate_rename_dict):
    all_teams_sched_df = pd.DataFrame()
    for team in team_dict:
        
        team_sched_df = pd.DataFrame(columns = ['Team','Year','Game_num','Date','Away_flag','Opponent','OT_flag']) #create empty dataframe with column headers
        
        if team not in teams_relocate_rename_dict: #for those teams that didn't (a) change cities, or (b) otherwise have a name change
            for year in season_list:
                single_season_df = scrape_team_season_schedule(team, year,team_dict)
                team_sched_df=pd.concat([team_sched_df,single_season_df], ignore_index=True)
                print('Scraped {} {} game schedule'.format(team,year))
                #Add a pause to keep web server happy
                time.sleep(1)
        
            print('Scraped multi-season schedule - {} game schedule {} - {}'.format(team, season_list[0], season_list[-1]))

        else:    
            for year in teams_relocate_rename_dict[team]: #for those team that either moved or otherwise had a name change
                single_season_df = scrape_team_season_schedule(team, year,team_dict)
                team_sched_df=pd.concat([team_sched_df,single_season_df], ignore_index=True)
                print('Scraped {} {} game schedule'.format(team,year))
                #Add a pause to keep web server happy
                time.sleep(1)
            
            print('Scraped multi-season schedule - {} game schedule {} - {}'.format(team, season_list[0], season_list[-1]))

    #append 'master schedule' data frame with team's schedule
        all_teams_sched_df = pd.concat([all_teams_sched_df, team_sched_df], ignore_index=True)   

    year_s = int(season_list[0])-1
    year_e = int(season_list[-1])-1

    all_teams_sched_df.to_csv(DIR_RAW_DATA+'all_teams_schedule_{}_{}.csv'.format(str(year_s), str(year_e)))

    return all_teams_sched_df

In [None]:
teams_sched_df = scrape_schedule(season_list,team_dict,teams_relocate_rename_dict)
teams_sched_df.head()