In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import requests
import os
import datetime
import warnings
import pickle
import re
from bs4 import BeautifulSoup
from time import sleep
warnings.filterwarnings('ignore')

# Specify Input Parameters

In [2]:
season_list = [
    '2015-16',
    '2016-17',
    '2017-18',
    '2018-19',
    '2019-20', 
    '2020-21',
    '2021-22',
    '2022-23',
]


#season_type = ['Regular Season', 'Playoffs']
season_type = 'Regular Season'


per_mode = 'Totals'
#measure_type = 'Base'
#measure_type = 'Advanced'
measure_type = 'Four Factors'

# Selenium

In [3]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [4]:
option = webdriver.ChromeOptions()
#option.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options = option)

# Get Games Info Using NBA API

In [1]:
from nba_api.stats.static import players
from nba_api.stats.static import teams
from nba_api.stats.endpoints import LeagueGameFinder

In [2]:
headers  = {
    'Connection': 'keep-alive',
    'Accept': 'application/json, text/plain, */*',
    'x-nba-stats-token': 'true',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://stats.nba.com/',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

In [None]:
def get_games(season_id):

    #Find all games in that season
    gamefinder = LeagueGameFinder(headers = headers, 
                                  season_nullable = season_id,
                                  league_id_nullable='00', 
                                  season_type_nullable = season_type, 
                                  timeout = 120)
    sleep(2)
    games = gamefinder.get_data_frames()[0]

    #Games contain basic stats, drop the following stats as advanced stats have similar metrics
    games.drop(columns = ['TEAM_NAME', 'MIN', 'PTS', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'PF'], inplace = True)
    #Generate all unique games in that season
    games_list = games['GAME_ID'].unique().tolist()

    return games, games_list

games_df = pd.DataFrame()
games_list = []

#Iterate over each season
for season in season_list:
    season_games, season_games_list = get_games(season)
    games_df = pd.concat([games_df, season_games])
    games_list.extend(season_games_list)

games_df.to_csv('games_df.csv', index = False)


# Team Stats Scrape Block

In [5]:
team_advanced = []
for season in reversed(season_list):
    
    url = 'https://www.nba.com/stats/teams/boxscores-advanced?Season='+season
    driver.get(url)

    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
        )
        print('Element Found For ' + season + ' Season')

    except: 
        print('Path Not Located')
        pass
    
    select = Select(driver.find_element("xpath", "/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
    select.select_by_index(0)
    
    soup = BeautifulSoup(driver.page_source, 'lxml')
    table = soup.find("table", attrs = {"class" : "Crom_table_p1iZz"})   
    
    headers = soup.find_all('th')[14:]
    header_list = [th.text.strip() for th in headers]
    
    rows = soup.find_all('tr')[13:]
    rows_data = [[td.getText().strip() for td in rows[i].find_all('td')]
                         for i in range(len(rows))]
    
    this_season = pd.DataFrame(rows_data, columns = header_list)
    
    team_advanced.append(this_season)
    
    advanced_df = pd.concat(team_advanced, ignore_index = True)



In [6]:
advanced_df.to_csv('advanced_team_stats.csv')

# Read in Advanced Team Stats

In [7]:
advanced_df = pd.read_excel('advanced_team_stats.xlsx', index_col = 0)

In [8]:
cols = ['Team', 'Match_Up', 'Game_Date', 'W/L', 'MIN', 'OffRtg', 'DefRtg',
       'NetRtg', 'AST%', 'AST/TO', 'ASTRatio', 'OREB%', 'DREB%', 'REB%',
       'TOV%', 'eFG%', 'TS%', 'PACE', 'PIE']
advanced_df.columns = cols
advanced_df

Unnamed: 0,Team,Match_Up,Game_Date,W/L,MIN,OffRtg,DefRtg,NetRtg,AST%,AST/TO,ASTRatio,OREB%,DREB%,REB%,TOV%,eFG%,TS%,PACE,PIE
0,IND,IND @ SAC,11/30/2022,L,48,100.0,120.2,-20.2,70.0,1.87,17.8,23.1,72.9,44.2,13.2,46.0,50.4,114.0,40.0
1,MIL,MIL @ NYK,11/30/2022,W,48,114.7,107.3,7.4,59.0,3.83,17.3,28.3,62.9,47.0,6.3,48.9,53.7,95.5,57.9
2,BKN,BKN vs. WAS,11/30/2022,W,48,111.9,107.0,4.9,56.4,2.44,17.2,11.1,63.0,39.4,8.9,53.0,59.2,100.5,54.4
3,TOR,TOR @ NOP,11/30/2022,L,48,102.9,121.2,-18.3,69.0,1.81,20.4,18.4,72.7,44.1,15.2,54.0,56.0,104.5,41.3
4,ORL,ORL vs. ATL,11/30/2022,L,48,105.9,122.5,-16.7,66.7,1.44,19.4,16.3,66.7,42.0,17.6,54.4,60.0,102.0,41.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17221,GSW,GSW vs. NOP,10/27/2015,W,48,107.8,93.1,14.6,70.7,1.45,18.7,44.6,74.5,58.9,19.4,47.4,52.5,102.5,62.5
17222,CHI,CHI vs. CLE,10/27/2015,W,48,99.0,96.9,2.0,35.1,1.00,10.6,24.1,72.4,49.1,13.3,46.6,49.9,98.0,47.9
17223,DET,DET @ ATL,10/27/2015,W,48,107.1,96.9,10.2,62.2,1.53,15.5,44.4,84.8,61.5,15.2,44.8,49.3,98.0,53.8
17224,CLE,CLE @ CHI,10/27/2015,L,48,96.9,99.0,-2.0,68.4,2.36,18.7,27.6,75.9,50.9,11.2,45.2,46.8,98.0,52.1


# Read in Games Info (with Basic Stats)

In [9]:
games_df = pd.read_csv('games_df.csv')

In [67]:
games_df.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'GAME_ID', 'GAME_DATE',
       'MATCHUP', 'WL', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
       'FTM', 'FTA', 'FT_PCT', 'STL', 'BLK', 'PLUS_MINUS'],
      dtype='object')

In [68]:
from datetime import datetime

games_info = games_df[['TEAM_ID', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
       'FTM', 'FTA', 'FT_PCT']]
games_info['GAME_DATE'] = games_info['GAME_DATE'].map(lambda x: datetime.strftime
                                                      (datetime.strptime(x, '%Y-%m-%d'), '%m/%d/%Y'))

games_info

Unnamed: 0,TEAM_ID,GAME_ID,GAME_DATE,MATCHUP,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT
0,1610612744,21501227,04/13/2016,GSW vs. MEM,46,87,0.529,20,47,0.426,13,16,0.813
1,1610612756,21501229,04/13/2016,PHX vs. LAC,45,99,0.455,7,26,0.269,17,22,0.773
2,1610612741,21501222,04/13/2016,CHI vs. PHI,39,83,0.470,15,24,0.625,22,27,0.815
3,1610612765,21501220,04/13/2016,DET @ CLE,37,85,0.435,16,33,0.485,22,28,0.786
4,1610612761,21501218,04/13/2016,TOR @ BKN,41,93,0.441,13,37,0.351,8,15,0.533
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17221,1610612759,22200011,10/19/2022,SAS vs. CHA,40,99,0.404,7,34,0.206,15,21,0.714
17222,1610612744,22200002,10/18/2022,GSW vs. LAL,45,99,0.455,16,45,0.356,17,23,0.739
17223,1610612755,22200001,10/18/2022,PHI @ BOS,40,80,0.500,13,34,0.382,24,28,0.857
17224,1610612747,22200002,10/18/2022,LAL @ GSW,40,94,0.426,10,40,0.250,19,25,0.760


# Begin Merge 

- Advanced_df needs to have game_ID
- For now, basic stats are ignored (To retain basic stats simply keep columns in games_df)
- Final merge should include all home & away team stats on game by game level

In [69]:
advanced_stats = pd.merge(advanced_df, games_info, left_on = ['Game_Date', 'Match_Up'], right_on = ['GAME_DATE', 'MATCHUP'])

away_games = advanced_stats[advanced_stats['Match_Up'].str.contains('@')].reset_index()
away_games.drop(columns = ['index'], inplace = True)
home_games = advanced_stats[advanced_stats['Match_Up'].str.contains('vs.')]

In [71]:
home_away_advanced = pd.merge(home_games, away_games, on = ['GAME_ID', 'GAME_DATE'], suffixes = ['_Home', '_Away'])
home_away_advanced.columns

Index(['Team_Home', 'Match_Up_Home', 'Game_Date_Home', 'W/L_Home', 'MIN_Home',
       'OffRtg_Home', 'DefRtg_Home', 'NetRtg_Home', 'AST%_Home', 'AST/TO_Home',
       'ASTRatio_Home', 'OREB%_Home', 'DREB%_Home', 'REB%_Home', 'TOV%_Home',
       'eFG%_Home', 'TS%_Home', 'PACE_Home', 'PIE_Home', 'TEAM_ID_Home',
       'GAME_ID', 'GAME_DATE', 'MATCHUP_Home', 'FGM_Home', 'FGA_Home',
       'FG_PCT_Home', 'FG3M_Home', 'FG3A_Home', 'FG3_PCT_Home', 'FTM_Home',
       'FTA_Home', 'FT_PCT_Home', 'Team_Away', 'Match_Up_Away',
       'Game_Date_Away', 'W/L_Away', 'MIN_Away', 'OffRtg_Away', 'DefRtg_Away',
       'NetRtg_Away', 'AST%_Away', 'AST/TO_Away', 'ASTRatio_Away',
       'OREB%_Away', 'DREB%_Away', 'REB%_Away', 'TOV%_Away', 'eFG%_Away',
       'TS%_Away', 'PACE_Away', 'PIE_Away', 'TEAM_ID_Away', 'MATCHUP_Away',
       'FGM_Away', 'FGA_Away', 'FG_PCT_Away', 'FG3M_Away', 'FG3A_Away',
       'FG3_PCT_Away', 'FTM_Away', 'FTA_Away', 'FT_PCT_Away'],
      dtype='object')

In [72]:
cols = ['GAME_ID', 'GAME_DATE', 'Team_Home', 'Team_Away','TEAM_ID_Home', 'TEAM_ID_Away', 'W/L_Home',
       'OffRtg_Home', 'DefRtg_Home', 'NetRtg_Home', 'AST%_Home', 'AST/TO_Home',
       'ASTRatio_Home', 'OREB%_Home', 'DREB%_Home', 'REB%_Home', 'TOV%_Home',
       'eFG%_Home', 'TS%_Home', 'PACE_Home', 'PIE_Home', 'FGM_Home', 'FGA_Home',
       'FG_PCT_Home', 'FG3M_Home', 'FG3A_Home', 'FG3_PCT_Home', 'FTM_Home',
       'FTA_Home', 'FT_PCT_Home',
        'W/L_Away', 'OffRtg_Away', 'DefRtg_Away',
       'NetRtg_Away', 'AST%_Away', 'AST/TO_Away', 'ASTRatio_Away',
       'OREB%_Away', 'DREB%_Away', 'REB%_Away', 'TOV%_Away', 'eFG%_Away',
       'TS%_Away', 'PACE_Away', 'PIE_Away', 'FGM_Away', 'FGA_Away', 'FG_PCT_Away', 'FG3M_Away', 'FG3A_Away',
       'FG3_PCT_Away', 'FTM_Away', 'FTA_Away', 'FT_PCT_Away']

In [73]:
advanced_stats_df = home_away_advanced[cols]

# Validating for correct merge

In [74]:
# advanced_stats_df[advanced_stats_df['GAME_ID'] == 21501220]
# advanced_stats_df[advanced_stats_df['GAME_ID'] == 22200002]
# advanced_df[advanced_df['Game_Date'] == '10/18/2022']

#advanced_stats_df.to_csv('team_base_advanced.csv')

In [79]:
pd.read_csv('team_base_advanced.csv', index_col = 0)

Unnamed: 0,GAME_ID,GAME_DATE,Team_Home,Team_Away,TEAM_ID_Home,TEAM_ID_Away,W/L_Home,OffRtg_Home,DefRtg_Home,NetRtg_Home,...,PIE_Away,FGM_Away,FGA_Away,FG_PCT_Away,FG3M_Away,FG3A_Away,FG3_PCT_Away,FTM_Away,FTA_Away,FT_PCT_Away
0,22200315,11/30/2022,BKN,WAS,1610612751,1610612764,W,111.9,107.0,4.9,...,45.6,39,89,0.438,6,24,0.250,23,34,0.676
1,22200313,11/30/2022,ORL,ATL,1610612753,1610612737,L,105.9,122.5,-16.7,...,58.6,51,94,0.543,8,32,0.250,15,19,0.789
2,22200320,11/30/2022,DEN,HOU,1610612743,1610612745,W,127.7,106.4,21.3,...,37.0,37,88,0.420,12,41,0.293,14,18,0.778
3,22200321,11/30/2022,PHX,CHI,1610612756,1610612741,W,126.9,108.7,18.3,...,46.2,41,82,0.500,4,25,0.160,27,33,0.818
4,22200316,11/30/2022,NYK,MIL,1610612752,1610612749,L,107.3,114.7,-7.4,...,57.9,39,90,0.433,10,35,0.286,21,26,0.808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8608,21500014,10/28/2015,PHX,DAL,1610612756,1610612742,L,92.2,106.7,-14.5,...,67.7,40,85,0.471,10,21,0.476,21,31,0.677
8609,21500005,10/28/2015,BOS,PHI,1610612738,1610612755,W,109.8,92.2,17.6,...,37.0,34,83,0.410,7,22,0.318,20,23,0.870
8610,21500003,10/27/2015,GSW,NOP,1610612744,1610612740,W,107.8,93.1,14.6,...,37.5,35,83,0.422,6,18,0.333,19,27,0.704
8611,21500002,10/27/2015,CHI,CLE,1610612741,1610612739,W,99.0,96.9,2.0,...,52.1,38,94,0.404,9,29,0.310,10,17,0.588


# Player Base & Advanced Block

In [18]:
player_advanced_1 = []

for season in reversed(season_list):
    
    url = 'https://www.nba.com/stats/players/boxscores-advanced?Season=' + season
    driver.get(url)

    try:
        element = WebDriverWait(driver, 50).until(
            EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
        )
        print('Element Found For ' + season + ' Season')

    except: 
        print('Path Not Located')
        pass
   
    select = Select(driver.find_element("xpath", "/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
    select.select_by_index(0)
    
    soup = BeautifulSoup(driver.page_source, 'lxml')
    table = soup.find("table", attrs = {"class" : "Crom_table_p1iZz"})   
    
    headers = soup.find_all('th')[14:]
    header_list = [th.text.strip() for th in headers]
    
    rows = soup.find_all('tr')[13:]
    rows_data = [[td.getText().strip() for td in rows[i].find_all('td')]
                         for i in range(len(rows))]
    
    this_season_1 = pd.DataFrame(rows_data, columns = header_list)
    
    player_advanced_1.append(this_season_1)
    
    player_df_1 = pd.concat(player_advanced_1, ignore_index = True)



In [19]:
player_advanced = pd.concat([player_df_1, player_df_2])
player_advanced.to_csv('advanced_player_stats.csv')

In [20]:
player_traditional = []
for season in reversed(season_list):
    
    url = 'https://www.nba.com/stats/players/boxscores-traditional?Season='+season
    driver.get(url)

    try:
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
        )
        print('Element Found For ' + season + ' Season')

    except: 
        print('Path Not Located')
        pass
    
    select = Select(driver.find_element("xpath", "/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
    select.select_by_index(0)
    
    soup = BeautifulSoup(driver.page_source, 'lxml')
    table = soup.find("table", attrs = {"class" : "Crom_table_p1iZz"})   
    
    headers = soup.find_all('th')[14:]
    header_list = [th.text.strip() for th in headers]
    
    rows = soup.find_all('tr')[13:]
    rows_data = [[td.getText().strip() for td in rows[i].find_all('td')]
                         for i in range(len(rows))]
    
    this_season = pd.DataFrame(rows_data, columns = header_list)
    
    player_traditional.append(this_season)
    
    traditional_df = pd.concat(player_traditional, ignore_index = True)



In [21]:
traditional_df.to_csv('traditional_player_stats.csv')

# Merging Base & Advanced

In [45]:
traditional = pd.read_csv('traditional_player_stats.csv', 
                          index_col = 0).rename(columns = {'MATCH UP': 'MATCH_UP', 'GAME DATE': 'GAME_DATE'})
traditional.drop(columns = 'MIN', inplace = True)


advanced = pd.read_csv('advanced_player_stats.csv', 
                       index_col = 0).rename(columns = {'MATCH UP': 'MATCH_UP', 'GAME DATE': 'GAME_DATE'})
advanced.drop(columns = 'MIN', inplace = True)

In [48]:
player_merged = pd.merge(traditional, advanced, on = ['PLAYER', 'TEAM', 'MATCH_UP', 'GAME_DATE', 'W/L'])

In [59]:
player_merged.to_csv('player_base_advanced.csv')

In [80]:
pd.read_csv('player_base_advanced.csv', index_col = 0)

Unnamed: 0,PLAYER,TEAM,MATCH_UP,GAME_DATE,W/L,PTS,FGM,FGA,FG%,3PM,...,AST Ratio,OREB%,DREB%,REB%,TO Ratio,eFG%,TS%,USG%,PACE,PIE
0,Jevon Carter,MIL,MIL @ NYK,11/30/2022,W,3,1,2,50.0,1,...,50.0,2.7,5.1,3.9,0.0,75.0,75.0,2.6,100.86,5.0
1,Immanuel Quickley,NYK,NYK vs. MIL,11/30/2022,L,4,2,9,22.2,0,...,18.2,7.1,19.0,12.2,0.0,22.2,22.2,18.4,96.40,5.3
2,Obi Toppin,NYK,NYK vs. MIL,11/30/2022,L,7,2,5,40.0,2,...,0.0,0.0,18.8,8.6,14.3,60.0,59.5,20.0,99.90,5.8
3,Quentin Grimes,NYK,NYK vs. MIL,11/30/2022,L,7,3,7,42.9,1,...,0.0,2.4,10.8,6.3,0.0,50.0,50.0,8.5,94.38,2.7
4,Pat Connaughton,MIL,MIL @ NYK,11/30/2022,W,10,4,13,30.8,2,...,0.0,3.7,9.7,6.9,0.0,38.5,38.5,22.8,97.30,4.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182870,E'Twaun Moore,CHI,CHI vs. CLE,10/27/2015,W,11,5,8,62.5,1,...,11.1,0.0,13.0,7.3,0.0,68.8,68.8,23.5,101.90,32.7
182871,Marcus Morris Sr.,DET,DET @ ATL,10/27/2015,W,18,6,19,31.6,1,...,15.4,9.8,13.2,11.2,0.0,34.2,41.6,22.0,97.08,11.0
182872,Joakim Noah,CHI,CHI vs. CLE,10/27/2015,W,0,0,0,0.0,0,...,66.7,10.5,30.4,21.4,16.7,0.0,0.0,5.1,99.55,18.8
182873,Matthew Dellavedova,CLE,CLE @ CHI,10/27/2015,L,6,3,8,37.5,0,...,30.8,3.7,12.0,7.7,7.7,37.5,37.5,19.1,102.18,11.5
