In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

# Initialize the Selenium driver
driver = webdriver.Chrome()

# List out the seasons and the different page types
seasons = ['2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']
data_types = ['touches', 'drives', 'shooting-efficiency', 'offensive-rebounding', 'defensive-rebounding', 'defensive-impact']

# Base URL
base_url = 'https://www.nba.com/stats/players/touches?PlayerPosition=F&Season=2013-14&SeasonType=Regular+Season'

# Dictionary to hold all final DataFrames
final_dfs = {}

for season in seasons:
    for d in data_types:
        # List to store all DataFrames
        all_dataframes = []
        
        # Open the first page
        driver.get(f'https://www.nba.com/stats/players/{d}?PlayerPosition=F&Season={season}&SeasonType=Regular+Season')
        
        for i in range(5):  # Loop through the three pages
            time.sleep(10)  # Wait for the page to load
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Find the table
            table = soup.find('table', {'class': 'Crom_table__p1iZz'})
            
            # Parse the table data
            df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
            all_dataframes.append(df)
            
            # Click the "Next" button to go to the next page
            next_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[5]/button[2]')
            driver.execute_script("arguments[0].click();", next_button)
            
        # Combine into one dataframe
        df = pd.concat(all_dataframes, ignore_index=True)
        
        # Capitalize the column names in 'touches' and 'defense' to normalize them with the other dataframes
        if d == 'touches':
            final_dfs['touches'].columns = final_dfs['touches'].columns.str.upper()
        if d == 'defensive-impact':
            final_dfs['defensive-impact'].columns = final_dfs['defensive-impact'].columns.str.upper()
        

for category in base_urls_2014:

    # List to store all DataFrames
    all_dataframes = []
    
    # Open the first page
    driver.get(base_urls_2014[category])
    
    for i in range(5):  # Loop through the three pages
        time.sleep(10)  # Wait for the page to load
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find the table
        table = soup.find('table', {'class': 'Crom_table__p1iZz'})
        
        # Parse the table data
        df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
        all_dataframes.append(df)
        
        # Click the "Next" button to go to the next page
        next_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[5]/button[2]')
        driver.execute_script("arguments[0].click();", next_button)
    
    # Combine all pages into one DataFrame
    final_dfs[category] = pd.concat(all_dataframes, ignore_index=True)
    
driver.quit()

# Capitalize the column names in 'touches' and 'defense' to normalize them with the other dataframes
final_dfs['touches'].columns = final_dfs['touches'].columns.str.upper()
final_dfs['defense'].columns = final_dfs['defense'].columns.str.upper()

# Merge all the data into a singular dataframe
merged_df_2014 = pd.merge(final_dfs['touches'], final_dfs['drives'], on='PLAYER', suffixes=('', 'drives'))
merged_df_2014 = pd.merge(merged_df_2014, final_dfs['shooting_eff'], on='PLAYER', suffixes=('', 'shooting_eff'))
merged_df_2014 = pd.merge(merged_df_2014, final_dfs['o-reb'], on='PLAYER', suffixes=('', 'o-reb'))
merged_df_2014 = pd.merge(merged_df_2014, final_dfs['d-reb'], on='PLAYER', suffixes=('', 'd-reb'))
merged_df_2014 = pd.merge(merged_df_2014, final_dfs['defense'], on='PLAYER', suffixes=('', 'defense'))
nba_data_df_2014 = merged_df_2014.drop(columns = [col for col in merged_df_2014.columns if 'drives' in col or 'shooting_eff' in col or 'o-reb' in col or 'd-reb' in col or 'defense' in col])
nba_data_df_2014

Unnamed: 0,PLAYER,TEAM,GP,W,L,MIN,PTS,TOUCHES,FRONT CT TOUCHES,TIME OF POSS,...,DREB Chances,DREB Chance%,Deferred DREB Chances,Adjusted DREB Chance%,AVG DREB Distance,STL,BLK,DFGM,DFGA,DFG%
0,Adonis Thomas,PHI,6,2,4,6.3,2.3,6.3,4.7,0.2,...,1.5,100.0,0.0,100.0,10.7,0.0,0.0,0.2,0.3,50.0
1,Al Harrington,WAS,33,20,13,15.0,6.6,24.9,15.1,0.6,...,4.0,54.8,0.3,60.0,7.3,0.4,0.0,1.1,1.4,74.5
2,Al Horford,ATL,29,16,13,33.0,18.6,63.0,40.6,1.8,...,10.4,59.1,0.8,64.3,6.1,0.9,1.5,2.1,4.3,49.6
3,Al-Farouq Aminu,NOP,80,33,47,25.6,7.2,31.5,21.4,1.1,...,7.5,62.5,0.5,67.1,6.1,1.0,0.5,1.8,2.7,66.0
4,Alan Anderson,BKN,77,42,35,22.7,7.1,26.8,16.9,1.3,...,4.3,48.2,0.4,53.8,8.7,0.6,0.1,1.0,1.4,67.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,Vince Carter,DAL,81,49,32,24.4,11.9,43.0,29.4,1.8,...,6.0,45.6,0.6,50.7,5.7,0.8,0.4,1.3,2.0,67.7
293,Wesley Johnson,LAL,79,27,52,28.4,9.1,36.0,22.9,0.9,...,7.1,51.4,0.6,55.8,6.6,1.1,1.0,2.2,3.4,66.5
294,Wilson Chandler,DEN,62,27,35,31.1,13.6,43.7,29.1,1.7,...,7.2,53.8,0.8,60.4,5.6,0.7,0.5,1.7,3.1,55.0
295,Xavier Henry,LAL,43,16,27,21.1,10.0,33.9,20.2,1.8,...,5.1,46.4,0.4,50.0,8.6,1.0,0.2,1.0,1.5,64.1


In [2]:
# Save the dataframe to my GitHub repo
nba_data_df_2014.to_csv(r'C:\Users\vaugh\Desktop\basketball-pf-research\nba_data_2014.csv')

In [4]:
# Repeat the above process now for the 2023-24 data

# Initialize the Selenium driver
driver = webdriver.Chrome()
    
# Base URL
base_urls_2024 = {'touches': 'https://www.nba.com/stats/players/touches?PlayerPosition=F&Season=2023-24&SeasonType=Regular+Season', 'drives': 'https://www.nba.com/stats/players/drives?Season=2023-24&PlayerPosition=F&SeasonType=Regular+Season', 'shooting_eff': 'https://www.nba.com/stats/players/shooting-efficiency?Season=2023-24&PlayerPosition=F&SeasonType=Regular+Season', 'o-reb': 'https://www.nba.com/stats/players/offensive-rebounding?Season=2023-24&SeasonType=Regular+Season&PlayerPosition=F', 'd-reb': 'https://www.nba.com/stats/players/defensive-rebounding?Season=2023-24&SeasonType=Regular+Season&PlayerPosition=F', 'defense': 'https://www.nba.com/stats/players/defensive-impact?Season=2023-24&SeasonType=Regular+Season&PlayerPosition=F'}

# Dictionary to hold all final DataFrames
final_dfs = {}

for category in base_urls_2024:

    # List to store all DataFrames
    all_dataframes = []
    
    # Open the first page
    driver.get(base_urls_2024[category])
    
    for i in range(6):  # Loop through the three pages
        time.sleep(10)  # Wait for the page to load
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find the table
        table = soup.find('table', {'class': 'Crom_table__p1iZz'})
        
        # Parse the table data
        df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
        all_dataframes.append(df)
        
        # Click the "Next" button to go to the next page
        next_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[5]/button[2]')
        driver.execute_script("arguments[0].click();", next_button)
    
    # Combine all pages into one DataFrame
    final_dfs[category] = pd.concat(all_dataframes, ignore_index=True)
    
driver.quit()

# Capitalize the column names in 'touches' and 'defense' to normalize them with the other dataframes
final_dfs['touches'].columns = final_dfs['touches'].columns.str.upper()
final_dfs['defense'].columns = final_dfs['defense'].columns.str.upper()

# Merge all the data into a singular dataframe
merged_df_2024 = pd.merge(final_dfs['touches'], final_dfs['drives'], on='PLAYER', suffixes=('', 'drives'))
merged_df_2024 = pd.merge(merged_df_2024, final_dfs['shooting_eff'], on='PLAYER', suffixes=('', 'shooting_eff'))
merged_df_2024 = pd.merge(merged_df_2024, final_dfs['o-reb'], on='PLAYER', suffixes=('', 'o-reb'))
merged_df_2024 = pd.merge(merged_df_2024, final_dfs['d-reb'], on='PLAYER', suffixes=('', 'd-reb'))
merged_df_2024 = pd.merge(merged_df_2024, final_dfs['defense'], on='PLAYER', suffixes=('', 'defense'))
nba_data_df_2024 = merged_df_2024.drop(columns = [col for col in merged_df_2024.columns if 'drives' in col or 'shooting_eff' in col or 'o-reb' in col or 'd-reb' in col or 'defense' in col])
nba_data_df_2024

Unnamed: 0,PLAYER,TEAM,GP,W,L,MIN,PTS,TOUCHES,FRONT CT TOUCHES,TIME OF POSS,...,DREB Chances,DREB Chance%,Deferred DREB Chances,Adjusted DREB Chance%,AVG DREB Distance,STL,BLK,DFGM,DFGA,DFG%
0,AJ Griffin,ATL,20,8,12,8.5,2.4,8.5,5.4,0.3,...,2.2,80.0,0.1,84.2,11.1,0.1,0.1,0.6,0.9,70.6
1,Aaron Gordon,DEN,71,47,24,31.6,13.9,42.3,23.3,2.2,...,6.1,67.7,0.4,72.1,7.2,0.7,0.6,1.8,3.1,56.6
2,Aaron Nesmith,IND,72,41,31,27.7,12.2,39.3,22.6,1.1,...,5.7,52.0,0.7,59.1,5.7,0.9,0.7,2.1,3.8,55.8
3,Adama Sanogo,CHI,9,3,6,7.3,4.0,9.8,5.8,0.2,...,3.1,77.3,0.1,81.0,4.9,0.1,0.0,0.7,0.7,100.0
4,Admiral Schofield,ORL,23,15,8,3.7,1.1,7.0,3.7,0.2,...,1.3,81.3,0.0,81.3,5.2,0.0,0.0,0.1,0.1,66.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286,Yuta Watanabe,MEM,34,18,16,13.7,3.4,14.9,8.6,0.4,...,3.6,50.0,0.3,54.4,6.6,0.4,0.2,0.9,1.5,61.5
287,Zach Collins,SAS,68,18,50,22.1,11.3,39.8,27.2,1.2,...,7.0,53.5,0.7,59.0,5.4,0.5,0.8,3.7,6.4,58.3
288,Zeke Nnaji,DEN,56,39,17,9.9,3.3,11.2,7.9,0.3,...,2.5,55.0,0.2,59.4,5.1,0.3,0.6,1.1,2.3,50.0
289,Ziaire Williams,MEM,51,20,31,20.3,8.2,27.4,16.4,1.3,...,4.3,73.8,0.4,80.4,7.1,0.7,0.2,1.2,1.9,64.3


In [5]:
# Save the dataframe to my GitHub repo
nba_data_df_2024.to_csv(r'C:\Users\vaugh\Desktop\basketball-pf-research\nba_data_2024.csv')