In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

# Initialize the Selenium driver
driver = webdriver.Chrome()
    
# Base URL
base_urls_2014 = {'touches': 'https://www.nba.com/stats/players/touches?PlayerPosition=F&Season=2013-14', 'drives': 'https://www.nba.com/stats/players/drives?Season=2013-14&PlayerPosition=F', 'shooting_eff': 'https://www.nba.com/stats/players/shooting-efficiency?Season=2013-14&PlayerPosition=F'}

# Dictionary to hold all final DataFrames
final_dfs = {}

for category in base_urls_2014:

    # List to store all DataFrames
    all_dataframes = []
    
    # Open the first page
    driver.get(base_urls_2014[category])
    
    for i in range(3):  # Loop through the three pages
        time.sleep(10)  # Wait for the page to load
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find the table
        table = soup.find('table', {'class': 'Crom_table__p1iZz'})
        
        # Parse the table data
        df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
        all_dataframes.append(df)
        
        # Click the "Next" button to go to the next page
        next_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[5]/button[2]')
        driver.execute_script("arguments[0].click();", next_button)
    
    # Combine all pages into one DataFrame
    final_dfs[category] = pd.concat(all_dataframes, ignore_index=True)
    
driver.quit()

# Capitalize the column names in 'touches' to normalize them with the other dataframes
final_dfs['touches'].columns = final_dfs['touches'].columns.str.upper()

# Merge all the data into a singular dataframe
merged_df_2014 = pd.merge(final_dfs['touches'], final_dfs['drives'], on='PLAYER', suffixes=('', 'drives'))
merged_df_2014 = pd.merge(merged_df_2014, final_dfs['shooting_eff'], on='PLAYER', suffixes=('', 'shooting_eff'))
nba_data_df_2014 = merged_df_2014.drop(columns = [col for col in merged_df_2014.columns if 'drives' in col or 'shooting_eff' in col])
nba_data_df_2014

Unnamed: 0,PLAYER,TEAM,GP,W,L,MIN,PTS,TOUCHES,FRONT CT TOUCHES,TIME OF POSS,...,C&S FG%,Pull Up PTS,Pull Up FG%,Paint Touch PTS,Paint Touch FG%,Post Touch PTS,Post Touch FG%,Elbow Touch PTS,Elbow Touch FG%,eFG%
0,Al Harrington,WAS,7,3,4,8.5,2.4,11.0,5.1,0.3,...,0.0,0.3,50.0,0.7,100.0,0.1,0.0,0.0,0.0,40.0
1,Alan Anderson,BKN,12,5,7,21.8,5.9,24.5,15.4,0.9,...,33.3,0.8,35.7,0.0,0.0,0.0,0.0,0.0,0.0,47.6
2,Amir Johnson,TOR,7,3,4,27.3,11.0,37.6,23.7,1.0,...,0.0,0.6,66.7,5.4,63.0,0.9,75.0,1.6,100.0,65.4
3,Andray Blatche,BKN,12,5,7,14.3,6.4,22.2,15.0,0.7,...,0.0,0.0,0.0,4.8,54.3,0.8,45.5,0.1,0.0,44.8
4,Andre Iguodala,GSW,7,3,4,35.5,13.1,47.4,26.3,3.2,...,43.8,1.9,40.0,1.3,100.0,0.7,16.7,0.3,100.0,58.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,Tyler Hansbrough,TOR,3,0,3,9.7,2.3,9.7,3.0,0.2,...,0.0,0.0,0.0,0.7,50.0,0.0,0.0,0.3,0.0,33.3
99,Udonis Haslem,MIA,16,10,6,10.5,2.5,11.7,5.6,0.3,...,33.3,0.0,0.0,1.3,57.1,0.0,0.0,0.4,66.7,45.9
100,Victor Claver,POR,2,0,2,3.6,0.0,9.5,4.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,Vince Carter,DAL,7,3,4,27.1,12.6,39.1,25.4,1.6,...,53.6,2.4,36.8,0.6,100.0,0.7,40.0,0.6,50.0,56.6


In [2]:
# Save the dataframe to my GitHub repo
nba_data_df_2014.to_csv(r'C:\Users\vaugh\Desktop\basketball-pf-research\nba_data_2014.csv')

In [3]:
# Repeat the above process now for the 2023-24 data

# Initialize the Selenium driver
driver = webdriver.Chrome()
    
# Base URL
base_urls_2024 = {'touches': 'https://www.nba.com/stats/players/touches?PlayerPosition=F&Season=2023-24', 'drives': 'https://www.nba.com/stats/players/drives?Season=2023-24&PlayerPosition=F', 'shooting_eff': 'https://www.nba.com/stats/players/shooting-efficiency?Season=2023-24&PlayerPosition=F'}

# Dictionary to hold all final DataFrames
final_dfs = {}

for category in base_urls_2024:

    # List to store all DataFrames
    all_dataframes = []
    
    # Open the first page
    driver.get(base_urls_2024[category])
    
    for i in range(3):  # Loop through the three pages
        time.sleep(10)  # Wait for the page to load
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find the table
        table = soup.find('table', {'class': 'Crom_table__p1iZz'})
        
        # Parse the table data
        df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
        all_dataframes.append(df)
        
        # Click the "Next" button to go to the next page
        next_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[5]/button[2]')
        driver.execute_script("arguments[0].click();", next_button)
    
    # Combine all pages into one DataFrame
    final_dfs[category] = pd.concat(all_dataframes, ignore_index=True)
    
driver.quit()

# Capitalize the column names in 'touches' to normalize them with the other dataframes
final_dfs['touches'].columns = final_dfs['touches'].columns.str.upper()

# Merge all the data into a singular dataframe
merged_df_2024 = pd.merge(final_dfs['touches'], final_dfs['drives'], on='PLAYER', suffixes=('', 'drives'))
merged_df_2024 = pd.merge(merged_df_2024, final_dfs['shooting_eff'], on='PLAYER', suffixes=('', 'shooting_eff'))
nba_data_df_2024 = merged_df_2024.drop(columns = [col for col in merged_df_2024.columns if 'drives' in col or 'shooting_eff' in col])
nba_data_df_2024

Unnamed: 0,PLAYER,TEAM,GP,W,L,MIN,PTS,TOUCHES,FRONT CT TOUCHES,TIME OF POSS,...,C&S FG%,Pull Up PTS,Pull Up FG%,Paint Touch PTS,Paint Touch FG%,Post Touch PTS,Post Touch FG%,Elbow Touch PTS,Elbow Touch FG%,eFG%
0,Aaron Gordon,DEN,12,7,5,37.1,14.3,51.9,27.3,2.7,...,44.1,0.7,44.4,6.8,72.0,0.0,0.0,0.5,28.6,63.1
1,Aaron Nesmith,IND,17,8,9,32.9,10.5,49.3,28.0,1.3,...,29.6,1.4,44.0,0.6,80.0,0.0,0.0,0.0,0.0,51.1
2,Al Horford,BOS,19,16,3,30.3,9.2,39.2,24.2,0.9,...,38.8,0.0,0.0,1.8,65.2,0.4,66.7,0.6,83.3,60.5
3,Amir Coffey,LAC,6,2,4,18.7,2.8,17.7,9.7,0.6,...,30.8,0.7,66.7,0.7,50.0,0.0,0.0,0.0,0.0,38.6
4,Andrew Nembhard,IND,17,8,9,32.6,14.9,58.0,32.8,3.2,...,50.0,3.6,49.2,0.6,50.0,0.0,0.0,0.1,0.0,64.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,Trey Murphy III,NOP,4,0,4,42.0,11.5,55.5,33.0,2.0,...,42.1,3.0,31.3,1.5,75.0,0.0,0.0,1.0,100.0,47.9
112,Tristan Thompson,CLE,10,3,7,8.7,1.5,11.0,6.5,0.3,...,0.0,0.0,0.0,0.6,37.5,0.2,50.0,0.4,50.0,43.8
113,Wendell Carter Jr.,ORL,7,3,4,26.4,7.6,29.7,19.3,0.7,...,31.0,0.3,50.0,2.9,58.3,0.0,0.0,0.0,0.0,47.9
114,Xavier Tillman,BOS,8,6,2,8.6,1.5,10.8,4.6,0.3,...,100.0,0.0,0.0,0.3,33.3,0.0,0.0,0.0,0.0,68.8


In [4]:
# Save the dataframe to my GitHub repo
nba_data_df_2024.to_csv(r'C:\Users\vaugh\Desktop\basketball-pf-research\nba_data_2024.csv')