In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define a function to get the urls for each power forward's personal page
def get_pf_urls(season_url):
    response = requests.get(season_url)
    soup = BeautifulSoup(response.content, 'lxml')
    players_table = soup.find('table', {'id': 'advanced_stats'})
    players = []
    for row in players_table.tbody.find_all('tr'):
        pos_cell = row.find('td', {'data-stat': 'pos'})
        if pos_cell is not None:  # Check if the 'pos' cell exists
            position = pos_cell.text
            if 'PF' in position:
                player_link = row.find('td', {'data-stat': 'player'}).a['href']
                player_name = row.find('td', {'data-stat': 'player'}).text
                players.append((player_name, 'https://www.basketball-reference.com' + player_link))
    return players

season_2014_url = "https://www.basketball-reference.com/leagues/NBA_2014_advanced.html"
season_2024_url = "https://www.basketball-reference.com/leagues/NBA_2024_advanced.html"
pf_players_2014 = get_pf_urls(season_2014_url)
pf_players_2024 = get_pf_urls(season_2024_url)

In [10]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options

def get_shooting_data(player_url, season, player_name):
    # Initialize Selenium WebDriver
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)  # Make sure you have the ChromeDriver installed and in PATH
    driver.get(player_url)
    
    # Wait for the page to fully load (you might need to adjust this delay)
    time.sleep(10)
    
    # Get page source and parse it with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    # Close the driver after page load
    driver.quit()
    
    # List of possible table IDs
    table_ids = ['shooting_sh', 'div_shooting', 'shooting']
    
    shooting_table = None
    for table_id in table_ids:
        shooting_table = soup.find('table', {'id': table_id})
        if shooting_table is not None:
            break
    
    if shooting_table is None:
        print(f"No shooting data found for {player_url}")
        return []
    
    # Find the headers
    headers = [th.text.strip() for th in shooting_table.find('thead').find_all('th')]
    headers = headers[14:39]
    for header in headers:
        if header == '':
            headers.remove('')
    headers[8] = '% of FGA by Distance: 2P'
    headers[9] = '% of FGA by Distance: 0-3ft'
    headers[10] = '% of FGA by Distance: 3-10ft'
    headers[11] = '% of FGA by Distance: 10-16ft'
    headers[12] = '% of FGA by Distance: 16-3P'
    headers[13] = '% of FGA by Distance: 3P'
    headers[14] = 'FG% by Distance: 2P'
    headers[15] = 'FG% by Distance: 0-3ft'
    headers[16] = 'FG% by Distance: 3-10ft'
    headers[17] = 'FG% by Distance: 10-16ft'
    headers[18] = 'FG% by Distance: 16-3P'
    headers[19] = 'FG% by Distance: 3P'
    headers[20] = "% of FG Ast'd: 2P"
    headers[21] = "% of FG Ast'd: 3P"
    headers.append("Player")
    
    data_rows = shooting_table.tbody.find_all('tr')
    
    season_data = []

    # Iterate over rows to find data for the specific season and 'TOT' if available
    for row in data_rows:
        season_cell = row.find('th', {'data-stat': 'season'})
        team_id_cell = row.find('td', {'data-stat': 'team_id'})
        
        if season_cell.text.strip() == season:
            data = [td.text.strip() for td in row.find_all('td')][:25]
            data.pop(8)
            data.pop(14)
            data.pop(20)
            data.append(player_name)
            df = pd.DataFrame([data], columns=headers)
            df.set_index("Player", inplace=True)
            return df
        
    return pd.DataFrame()

In [None]:
# Get the shooting dataframe for the 2013-14 power forwards
shooting_data_dfs_2014 = {}
for pf in pf_players_2014:
    if pf[0] not in shooting_data_dfs_2014.keys():
        shooting_data_dfs_2014[pf[0]] = get_shooting_data(pf[1], '2013-14', pf[0])

shooting_data_2014 = pd.concat(shooting_data_dfs_2014.values())
shooting_data_2014

In [6]:
pf_players_2024

[('Precious Achiuwa',
  'https://www.basketball-reference.com/players/a/achiupr01.html'),
 ('Precious Achiuwa',
  'https://www.basketball-reference.com/players/a/achiupr01.html'),
 ('Santi Aldama',
  'https://www.basketball-reference.com/players/a/aldamsa01.html'),
 ('Kyle Anderson',
  'https://www.basketball-reference.com/players/a/anderky01.html'),
 ('Giannis Antetokounmpo',
  'https://www.basketball-reference.com/players/a/antetgi01.html'),
 ('Thanasis Antetokounmpo',
  'https://www.basketball-reference.com/players/a/antetth01.html'),
 ('Paolo Banchero',
  'https://www.basketball-reference.com/players/b/banchpa01.html'),
 ('Dominick Barlow',
  'https://www.basketball-reference.com/players/b/barlodo01.html'),
 ('Harrison Barnes',
  'https://www.basketball-reference.com/players/b/barneha02.html'),
 ('Nicolas Batum',
  'https://www.basketball-reference.com/players/b/batumni01.html'),
 ('Nicolas Batum',
  'https://www.basketball-reference.com/players/b/batumni01.html'),
 ('Nicolas Batum

In [11]:
# Get the shooting dataframe for the 2023-24 power forwards
shooting_data_dfs_2024 = {}
for pf in pf_players_2024:
    if pf[0] not in shooting_data_dfs_2024.keys():
        shooting_data_dfs_2024[pf[0]] = get_shooting_data(pf[1], '2023-24', pf[0])

shooting_data_2024 = pd.concat(shooting_data_dfs_2024.values())
shooting_data_2024

Unnamed: 0_level_0,Age,Tm,Lg,Pos,G,MP,FG%,Dist.,% of FGA by Distance: 2P,% of FGA by Distance: 0-3ft,...,% of FGA by Distance: 16-3P,% of FGA by Distance: 3P,FG% by Distance: 2P,FG% by Distance: 0-3ft,FG% by Distance: 3-10ft,FG% by Distance: 10-16ft,FG% by Distance: 16-3P,FG% by Distance: 3P,% of FG Ast'd: 2P,% of FG Ast'd: 3P
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Precious Achiuwa,24,TOT,NBA,"C,PF",74,1624,.501,7.6,.793,.520,...,.011,.207,.562,.697,.317,.273,.200,.268,.593,.962
Santi Aldama,23,MEM,NBA,PF,61,1618,.435,15.8,.465,.211,...,.016,.535,.534,.692,.411,.393,.333,.349,.674,.962
Kyle Anderson,30,MIN,NBA,PF,79,1782,.460,9.0,.889,.192,...,.028,.111,.488,.627,.438,.476,.500,.229,.511,1.000
Giannis Antetokounmpo,29,MIL,NBA,PF,73,2567,.611,6.7,.909,.519,...,.072,.091,.645,.818,.452,.323,.374,.274,.509,.500
Thanasis Antetokounmpo,31,MIL,NBA,PF,34,155,.533,3.7,.967,.767,...,.033,.033,.552,.609,.400,,.000,.000,.750,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zion Williamson,23,NOP,NBA,PF,70,2207,.570,4.0,.984,.547,...,.003,.016,.574,.717,.396,.394,.333,.333,.474,.833
D.J. Wilson,27,PHI,NBA,PF,2,15,.667,10.3,.667,.333,...,.000,.333,.500,1.000,.000,,,1.000,1.000,1.000
Jalen Wilson,23,BRK,NBA,PF,43,664,.425,14.0,.587,.229,...,.022,.413,.495,.683,.356,.400,.500,.324,.615,1.000
Christian Wood,28,LAL,NBA,PF,50,872,.466,13.8,.568,.261,...,.042,.432,.587,.812,.424,.455,.182,.307,.761,.971


In [None]:
shooting_data_2014.to_csv(r'C:\Users\vaugh\Desktop\basketball-pf-research\bball-ref_data_2014.csv')

In [12]:
shooting_data_2024.to_csv(r'C:\Users\vaugh\Desktop\basketball-pf-research\bball-ref_data_2024.csv')