In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import unicodedata

In [2]:
def convert_foreign_characters(text):
    return unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8')

In [3]:
# Extracting Data from BBall Reference
year = 2024
url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
data = requests.get(url) #retrieve data from basketball reference
page = data.text #extract html
soup = BeautifulSoup(page, 'html.parser') # parse html

# Adding all td elements to list
link_list = []
for link in soup.findAll('td'):
    link_list.append(link)

In [4]:
# Taking the extracted data and parsing strings
player_name_list = []
url_params_list = []

for link in link_list:
    if "data-append-csv" in str(link):
        player_name = re.search(r'csk="(\S+)"', str(link)) 
        url_param = re.search(r'data-append-csv="(\S+)"', str(link))
        player_name_list.append(player_name)
        url_params_list.append(url_param)

In [5]:
# Clean and transform player names
player_list = []

for player in player_name_list:
    clean_name = str(player).split("csk=")[-1].replace('"', '').replace("'>", "")
    split_names = clean_name.split(",")
    first_last = split_names[-1] + " " + split_names[0]
    player_list.append(first_last)

In [6]:
# Clean and transform url parameters
param_list = []

for url in url_params_list:
    param = str(url).split("data-append-csv=")[-1].replace('"', '').replace("'>", "")
    param_list.append(param)

# 12/1/2023 Update - Bring in Player Position

In [7]:
position_list = []

for link in link_list:
    if 'data-stat="pos"' in str(link):
        position_list.append(str(link).split(" ")[-1])

In [8]:
clean_positions = []

for pos in position_list:
    prefix_clean = pos.split('data-stat="pos"')[-1].replace(">", "")
    suffix_clean = prefix_clean.split("<")[0]
    clean_positions.append(suffix_clean)

In [15]:
team_list = []

for link in link_list:
    if 'data-stat="team_id"' in str(link):
        team_list.append(str(link).split('2024.html">')[-1][:3])

In [17]:
# combine and read into pandas
player_param_pair = dict(zip(player_list, param_list))
player_position_pair = dict(zip(player_list,clean_positions))
player_team_pair = dict(zip(player_list, team_list))

# Create Mappings for each player based on position and URL parameter
player_pos_df = pd.DataFrame(player_position_pair.items(), columns = ['Name', 'Pos'])
player_param_df = pd.DataFrame(player_param_pair.items(), columns =['Name', 'URL_Param'])
player_team_df = pd.DataFrame(player_team_pair.items(), columns = ['Name', 'Team'])

# join both data sources on position
df = player_param_df.merge(player_pos_df, on = 'Name')
df = df.merge(player_team_df, on = 'Name')
df['Name'] = df['Name'].apply(convert_foreign_characters) # Clean foreign characters
df['Name'] = df['Name'].str.replace("'", "")
df['Name'] = df['Name'].str.lower()

file_name = f'REFERENCE TABLE - Player URL Mapping {year-1}-{year} Season.xlsx'
df.to_excel(file_name, index = False)
print(f'File Saved to {os.getcwd()}!')

File Saved to C:\Users\loudr\Desktop\NBA Sports Betting Models\Scripts!


In [18]:
df[df['Name'].str.contains("fox", case = False)] # Clean "\"

Unnamed: 0,Name,URL_Param,Pos,Team
134,de\aaron fox,foxde01,PG,SAC


In [22]:
pd.DataFrame(team_list)[0].unique() # Checking for any erroneous team assignments

array(['TOR', 'MIA', 'UTA', 'MEM', 'MIN', 'PHO', 'CLE', 'NOP', 'MIL',
       'ORL', 'NYK', 'WAS', 'POR', 'DET', 'CHO', 'PHI', 'BOS', 'SAS',
       'SAC', '<td', 'LAC', 'OKC', 'ATL', 'CHI', 'DEN', 'BRK', 'HOU',
       'IND', 'DAL', 'LAL', 'GSW'], dtype=object)