In [28]:
import pandas as pd
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.endpoints import playerprofilev2
import requests
from bs4 import BeautifulSoup
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
def fetch_player_attributes(player_ids):
    """
    Fetch player positions, heights, and weights for a list of player IDs.
    """
    attributes = {}
    for player_id in player_ids:
        try:
            player_info = commonplayerinfo.CommonPlayerInfo(player_id=player_id)
            player_info_df = player_info.get_data_frames()[0]
            
            position = player_info_df.get('POSITION', 'N/A')
            height = player_info_df.get('HEIGHT', 'N/A')
            weight = player_info_df.get('WEIGHT', 'N/A')
            
            attributes[player_id] = {
                'Position': position,
                'Height': height,
                'Weight': weight
            }
        except Exception as e:
            print(f"Error fetching attributes for player ID {player_id}: {e}")
            attributes[player_id] = {
                'Position': 'N/A',
                'Height': 'N/A',
                'Weight': 'N/A'
            }
    
    return attributes

def fetch_player_stats_by_season_with_attributes(start_year, end_year):
    """
    Fetch all players' stats for each season within the specified range, including player attributes.
    
    Parameters:
        start_year (int): The starting season year (e.g., 2000).
        end_year (int): The ending season year (e.g., 2024).
    
    Returns:
        pd.DataFrame: A DataFrame containing all player stats for the specified seasons.
    """
    all_seasons_stats = []

    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year + 1)[-2:]}" 
        print(f"Fetching stats for season {season}...")
        
        try:
            player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season=season)
            stats_df = player_stats.get_data_frames()[0] 
            stats_df['SEASON'] = season

            player_ids = stats_df['PLAYER_ID'].unique()
            attributes = fetch_player_attributes(player_ids)
            
            stats_df['POSITION'] = stats_df['PLAYER_ID'].map(lambda x: attributes[x]['Position'])
            stats_df['HEIGHT'] = stats_df['PLAYER_ID'].map(lambda x: attributes[x]['Height'])
            stats_df['WEIGHT'] = stats_df['PLAYER_ID'].map(lambda x: attributes[x]['Weight'])

            all_seasons_stats.append(stats_df)
        except Exception as e:
            print(f"Failed to fetch stats for season {season}: {e}")
            continue

    if all_seasons_stats:
        return pd.concat(all_seasons_stats, ignore_index=True)
    else:
        return pd.DataFrame()

start_year = 2000
end_year = 2023
all_stats = fetch_player_stats_by_season_with_attributes(start_year, end_year)

if not all_stats.empty:
    print(f"Successfully fetched stats for seasons {start_year} to {end_year}")
    all_stats.to_csv("player_stats_with_attributes_2000_2023.csv", index=False)
    print("Stats saved to player_stats_with_attributes_2000_2023.csv")
else:
    print("No data was fetched.")

##Ultimately did not use this code as issues with scraping positions from the NBA_API


Fetching stats for season 2000-01...
Error fetching attributes for player ID 344: Expecting value: line 1 column 1 (char 0)
Error fetching attributes for player ID 239: Expecting value: line 1 column 1 (char 0)
Error fetching attributes for player ID 2052: Expecting value: line 1 column 1 (char 0)
Error fetching attributes for player ID 1051: Expecting value: line 1 column 1 (char 0)
Error fetching attributes for player ID 1901: Expecting value: line 1 column 1 (char 0)
Error fetching attributes for player ID 2109: Expecting value: line 1 column 1 (char 0)
Error fetching attributes for player ID 961: Expecting value: line 1 column 1 (char 0)
Error fetching attributes for player ID 84: Expecting value: line 1 column 1 (char 0)
Error fetching attributes for player ID 1718: Expecting value: line 1 column 1 (char 0)
Error fetching attributes for player ID 895: Expecting value: line 1 column 1 (char 0)
Fetching stats for season 2001-02...
Error fetching attributes for player ID 239: Expecti

KeyboardInterrupt: 

In [9]:
player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season='2000-01')
stats_df = player_stats.get_data_frames()[0]
print(stats_df.head())

   PLAYER_ID     PLAYER_NAME NICKNAME     TEAM_ID TEAM_ABBREVIATION   AGE  GP  \
0        920      A.C. Green     A.C.  1610612748               MIA  37.0  82   
1       2062     A.J. Guyton     A.J.  1610612741               CHI  23.0  33   
2        243     Aaron McKie    Aaron  1610612755               PHI  28.0  76   
3       1425  Aaron Williams    Aaron  1610612751               NJN  29.0  82   
4        228      Adam Keefe     Adam  1610612744               GSW  31.0  67   

    W   L  W_PCT  ...  BLK_RANK  BLKA_RANK  PF_RANK  PFD_RANK  PTS_RANK  \
0  50  32  0.610  ...       271        263      220        39       212   
1   6  27  0.182  ...       311         57       92       112       292   
2  51  25  0.671  ...       271        376      316       112        87   
3  26  56  0.317  ...        24        404      441       112        91   
4  14  53  0.209  ...       169        209      200       112       307   

   PLUS_MINUS_RANK  NBA_FANTASY_PTS_RANK  DD2_RANK  TD3_RANK  

In [None]:
def flatten_position(value):
    if isinstance(value, pd.Series):
        return value.iloc[0]
    return value

all_stats['POSITION'] = all_stats['POSITION'].apply(flatten_position)


In [None]:
print(all_stats['POSITION'].apply(type).unique())

[<class 'str'> <class 'NoneType'>]


In [None]:
unique_positions = all_stats['POSITION'].dropna().unique()
print("Unique positions in the dataset:", unique_positions)

all_stats

Unique positions in the dataset: ['Forward' 'Guard' 'Center-Forward' 'Center' 'Guard-Forward'
 'Forward-Center' 'Forward-Guard' '']


Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,SEASON,POSITION
0,920,A.C. Green,A.C.,1610612748,MIA,37.0,82,50,32,0.610,...,220,39,212,95,218,126,26,223,2000-01,Forward
1,2062,A.J. Guyton,A.J.,1610612741,CHI,23.0,33,6,27,0.182,...,92,112,292,378,317,224,26,314,2000-01,Guard
2,243,Aaron McKie,Aaron,1610612755,PHI,28.0,76,51,25,0.671,...,316,112,87,20,84,77,4,81,2000-01,Guard
3,1425,Aaron Williams,Aaron,1610612751,NJN,29.0,82,26,56,0.317,...,441,112,91,434,77,53,26,79,2000-01,Center-Forward
4,228,Adam Keefe,Adam,1610612744,GSW,31.0,67,14,53,0.209,...,200,112,307,385,274,224,26,280,2000-01,Forward
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12152,1641744,Zach Edey,Zach,1610612763,MEM,22.0,14,8,6,0.571,...,385,155,199,91,203,67,20,211,2024-25,Center
12153,203897,Zach LaVine,Zach,1610612741,CHI,29.0,21,10,11,0.476,...,339,68,28,216,50,91,20,38,2024-25,Guard
12154,1630192,Zeke Nnaji,Zeke,1610612743,DEN,23.0,14,9,5,0.643,...,102,343,405,439,408,146,20,412,2024-25,Forward-Center
12155,1630533,Ziaire Williams,Ziaire,1610612751,BKN,23.0,21,9,12,0.429,...,456,108,173,436,177,91,20,180,2024-25,Forward


In [None]:
#New code to scrape the basketballreference website

def download_webpage(url, output_file):
    """
    Downloads a webpage and saves it locally.
    """
    try:
        print(f"Downloading data from: {url}")
        response = requests.get(url)
        response.raise_for_status()
        
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(response.text)
        
        print(f"Webpage successfully downloaded and saved as {output_file}")
    except requests.exceptions.RequestException as e:
        print(f"Error while downloading the webpage: {e}")

def html_to_csv(input_file, output_csv):
    """
    Converts an HTML file to a CSV by extracting all rows in the table.
    """
    try:
        print(f"Parsing HTML file: {input_file}")

        with open(input_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
        
        table = soup.find('table', {'id': 'shooting'})
        if not table:
            print("Shooting table not found in the HTML file.")
            return
        
        headers = [th.get_text(strip=True) for th in table.find('thead').find_all('tr')[-1].find_all('th')]

        rows = table.find('tbody').find_all('tr')
        data = []
        for row in rows:
            if row.find('th', {"scope": "row"}): 
                row_data = [td.get_text(strip=True) for td in row.find_all('td')]
                row_index = row.find('th').get_text(strip=True) 
                full_row = [row_index] + row_data
                
                if len(full_row) < len(headers):
                    full_row.extend([''] * (len(headers) - len(full_row))) 
                elif len(full_row) > len(headers):
                    full_row = full_row[:len(headers)] 
                
                data.append(full_row)
        
        df = pd.DataFrame(data, columns=headers)
        df.to_csv(output_csv, index=False)
        print(f"Data successfully converted to CSV and saved as {output_csv}")
    except Exception as e:
        print(f"Error while converting HTML to CSV: {e}")

url = "https://www.basketball-reference.com/leagues/NBA_2001_shooting.html#shooting"
html_file = "NBA_2001_shooting.html"
csv_file = "NBA_2001_shooting.csv"

download_webpage(url, html_file)
html_to_csv(html_file, csv_file)

#This was successfull so then I iterate it in the next code chunk



Downloading data from: https://www.basketball-reference.com/leagues/NBA_2001_shooting.html#shooting
Webpage successfully downloaded and saved as NBA_2001_shooting.html
Parsing HTML file: NBA_2001_shooting.html
Data successfully converted to CSV and saved as NBA_2001_shooting.csv


In [None]:
def download_webpage(url, output_file):
    """
    Downloads a webpage and saves it locally.
    """
    try:
        print(f"Downloading data from: {url}")
        response = requests.get(url)
        response.raise_for_status()
        
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(response.text)
        
        print(f"Webpage successfully downloaded and saved as {output_file}")
    except requests.exceptions.RequestException as e:
        print(f"Error while downloading the webpage: {e}")

def html_to_csv(input_file, output_csv):
    """
    Converts an HTML file to a CSV by extracting all rows in the table.
    """
    try:
        print(f"Parsing HTML file: {input_file}")
        
        with open(input_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
        
        table = soup.find('table', {'id': 'shooting'})
        if not table:
            print("Shooting table not found in the HTML file.")
            return
        
        headers = [th.get_text(strip=True) for th in table.find('thead').find_all('tr')[-1].find_all('th')]

        rows = table.find('tbody').find_all('tr')
        data = []
        for row in rows:
            if row.find('th', {"scope": "row"}): 
                row_data = [td.get_text(strip=True) for td in row.find_all('td')]
                row_index = row.find('th').get_text(strip=True) 
                full_row = [row_index] + row_data
                
                if len(full_row) < len(headers):
                    full_row.extend([''] * (len(headers) - len(full_row)))
                elif len(full_row) > len(headers):
                    full_row = full_row[:len(headers)]
                
                data.append(full_row)
        
        df = pd.DataFrame(data, columns=headers)
        df.to_csv(output_csv, index=False)
        print(f"Data successfully converted to CSV and saved as {output_csv}")
    except Exception as e:
        print(f"Error while converting HTML to CSV: {e}")

def scrape_seasons(start_year, end_year):
    """
    Scrapes data for each season from start_year to end_year.
    """
    for year in range(start_year, end_year + 1):
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_shooting.html#shooting"
        html_file = f"NBA_{year}_shooting.html"
        csv_file = f"NBA_{year}_shooting.csv"
        
        download_webpage(url, html_file)
        html_to_csv(html_file, csv_file)

scrape_seasons(2001, 2024)


Downloading data from: https://www.basketball-reference.com/leagues/NBA_2001_shooting.html#shooting
Webpage successfully downloaded and saved as NBA_2001_shooting.html
Parsing HTML file: NBA_2001_shooting.html
Data successfully converted to CSV and saved as NBA_2001_shooting.csv
Downloading data from: https://www.basketball-reference.com/leagues/NBA_2002_shooting.html#shooting
Webpage successfully downloaded and saved as NBA_2002_shooting.html
Parsing HTML file: NBA_2002_shooting.html
Data successfully converted to CSV and saved as NBA_2002_shooting.csv
Downloading data from: https://www.basketball-reference.com/leagues/NBA_2003_shooting.html#shooting
Webpage successfully downloaded and saved as NBA_2003_shooting.html
Parsing HTML file: NBA_2003_shooting.html
Data successfully converted to CSV and saved as NBA_2003_shooting.csv
Downloading data from: https://www.basketball-reference.com/leagues/NBA_2004_shooting.html#shooting
Webpage successfully downloaded and saved as NBA_2004_shooti

In [None]:
#Merges the individual csvs into one csv

def merge_csv_files(directory, output_file):
    """
    Merges all CSV files in the specified directory into a single CSV file.
    """
    try:
        csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
        

        all_data = []

        for csv_file in csv_files:
            file_path = os.path.join(directory, csv_file)
            print(f"Loading {csv_file}...")
            df = pd.read_csv(file_path)
            season = csv_file.split('_')[1] 
            df['Season'] = season
            all_data.append(df)

        merged_data = pd.concat(all_data, ignore_index=True)
        
        merged_data.to_csv(output_file, index=False)
        print(f"All CSV files merged successfully into {output_file}")
        
    except Exception as e:
        print(f"Error while merging CSV files: {e}")

directory = '/Users/nicholasrichards/Desktop/nba_shootingstats_csvs'
output_file = '/Users/nicholasrichards/Desktop/nba_shootingstats_merged.csv'

merge_csv_files(directory, output_file)


Loading NBA_2010_shooting.csv...
Loading NBA_2017_shooting.csv...
Loading NBA_2002_shooting.csv...
Loading NBA_2005_shooting.csv...
Loading NBA_2021_shooting.csv...
Loading NBA_2004_shooting.csv...
Loading NBA_2003_shooting.csv...
Loading NBA_2016_shooting.csv...
Loading NBA_2011_shooting.csv...
Loading NBA_2020_shooting.csv...
Loading NBA_2009_shooting.csv...
Loading NBA_2008_shooting.csv...
Loading NBA_2018_shooting.csv...
Loading NBA_2019_shooting.csv...
Loading NBA_2001_shooting.csv...
Loading NBA_2006_shooting.csv...
Loading NBA_2013_shooting.csv...
Loading NBA_2014_shooting.csv...
Loading NBA_2022_shooting.csv...
Loading NBA_2015_shooting.csv...
Loading NBA_2012_shooting.csv...
Loading NBA_2007_shooting.csv...
Loading NBA_2024_shooting.csv...
Loading NBA_2023_shooting.csv...
All CSV files merged successfully into /Users/nicholasrichards/Desktop/nba_shootingstats_merged.csv


In [None]:
import pandas as pd

def remove_players_with_low_mp(input_file, output_file):
    """
    Removes players with MP (minutes played) less than 1000 from the CSV file and saves the updated file.
    """
    try:
        df = pd.read_csv(input_file)

        if 'MP' in df.columns:
            df_filtered = df[df['MP'] >= 1000]
            print(f"Removed players with MP less than 1000. Remaining players: {len(df_filtered)}")
        else:
            print("'MP' column not found in the file.")
            return
        
        df_filtered.to_csv(output_file, index=False)
        print(f"Updated CSV saved as {output_file}")
    
    except Exception as e:
        print(f"Error while removing players with low MP: {e}")

input_file = '/Users/nicholasrichards/Desktop/nba_shootingstats_csvs/nba_shootingstats_cleaned.csv'
output_file = '/Users/nicholasrichards/Desktop/nba_shootingstats_csvs/nba_shootingstats_cleaned.csv'

remove_players_with_low_mp(input_file, output_file)


Removed players with MP less than 1000. Remaining players: 6600
Updated CSV saved as /Users/nicholasrichards/Desktop/nba_shootingstats_csvs/nba_shootingstats_cleaned.csv


In [None]:
def report_rows_in_csv(file_path):
    """
    Loads the CSV file and reports the number of rows.
    """
    try:
        df = pd.read_csv(file_path)

        num_rows = len(df)
        print(f"The CSV file has {num_rows} rows.")
    
    except Exception as e:
        print(f"Error while loading the CSV file: {e}")

file_path = '/Users/nicholasrichards/Desktop/nba_shootingstats_csvs/nba_shootingstats_cleaned.csv'

report_rows_in_csv(file_path)


The CSV file has 6600 rows.


In [None]:
#This is the same as the previous webscraper for basketball reference but now scraping for total stats not just shooting stats

#There was probably an easier way to scrape each url but pressed for time so did it this way

def download_webpage(url, output_file):
    """
    Downloads a webpage and saves it locally.
    """
    try:
        print(f"Downloading data from: {url}")
        response = requests.get(url)
        response.raise_for_status()
        
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(response.text)
        
        print(f"Webpage successfully downloaded and saved as {output_file}")
    except requests.exceptions.RequestException as e:
        print(f"Error while downloading the webpage: {e}")

def html_to_csv(input_file, output_csv):
    """
    Converts an HTML file to a CSV by extracting all rows in the totals table.
    """
    try:
        print(f"Parsing HTML file: {input_file}")
        
        with open(input_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')

        table = soup.find('table', {'id': 'totals_stats'})
        if not table:
            print("Totals table not found in the HTML file.")
            return
        
        headers = [th.get_text(strip=True) for th in table.find('thead').find_all('tr')[-1].find_all('th')]

        rows = table.find('tbody').find_all('tr')
        data = []
        for row in rows:
            if row.find('th', {"scope": "row"}):
                row_data = [td.get_text(strip=True) for td in row.find_all('td')]
                row_index = row.find('th').get_text(strip=True)
                full_row = [row_index] + row_data

                if len(full_row) < len(headers):
                    full_row.extend([''] * (len(headers) - len(full_row))) 
                elif len(full_row) > len(headers):
                    full_row = full_row[:len(headers)]
                
                data.append(full_row)
        
        df = pd.DataFrame(data, columns=headers)
        df.to_csv(output_csv, index=False)
        print(f"Data successfully converted to CSV and saved as {output_csv}")
    except Exception as e:
        print(f"Error while converting HTML to CSV: {e}")

def scrape_seasons(start_year, end_year):
    """
    Scrapes data for each season from start_year to end_year.
    """
    for year in range(start_year, end_year + 1):
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_totals.html#totals_stats"
        html_file = f"NBA_{year}_totals.html"
        csv_file = f"NBA_{year}_totals.csv"
        
        download_webpage(url, html_file)
        html_to_csv(html_file, csv_file)

scrape_seasons(2001, 2024)



Downloading data from: https://www.basketball-reference.com/leagues/NBA_2001_totals.html#totals_stats
Webpage successfully downloaded and saved as NBA_2001_totals.html
Parsing HTML file: NBA_2001_totals.html
Data successfully converted to CSV and saved as NBA_2001_totals.csv
Downloading data from: https://www.basketball-reference.com/leagues/NBA_2002_totals.html#totals_stats
Webpage successfully downloaded and saved as NBA_2002_totals.html
Parsing HTML file: NBA_2002_totals.html
Data successfully converted to CSV and saved as NBA_2002_totals.csv
Downloading data from: https://www.basketball-reference.com/leagues/NBA_2003_totals.html#totals_stats
Webpage successfully downloaded and saved as NBA_2003_totals.html
Parsing HTML file: NBA_2003_totals.html
Data successfully converted to CSV and saved as NBA_2003_totals.csv
Downloading data from: https://www.basketball-reference.com/leagues/NBA_2004_totals.html#totals_stats
Webpage successfully downloaded and saved as NBA_2004_totals.html
Pars

In [None]:
def merge_csv_files(directory, output_file):
    """
    Merges all CSV files in the specified directory into a single CSV file.
    """
    try:
        csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
        
        all_data = []

        for csv_file in csv_files:
            file_path = os.path.join(directory, csv_file)
            print(f"Loading {csv_file}...")
            df = pd.read_csv(file_path)
            season = csv_file.split('_')[1] 
            df['Season'] = season
            all_data.append(df)

        merged_data = pd.concat(all_data, ignore_index=True)

        merged_data.to_csv(output_file, index=False)
        print(f"All CSV files merged successfully into {output_file}")
        
    except Exception as e:
        print(f"Error while merging CSV files: {e}")

directory = '/Users/nicholasrichards/Desktop/nba_totalstats_csvs'
output_file = '/Users/nicholasrichards/Desktop/nba_totalstats_csvs/nba_totalstats_merged.csv'

merge_csv_files(directory, output_file)


Loading NBA_2013_totals.csv...
Loading NBA_2024_totals.csv...
Loading NBA_2002_totals.csv...
Loading NBA_2007_totals.csv...
Loading NBA_2021_totals.csv...
Loading NBA_2016_totals.csv...
Loading NBA_2011_totals.csv...
Loading NBA_2005_totals.csv...
Loading NBA_2019_totals.csv...
Loading NBA_2008_totals.csv...
Loading NBA_2014_totals.csv...
Loading NBA_2023_totals.csv...
Loading NBA_2006_totals.csv...
Loading NBA_2020_totals.csv...
Loading NBA_2017_totals.csv...
Loading NBA_2012_totals.csv...
Loading NBA_2003_totals.csv...
Loading NBA_2004_totals.csv...
Loading NBA_2018_totals.csv...
Loading NBA_2009_totals.csv...
Loading NBA_2015_totals.csv...
Loading NBA_2022_totals.csv...
Loading NBA_2010_totals.csv...
Loading NBA_2001_totals.csv...
All CSV files merged successfully into /Users/nicholasrichards/Desktop/nba_totalstats_csvs/nba_totalstats_merged.csv


In [None]:
def sort_csv_by_season(input_file, output_file):
    """
    Sorts the CSV file by the 'Season' column, from 2001 to 2024.
    """
    try:
        df = pd.read_csv(input_file)

        df['Season'] = pd.to_numeric(df['Season'], errors='coerce')

        df_sorted = df.sort_values(by='Season', ascending=True)

        df_sorted.to_csv(output_file, index=False)
        print(f"CSV file sorted by season and saved as {output_file}")
    
    except Exception as e:
        print(f"Error while sorting the CSV: {e}")

input_file = '/Users/nicholasrichards/Desktop/nba_totalstats_csvs/nba_totalstats_cleaned.csv'
output_file = '/Users/nicholasrichards/Desktop/nba_totalstats_csvs/nba_totalstats_cleaned.csv'

sort_csv_by_season(input_file, output_file)

#There is some cleaning code that I wrote over in the chunk but all of the cleaning I did for the shooting stats data was done for the totals and advanced stats data as well



CSV file sorted by season and saved as /Users/nicholasrichards/Desktop/nba_totalstats_csvs/nba_totalstats_cleaned.csv


In [None]:
def download_webpage(url, output_file):
    """
    Downloads a webpage and saves it locally.
    """
    try:
        print(f"Downloading data from: {url}")
        response = requests.get(url)
        response.raise_for_status()
        
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(response.text)
        
        print(f"Webpage successfully downloaded and saved as {output_file}")
    except requests.exceptions.RequestException as e:
        print(f"Error while downloading the webpage: {e}")

def html_to_csv(input_file, output_csv):
    """
    Converts an HTML file to a CSV by extracting all rows in the totals table.
    """
    try:
        print(f"Parsing HTML file: {input_file}")
        
        with open(input_file, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
        
        table = soup.find('table', {'id': 'advanced'})
        if not table:
            print("Advanced table not found in the HTML file.")
            return
        
        headers = [th.get_text(strip=True) for th in table.find('thead').find_all('tr')[-1].find_all('th')]

        rows = table.find('tbody').find_all('tr')
        data = []
        for row in rows:
            if row.find('th', {"scope": "row"}):
                row_data = [td.get_text(strip=True) for td in row.find_all('td')]
                row_index = row.find('th').get_text(strip=True)
                full_row = [row_index] + row_data

                if len(full_row) < len(headers):
                    full_row.extend([''] * (len(headers) - len(full_row))) 
                elif len(full_row) > len(headers):
                    full_row = full_row[:len(headers)]
                
                data.append(full_row)

        df = pd.DataFrame(data, columns=headers)
        df.to_csv(output_csv, index=False)
        print(f"Data successfully converted to CSV and saved as {output_csv}")
    except Exception as e:
        print(f"Error while converting HTML to CSV: {e}")

def scrape_seasons(start_year, end_year):
    """
    Scrapes data for each season from start_year to end_year.
    """
    for year in range(start_year, end_year + 1):
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html#advanced"
        html_file = f"NBA_{year}_advanced.html"
        csv_file = f"NBA_{year}_advanced.csv"
        
        download_webpage(url, html_file)
        html_to_csv(html_file, csv_file)
        
scrape_seasons(2001, 2024)

Downloading data from: https://www.basketball-reference.com/leagues/NBA_2001_advanced.html#advanced
Webpage successfully downloaded and saved as NBA_2001_advanced.html
Parsing HTML file: NBA_2001_advanced.html
Data successfully converted to CSV and saved as NBA_2001_advanced.csv
Downloading data from: https://www.basketball-reference.com/leagues/NBA_2002_advanced.html#advanced
Webpage successfully downloaded and saved as NBA_2002_advanced.html
Parsing HTML file: NBA_2002_advanced.html
Data successfully converted to CSV and saved as NBA_2002_advanced.csv
Downloading data from: https://www.basketball-reference.com/leagues/NBA_2003_advanced.html#advanced
Webpage successfully downloaded and saved as NBA_2003_advanced.html
Parsing HTML file: NBA_2003_advanced.html
Data successfully converted to CSV and saved as NBA_2003_advanced.csv
Downloading data from: https://www.basketball-reference.com/leagues/NBA_2004_advanced.html#advanced
Webpage successfully downloaded and saved as NBA_2004_advanc

In [None]:
def merge_csv_files(directory, output_file):
    """
    Merges all CSV files in the specified directory into a single CSV file.
    """
    try:
        csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
        
        all_data = []

        for csv_file in csv_files:
            file_path = os.path.join(directory, csv_file)
            print(f"Loading {csv_file}...")
            df = pd.read_csv(file_path)
            season = csv_file.split('_')[1]
            df['Season'] = season
            all_data.append(df)

        merged_data = pd.concat(all_data, ignore_index=True)

        merged_data.to_csv(output_file, index=False)
        print(f"All CSV files merged successfully into {output_file}")
        
    except Exception as e:
        print(f"Error while merging CSV files: {e}")

directory = '/Users/nicholasrichards/Desktop/nba_advancedstats_csvs'
output_file = '/Users/nicholasrichards/Desktop/nba_advancedstats_csvs/nba_advancedstats_merged.csv'


merge_csv_files(directory, output_file)


Loading NBA_2019_advanced.csv...
Loading NBA_2018_advanced.csv...
Loading NBA_2023_advanced.csv...
Loading NBA_2024_advanced.csv...
Loading NBA_2012_advanced.csv...
Loading NBA_2015_advanced.csv...
Loading NBA_2007_advanced.csv...
Loading NBA_2022_advanced.csv...
Loading NBA_2006_advanced.csv...
Loading NBA_2001_advanced.csv...
Loading NBA_2014_advanced.csv...
Loading NBA_2013_advanced.csv...
Loading NBA_2020_advanced.csv...
Loading NBA_2003_advanced.csv...
Loading NBA_2004_advanced.csv...
Loading NBA_2011_advanced.csv...
Loading NBA_2016_advanced.csv...
Loading NBA_2021_advanced.csv...
Loading NBA_2017_advanced.csv...
Loading NBA_2010_advanced.csv...
Loading NBA_2005_advanced.csv...
Loading NBA_2002_advanced.csv...
Loading NBA_2008_advanced.csv...
Loading NBA_2009_advanced.csv...
All CSV files merged successfully into /Users/nicholasrichards/Desktop/nba_advancedstats_csvs/nba_advancedstats_merged.csv


In [None]:
def remove_players_with_low_mp(input_file, output_file):
    """
    Removes players with MP (minutes played) less than 1000 from the CSV file and saves the updated file.
    """
    try:
        df = pd.read_csv(input_file)

        if 'MP' in df.columns:
            df_filtered = df[df['MP'] >= 1000]
            print(f"Removed players with MP less than 1000. Remaining players: {len(df_filtered)}")
        else:
            print("'MP' column not found in the file.")
            return
        
        df_filtered.to_csv(output_file, index=False)
        print(f"Updated CSV saved as {output_file}")
    
    except Exception as e:
        print(f"Error while removing players with low MP: {e}")

input_file = '/Users/nicholasrichards/Desktop/nba_advancedstats_csvs/nba_advancedstats_noawards.csv'
output_file = '/Users/nicholasrichards/Desktop/nba_advancedstats_csvs/nba_advancedstats_cleaned.csv'

remove_players_with_low_mp(input_file, output_file)



Removed players with MP less than 1000. Remaining players: 6600
Updated CSV saved as /Users/nicholasrichards/Desktop/nba_advancedstats_csvs/nba_advancedstats_cleaned.csv


In [None]:
def sort_csv_by_season(input_file, output_file):
    """
    Sorts the CSV file by the 'Season' column, from 2001 to 2024.
    """
    try:
        df = pd.read_csv(input_file)

        df['Season'] = pd.to_numeric(df['Season'], errors='coerce')

        df_sorted = df.sort_values(by='Season', ascending=True)

        df_sorted.to_csv(output_file, index=False)
        print(f"CSV file sorted by season and saved as {output_file}")
    
    except Exception as e:
        print(f"Error while sorting the CSV: {e}")

input_file = '/Users/nicholasrichards/Desktop/nba_advancedstats_csvs/nba_advancedstats_cleaned.csv'
output_file = '/Users/nicholasrichards/Desktop/nba_advancedstats_csvs/nba_advancedstats_cleaned.csv'

sort_csv_by_season(input_file, output_file)


CSV file sorted by season and saved as /Users/nicholasrichards/Desktop/nba_advancedstats_csvs/nba_advancedstats_cleaned.csv


In [None]:
#This program merges the three CSVs just scraped and cleaned into one master CSV

shooting_csv = "/Users/nicholasrichards/Desktop/nba_shootingstats_csvs/nba_shootingstats_cleaned.csv"
totals_csv = "/Users/nicholasrichards/Desktop/nba_totalstats_csvs/nba_totalstats_cleaned.csv"
advanced_csv = "/Users/nicholasrichards/Desktop/nba_advancedstats_csvs/nba_advancedstats_cleaned.csv"

output_csv = "/Users/nicholasrichards/Desktop/nba_combinedstats.csv"

shooting_df = pd.read_csv(shooting_csv)
totals_df = pd.read_csv(totals_csv)
advanced_df = pd.read_csv(advanced_csv)

merge_columns = ["Player", "Age", "Team", "Pos", "G", "GS", "MP", "Season"]

merged_df = shooting_df.merge(totals_df, on=merge_columns, how="inner", suffixes=("_shooting", "_totals"))
merged_df = merged_df.merge(advanced_df, on=merge_columns, how="inner")

merged_df.to_csv(output_csv, index=False)

print(f"Merged data has been saved to {output_csv}")
print(f"Number of rows in the merged dataset: {len(merged_df)}")


Merged data has been saved to /Users/nicholasrichards/Desktop/nba_combinedstats.csv
Number of rows in the merged dataset: 6600


In [None]:
# Input and output file paths
input_csv = "/Users/nicholasrichards/Desktop/machine_learning/nba_combinedstats.csv"
output_csv = "/Users/nicholasrichards/Desktop/machine_learning/nba_combinedstats_cleaned.csv"

columns_to_remove = [
    "2P.2", "3P.2", "#", "Att.", "Md.", "Rk_totals", "FG%_totals", 
    "eFG", "PF", "PTS", "Trp-Dbl", "Rk", "TS%", "USG%", 
    "OWS", "DWS", "WS", "WS/48", "OBPM", "DBPM", "BPM", "VORP"
]

try:
    df = pd.read_csv(input_csv)

    cleaned_df = df.drop(columns=columns_to_remove, errors="ignore")
    
    cleaned_df.to_csv(output_csv, index=False)
    
    print(f"Cleaned dataset saved to {output_csv}")
    print(f"Number of remaining columns: {len(cleaned_df.columns)}")
    print(f"Number of rows: {len(cleaned_df)}")
except Exception as e:
    print(f"Error processing the dataset: {e}")

#Cleaning the master CSV

Cleaned dataset saved to /Users/nicholasrichards/Desktop/machine_learning/nba_combinedstats_cleaned.csv
Number of remaining columns: 55
Number of rows: 6600


In [None]:
file_path = "/Users/nicholasrichards/Desktop/machine_learning/nba_combinedstats_cleaned.csv"

df = pd.read_csv(file_path)

features = [
    "FG%", "AvgDist", "2PA%", "0-3A%", "3-10A%", "10-16A%", "16-3PA%", 
    "3PA%", "2P%", "0-3%", "3-10%", "10-16%", "16-3P%", "3P%", 
    "DnkFGA%", "Cnr3PA%", "Cnr3P%", "FT%", "FTr", "ORB%", "AST%", 
    "STL%", "BLK%", "TOV%"
]

missing_features = [col for col in features + ["Pos", "Season"] if col not in df.columns]
if missing_features:
    raise ValueError(f"Missing columns in the dataset: {missing_features}")

train_data = df[df["Season"] != 2024]
test_data = df[df["Season"] == 2024]

X_train = train_data[features]
y_train = train_data["Pos"]
X_test = test_data[features]
y_test = test_data["Pos"]

rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

#This was the beginning of our random forest training but instead continued on Wendy's computer!

Classification Report:
              precision    recall  f1-score   support

           C       0.85      0.83      0.84        47
          PF       0.46      0.59      0.52        59
          PG       0.70      0.60      0.65        58
          SF       0.54      0.41      0.47        61
          SG       0.46      0.51      0.48        61

    accuracy                           0.58       286
   macro avg       0.60      0.59      0.59       286
weighted avg       0.59      0.58      0.58       286

Accuracy: 0.58


In [None]:
original_file = '/Users/nicholasrichards/Desktop/machine_learning/nba_combinedstats.csv'
cleaned_file = '/Users/nicholasrichards/Desktop/machine_learning/nba_combinedstats_cleaned.csv'
output_file = '/Users/nicholasrichards/Desktop/machine_learning/nba_combinedstats_cleaned.csv'

original_data = pd.read_csv(original_file)
cleaned_data = pd.read_csv(cleaned_file)

merged_data = pd.merge(
    cleaned_data,
    original_data[['Player', 'Age', 'Team', 'Pos', 'PTS']],
    on=['Player', 'Age', 'Team', 'Pos'],
    how='left' 
)

merged_data.to_csv(output_file, index=False)

print(f"Merged dataset saved to: {output_file}")


Merged dataset saved to: /Users/nicholasrichards/Desktop/machine_learning/nba_combinedstats_cleaned.csv
