In [None]:
import streamlit as st
from streamlit_option_menu import option_menu
import pandas as pd
import numpy as np
import os
import random
from nba_stats_predictor import NBAStatsPredictor
import time


In [29]:
player_ids = [
    "j/jamesle01", "c/curryst01", "d/duranke01", "a/antetgi01", "d/doncilu01",
    "j/jokicni01", "e/embiijo01", "t/tatumja01", "b/butleji01", "l/leonaka01",
    "l/lillada01", "h/hardeja01", "d/davisan02", "b/bookede01", "m/mitchdo01",
    "w/willizi01", "m/moranja01", "y/youngtr01", "t/townska01", "b/bealbr01",
    "g/georgpa01", "i/irvinky01", "p/paulch01", "d/derozde01", "w/westbru01",
    "a/adebaba01", "h/holidjr01", "m/middlkh01", "s/siakapa01", "v/vanvlfr01",
    "g/gilgesh01", "i/ingrabr01", "m/mccolcj01", "b/ballla01", "h/halibty01",
    "r/randlju01", "b/barrerj01", "f/foxde01", "s/sabondo01", "t/turnemy01",
    "p/portemi01", "m/murraja01", "w/wiggian01", "g/greenra01", "v/vucicni01",
    "m/mobleev01", "s/smithja02", "b/barnesc01", "b/banchpa01", "s/suggsca01"
]



In [38]:


def scrape_player_game_logs(player_id, season="2025"):
    """
    Scrapes a player's NBA season game logs from Basketball Reference.
    
    Args:
        player_id (str): Player ID from Basketball Reference (e.g., 'jamesle01' or 'j/jamesle01')
        season (str): Season year (e.g., '2024' for 2023-2024 season)
    """
    # Check if player_id already contains the first letter path
    if '/' in player_id:
        # Player ID already includes the letter path (e.g., 'j/jamesle01')
        url = f"https://www.basketball-reference.com/players/{player_id}/gamelog/{season}"
    else:
        # Player ID is just the ID part (e.g., 'jamesle01')
        url = f"https://www.basketball-reference.com/players/{player_id[0]}/{player_id}/gamelog/{season}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    print(f"Sending request to Basketball Reference for player {player_id}...")
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            print(f"Request successful! Status code: {response.status_code}")
            html_io = StringIO(response.text)
            print("Parsing tables with pandas...")
            try:
                tables = pd.read_html(html_io, attrs={'id': 'pgl_basic'})
                if tables and len(tables) > 0:
                    print("Found game log table by ID!")
                    game_log_df = tables[0]
                    processed_df = process_dataframe(game_log_df)
                    processed_df['player_id'] = player_id  # Add player_id column
                    return processed_df
            except Exception as e:
                print(f"Couldn't find table by ID: {e}")
                html_io.seek(0)
                tables = pd.read_html(html_io)
                tables_sorted = sorted(tables, key=lambda x: len(x), reverse=True)
                if tables_sorted:
                    game_log_df = tables_sorted[0]
                    print(f"Table found! Dimensions: {game_log_df.shape}")
                    processed_df = process_dataframe(game_log_df)
                    processed_df['player_id'] = player_id  # Add player_id column
                    return processed_df
                else:
                    print("No tables found on the page.")
                    return None
        else:
            print(f"Request failed with status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error during scraping: {e}")
        return None

def process_dataframe(game_log_df):
    """
    Process the raw dataframe from Basketball Reference to clean and format it properly.
    """
    print("Processing the game log dataframe...")
    # Print the first few rows to help with debugging
    print("First few rows of raw data:")
    print(game_log_df.head(3))
    
    if isinstance(game_log_df.columns, pd.MultiIndex):
        # Handle multi-level columns
        game_log_df.columns = [' '.join(str(col) for col in cols if str(col) != 'Unnamed: 0_level_0').strip() 
                             for cols in game_log_df.columns.values]
    
    print("Raw columns:", game_log_df.columns.tolist())
    
    # Remove header rows (where Rk column appears again in the data)
    game_log_df = game_log_df[~game_log_df.iloc[:, 0].astype(str).str.contains("Rk")]
    
    # Remove unnamed columns
    unnamed_cols = [col for col in game_log_df.columns if 'Unnamed' in str(col)]
    if unnamed_cols:
        game_log_df = game_log_df.drop(columns=unnamed_cols)
    
    # Handle team column naming
    if 'Tm' in game_log_df.columns and 'Team' not in game_log_df.columns:
        game_log_df = game_log_df.rename(columns={'Tm': 'Team'})
    elif 'Tm' in game_log_df.columns and 'Team' in game_log_df.columns and game_log_df['Team'].isna().all():
        game_log_df['Team'] = game_log_df['Tm']
        game_log_df = game_log_df.drop(columns=['Tm'])
    
    # Fill missing team values
    if 'Team' in game_log_df.columns and game_log_df['Team'].isna().any():
        game_log_df['Team'] = game_log_df['Team'].fillna('Unknown')
    elif 'Team' not in game_log_df.columns and 'Tm' not in game_log_df.columns:
        game_log_df['Team'] = 'Unknown'
    
    # Check for and handle the MP (Minutes Played) column
    if 'MP' in game_log_df.columns:
        
        game_log_df['MP'] = game_log_df['MP'].apply(lambda x: convert_minutes_format(x) if pd.notna(x) else x)
    elif 'MIN' in game_log_df.columns:
        game_log_df = game_log_df.rename(columns={'MIN': 'MP'})
        game_log_df['MP'] = game_log_df['MP'].apply(lambda x: convert_minutes_format(x) if pd.notna(x) else x)
    
    # Convert all possible numeric columns
    for col in game_log_df.columns:
        if col not in ['Date', 'Tm', 'Team', 'Opp', 'Result', 'GS', 'player_id']:
            game_log_df[col] = pd.to_numeric(game_log_df[col], errors='coerce')
    
    # Additional check for MP column
    if 'MP' not in game_log_df.columns:
        print("MP column not found in the data. Available columns:", game_log_df.columns.tolist())
        game_log_df['MP'] = None  # Create the column with None values if it doesn't exist
        
        
        minute_column_variants = ['Minutes', 'Min', 'Mins', 'Minutes Played']
        for col_name in minute_column_variants:
            if col_name in game_log_df.columns:
                game_log_df['MP'] = game_log_df[col_name]
                game_log_df['MP'] = game_log_df['MP'].apply(lambda x: convert_minutes_format(x) if pd.notna(x) else x)
                print(f"Found minutes data in column: {col_name}")
                break
    
    expected_columns = ['Date', 'Team', 'Opp', 'Result', 'MP']
    missing_cols = [col for col in expected_columns if col not in game_log_df.columns]
    if missing_cols:
        print(f"Warning: Missing expected columns: {missing_cols}")
    
    return game_log_df

def convert_minutes_format(minutes_str):
    """
    Convert minutes from 'MM:SS' string format to decimal minutes.
    Example: '36:12' becomes 36.2 (36 minutes and 12 seconds)
    """
    try:
        if isinstance(minutes_str, (int, float)):
            return float(minutes_str)
        elif isinstance(minutes_str, str):
            if ':' in minutes_str:
                parts = minutes_str.split(':')
                if len(parts) == 2:
                    minutes = int(parts[0])
                    seconds = int(parts[1])
                    return minutes + seconds/60
            else:
                return float(minutes_str)
        return None
    except Exception as e:
        print(f"Error converting minutes format: {e}, value was: {minutes_str}")
        return None

def save_to_csv(df, filename='nba_player_game_logs.csv'):
    """
    Saves the DataFrame to a CSV file.
    """
    if df is not None:
        try:
            df.to_csv(filename, index=False)
            print(f"Data saved to {filename}")
            return True
        except Exception as e:
            print(f"Error saving CSV: {e}")
            return False
    else:
        print("No data to save.")
        return False

def main():
    """
    Main function to execute the scraping workflow for multiple players.
    """
    # List of player IDs to scrape
    player_ids = [
        "j/jamesle01", "c/curryst01", "d/duranke01", "a/antetgi01", "d/doncilu01", 
        "j/jokicni01", "e/embiijo01", "t/tatumja01", "b/butleji01", "l/leonaka01", 
        "l/lillada01", "h/hardeja01", "d/davisan02", "b/bookede01", "m/mitchdo01", 
        "w/willizi01", "m/moranja01", "y/youngtr01", "t/townska01", "b/bealbr01", 
        "g/georgpa01", "i/irvinky01", "p/paulch01", "d/derozde01", "w/westbru01", 
        "a/adebaba01", "h/holidjr01", "m/middlkh01", "s/siakapa01", "v/vanvlfr01", 
        "g/gilgesh01", "i/ingrabr01", "m/mccolcj01", "b/ballla01", "h/halibty01", 
        "r/randlju01", "b/barrerj01", "f/foxde01", "s/sabondo01", "t/turnemy01", 
        "p/portemi01", "m/murraja01", "w/wiggian01", "g/greenra01", "v/vucicni01", 
        "m/mobleev01", "s/smithja02", "b/barnesc01", "b/banchpa01", "s/suggsca01"
    ]
    
    season = "2025"  
    
    print(f"Starting NBA game log scraper for {len(player_ids)} players...")
    start_time = time.time()
    
    all_game_logs = []
    
    
    checkpoint_interval = 5  # Save after every 5 players
    
    for i, player_id in enumerate(player_ids):
        print(f"\nScraping data for player {i+1}/{len(player_ids)}: {player_id}")
        player_game_logs = scrape_player_game_logs(player_id, season)
        
        if player_game_logs is not None:
            all_game_logs.append(player_game_logs)
            print(f"Successfully scraped {len(player_game_logs)} games for {player_id}")
            
            # Save checkpoint after every few players
            if (i + 1) % checkpoint_interval == 0 and all_game_logs:
                checkpoint_df = pd.concat(all_game_logs, ignore_index=True)
                checkpoint_filename = f"checkpoint_nba_logs_{i+1}_players.csv"
                save_to_csv(checkpoint_df, checkpoint_filename)
                print(f"Checkpoint saved to {checkpoint_filename} after {i+1} players")
            
            # Add a significant delay between players to avoid rate limiting
            if i < len(player_ids) - 1:  
                wait_time = 10 + random.randint(5, 15)  
                print(f"Waiting {wait_time} seconds before next player to avoid rate limiting...")
                time.sleep(wait_time)
        else:
            print(f"Failed to scrape data for {player_id}")
            
            if i < len(player_ids) - 1:
                wait_time = 15 + random.randint(5, 15)
                print(f"Waiting {wait_time} seconds before next player...")
                time.sleep(wait_time)
    
    # Combine all player data into a single DataFrame
    if all_game_logs:
        combined_df = pd.concat(all_game_logs, ignore_index=True)
        print(f"\nCombined dataset created with {len(combined_df)} total game logs")
        
        # Save combined data
        save_to_csv(combined_df, f"nba_game_logs_{season}.csv")
        
        # Print sample and statistics
        print("\nSample of the scraped data:")
        print(combined_df.head())
        
        print("\nBasic statistics:")
        print(f"Total games: {len(combined_df)}")
        print(f"Players included: {combined_df['player_id'].nunique()}")
        print("\nAvailable columns:")
        print(combined_df.columns.tolist())
        
        # Save intermediate results after every 10 players
        print("\nTotal time elapsed:", time.time() - start_time)
    else:
        print("Scraping failed for all players.")


Starting NBA game log scraper for 50 players...

Scraping data for player 1/50: j/jamesle01
Sending request to Basketball Reference for player j/jamesle01...
Request failed with status code: 429
Failed to scrape data for j/jamesle01
Waiting 29 seconds before next player...


KeyboardInterrupt: 

In [None]:


def load_existing_data(filename='nba_game_logs_2025.csv'):
    """
    Load existing dataset if available, return empty DataFrame if file doesn't exist.
    """
    try:
        if os.path.exists(filename):
            df = pd.read_csv(filename)
            print(f"Loaded existing dataset with {len(df)} records from {filename}")
            return df
        else:
            print(f"File {filename} not found. Will create a new dataset.")
            return None
    except Exception as e:
        print(f"Error loading existing data: {e}")
        return None

def scrape_additional_players(new_player_ids, existing_filename='nba_game_logs_2025.csv', season="2025"):
    """
    Scrapes game logs for new players and combines with existing data.
    """
    # Load existing data
    existing_df = load_existing_data(existing_filename)
    
    # Get list of players already in the dataset to avoid duplicates
    existing_players = set()
    if existing_df is not None and 'player_id' in existing_df.columns:
        existing_players = set(existing_df['player_id'].unique())
        print(f"Existing dataset contains {len(existing_players)} players")
    
    # Filter out players that are already in the dataset
    players_to_scrape = [p_id for p_id in new_player_ids if p_id not in existing_players]
    
    if not players_to_scrape:
        print("All players in the new list are already in the dataset. Nothing to scrape.")
        return existing_df
    
    print(f"Scraping data for {len(players_to_scrape)} new players...")
    
    # Scrape the new players
    start_time = time.time()
    new_game_logs = []
    checkpoint_interval = 5
    
    for i, player_id in enumerate(players_to_scrape):
        print(f"\nScraping data for new player {i+1}/{len(players_to_scrape)}: {player_id}")
        player_game_logs = scrape_player_game_logs(player_id, season)
        
        if player_game_logs is not None:
            new_game_logs.append(player_game_logs)
            print(f"Successfully scraped {len(player_game_logs)} games for {player_id}")
            
            # Save checkpoint after every few players
            if (i + 1) % checkpoint_interval == 0 and new_game_logs:
                checkpoint_df = pd.concat(new_game_logs, ignore_index=True)
                checkpoint_filename = f"new_checkpoint_nba_logs_{i+1}_players.csv"
                save_to_csv(checkpoint_df, checkpoint_filename)
                print(f"Checkpoint saved to {checkpoint_filename} after {i+1} new players")
            
            # No waiting time between players
            print("Continuing to next player immediately...")
        else:
            print(f"Failed to scrape data for {player_id}")
            
            if i < len(players_to_scrape) - 1:
                print("Continuing to next player...")
    
    # Combine all new player data
    if new_game_logs:
        new_combined_df = pd.concat(new_game_logs, ignore_index=True)
        print(f"\nNew dataset created with {len(new_combined_df)} total game logs")
        
        # Merge with existing data if it exists
        if existing_df is not None:
            # Combine new data with existing data
            final_df = pd.concat([existing_df, new_combined_df], ignore_index=True)
            print(f"Combined dataset now has {len(final_df)} total game logs from {final_df['player_id'].nunique()} players")
        else:
            final_df = new_combined_df
            print(f"No existing data to merge. New dataset has {len(final_df)} game logs")
        
        # Save combined data
        save_to_csv(final_df, existing_filename)
        
        # Print sample and statistics
        print("\nSample of the updated dataset:")
        print(final_df.head())
        
        print("\nBasic statistics:")
        print(f"Total games: {len(final_df)}")
        print(f"Players included: {final_df['player_id'].nunique()}")
        print("Total scraping time:", time.time() - start_time)
        
        return final_df
    else:
        print("Scraping failed for all new players.")
        return existing_df

def main_additional():
    """
    Main function to add new players to the existing dataset.
    """
    # Specify the new list of player IDs to scrape
    new_player_ids = [
        # Add your new list of player IDs here
        "e/edwaran01", "g/giddesh01", "h/hendeco01", "m/maxeyty01", "m/murraja01", 
        "h/holmgri01", "b/brogdma01", "a/aytonde01", "j/johnsja05", "r/reeveau01",
        "b/brownja02", "b/banchpa01", "w/wagnefr01", "s/sengaal01", "t/thomptr01",
        "p/porzikr01", "m/murrays01", "b/brunjan01", "h/hartjo01", "a/aldrila01",
        "r/russeda01", "h/hayesjk01", "w/washinpj01", "c/claxcni01", "d/davisan03",
        "g/gainesd01", "m/millspa01", "l/lowryky01", "w/whitede01", "c/claxtca01",
        "g/goberru01", "a/allenja01", "p/poweljn01", "b/brissoo01", "p/poeleja01"
    ]
    
    existing_filename = "nba_game_logs_2025.csv"  # The name of your existing dataset
    season = "2025"
    
    # Run the scraping and data aggregation
    final_dataset = scrape_additional_players(new_player_ids, existing_filename, season)
    
    if final_dataset is not None:
        print("\nData aggregation complete! Updated dataset is saved to", existing_filename)
    else:
        print("\nFailed to update the dataset.")

if __name__ == '__main__':
    # Use main_additional() instead of main() to scrape additional players
    main_additional()


In [39]:
df=pd.read_csv("nba_game_logs_2025.csv")
df

Unnamed: 0,Rk,Gcar,Gtm,Date,Team,Opp,Result,GS,MP,FG,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,player_id
0,1.0,1493.0,1.0,2024-10-22,LAL,MIN,"W, 110-103",*,34.650000,7.0,...,5.0,4.0,0.0,2.0,2.0,3.0,16.0,10.1,-6.0,j/jamesle01
1,2.0,1494.0,2.0,2024-10-25,LAL,PHO,"W, 123-116",*,34.700000,7.0,...,4.0,8.0,0.0,0.0,2.0,1.0,21.0,17.9,14.0,j/jamesle01
2,3.0,1495.0,3.0,2024-10-26,LAL,SAC,"W, 131-127",*,33.766667,12.0,...,14.0,10.0,0.0,1.0,5.0,3.0,32.0,27.1,13.0,j/jamesle01
3,4.0,1496.0,4.0,2024-10-28,LAL,PHO,"L, 105-109",*,35.800000,3.0,...,5.0,8.0,1.0,0.0,2.0,1.0,11.0,6.9,-17.0,j/jamesle01
4,5.0,1497.0,5.0,2024-10-30,LAL,CLE,"L, 110-134",*,28.966667,9.0,...,6.0,3.0,0.0,0.0,6.0,2.0,26.0,18.0,-17.0,j/jamesle01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2068,72.0,1234.0,79.0,2025-04-06,DEN,IND,"L, 120-125",,23.583333,6.0,...,5.0,4.0,0.0,0.0,1.0,4.0,16.0,12.7,-14.0,w/westbru01
2069,73.0,1235.0,80.0,2025-04-09,DEN,SAC,"W, 124-116",,,...,4.0,4.0,1.0,0.0,3.0,2.0,5.0,0.1,-4.0,w/westbru01,
2070,74.0,1236.0,81.0,2025-04-11,DEN,MEM,"W, 117-109",,,...,3.0,4.0,2.0,0.0,2.0,0.0,14.0,13.0,11.0,w/westbru01,
2071,75.0,1237.0,82.0,2025-04-13,DEN,HOU,"W, 126-111",,22.633333,5.0,...,0.0,6.0,0.0,0.0,1.0,1.0,17.0,15.1,8.0,w/westbru01
