In [36]:
import pandas as pd
import requests
import time

Starting NBA game log scraper for 25 players...

Scraping data for player 1/25: a/adebaba01
Sending request to Basketball Reference for player a/adebaba01...
Request failed with status code: 429
Failed to scrape data for a/adebaba01
Waiting 23 seconds before next player...


KeyboardInterrupt: 

In [38]:
df=pd.read_csv('lebron_james_game_logs_2024.csv')

In [39]:
df.head()

Unnamed: 0,Rk,Gcar,Gtm,Date,Team,Opp,Result,GS,MP,FG,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1.0,1422.0,1.0,2023-10-24,LAL,DEN,"L, 107-119",*,,10.0,...,7.0,8.0,5.0,1.0,0.0,0.0,1.0,21.0,20.3,7.0
1,2.0,1423.0,2.0,2023-10-26,LAL,PHO,"W, 100-95",*,,7.0,...,7.0,8.0,9.0,2.0,2.0,5.0,1.0,21.0,20.3,22.0
2,3.0,1424.0,3.0,2023-10-29,LAL,SAC,"L, 127-132 (OT)",*,,11.0,...,15.0,15.0,8.0,0.0,0.0,8.0,2.0,27.0,17.8,-5.0
3,4.0,1425.0,4.0,2023-10-30,LAL,ORL,"W, 106-103",*,,7.0,...,3.0,3.0,4.0,3.0,1.0,5.0,0.0,19.0,11.9,5.0
4,5.0,1426.0,5.0,2023-11-01,LAL,LAC,"W, 130-125 (OT)",*,,13.0,...,12.0,12.0,7.0,1.0,2.0,4.0,3.0,35.0,30.6,6.0


In [42]:

df.drop('GS',axis=1 ,inplace=True)

KeyError: "['GS'] not found in axis"

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      82 non-null     float64
 1   Gcar    71 non-null     float64
 2   Gtm     82 non-null     float64
 3   Date    82 non-null     object 
 4   Team    83 non-null     object 
 5   Opp     82 non-null     object 
 6   Result  83 non-null     object 
 7   FG      72 non-null     float64
 8   FGA     72 non-null     float64
 9   FG%     72 non-null     float64
 10  3P      72 non-null     float64
 11  3PA     72 non-null     float64
 12  3P%     72 non-null     float64
 13  2P      72 non-null     float64
 14  2PA     72 non-null     float64
 15  2P%     72 non-null     float64
 16  eFG%    72 non-null     float64
 17  FT      72 non-null     float64
 18  FTA     72 non-null     float64
 19  FT%     71 non-null     float64
 20  ORB     72 non-null     float64
 21  DRB     72 non-null     float64
 22  TRB 

In [None]:
import requests
import pandas as pd
import time
from io import StringIO

def scrape_lebron_game_logs():
    """
    Scrapes LeBron James' 2023-2024 NBA season game logs from Basketball Reference.
    Uses pandas read_html which proved to be successful in previous attempts.
    """
    url = "https://www.basketball-reference.com/players/j/jamesle01/gamelog/2024"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    print("Sending request to Basketball Reference...")
    
    try:
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            print(f"Request successful! Status code: {response.status_code}")
            
            html_io = StringIO(response.text)
            print("Parsing tables with pandas...")
            
            try:
                tables = pd.read_html(html_io, attrs={'id': 'pgl_basic'})
                if tables and len(tables) > 0:
                    print("Found game log table by ID!")
                    game_log_df = tables[0]
                    return process_dataframe(game_log_df)
            except Exception as e:
                print(f"Couldn't find table by ID: {e}")
                html_io.seek(0)
            
            tables = pd.read_html(html_io)
            tables_sorted = sorted(tables, key=lambda x: len(x), reverse=True)
            
            if tables_sorted:
                game_log_df = tables_sorted[0]
                print(f"Table found! Dimensions: {game_log_df.shape}")
                return process_dataframe(game_log_df)
            else:
                print("No tables found on the page.")
                return None
        else:
            print(f"Request failed with status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error during scraping: {e}")
        return None

def process_dataframe(game_log_df):
    """
    Process the raw dataframe from Basketball Reference to clean and format it properly.
    """
    print("Processing the game log dataframe...")
    
    if isinstance(game_log_df.columns, pd.MultiIndex):
        game_log_df.columns = [' '.join(str(col) for col in cols if str(col) != 'Unnamed: 0_level_0').strip() 
                               for cols in game_log_df.columns.values]
    
    print("Raw columns:", game_log_df.columns.tolist())
    
    game_log_df = game_log_df[~game_log_df.iloc[:, 0].astype(str).str.contains("Rk")]
    
    unnamed_cols = [col for col in game_log_df.columns if 'Unnamed' in str(col)]
    if unnamed_cols:
        game_log_df = game_log_df.drop(columns=unnamed_cols)
    
    if 'Tm' in game_log_df.columns and 'Team' not in game_log_df.columns:
        game_log_df = game_log_df.rename(columns={'Tm': 'Team'})
    elif 'Tm' in game_log_df.columns and 'Team' in game_log_df.columns and game_log_df['Team'].isna().all():
        game_log_df['Team'] = game_log_df['Tm']
        game_log_df = game_log_df.drop(columns=['Tm'])
    
    if 'Team' in game_log_df.columns and game_log_df['Team'].isna().any():
        game_log_df['Team'] = game_log_df['Team'].fillna('LAL')
    elif 'Team' not in game_log_df.columns and 'Tm' not in game_log_df.columns:
        game_log_df['Team'] = 'LAL'
    
    numeric_cols = game_log_df.select_dtypes(include=['object']).columns
    for col in numeric_cols:
        if col not in ['Date', 'Tm', 'Team', 'Opp', 'Result', 'GS']:
            game_log_df[col] = pd.to_numeric(game_log_df[col], errors='coerce')
    
    expected_columns = ['Date', 'Team', 'Opp', 'Result']
    missing_cols = [col for col in expected_columns if col not in game_log_df.columns]
    if missing_cols:
        print(f"Warning: Missing expected columns: {missing_cols}")
    
    return game_log_df

def save_to_csv(df, filename='lebron_james_game_logs_2024.csv'):
    """
    Saves the DataFrame to a CSV file.
    """
    if df is not None:
        try:
            df.to_csv(filename, index=False)
            print(f"Data saved to {filename}")
            return True
        except Exception as e:
            print(f"Error saving CSV: {e}")
            return False
    else:
        print("No data to save.")
        return False

def main():
    """
    Main function to execute the scraping workflow.
    """
    print("Starting LeBron James game log scraper...")
    start_time = time.time()
    
    game_logs = scrape_lebron_game_logs()
    
    if game_logs is not None:
        save_to_csv(game_logs)
        
        print("\nSample of the scraped data:")
        print(game_logs.head())
        
        print("\nBasic statistics:")
        print(f"Total games: {len(game_logs)}")
        
        print("\nAvailable columns:")
        print(game_logs.columns.tolist())
        
        for stat_col in [('PTS', 'pts'), ('TRB', 'trb'), ('AST', 'ast'), 
                         ('FG%', 'fg_pct'), ('3P%', 'fg3_pct'), ('FT%', 'ft_pct')]:
            for col in stat_col:
                if col in game_logs.columns and not game_logs[col].isna().all():
                    print(f"Average {col.lower()}: {game_logs[col].mean():.1f}")
                    break
        
        print(f"\nScraping completed in {time.time() - start_time:.2f} seconds")
    else:
        print("Scraping failed. The website structure might have changed or access is blocked.")

if __name__ == '__main__':
    main()