In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup # Now we'll definitely use this
import io

# URL for the 2024-2025 Premier League Standard Stats
url = "https://fbref.com/en/comps/9/stats/Premier-League-Stats"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

html_content = None
try:
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    print(f"Successfully fetched URL: {url}\n")
    html_content = response.text
except requests.exceptions.RequestException as e:
    print(f"Error fetching URL {url}: {e}")

player_stats_df = None

if html_content:
    try:
        soup = BeautifulSoup(html_content, 'html.parser') # Using standard html.parser first

        # Attempt 1: Find the table directly by its ID using BeautifulSoup
        stats_table_element = soup.find('table', id='stats_standard')

        if stats_table_element:
            print("Found <table> element with id='stats_standard' using BeautifulSoup.")
            # Convert the BeautifulSoup Tag object back to a string to pass to pandas
            table_html_string = str(stats_table_element)
            # Use io.StringIO for the HTML string
            list_of_dataframes = pd.read_html(io.StringIO(table_html_string), flavor='html5lib')
            
            if list_of_dataframes:
                player_stats_df = list_of_dataframes[0] # Should be only one table now
                print("Successfully parsed table string into DataFrame using pandas.")
            else:
                print("Pandas could not parse the table string from BeautifulSoup element.")

        else:
            # Attempt 2: FBref sometimes comments out tables.
            # We need to find comments and see if our table is inside one.
            print("Table with id='stats_standard' not found directly. Checking for commented out tables...")
            
            from bs4 import Comment # Import Comment to search for comments
            comments = soup.find_all(string=lambda text: isinstance(text, Comment))
            
            found_commented_table = False
            for comment in comments:
                # Parse the content of the comment itself as HTML
                comment_soup = BeautifulSoup(comment, 'html.parser')
                potential_table = comment_soup.find('table', id='stats_standard')
                if potential_table:
                    print("Found <table> with id='stats_standard' INSIDE an HTML comment.")
                    table_html_string = str(potential_table)
                    list_of_dataframes = pd.read_html(io.StringIO(table_html_string), flavor='html5lib')
                    if list_of_dataframes:
                        player_stats_df = list_of_dataframes[0]
                        print("Successfully parsed commented table string into DataFrame.")
                        found_commented_table = True
                        break # Found it
            
            if not found_commented_table:
                print("Table with id='stats_standard' not found directly or within comments.")
                print("This might indicate a different table ID, page structure, or dynamic loading not captured by requests.")


        # If player_stats_df was successfully created by either method:
        if player_stats_df is not None:
            print("\n--- Player Stats DataFrame ---")
            print(f"Initial shape: {player_stats_df.shape}")
            # print(player_stats_df.head()) # For diagnosing column structure before flattening

            # Clean and flatten MultiIndex columns if present
            if isinstance(player_stats_df.columns, pd.MultiIndex):
                print("Detected MultiIndex columns. Flattening...")
                original_columns = player_stats_df.columns
                new_cols = []
                for col_tuple in original_columns.values:
                    levels = [str(level) for level in col_tuple]
                    if "Unnamed:" in levels[0] and len(levels) > 1 and "Unnamed:" not in levels[-1] : # Prioritize last level if first is unnamed
                        flat_col = levels[-1]
                    else:
                        # Join non-empty, non-"Unnamed" levels
                        meaningful_levels = [level for level in levels if "Unnamed:" not in level and level.strip() != '']
                        if meaningful_levels:
                             flat_col = '_'.join(meaningful_levels)
                        else: # if all levels are unnamed or empty, take the last original level
                            flat_col = levels[-1] 
                    new_cols.append(flat_col.strip('_'))

                player_stats_df.columns = new_cols
                player_stats_df.columns = [col.replace(' ', '_').replace('-', '_').replace('%', 'Pct').replace('#', 'Num').replace('/', '_per_') for col in player_stats_df.columns]
                player_stats_df.columns = [col.strip('_') for col in player_stats_df.columns]
                
                print("--- After Column Flattening & Cleaning ---")
            else:
                print("Columns are not MultiIndex. Applying basic cleaning.")
                player_stats_df.columns = [col.replace(' ', '_').replace('-', '_').replace('%', 'Pct').replace('#', 'Num').replace('/', '_per_') for col in player_stats_df.columns]


            print(player_stats_df.head())
            print(f"Shape after processing: {player_stats_df.shape}")
            print(f"Columns after processing: {player_stats_df.columns.tolist()}")

            raw_csv_path = 'fbref_2024_2025_pl_standard_stats_raw.csv'
            player_stats_df.to_csv(raw_csv_path, index=False)
            print(f"\nRaw player stats DataFrame saved to {raw_csv_path}")
        else:
            print("\nFailed to extract player_stats_df.")


    except ImportError as ie:
        print(f"ImportError during processing (check dependencies like html5lib): {ie}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        import traceback
        traceback.print_exc() # Print full traceback for debugging
else:
    print("Could not fetch HTML content, skipping table parsing.")

Successfully fetched URL: https://fbref.com/en/comps/9/stats/Premier-League-Stats

Table with id='stats_standard' not found directly. Checking for commented out tables...
Found <table> with id='stats_standard' INSIDE an HTML comment.
Successfully parsed commented table string into DataFrame.

--- Player Stats DataFrame ---
Initial shape: (591, 37)
Detected MultiIndex columns. Flattening...
--- After Column Flattening & Cleaning ---
  Rk             Player   Nation    Pos        Squad     Age  Born  \
0  1         Max Aarons  eng ENG     DF  Bournemouth  25-132  2000   
1  2  Joshua Acheampong  eng ENG     DF      Chelsea  19-011  2006   
2  3        Tyler Adams   us USA     MF  Bournemouth  26-091  1999   
3  4   Tosin Adarabioyo  eng ENG     DF      Chelsea  27-234  1997   
4  5      Simon Adingra   ci CIV  FW,MF     Brighton  23-135  2002   

  Playing_Time_MP Playing_Time_Starts Playing_Time_Min  ...  \
0               3                   1               86  ...   
1               4