In [None]:
import pandas as pd
import time
import random
import os
import logging
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Step 1: Team class
class Team:
    def __init__(self, name):
        self.name = name
        self.advanced_goalkeeping = None
        self.defensive_actions = None
        self.goalkeeping = None
        self.goal_and_shot_creation = None
        self.misc_stats = None
        self.passing = None
        self.pass_types = None
        self.player_summaries = None
        self.playing_time = None
        self.possession = None
        self.shooting = None
        self.html_content = None  # Store the HTML content

# Step 2: Team names
teams = [
    "Liverpool", "Arsenal", "Newcastle-United", "Manchester-City", "Chelsea",
    "Aston-Villa", "Nottingham-Forest", "Brentford", "Brighton", "Bournemouth",
    "Fulham", "Crystal-Palace", "Everton", "Wolves", "West-Ham",
    "Manchester-United", "Tottenham", "Ipswich-Town", "Leicester-City", "Southampton"
]

# Step 3: Create Team objects
team_dict = {name: Team(name) for name in teams}

# Step 4: FBref team IDs
team_ids = {
    "Liverpool": "822bd0ba",
    "Arsenal": "18bb7c10",
    "Newcastle-United": "b2b47a98",  # Corrected from Newcastle-Utd
    "Manchester-City": "b8fd03ef",
    "Chelsea": "cff3d9bb",
    "Aston-Villa": "8602292d",
    "Nottingham-Forest": "e4a775cb",
    "Brentford": "cd051869",
    "Brighton": "d07537b9",
    "Bournemouth": "4ba7cbea",
    "Fulham": "fd962109",
    "Crystal-Palace": "47c64c55",
    "Everton": "d3fd31cc",
    "Wolves": "8cec06e1",
    "West-Ham": "7c21e445",
    "Manchester-United": "19538871",  # Corrected from Manchester-Utd
    "Tottenham": "361ca564",
    "Ipswich-Town": "b74092de",
    "Leicester-City": "a2d435b3",
    "Southampton": "33c895d4"
}

# Step 5: Stat table IDs on FBref
stat_ids = {
    "advanced_goalkeeping": "stats_keeper_adv_9",
    "defensive_actions": "stats_defense_9",
    "goalkeeping": "stats_keeper_9",
    "goal_and_shot_creation": "stats_gca_9",
    "misc_stats": "stats_misc_9",
    "passing": "stats_passing_9",
    "pass_types": "stats_passing_types_9",
    "player_summaries": "stats_standard_9",
    "playing_time": "stats_playing_time_9",
    "possession": "stats_possession_9",
    "shooting": "stats_shooting_9"
}

# Function to create cache directory if it doesn't exist
def ensure_cache_dir():
    cache_dir = "fbref_cache"
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    return cache_dir

# Function to check if cached data exists
def get_cached_html(team_name, cache_dir):
    cache_file = os.path.join(cache_dir, f"{team_name}.html")
    if os.path.exists(cache_file):
        with open(cache_file, 'r', encoding='utf-8') as f:
            return f.read()
    return None

# Function to save HTML to cache
def save_to_cache(team_name, html_content, cache_dir):
    cache_file = os.path.join(cache_dir, f"{team_name}.html")
    with open(cache_file, 'w', encoding='utf-8') as f:
        f.write(html_content)

# Function to extract tables from HTML content
def extract_tables_from_html(html_content, team_name):
    tables = {}
    for attr, table_id in stat_ids.items():
        try:
            # Find tables using pandas read_html with specific attributes
            dfs = pd.read_html(html_content, attrs={"id": table_id})
            if dfs:
                tables[attr] = dfs[0]
                #logger.info(f"  ✓ {attr}")
            else:
                logger.warning(f"  ✗ Table {table_id} not found for {team_name}")
                tables[attr] = None
        except Exception as e:
            logger.error(f"  ✗ Failed to extract {attr} for {team_name}: {e}")
            tables[attr] = None
    
    return tables

# Function to initialize Selenium WebDriver
def setup_driver():
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")  # Set window size
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
    
    # Initialize WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# Step 6: Fetch HTML for each team once and extract all needed tables
def fetch_and_process_teams():
    # Create cache directory
    cache_dir = ensure_cache_dir()
    
    # Initialize the WebDriver
    driver = setup_driver()
    
    try:
        # Track how many teams we've fetched from web (excluding cached) for implementing breaks
        web_requests_count = 0
        
        for team_name, team_id in team_ids.items():
            logger.info(f"\nProcessing {team_name}...")
            
            # Check cache first
            cached_html = get_cached_html(team_name, cache_dir)
            
            if cached_html:
                #logger.info(f"Using cached data for {team_name}")
                html_content = cached_html
            else:
                url = f"https://fbref.com/en/squads/{team_id}/{team_name}-Stats"
                logger.info(f"Fetching HTML from {url}")
                
                try:
                    # Navigate to the URL
                    driver.get(url)
                    
                    # Wait for the page to load completely
                    WebDriverWait(driver, 20).until(
                        EC.presence_of_element_located((By.ID, "meta"))
                    )
                    
                    # Add some random scrolling to mimic human behavior
                    scroll_amount = random.randint(300, 1000)
                    driver.execute_script(f"window.scrollTo(0, {scroll_amount});")
                    time.sleep(random.uniform(1.0, 2.0))
                    
                    # Get the page source
                    html_content = driver.page_source
                    
                    # Save to cache
                    save_to_cache(team_name, html_content, cache_dir)
                    
                    # Increment web requests counter - ONLY count actual web requests
                    web_requests_count += 1
                    
                    # Take a 3-minute break after every 5 web requests (but only if more teams remain)
                    if web_requests_count % 5 == 0 and team_name != teams[-1]:
                        break_minutes = 3
                        logger.info(f"\n==== Taking a {break_minutes}-minute break after {web_requests_count} web requests ====")
                        time.sleep(break_minutes * 60)  # Convert minutes to seconds
                        logger.info("==== Break complete, continuing with next team ====\n")
                    else:
                        # Be nice to the server with a longer random delay between requests
                        delay = random.uniform(15.0, 25.0)
                        logger.info(f"Waiting {delay:.2f} seconds before next request...")
                        time.sleep(delay)
                    
                except Exception as e:
                    logger.error(f"Failed to fetch data for {team_name}: {e}")
                    continue
            
            # Store the HTML content in the Team object
            team_dict[team_name].html_content = html_content
            
            # Extract all tables from the HTML content
            tables = extract_tables_from_html(html_content, team_name)
            
            # Assign tables to team attributes
            for attr, df in tables.items():
                setattr(team_dict[team_name], attr, df)
    
    finally:
        # Always close the driver
        driver.quit()

# Run the scraper
if __name__ == "__main__":
    try:
        fetch_and_process_teams()
        
        # Example of accessing data:
        logger.info("\nExample data access:")
        sample_team = teams[0]
        if team_dict[sample_team].player_summaries is not None:
            logger.info(f"{sample_team} player summaries preview:")
            logger.info(team_dict[sample_team].player_summaries.head())
            
            # Save as CSV example
            sample_team_df = team_dict[sample_team].player_summaries
            sample_team_df.to_csv(f"{sample_team}_player_summaries.csv", index=False)
            #logger.info(f"Saved {sample_team} player summaries to CSV")
            
    except Exception as e:
        logger.error(f"Error in main execution: {e}")

2025-05-13 22:30:19,443 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-13 22:30:20,075 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-13 22:30:20,572 - INFO - Driver [C:\Users\pebou\.wdm\drivers\chromedriver\win64\136.0.7103.92\chromedriver-win32/chromedriver.exe] found in cache
2025-05-13 22:30:21,542 - INFO - 
Processing Liverpool...
  dfs = pd.read_html(html_content, attrs={"id": table_id})
  dfs = pd.read_html(html_content, attrs={"id": table_id})
  dfs = pd.read_html(html_content, attrs={"id": table_id})
  dfs = pd.read_html(html_content, attrs={"id": table_id})
  dfs = pd.read_html(html_content, attrs={"id": table_id})
  dfs = pd.read_html(html_content, attrs={"id": table_id})
  dfs = pd.read_html(html_content, attrs={"id": table_id})
  dfs = pd.read_html(html_content, attrs={"id": table_id})
  dfs = pd.read_html(html_content, attrs={"id": table_id})
  dfs = pd.read_html(html_content, attrs={"id": table_id})
  dfs = pd.read_html(html_conten

Flatten Tables

In [None]:
def flatten_header(df):
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [' '.join(col).strip() for col in df.columns.values]
    return df


Save each table

In [None]:
def save_all_tables(team_dict):
    for stat_name in stat_ids.keys():
        # Make directory for each stat table
        os.makedirs(stat_name, exist_ok=True)
        
        for team_name, team in team_dict.items():
            df = getattr(team, stat_name)
            if df is not None:
                df = flatten_header(df)  # Remove double headers
                filename = f"{team_name.replace('-', '_')}-{stat_name}.csv"
                filepath = os.path.join(stat_name, filename)
                try:
                    df.to_csv(filepath, index=False)
                    logger.info(f"Saved: {filepath}")
                except Exception as e:
                    logger.error(f"Failed to save {filepath}: {e}")
