# üìä Football Player Analytics Pipeline
## Notebook 1B: Scrape ALL Stat Types from FBref

The initial scrape only got **standard stats**. This notebook scrapes:
- **Shooting**: Shots, SoT%, xG, distance
- **Passing**: Key Passes, xA, pass completion
- **Possession**: Touches, Carries, Dribbles, Dispossessed
- **Defense**: Tackles, Interceptions, Blocks
- **Miscellaneous**: Aerials, Fouls

This gives us 50+ features for better clustering.

In [None]:
import pandas as pd
import numpy as np
import time
import random
from pathlib import Path
from io import StringIO
import warnings
warnings.filterwarnings('ignore')

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

print("‚úÖ Libraries loaded!")

In [None]:
# Paths
DATA_DIR = Path("../data")
RAW_DIR = DATA_DIR / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Stat types to scrape
STAT_TYPES = {
    'shooting': 'shooting',    # Shots, SoT, xG, etc.
    'passing': 'passing',      # Key passes, xA, completion%
    'possession': 'possession', # Touches, carries, dribbles
    'defense': 'defense',      # Tackles, interceptions
    'misc': 'misc',            # Aerials, fouls
    'gca': 'gca'               # Goal/shot creating actions
}

# Leagues and their base URLs
LEAGUES = {
    "Premier-League": ("9", "Premier-League"),
    "La-Liga": ("12", "La-Liga"),
    "Serie-A": ("11", "Serie-A"),
    "Bundesliga": ("20", "Bundesliga"),
    "Ligue-1": ("13", "Ligue-1"),
    "Championship": ("10", "Championship"),
    "Eredivisie": ("23", "Eredivisie")
}

print(f"üìä Will scrape {len(STAT_TYPES)} stat types")
print(f"üèÜ Across {len(LEAGUES)} leagues")

In [None]:
def create_browser():
    """Create headless Chrome browser"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver

def get_stat_url(league_id, league_name, stat_type):
    """Build FBref URL for a specific stat type"""
    base = "https://fbref.com/en/comps"
    return f"{base}/{league_id}/{stat_type}/{league_name}-Stats"

def scrape_page(driver, url):
    """Scrape stats table from URL"""
    try:
        driver.get(url)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
        time.sleep(random.uniform(2, 4))
        
        html = driver.page_source
        tables = pd.read_html(StringIO(html))
        
        # Find player stats table
        for table in tables:
            if isinstance(table.columns, pd.MultiIndex):
                table.columns = ['_'.join(str(c) for c in col).strip() for col in table.columns]
            
            col_str = ' '.join(table.columns.astype(str)).lower()
            if 'player' in col_str and len(table) > 10:
                player_col = [c for c in table.columns if 'player' in c.lower()][0]
                table = table[table[player_col] != 'Player']
                table = table.dropna(subset=[player_col])
                return table
        return pd.DataFrame()
    except Exception as e:
        print(f"  Error: {e}")
        return pd.DataFrame()

print("‚úÖ Functions defined!")

In [None]:
# Create browser
driver = create_browser()
print("‚úÖ Browser created")

In [None]:
# Scrape all stat types for each league
all_stats = {stat_type: [] for stat_type in STAT_TYPES.keys()}

print("üöÄ Starting comprehensive data collection...")
print(f"‚è±Ô∏è Estimated time: {len(LEAGUES) * len(STAT_TYPES) * 12 / 60:.0f} minutes\n")

for league_name, (league_id, url_name) in LEAGUES.items():
    print(f"\n{'='*60}")
    print(f"üèÜ {league_name}")
    print(f"{'='*60}")
    
    for stat_type, stat_url_name in STAT_TYPES.items():
        url = get_stat_url(league_id, url_name, stat_url_name)
        print(f"  ‚Üí {stat_type}...", end=" ")
        
        df = scrape_page(driver, url)
        
        if not df.empty:
            df['_league'] = league_name
            df['_stat_type'] = stat_type
            all_stats[stat_type].append(df)
            print(f"‚úÖ {len(df)} players")
            
            # Save individual file
            filename = RAW_DIR / f"{league_name}_{stat_type}.csv"
            df.to_csv(filename, index=False)
        else:
            print("‚ùå Failed")
        
        # Rate limiting
        time.sleep(random.uniform(6, 10))

print("\n" + "="*60)
print("‚úÖ SCRAPING COMPLETE!")

In [None]:
# Close browser
driver.quit()
print("‚úÖ Browser closed")

In [None]:
# Combine each stat type into master files
for stat_type, dfs in all_stats.items():
    if dfs:
        combined = pd.concat(dfs, ignore_index=True)
        filename = RAW_DIR / f"all_leagues_{stat_type}.csv"
        combined.to_csv(filename, index=False)
        print(f"üíæ Saved: {filename.name} ({len(combined)} rows, {len(combined.columns)} cols)")

In [None]:
# Check what we have
print("\nüìÅ Files in raw directory:")
for f in sorted(RAW_DIR.glob("*.csv")):
    size_kb = f.stat().st_size / 1024
    print(f"  {f.name} ({size_kb:.1f} KB)")

---
## ‚úÖ Now run Notebook 02 to process all the new data!