# üìä Football Player Analytics Pipeline
## Notebook 1: Data Collection from FBref

This notebook collects player statistics from 8 major football leagues.

### ‚ö†Ô∏è FBref Bot Protection
FBref blocks automated requests (403 Forbidden). We solve this by:
1. Using Selenium WebDriver to control a real browser
2. Adding realistic delays between requests
3. Using proper browser headers

### Leagues We'll Scrape:
1. Premier League (England)
2. La Liga (Spain)
3. Serie A (Italy)
4. Bundesliga (Germany)
5. Ligue 1 (France)
6. Championship (England - 2nd tier)
7. MLS (USA)
8. Eredivisie (Netherlands)

In [None]:
# Install Selenium if not already installed
!pip install selenium webdriver-manager

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import time
import random
from pathlib import Path
from io import StringIO
import warnings
warnings.filterwarnings('ignore')

# Selenium imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

print("‚úÖ Libraries imported successfully!")

## 1. Configuration

In [None]:
# === CONFIGURATION ===

# FBref URLs for current season stats (no season in URL = current season)
# These URLs get the CURRENT season automatically
LEAGUE_URLS = {
    "Premier-League": "https://fbref.com/en/comps/9/stats/Premier-League-Stats",
    "La-Liga": "https://fbref.com/en/comps/12/stats/La-Liga-Stats",
    "Serie-A": "https://fbref.com/en/comps/11/stats/Serie-A-Stats", 
    "Bundesliga": "https://fbref.com/en/comps/20/stats/Bundesliga-Stats",
    "Ligue-1": "https://fbref.com/en/comps/13/stats/Ligue-1-Stats",
    "Championship": "https://fbref.com/en/comps/10/stats/Championship-Stats",
    "MLS": "https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats",
    "Eredivisie": "https://fbref.com/en/comps/23/stats/Eredivisie-Stats"
}

# Ghana Black Stars Forwards
GHANA_FORWARDS = [
    "Mohammed Kudus",
    "Antoine Semenyo",
    "Jordan Ayew",
    "Ernest Nuamah",
    "Osman Bukari",
    "Abdul Fatawu Issahaku",
    "Kamaldeen Sulemana",
    "Ibrahim Osman",
    "Brandon Thomas-Asante",
    "I√±aki Williams",
    "Joseph Paintsil",
    "Jerry Afriyie",
    "Christopher Bonsu Baah"
]

# Data directories
DATA_DIR = Path("../data")
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"

# Create directories
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print("‚úÖ Configuration set!")
print(f"üìÅ Data directory: {DATA_DIR.absolute()}")
print(f"üèÜ Leagues to scrape: {len(LEAGUE_URLS)}")

## 2. Setup Selenium Browser

We use a real Chrome browser in headless mode to bypass FBref's bot protection.

In [None]:
def create_browser():
    """
    Create a Chrome browser instance with anti-detection settings.
    """
    chrome_options = Options()
    
    # Run in headless mode (no visible browser window)
    chrome_options.add_argument("--headless=new")
    
    # Anti-detection options
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    # Disable automation flags
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    # Create driver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Remove webdriver property
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    return driver

print("‚úÖ Browser setup function defined!")

In [None]:
def scrape_fbref_page(driver, url: str) -> pd.DataFrame:
    """
    Scrape a FBref stats page using Selenium.
    
    Args:
        driver: Selenium WebDriver instance
        url: FBref URL to scrape
    
    Returns:
        DataFrame with player stats
    """
    try:
        # Navigate to page
        driver.get(url)
        
        # Wait for page to load (wait for stats table)
        wait = WebDriverWait(driver, 15)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
        
        # Add random delay to appear more human
        time.sleep(random.uniform(2, 4))
        
        # Get page HTML
        html = driver.page_source
        
        # Parse tables with pandas
        tables = pd.read_html(StringIO(html))
        
        # Find the main stats table (largest with 'Player' column)
        for table in tables:
            # Flatten multi-level columns if present
            if isinstance(table.columns, pd.MultiIndex):
                table.columns = ['_'.join(str(c) for c in col).strip() for col in table.columns]
            
            # Check for player column
            col_str = ' '.join(table.columns.astype(str)).lower()
            if 'player' in col_str and len(table) > 10:
                # Clean up header rows
                player_col = [c for c in table.columns if 'player' in c.lower()][0]
                table = table[table[player_col] != 'Player']
                table = table.dropna(subset=[player_col])
                return table
        
        print("  ‚ö†Ô∏è No suitable table found")
        return pd.DataFrame()
        
    except Exception as e:
        print(f"  ‚ùå Error: {e}")
        return pd.DataFrame()

print("‚úÖ Scraper function defined!")

## 3. Test Scraping (Single Page)

In [None]:
# Test with Premier League
print("üß™ Testing scraper with Premier League...")
print("   (This will open Chrome in background)\n")

# Create browser
driver = create_browser()
print("‚úÖ Browser created")

# Test scrape
test_url = LEAGUE_URLS["Premier-League"]
print(f"üîó URL: {test_url}")

test_df = scrape_fbref_page(driver, test_url)

if not test_df.empty:
    print(f"\n‚úÖ SUCCESS! Got {len(test_df)} players")
    print(f"\nüìã Columns ({len(test_df.columns)}):")
    print(test_df.columns.tolist()[:15])
    print("\nüîç Sample data:")
    display(test_df.head())
else:
    print("‚ùå Test failed")

# Keep browser open for now

## 4. Full Data Collection

Now scrape all 8 leagues. We'll use the same browser session to be more efficient.

In [None]:
# === MAIN SCRAPING LOOP ===

all_data = []

print("üöÄ Starting full data collection...")
print(f"üìä Leagues: {len(LEAGUE_URLS)}")
print(f"‚è±Ô∏è Estimated time: {len(LEAGUE_URLS) * 10 / 60:.1f} minutes")
print("\n" + "="*60)

# Create browser if not exists
try:
    driver.title
except:
    print("Creating new browser...")
    driver = create_browser()

for league_name, url in LEAGUE_URLS.items():
    print(f"\nüèÜ Scraping: {league_name}")
    print(f"   URL: {url}")
    
    df = scrape_fbref_page(driver, url)
    
    if not df.empty:
        print(f"   ‚úÖ Got {len(df)} players")
        
        # Add league column
        df['_league'] = league_name
        df['_season'] = '2024-2025'  # Current season
        
        # Save individual file
        filename = RAW_DIR / f"{league_name}_standard.csv"
        df.to_csv(filename, index=False)
        print(f"   üíæ Saved: {filename.name}")
        
        all_data.append(df)
    else:
        print(f"   ‚ùå Failed to scrape {league_name}")
    
    # Delay between leagues (important!)
    delay = random.uniform(8, 15)
    print(f"   ‚è≥ Waiting {delay:.1f}s...")
    time.sleep(delay)

print("\n" + "="*60)
print("‚úÖ SCRAPING COMPLETE!")
print("="*60)

In [None]:
# Close browser when done
try:
    driver.quit()
    print("‚úÖ Browser closed")
except:
    pass

In [None]:
# Combine all data into master file
if all_data:
    master_df = pd.concat(all_data, ignore_index=True)
    master_file = RAW_DIR / "all_leagues_master.csv"
    master_df.to_csv(master_file, index=False)
    
    print(f"\nüìä MASTER FILE CREATED")
    print(f"üìÅ Location: {master_file}")
    print(f"üë• Total player records: {len(master_df)}")
    print(f"\nüèÜ Records per league:")
    print(master_df['_league'].value_counts())
else:
    print("‚ùå No data collected!")

## 5. Quick Data Check

In [None]:
# Check files
print("üìÅ Files in raw data directory:")
csv_files = list(RAW_DIR.glob("*.csv"))
print(f"Total files: {len(csv_files)}")
for f in sorted(csv_files):
    size_kb = f.stat().st_size / 1024
    print(f"  - {f.name} ({size_kb:.1f} KB)")

In [None]:
# Load and preview master file
master_file = RAW_DIR / "all_leagues_master.csv"

if master_file.exists():
    df = pd.read_csv(master_file)
    print(f"üìä Master dataset: {len(df)} player records")
    print(f"\nüìã Columns:")
    print(df.columns.tolist())
    print(f"\nüîç Sample data:")
    display(df.head())
else:
    print("‚ùå Master file not found. Run scraping cells first!")

## 6. Find Ghana Players

In [None]:
# Search for Ghana players
if 'df' in dir() and not df.empty:
    # Find player column
    player_col = [c for c in df.columns if 'player' in c.lower()][0]
    
    print(f"üá¨üá≠ Searching for Ghana players...")
    print("\n" + "="*60)
    
    found = []
    not_found = []
    
    for player in GHANA_FORWARDS:
        last_name = player.split()[-1]
        matches = df[df[player_col].astype(str).str.contains(last_name, case=False, na=False)]
        
        if not matches.empty:
            found.append(player)
            match_name = matches[player_col].iloc[0]
            match_league = matches['_league'].iloc[0] if '_league' in matches.columns else 'Unknown'
            print(f"‚úÖ {player}")
            print(f"   ‚Üí Found as: {match_name} ({match_league})")
        else:
            not_found.append(player)
            print(f"‚ùå {player} - NOT FOUND")
    
    print("\n" + "="*60)
    print(f"\nüìä Found {len(found)}/{len(GHANA_FORWARDS)} Ghana players")
    
    if not_found:
        print(f"\n‚ö†Ô∏è Not found (may be in leagues not scraped):")
        for p in not_found:
            print(f"   - {p}")

---
## ‚úÖ Next Steps

Data collection is complete! Move on to **Notebook 02** for data processing.