In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Base URL for fbref
base_url = "https://fbref.com"

# Main Bundesliga statistics page (in German)
main_url = "https://fbref.com/de/wettbewerbe/20/stats/Bundesliga-Statistiken"

# Directory to store CSV files
output_dir = "data/bundesliga"
os.makedirs(output_dir, exist_ok=True)

# Request the main page
response = requests.get(main_url)
if response.status_code != 200:
    print(f"Error fetching main page: {main_url}")
    exit()

soup = BeautifulSoup(response.content, "html.parser")

# Find all team links (hrefs starting with /de/mannschaften/ and containing "Statistiken")
team_links = soup.find_all("a", href=True)
teams = {}
for link in team_links:
    href = link["href"]
    if href.startswith("/de/mannschaften/") and "Statistiken" in href:
        team_name = link.get_text(strip=True)
        team_url = base_url + href
        teams[team_name] = team_url

print("Found teams:")
for name, url in teams.items():
    print(f"  {name}: {url}")

# Iterate over each team and scrape the players table
for team_name, team_url in teams.items():
    print(f"\nScraping team: {team_name} from {team_url}")
    resp = requests.get(team_url)
    if resp.status_code != 200:
        print(f"Error retrieving {team_url}")
        continue

    team_soup = BeautifulSoup(resp.content, "html.parser")
    
    # Find the table containing player data (using id "stats_misc_20")
    table = team_soup.find("table", id="stats_misc_20")
    if not table:
        print(f"No table with id 'stats_misc_20' found for {team_name}")
        continue

    # Extract header: sometimes the last <tr> in <thead> holds the actual column names
    thead = table.find("thead")
    header_rows = thead.find_all("tr")
    if header_rows:
        headers = [th.get_text(strip=True) for th in header_rows[-1].find_all("th")]
    else:
        headers = []

    # Extract body rows
    rows_data = []
    tbody = table.find("tbody")
    for row in tbody.find_all("tr"):
        # Extract text from both header cells (if any) and td cells
        row_data = [cell.get_text(strip=True) for cell in row.find_all(["th", "td"])]
        if row_data:
            rows_data.append(row_data)

    # Create DataFrame and save as CSV
    df = pd.DataFrame(rows_data, columns=headers)
    # Replace any problematic characters in the team name for the filename
    safe_team_name = "".join([c if c.isalnum() or c in " -_" else "_" for c in team_name])
    output_path = os.path.join(output_dir, f"{safe_team_name}.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved {team_name} data to {output_path}")
    
    # Pause to be polite to the server
    time.sleep(3)


Error fetching main page: https://fbref.com/de/wettbewerbe/20/stats/Bundesliga-Statistiken
Found teams:


: 