In [1]:
!pip install selenium webdriver-manager



In [2]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [4]:
# Setup Chrome options
options = Options()
options.add_argument("--headless")  # Remove this line if you want to see the browser
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Define seasons and FBref URLs for "All Competitions" → "Playing Time"
seasons = {
    "2020-2021": "https://fbref.com/en/squads/18bb7c10/2020-2021/all_comps/Arsenal-Stats-All-Competitions#all_stats_playing_time",
    "2021-2022": "https://fbref.com/en/squads/18bb7c10/2021-2022/all_comps/Arsenal-Stats-All-Competitions#all_stats_playing_time",
    "2022-2023": "https://fbref.com/en/squads/18bb7c10/2022-2023/all_comps/Arsenal-Stats-All-Competitions#all_stats_playing_time",
    "2023-2024": "https://fbref.com/en/squads/18bb7c10/2023-2024/all_comps/Arsenal-Stats-All-Competitions#all_stats_playing_time",
    "2024-2025": "https://fbref.com/en/squads/18bb7c10/2024-2025/all_comps/Arsenal-Stats-All-Competitions#all_stats_playing_time"
}

# Store all player data
all_data = []

for season, url in seasons.items():
    print(f"Scraping: {season}")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    time.sleep(5)

    try:
        # Wait for the "Playing Time" table to appear
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.ID, "all_stats_playing_time"))
        )

        # Find table body
        table = driver.find_element(By.ID, "all_stats_playing_time")
        rows = table.find_elements(By.TAG_NAME, "tr")

        for row in rows:
            # Skip header or total rows
            if row.find_elements(By.TAG_NAME, "th") and "Player" in row.text:
                continue

            player_cell = row.find_element(By.TAG_NAME, "th")  # Player name lives here
            player_name = player_cell.text.strip()

            cells = row.find_elements(By.TAG_NAME, "td")
            if len(cells) == 0:
                continue  # Skip empty rows

            all_data.append({
                "Season": season,
                "Player": player_name,
                "Nation": cells[0].text.strip(),
                "Pos": cells[1].text.strip(),
                "Age": cells[2].text.strip(),
                "MP": cells[3].text.strip(),
                "Min": cells[4].text.strip(),
                "Mn/MP": cells[5].text.strip(),
                "Min%": cells[6].text.strip(),
                "90s": cells[7].text.strip(),
                "Starts": cells[8].text.strip(),
                "Mn/Start": cells[9].text.strip(),
                "Compl": cells[10].text.strip(),
                "Subs": cells[11].text.strip(),
                "Mn/Sub": cells[12].text.strip(),
                "unSub": cells[13].text.strip(),
                "PPM": cells[14].text.strip(),
                "onG": cells[15].text.strip(),
                "onGA": cells[16].text.strip(),
                "+/-": cells[17].text.strip(),
                "+/-90": cells[18].text.strip(),
                "On-Off": cells[19].text.strip(),
                "onxG": cells[20].text.strip(),
                "onxGA": cells[21].text.strip(),
                "xG+/-": cells[22].text.strip(),
                "xG+/-90": cells[23].text.strip(),
                "On-Off_xG": cells[24].text.strip()
            })

    except Exception as e:
        print(f"Failed to scrape {season}: {e}")
    
    driver.quit()

# Create and save DataFrame
df = pd.DataFrame(all_data)
df.to_csv("arsenal_minutes_played_2020_2025_fbref.csv", index=False)
print("Scrape complete! Data saved to 'arsenal_minutes_played_2020_2025_fbref.csv'.")

Scraping: 2020-2021
⚠️ Failed to scrape 2020-2021: list index out of range
Scraping: 2021-2022
⚠️ Failed to scrape 2021-2022: list index out of range
Scraping: 2022-2023
⚠️ Failed to scrape 2022-2023: list index out of range
Scraping: 2023-2024
⚠️ Failed to scrape 2023-2024: list index out of range
Scraping: 2024-2025
⚠️ Failed to scrape 2024-2025: list index out of range
✅ Scrape complete! Data saved to 'arsenal_minutes_played_2020_2025_fbref.csv'.
