In [1]:
!pip install selenium webdriver-manager



In [10]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
# Define the seasons and URLs
seasons = {
    "2020-2021": "https://www.capology.com/club/arsenal/salaries/2020-2021/",
    "2021-2022": "https://www.capology.com/club/arsenal/salaries/2021-2022/",
    "2022-2023": "https://www.capology.com/club/arsenal/salaries/2022-2023/",
    "2023-2024": "https://www.capology.com/club/arsenal/salaries/2023-2024/",
    "2024-2025": "https://www.capology.com/club/arsenal/salaries/2024-2025/"
}

# Store all player data
all_data = []

for season, url in seasons.items():
    print(f"Scraping: {season}")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    time.sleep(5)

    try:
        # Wait for at least one row to appear
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody tr"))
        )

        # Scroll down to load more rows (lazy loading fix)
        last_height = driver.execute_script("return document.body.scrollHeight")
        for i in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        time.sleep(2)

        # Now scrape all visible rows
        rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            if len(cells) < 7:
                continue

            all_data.append({
                "Season": season,
                "Player Name": cells[0].text.strip(),
                "Gross P/W (GBP)": cells[1].text.strip(),
                "Gross P/Y (GBP)": cells[2].text.strip(),
                "Adj. Gross (2025 GBP)": cells[3].text.strip(),
                "Position": cells[4].text.strip(),
                "Age": cells[5].text.strip(),
                "Country": cells[6].text.strip()
            })
    except:
        print(f"Window closed or failed to load for {season}")
    
    driver.quit()


# Create and save DataFrame
df = pd.DataFrame(all_data)
df.to_csv("arsenal_salaries_2020_2025_selenium.csv", index=False)
print("Scrape complete! Data saved to 'arsenal_salaries_2020_2025.csv'.")

🔄 Scraping: 2020-2021
🔄 Scraping: 2021-2022
🔄 Scraping: 2022-2023
🔄 Scraping: 2023-2024
🔄 Scraping: 2024-2025
✅ Scrape complete! Data saved to 'arsenal_salaries_2020_2025_selenium.csv'.
