In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import warnings
import time

warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
}

def scrape_data_for_year(code, league, year):
    url = f'https://www.transfermarkt.co.uk/{league}/startseite/wettbewerb/{code}/plus/?saison_id={year}'

    for attempt in range(10):
        wait_time = 2
        try:
            response = requests.get(url, headers = headers)

            if response.status_code == 200:
                break

            else:
                if attempt < 5:
                    print(f'Status {response.status_code} for {league} in {year} - Attempt {attempt + 1} - Error code {response.status_code}')
                    time.sleep(wait_time ** (attempt + 1))
                else:
                    print(f'Failed to retrieve data for {league} in {year}')
                    return None, None
                
        except requests.exceptions.RequestException as e:
            if attempt < 5:
                print(f'Request error for {league} in {year} - Attempt {attempt + 1} - Error code {e}')
                time.sleep(wait_time ** (attempt + 1))
            else:
                print(f'Request error for {league} in {year} after 5 retries: {e}')
                return None, None

    

    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find_all('table', class_='items')[0]
    tbody = table.find('tbody')

    team_links = []

    season = soup.find_all('h2', class_ = 'content-box-headline')[1].text.strip()

    if tbody:
        for row in tbody.find_all('tr'):
            team_link = row.find('a', href = True)
            if team_link:
                href = team_link['href']
                team_links.append(href)

    print(f'\n{league} and {year} - Successfully scraped\n')
    return season, team_links

In [24]:
leagues = [('premier-league', 'GB1'), ('laliga', 'ES1'), ('serie-a', 'IT1'), ('bundesliga', 'L1'), ('ligue-1', 'FR1'),
           ('championship', 'GB2'), ('laliga2', 'ES2'), ('serie-b', 'IT2'), ('2-bundesliga', 'L2'), ('ligue-2', 'FR2')]

for league, code in leagues:
    data = []
    for year in range(2011, 2025):
        print(f'Scraping data for {league} in {year}...')
        season, team_links = scrape_data_for_year(code, league, year)

        if season and team_links:
            for link in team_links:
                data.append({
                    'Season':season,
                    'Team Link': link
                })
    
    df = pd.DataFrame(data)
    df.to_csv(f'test/{league}_team_links.csv', index = False)
    print(f'Team links for {league} saved.')


Scraping data for premier-league in 2011...

premier-league and 2011 - Successfully scraped

Scraping data for premier-league in 2012...

premier-league and 2012 - Successfully scraped

Scraping data for premier-league in 2013...

premier-league and 2013 - Successfully scraped

Scraping data for premier-league in 2014...

premier-league and 2014 - Successfully scraped

Scraping data for premier-league in 2015...

premier-league and 2015 - Successfully scraped

Scraping data for premier-league in 2016...

premier-league and 2016 - Successfully scraped

Scraping data for premier-league in 2017...

premier-league and 2017 - Successfully scraped

Scraping data for premier-league in 2018...

premier-league and 2018 - Successfully scraped

Scraping data for premier-league in 2019...

premier-league and 2019 - Successfully scraped

Scraping data for premier-league in 2020...

premier-league and 2020 - Successfully scraped

Scraping data for premier-league in 2021...
Status 403 for premier-lea