In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import warnings
from tqdm import tqdm
import time
import random
from requests.exceptions import RequestException, ConnectionError
from urllib3.exceptions import ProtocolError
from http.client import RemoteDisconnected
import os

warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [2]:
league_to_scrape = 'serie_b'

df = pd.read_csv(f'links for all teams in a league/{league_to_scrape}_team_links.csv')

In [3]:
df.head()

Unnamed: 0,Season,Team Link
0,Clubs - Serie B 11/12,/sampdoria-genua/startseite/verein/1038/saison...
1,Clubs - Serie B 11/12,/delfino-pescara-1936/startseite/verein/2921/s...
2,Clubs - Serie B 11/12,/fc-turin/startseite/verein/416/saison_id/2011
3,Clubs - Serie B 11/12,/brescia-calcio/startseite/verein/19/saison_id...
4,Clubs - Serie B 11/12,/us-sassuolo/startseite/verein/6574/saison_id/...


In [3]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
}


def scrape_team_data(team_url):
    base_url = 'https://www.transfermarkt.co.uk'
    url = f'{base_url}{team_url}'

    attempt = 0
    max_retries = 10

    while attempt < max_retries:
        try:
            response = requests.get(url, headers = headers)

            if response.status_code in [502, 503, 504, 403]:
                wait_time = (2 ** attempt) + random.uniform(1, 3)
                print(f'\nRecieved status {response.status_code}. Retrying in {wait_time:.2f} seconds...')
                time.sleep(wait_time)
                attempt += 1
                continue

            elif response.status_code != 200:
                print(f'Failed to retrieve data for {team_url}. Status code: {response.status_code}.')
                return None
            
            break

        except (RemoteDisconnected, ConnectionError, ProtocolError) as e:
            wait_time = (2 ** attempt) + random.uniform(1, 3)
            print(f'\n Connection error ({type(e).__name__}: {e}). Retrying in {wait_time: .2f} seconds...')
            time.sleep(wait_time)
            attempt += 1

    else:
        print(f'\nSkipping team {team_url} due to repeated errors.')
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find('table', class_ = 'items')
    tbody = table.find('tbody')

    team = soup.find('div', class_ = 'data-header__headline-container').text.strip()

    player_ids = []
    player_names = []
    player_names_for_urls = []
    player_numbers = []

    season = team_url.split('/')[-1]

    if tbody:
        for row in tbody.find_all('tr'):
            player_number = row.find('td', class_ = re.compile('zentriert rueckennummer'))
            if player_number:
                player_number_tag = player_number.find('div', class_ = 'rn_nummer')
                if player_number_tag:
                    player_numbers.append(player_number_tag.text.strip())

            player_name = row.find('td', class_ = 'posrela')
            if player_name:
                inner_table = player_name.find('table', class_ = 'inline-table')
                player_link = inner_table.find('a', href = True) if inner_table else None

                if player_link:
                    href = player_link['href']
                    if '/profil/' in href:
                        player_id = href.split('/')[-1]
                        player_ids.append(player_id)

                        player_name_for_url = href.split('/')[1]
                        player_names_for_urls.append(player_name_for_url)

                        player_name_text = player_link.get_text(strip = True)
                        player_names.append(player_name_text)


    return pd.DataFrame({
        'Team Name': team,
        'Player ID': player_ids,
        'Player Name': player_names, 
        'Player Name for URL': player_names_for_urls,
        'Player Number': player_numbers,
        'Season': season
    })

In [4]:
def print_elapsed_time(start_time, iteration_count, total_iterations):
    elapsed_time = time.time() - start_time
    avg_time_per_iteration = elapsed_time/iteration_count
    remaining_iterations = total_iterations - iteration_count
    estimated_remaining_time = avg_time_per_iteration * remaining_iterations

    hours = int(estimated_remaining_time // 3600)
    mins = int((estimated_remaining_time % 3600) // 60)
    secs = int(estimated_remaining_time % 60)

    print(f'Avg time per iteration - {avg_time_per_iteration: .2f} seconds')
    print(f'Estimated time remaining - {hours} hours, {mins} minutes, and {secs} seconds')

In [None]:
ignore_backup = False
save_every = 50

backup_path = f'backup squad lists/{league_to_scrape}_squad_lists_test.csv'
save_path = f'squad lists/squad_lists_{league_to_scrape}.csv'

if os.path.exists(backup_path) and ignore_backup:
    print('Backup found but it is being ignored. Starting fresh Data Frame.')
    squad_lists = pd.DataFrame()
    iteration_count = 0
    previous_saved_iteration = 0
else:
    if os.path.exists(backup_path):
        squad_lists = pd.read_csv(backup_path)
        if 'previous_saved_iteration' in globals():
            iteration_count = previous_saved_iteration
        else:
            iteration_count = squad_lists.drop_duplicates(subset=['Team Name', 'Season']).shape[0]
        print(f'Loaded backup with {iteration_count} iterations')
    else:
        squad_lists = pd.DataFrame()
        print('Starting a fresh Data Frame')
        iteration_count = 0
        previous_saved_iteration = 0

total_iterations = len(df) - iteration_count

start_time = time.time()


for index, row in tqdm(df.iloc[iteration_count: ].iterrows(), total = total_iterations, desc = 'Scraping Teams', dynamic_ncols = True):
    try:
        team_url = row['Team Link']
        team_data = scrape_team_data(team_url)

        if team_data is not None:
            squad_lists = pd.concat([squad_lists, team_data], ignore_index = True)

        iteration_count += 1

        if (iteration_count) % save_every == 0:
            squad_lists.to_csv(backup_path, index = False)
            previous_saved_iteration = iteration_count
            print(f'\nProgress saved after {iteration_count} iterations.')

            print_elapsed_time(start_time, iteration_count, total_iterations)

        time.sleep(random.uniform(1, 3))

    except Exception as e:
        print(f'\nError at iteration {index}: {e}')
        break

squad_lists.to_csv(save_path)
end_time = time.time()
total_elapsed_time = end_time - start_time

print(f'\nCompleted scraping in {int(total_elapsed_time // 60)} min and {int(total_elapsed_time % 60)} secs')


Loaded backup with 15 iterations


Scraping Teams:   1%|▏         | 4/278 [00:14<17:15,  3.78s/it]


Progress saved after 20 iterations.
Avg time per iteration -  0.82 seconds
Estimated time remaining - 0 hours, 3 minutes, and 30 seconds


Scraping Teams:   3%|▎         | 9/278 [00:35<19:55,  4.44s/it]


Recieved status 403. Retrying in 2.82 seconds...

Recieved status 403. Retrying in 3.94 seconds...

Progress saved after 25 iterations.
Avg time per iteration -  2.58 seconds
Estimated time remaining - 0 hours, 10 minutes, and 53 seconds


Scraping Teams:   4%|▎         | 10/278 [01:06<56:05, 12.56s/it]


Recieved status 403. Retrying in 2.22 seconds...

Recieved status 403. Retrying in 4.55 seconds...


Scraping Teams:   4%|▍         | 11/278 [01:39<1:24:03, 18.89s/it]


Recieved status 403. Retrying in 2.07 seconds...

Recieved status 403. Retrying in 3.76 seconds...

Recieved status 403. Retrying in 6.92 seconds...


Scraping Teams:   4%|▍         | 12/278 [02:26<2:01:41, 27.45s/it]


Recieved status 403. Retrying in 3.48 seconds...

Recieved status 403. Retrying in 4.54 seconds...

Recieved status 403. Retrying in 6.59 seconds...

Recieved status 403. Retrying in 9.09 seconds...

Recieved status 403. Retrying in 17.86 seconds...

Recieved status 403. Retrying in 33.66 seconds...


Scraping Teams:   5%|▌         | 14/278 [04:50<3:13:52, 44.06s/it]


Progress saved after 30 iterations.
Avg time per iteration -  9.73 seconds
Estimated time remaining - 0 hours, 40 minutes, and 13 seconds


Scraping Teams:   5%|▌         | 15/278 [04:53<2:18:54, 31.69s/it]


Recieved status 403. Retrying in 4.00 seconds...

Recieved status 403. Retrying in 4.46 seconds...


Scraping Teams:   7%|▋         | 19/278 [05:58<1:13:49, 17.10s/it]


Progress saved after 35 iterations.
Avg time per iteration -  10.28 seconds
Estimated time remaining - 0 hours, 41 minutes, and 37 seconds


Scraping Teams:   7%|▋         | 19/278 [06:01<1:22:10, 19.04s/it]


KeyboardInterrupt: 

In [37]:
squad_lists

Unnamed: 0,Team Name,Player ID,Player Name,Player Name for URL,Player Number,Season
0,Inter Milan,22412,Júlio César,julio-cesar,1,2011
1,Inter Milan,24316,Emiliano Viviano,emiliano-viviano,33,2011
2,Inter Milan,6058,Luca Castellazzi,luca-castellazzi,12,2011
3,Inter Milan,70580,Raffaele Di Gennaro,raffaele-di-gennaro,91,2011
4,Inter Milan,45583,Paolo Tornaghi,paolo-tornaghi,70,2011
...,...,...,...,...,...,...
11953,Venezia FC,936684,Saad El Haddad,saad-el-haddad,80,2024
11954,Venezia FC,433584,Alessio Zerbin,alessio-zerbin,24,2024
11955,Venezia FC,666268,Daniel Fila,daniel-fila,18,2024
11956,Venezia FC,76274,Christian Gytkjaer,christian-gytkjaer,9,2024
