In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import warnings
from tqdm import tqdm
import time
import random
import os
from requests.exceptions import RequestException, ConnectionError
from urllib3.exceptions import ProtocolError
from http.client import RemoteDisconnected

warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [None]:
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0"
]

def scrape_player_statistics(player_name_tag, player_id, season):
    session = requests.Session()

    headers = {
        'User-Agent': random.choice(user_agents)
    }

    session.headers.update(headers)

    detailed_url = f'https://www.transfermarkt.com/{player_name_tag}/leistungsdatendetails/spieler/{player_id}/saison/{season}/verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'

    attempt = 0
    max_retries = 10

    while attempt < max_retries:
        try:
            current_agent = random.choice(user_agents)
            session.headers.update({'User-Agent': current_agent})

            if attempt > 0:
                jitter = random.uniform(0.5, 1.5)
                time.sleep(jitter)

            detailed_response = session.get(
                detailed_url,
                timeout = (10, 30)
            )

            if detailed_response.status_code == 503:
                wait_time = (2 ** (attempt + 1)) + random.uniform(2, 6)
                print(f'\n503 Service unavailable for {player_name_tag} in season {season}. Backing off for {wait_time:.2f} secs...')
                time.sleep(wait_time)
                attempt += 1
                continue

            elif detailed_response.status_code in [403, 429, 502, 504]:
                wait_time = (2 ** (attempt + 1)) + random.uniform(1, 3)
                print(f'\nRecieved status {detailed_response.status_code} for {player_name_tag} in season {season}. Retrying in {wait_time:.2f} secs...')
                time.sleep(wait_time)
                attempt += 1
                continue

            elif detailed_response.status_code != 200:
                print(f"Failed to retrieve data for player - {player_name_tag} - in season {season}")
                print(f"Status Code: {detailed_response.status_code}")
                error_count += 1
                if error_count % 10 == 0:
                    print('Ran into too many errors, resting the code for 2 minutes')
                    time.sleep(120)  # Extended rest period
                return None
            
            break

        except (RemoteDisconnected, ConnectionError, ProtocolError, requests.exceptions.Timeout) as e:
            wait_time = (2 ** (attempt + 1)) + random.uniform(2, 5)
            print(f'\nConnection error ({type(e).__name__}: {e}). Retrying in {wait_time:.2f} secs...')
            time.sleep(wait_time)
            attempt += 1

    else:
        print(f'\nSkipping player {player_name_tag} in season {season} due to repeated failures.')
        error_count += 1

        if error_count % 5 == 0:
            print('Ran into multiple errors, resting the code for two minutes')
            time.sleep(120)
        return None
    

    soup = BeautifulSoup(detailed_response.content, 'html.parser')

    rows = []

    tables = soup.find_all('div', class_='responsive-table')[1:]
    
    player_id = detailed_url.split('/')[6]
    name_and_number = soup.find('h1', class_="data-header__headline-wrapper").text.strip()
    
    match = re.match(r"#(\d+)\s*(.*)", name_and_number)
    if match:
        player_number = match.group(1)
        player_name = match.group(2)
    else: 
        player_name = name_and_number
        player_number = ''

    player_detail_items = soup.find_all('span', class_='data-header__content')
    
    date_of_birth = ''
    place_of_birth = ''
    citizenship = ''
    height = ''

    for item in player_detail_items:
        itemprop = item.get('itemprop')  

        if itemprop == 'birthDate':
            date_of_birth = item.get_text(strip=True).split(' (')[0]  
        elif itemprop == 'birthPlace':
            place_of_birth = item.get_text(strip=True)
        elif itemprop == 'nationality':
            citizenship = item.get_text(strip=True).replace('England', 'England').strip() 
        elif itemprop == 'height':
            height = item.get_text(strip=True)

    for table in tables:
        column_headers = []
        col_index = {}
        
        thead = table.find('thead')
        if thead:
            ths = thead.find_all('th')
            idx = 0
            
            for th in ths:
                title_holder = th.find(attrs = {'title': True})
                colspan = int(th.get('colspan', 1))
                header_name = None
                
                if title_holder:
                    header_name = title_holder['title']
                elif th.get_text(strip = True):
                    header_name = th.get_text(strip = True)
                else:
                    header_name = 'Unnamed Header'
                    
                if colspan > 1:
                    if "Home team" in header_name:
                        column_headers.append("Home Team")  
                        column_headers.append("Home Team Position")  
                        col_index["Home Team"] = idx
                        col_index["Home Team Position"] = idx + 1
                        idx += 2  
                    elif "Away team" in header_name:
                        column_headers.append("Away Team")  
                        column_headers.append("Away Team Position")  
                        col_index["Away Team"] = idx
                        col_index["Away Team Position"] = idx + 1
                        idx += 2  
                    else:
                        column_headers.append(header_name)
                        col_index[header_name] = idx
                        idx += colspan  
                else:
                    column_headers.append(header_name)
                    col_index[header_name] = idx
                    idx += 1
                    
        tbody = table.find('tbody')
        if tbody:
            for row in tbody.find_all('tr'):
                columns = row.find_all('td')
                if columns and len(columns) > 1:
                    match_data = {
                        'Player ID': player_id,
                        'Player Name': player_name, 
                        'Player No.': player_number,
                        'Date of Birth': date_of_birth,
                        'Place of Birth': place_of_birth,
                        'citizenship': citizenship, 
                        'height': height,
                        'Match ID': '',
                        'Competition': '',
                        'Matchday': '',
                        'Date': '',
                        'Home Team': '',
                        'Away Team': '',
                        'Home League Position': '', 
                        'Away League Position': '',
                        'Result': '', 
                        'Position': '', 
                        'Captain': '',
                        'Goals': '', 
                        'Assists': '', 
                        'Own Goals': '',
                        'Yellow Cards': '', 
                        'Second Yellow Cards': '',
                        'Red Cards': '', 
                        'Subbed On': '',
                        'Subbed Off': '',
                        'Injury Related Substitution': '',
                        'Minutes Played': ''
                    }
                    
                    for field_name, idx in col_index.items():
                        if idx < len(columns):
                            if field_name == 'Matchday':
                                match_data['Matchday'] = columns[idx].text.strip()
                                link = columns[idx].find('a', href=True)
                                if link:
                                    href = link['href']
                                    match_data['Competition'] = href.split('/')[1].replace('-', ' ').title()
                            elif field_name == 'Date':
                                match_data['Date'] = columns[idx].text.strip()
                            elif field_name == 'Home Team':
                                match_data['Home Team'] = columns[idx].find('a').get('title') if columns[idx].find('a') else columns[idx].text.strip()
                            elif field_name == 'Home Team Position':
                                match_data['Home League Position'] = columns[idx].find('span', class_='tabellenplatz').text.strip('().') if columns[idx].find('span', class_='tabellenplatz') else ''
                            elif field_name == 'Away Team':
                                match_data['Away Team'] = columns[idx].find('a').get('title') if columns[idx].find('a') else columns[idx].text.strip()
                            elif field_name == 'Away Team Position':
                                match_data['Away League Position'] = columns[idx].find('span', class_='tabellenplatz').text.strip('().') if columns[idx].find('span', class_='tabellenplatz') else ''
                            elif field_name == 'Result':
                                if columns[idx].find('span', class_=('greentext', 'redtext', '')):
                                    match_data['Result'] = columns[idx].find('span', class_=('greentext', 'redtext', '')).text.strip()
                                    match_link = columns[idx].find('a', class_='ergebnis-link')
                                    match_data['Match ID'] = match_link.get('id') if match_link and match_link.get('id') else ''
                            elif field_name == 'Pos.':
                                match_data['Position'] = columns[idx].text.strip()
                                match_data['Captain'] = 1 if columns[idx].find('span', class_='kapitaenicon-table icons_sprite') else ''
                            elif field_name == 'Goals':
                                match_data['Goals'] = columns[idx].text.strip()
                            elif field_name == 'Assists':
                                match_data['Assists'] = columns[idx].text.strip()
                            elif field_name == 'Own goals':
                                match_data['Own Goals'] = columns[idx].text.strip()
                            elif field_name == 'Yellow cards':
                                match_data['Yellow Cards'] = columns[idx].text.strip()
                            elif field_name == 'Second yellow cards':
                                match_data['Second Yellow Cards'] = columns[idx].text.strip()
                            elif field_name == 'Red cards':
                                match_data['Red Cards'] = columns[idx].text.strip()
                            elif field_name == 'Substitutions on':
                                match_data['Subbed On'] = columns[idx].text.strip()
                            elif field_name == 'Substitutions off':
                                match_data['Subbed Off'] = columns[idx].text.strip()
                                match_data['Injury Related Substitution'] = 1 if columns[idx].find('span', class_='verletzt-table mittig-vom-text icons_sprite') else ''
                            elif field_name == 'Minutes played':
                                match_data['Minutes Played'] = columns[idx].text.strip()
                    
                    rows.append(match_data)
                
    return pd.DataFrame(rows) if rows else None

In [4]:
player_name_tag = 'gerard-pique'
player_id = '18944'
season = '2011'

In [5]:
scrape_player_statistics(player_name_tag, player_id, season)

Unnamed: 0,Player ID,Player Name,Player No.,Date of Birth,Place of Birth,citizenship,height,Match ID,Competition,Matchday,...,Goals,Assists,Own Goals,Yellow Cards,Second Yellow Cards,Red Cards,Subbed On,Subbed Off,Injury Related Substitution,Minutes Played
0,18944,Gerard Piqué,,02/02/1987,Barcelona,Spain,"1,94 m",1141846,Uefa Champions League,Group H,...,,,,,,,,,,
1,18944,Gerard Piqué,,02/02/1987,Barcelona,Spain,"1,94 m",1141870,Uefa Champions League,Group H,...,,,,,,,,,,
2,18944,Gerard Piqué,,02/02/1987,Barcelona,Spain,"1,94 m",1141887,Uefa Champions League,Group H,...,,,,,,,,,,
3,18944,Gerard Piqué,,02/02/1987,Barcelona,Spain,"1,94 m",1141895,Uefa Champions League,Group H,...,,1,,,,,,,,90'
4,18944,Gerard Piqué,,02/02/1987,Barcelona,Spain,"1,94 m",1141924,Uefa Champions League,Group H,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,18944,Gerard Piqué,,02/02/1987,Barcelona,Spain,"1,94 m",1169007,Uefa Euro,Group C,...,,,,,,,,,,90'
75,18944,Gerard Piqué,,02/02/1987,Barcelona,Spain,"1,94 m",1169008,Uefa Euro,Group C,...,,,,,,,,,,90'
76,18944,Gerard Piqué,,02/02/1987,Barcelona,Spain,"1,94 m",2219615,Uefa Euro,Quarter-Finals,...,,,,,,,,,,90'
77,18944,Gerard Piqué,,02/02/1987,Barcelona,Spain,"1,94 m",2221241,Uefa Euro,Semi-Finals,...,,,,,,,,,,120'


In [6]:
def calculate_remaining_time(start_time, iteration_count, total_iterations):
    elapsed_time = time.time() - start_time
    avg_time = elapsed_time / iteration_count
    remaining_iterations = total_iterations - iteration_count
    estimated_remaining_time = avg_time * remaining_iterations
    
    hours = int(estimated_remaining_time // 3600)
    mins = int((estimated_remaining_time % 3600) // 60) 
    secs = int(estimated_remaining_time % 60)
            
    return hours, mins, secs, avg_time

In [8]:
league = 'la_liga_2'

In [9]:
skipped_players = []

In [10]:
ignore_backup = True
save_every = 50
data_buffer = []
buffer_size = 50
batch_size = 10

os.system('powercfg -change -standby-timeout-ac 0')

data = pd.read_csv(f'squad lists/squad_lists_{league}.csv')
backup_path = f'backups/player_data_{league}.csv'
save_path = f'player data/player_data_{league}.csv'

if os.path.exists(backup_path) and ignore_backup:
    print('Backup found but it is being ignored. Starting fresh Data Frame.')
    player_data = pd.DataFrame()
    iteration_count = 0
    previous_saved_iteration = 0
else:
    if os.path.exists(backup_path):
        player_data = pd.read_csv(backup_path)
        if 'previous_saved_iteration' in globals():
            iteration_count = previous_saved_iteration
        else:
            iteration_count = player_data.drop_duplicates(subset=['Team Name', 'Season']).shape[0]
        print(f'Loaded backup with {iteration_count} iterations')
    else:
        player_data = pd.DataFrame()
        print('Starting a fresh Data Frame')
        iteration_count = 0
        previous_saved_iteration = 0

total_iterations = len(data) - iteration_count
start_time = time.time()
consecutive_errors = 0
max_consecutive_errors = 10



for index, row in tqdm(data.iloc[iteration_count: ].iterrows(), total=total_iterations, desc='Scraping Players', dynamic_ncols=True):
    try:
        row = data.iloc[index]
        player_id = row['Player ID']
        season = row['Season']
        team = row['Team Name']
        
        if season == 2024:
            continue
            
        player_name_tag = row['Player Name for URL']
        
        # After every batch_size iterations, take a longer break
        if (iteration_count) % batch_size == 0 and iteration_count != 0:
            batch_pause = random.uniform(8, 15)
            print(f"\nTaking a batch pause of {batch_pause:.2f} seconds...")
            time.sleep(batch_pause)
        
        # Extra backoff after encountering errors
        if consecutive_errors > 3:
            cooldown = consecutive_errors * 20  # Longer cooldown based on error count
            print(f"\nToo many consecutive errors. Cooling down for {cooldown} seconds...")
            time.sleep(cooldown)
            consecutive_errors = 0  # Reset after cooling down
        
        single_player_data = scrape_player_statistics(player_name_tag, player_id, season)
        
        if single_player_data is not None:
            single_player_data['Season'] = season
            single_player_data['Team'] = team
            data_buffer.append(single_player_data)
            consecutive_errors = 0  # Reset on success
        else:
            consecutive_errors += 1
            
        # Check if we need to abort due to too many errors
        if consecutive_errors >= max_consecutive_errors:
            print(f"\nAborting due to {consecutive_errors} consecutive errors.")
            break
            
        iteration_count += 1
        
        # Show initial time estimate after 10 iterations
        if iteration_count == 10:
            hours, mins, secs, _ = calculate_remaining_time(start_time, iteration_count, total_iterations)
            print(f'Initial code run time estimate - {hours} hours, {mins} minutes, and {secs} seconds.')
            
        # Process buffer when it reaches threshold
        if len(data_buffer) >= buffer_size:
            if data_buffer:
                player_data = pd.concat([player_data] + data_buffer, ignore_index=True)
                data_buffer = []
        
        # Save progress periodically
        if iteration_count % save_every == 0:
            # Make sure to save any buffer data
            if data_buffer:
                player_data = pd.concat([player_data] + data_buffer, ignore_index=True)
                data_buffer = []
                
            player_data.to_csv(backup_path, index=False)
            previous_saved_iteration = iteration_count
            print(f'\nProgress saved after {iteration_count} iterations.')
            
            hours, mins, secs, avg_time = calculate_remaining_time(start_time, iteration_count, total_iterations)
            print(f'Avg time per iteration - {avg_time:.2f} seconds.')
            print(f'Estimated time remaining - {hours} hours, {mins} minutes, and {secs} seconds.')
       
        # Variable delay between requests
        delay = random.uniform(3, 8)  # Increased delay between requests
        time.sleep(delay)
        
    except Exception as e:
        print(f'\nError at iteration {iteration_count}: {e}')
        consecutive_errors += 1
        
        # Emergency save on exception
        if data_buffer:
            try:
                player_data = pd.concat([player_data] + data_buffer, ignore_index=True)
                player_data.to_csv(backup_path, index=False)
                print(f"Emergency backup saved at iteration {iteration_count}")
                data_buffer = []
            except Exception as save_error:
                print(f"Could not save emergency backup: {save_error}")
        
        # Break if too many consecutive errors
        if consecutive_errors >= max_consecutive_errors:
            print(f"Too many consecutive errors ({consecutive_errors}). Stopping.")
            break
            
        # Cooldown after an error
        error_cooldown = random.uniform(10, 20)
        print(f"Cooling down for {error_cooldown:.2f} seconds after error...")
        time.sleep(error_cooldown)

# Final processing
if data_buffer:
    player_data = pd.concat([player_data] + data_buffer, ignore_index=True)
    
# Save final data
player_data.to_csv(save_path, index=False)
print(f'Final data saved at {save_path}')

# Report total time
total_elapsed_time = time.time() - start_time
elapsed_hours = int(total_elapsed_time // 3600)
elapsed_minutes = int((total_elapsed_time % 3600) // 60)
elapsed_seconds = int(total_elapsed_time % 60)
print(f'\nCompleted scraping in {elapsed_hours} hr, {elapsed_minutes} mins, and {elapsed_seconds} secs.')

Backup found but it is being ignored. Starting fresh Data Frame.


Scraping Players:   0%|          | 0/10331 [00:00<?, ?it/s]


Taking a batch pause of 8.88 seconds...


Scraping Players:   0%|          | 5/10331 [00:52<27:05:34,  9.45s/it]


Recieved status 403 for sergi-gomez in season 2011. Retrying in 3.93 secs...


Scraping Players:   0%|          | 7/10331 [01:21<32:33:30, 11.35s/it]


Recieved status 403 for patric in season 2011. Retrying in 3.29 secs...

Recieved status 403 for patric in season 2011. Retrying in 3.06 secs...

Recieved status 403 for patric in season 2011. Retrying in 6.37 secs...


Scraping Players:   0%|          | 8/10331 [02:13<69:28:44, 24.23s/it]


Recieved status 403 for carles-planas in season 2011. Retrying in 2.17 secs...

Recieved status 403 for carles-planas in season 2011. Retrying in 4.28 secs...

Recieved status 403 for carles-planas in season 2011. Retrying in 6.24 secs...

Recieved status 403 for carles-planas in season 2011. Retrying in 10.48 secs...

Recieved status 403 for carles-planas in season 2011. Retrying in 18.17 secs...


Scraping Players:   0%|          | 9/10331 [03:56<139:54:55, 48.80s/it]

Initial code run time estimate - 68 hours, 3 minutes, and 12 seconds.


Scraping Players:   0%|          | 10/10331 [04:03<103:42:12, 36.17s/it]


Taking a batch pause of 8.74 seconds...

Recieved status 403 for martin-montoya in season 2011. Retrying in 3.49 secs...


Scraping Players:   0%|          | 11/10331 [04:35<99:24:57, 34.68s/it] 


Recieved status 403 for kiko-femenia in season 2011. Retrying in 2.06 secs...


Scraping Players:   0%|          | 13/10331 [05:14<69:23:07, 24.21s/it]


KeyboardInterrupt: 