# 03 Transfermarkt Scraper

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.parse

In [3]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}

In [4]:
page = 'https://www.transfermarkt.de/valencia-cf/startseite/verein/1049'

In [5]:
def get_transfermarkt_club_url(club_name, country='DE'):
    # Format search query
    base_search_url = 'https://www.transfermarkt.de/schnellsuche/ergebnis/schnellsuche'
    query = {'query': club_name}
    search_url = f"{base_search_url}?{urllib.parse.urlencode(query)}"

    headers = {
        'User-Agent': 'Mozilla/5.0'
    }

    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find club link in search results
    club_links = soup.select('a[href*="/startseite/verein/"]')
    for link in club_links:
        href = link.get('href', '')
        if '/startseite/verein/' in href:
            return urllib.parse.urljoin("https://www.transfermarkt.de", href)

    return None  # If no match found

In [6]:
club_url = get_transfermarkt_club_url("Valencia CF")
print(club_url)

https://www.transfermarkt.de/fc-valencia/startseite/verein/1049


In [7]:
page = club_url

In [8]:
page_tree = requests.get(page, headers=headers)
page_content = BeautifulSoup(page_tree.content, 'html.parser')

In [9]:
# Elements
players = page_content.find_all('img', {'class': 'bilderrahmen-fixed lazy lazy'})
age = page_content.find_all('td', {'class': 'zentriert'})
positions = page_content.find_all('td', {'class': [
    'zentriert rueckennummer bg_Torwart',
    'zentriert rueckennummer bg_Abwehr',
    'zentriert rueckennummer bg_Mittelfeld',
    'zentriert rueckennummer bg_Sturm']})
nations = page_content.find_all('td', {'class': 'zentriert'})
market_values = page_content.find_all('td', {'class': 'hauptlink'})

In [10]:
# Extract player names
PlayersList = [img.get("alt") for img in players if img.get("alt")]

# # Extract ages
# AgeList = []
# for i in range(1, len(players)*3, 3):
#     AgeList.append(str(age[i]).split(">", 1)[1].split("<", 1)[0])

# Extract ages (only the number inside parentheses)
AgeList = []
for i in range(1, len(players)*3, 3):
    raw_text = str(age[i]).split(">", 1)[1].split("<", 1)[0]
    age_str = raw_text.strip()
    if "(" in age_str and ")" in age_str:
        AgeList.append(age_str.split("(")[-1].split(")")[0])
    else:
        AgeList.append(None)

# Extract positions
PositionsList = []
for i in range(len(positions)):
    PositionsList.append(str(positions[i]).split('title="', 1)[1].split('"><div')[0])

# Extract nationalities
NationList = []
for i in range(2, len(players)*3, 3):
    NationList.append(str(nations[i]).split('title="', 1)[1].split('"', 1)[0])

In [11]:
# Extract only relevant market values
ValuesList = []

for tag in market_values:
    value = tag.text.strip()
    if '€' in value:
        ValuesList.append(value)

In [12]:
cleaned_values = []

for value in ValuesList:
    try:
        value = value.replace('.', '').replace(',', '.').replace('€', '').strip()

        if 'Mio' in value:
            numeric_part = value.replace('Mio', '').strip()
            cleaned_values.append(float(numeric_part) * 1_000_000)

        elif 'Tsd' in value:
            numeric_part = value.replace('Tsd', '').strip()
            cleaned_values.append(float(numeric_part) * 1_000)

        else:
            cleaned_values.append(None)

    except ValueError:
        cleaned_values.append(None)

In [13]:
final_df = pd.DataFrame({
    "Player": PlayersList,
    "Age": AgeList,
    "Position": PositionsList,
    "Nation": NationList,
    "Value": cleaned_values
})

In [14]:
print("Players:", len(PlayersList))
print("Ages:", len(AgeList))
print("Positions:", len(PositionsList))
print("Nations:", len(NationList))
print("Values:", len(cleaned_values))
display(final_df.head())

Players: 26
Ages: 26
Positions: 26
Nations: 26
Values: 26


Unnamed: 0,Player,Age,Position,Nation,Value
0,Giorgi Mamardashvili,24,Torwart,Georgien,30000000.0
1,Stole Dimitrievski,31,Torwart,Nordmazedonien,2500000.0
2,Jaume Doménech,34,Torwart,Spanien,400000.0
3,Cristhian Mosquera,20,Abwehr,Spanien,30000000.0
4,César Tárrega,23,Abwehr,Spanien,10000000.0


---

In [15]:
# TODO: Pipeline for english version // 

In [16]:
# Multi-Team Multi-Season Scraper
# Extended functionality to scrape any team data across multiple seasons

import ssl
import time
import re
from urllib.request import Request, urlopen
import random

In [17]:
def get_team_squad_url(team_name: str, season: int) -> str:
    """Generate squad URL for any team and season using English Transfermarkt."""
    # Use English Transfermarkt URL - Valencia CF specific
    if team_name.lower() == "valencia cf":
        return f"https://www.transfermarkt.com/fc-valencia/kader/verein/1049/saison_id/{season}/plus/1"
    else:
        # For other teams, you'd need to get their team ID first
        team_slug = team_name.lower().replace(' ', '-').replace('cf', 'fc')
        return f"https://www.transfermarkt.com/{team_slug}/kader/verein/1049/saison_id/{season}/plus/1"

def random_delay(min_seconds: float = 1.0, max_seconds: float = 3.0):
    """Add random delay to avoid being blocked."""
    delay = random.uniform(min_seconds, max_seconds)
    time.sleep(delay)

In [18]:
def extract_age_from_cell(age_text: str) -> int:
    """Extract age from the date of birth/age cell."""
    if pd.isna(age_text) or age_text == '':
        return None
    
    # Look for age pattern like "(24)" or "24"
    age_match = re.search(r'\((\d+)\)', str(age_text))
    if age_match:
        return int(age_match.group(1))
    
    return None

def extract_nationality_from_cell(nat_cell) -> str:
    """Extract nationality from the nationality cell using flag alt attribute."""
    if pd.isna(nat_cell) or nat_cell == '':
        return 'Unknown'
    
    # Look for flaggenrahmen img tags
    if hasattr(nat_cell, 'find'):
        flag_imgs = nat_cell.find_all('img', {'class': 'flaggenrahmen'})
        if flag_imgs:
            # Get the first nationality (primary)
            alt_text = flag_imgs[0].get('alt', '')
            if alt_text:
                return alt_text
    
    return 'Unknown'

In [19]:
def scrape_team_season(team_name: str, season: int) -> pd.DataFrame:
    """Scrape team player data for a specific season from English Transfermarkt."""
    url = get_team_squad_url(team_name, season)
    
    try:
        ssl._create_default_https_context = ssl._create_unverified_context
        req = Request(url, headers=headers)
        html = urlopen(req)
        
        # Parse with BeautifulSoup to get the exact table structure
        soup = BeautifulSoup(html, 'html.parser')
        
        # Find the squad table using the exact CSS selector
        squad_table = soup.select_one('div.responsive-table table.items')
        if not squad_table:
            print(f"No squad table found for {team_name} season {season}")
            return pd.DataFrame()
        
        # Find all player rows using the exact CSS selector
        player_rows = squad_table.select('tbody > tr')
        
        # Process the data
        processed_data = []
        
        for row in player_rows:
            try:
                # Extract shirt number
                number_cell = row.select_one('td.rn_nummer')
                shirt_number = number_cell.text.strip() if number_cell else None
                
                # Extract player name and profile link
                name_link = row.select_one('td.hauptlink a')
                player_name = name_link.text.strip() if name_link else 'Unknown'
                profile_url = name_link.get('href') if name_link else None
                
                # Extract player image
                player_img = row.select_one('td.hauptlink img')
                player_photo = None
                if player_img:
                    player_photo = player_img.get('data-src') or player_img.get('src')
                
                # Extract position from inline table
                posrela_cell = row.select_one('td.posrela')
                position = 'Unknown Position'
                if posrela_cell:
                    inline_table = posrela_cell.select_one('table.inline-table')
                    if inline_table:
                        position_rows = inline_table.select('tr')
                        if len(position_rows) > 1:
                            position_cell = position_rows[1].select_one('td')
                            if position_cell:
                                position = position_cell.text.strip()
                
                # Extract age from zentriert cells (find the one with age pattern)
                zentriert_cells = row.select('td.zentriert')
                age = None
                for cell in zentriert_cells:
                    if re.search(r'\(\d+\)', cell.text):
                        age = extract_age_from_cell(cell.text)
                        break
                
                # Extract nationality from flag images
                nationality = 'Unknown'
                flag_imgs = row.select('td img.flaggenrahmen')
                if flag_imgs:
                    nationality = flag_imgs[0].get('alt', 'Unknown')
                
                # Extract market value
                market_value_cell = row.select_one('td.rechts')
                market_value = '€0'
                if market_value_cell:
                    market_value_link = market_value_cell.select_one('a')
                    if market_value_link:
                        market_value = market_value_link.text.strip()
                
                # Extract contract (if available)
                contract = None
                # Look for contract info in the last zentriert cell or specific contract column
                contract_cells = row.select('td.zentriert')
                if len(contract_cells) > 2:  # Assuming contract might be in later zentriert cells
                    for cell in contract_cells[-2:]:  # Check last two zentriert cells
                        cell_text = cell.text.strip()
                        if cell_text and not re.search(r'\(\d+\)', cell_text) and not cell_text.isdigit():
                            contract = cell_text
                            break
                
                # Create player record
                player_record = {
                    'Player': [player_name, position],
                    'Age': age,
                    'Current club': team_name,
                    'Market value': market_value,
                    'Nat.': nationality,
                    'Season': season,
                    'Contract': contract,
                    'Shirt Number': shirt_number,
                    'Profile URL': profile_url,
                    'Photo URL': player_photo
                }
                
                processed_data.append(player_record)
                
            except Exception as e:
                print(f"Error processing player row: {str(e)}")
                continue
        
        result_df = pd.DataFrame(processed_data)
        print(f"Successfully scraped {len(result_df)} players for {team_name} season {season}")
        return result_df
        
    except Exception as e:
        print(f"Error scraping {team_name} season {season}: {str(e)}")
        return pd.DataFrame()

In [20]:
def scrape_team_multiple_seasons(team_name: str, min_season: int, max_season: int) -> pd.DataFrame:
    """Scrape team data for multiple seasons with random delays."""
    all_data = []
    
    for season in range(min_season, max_season + 1):
        print(f"Scraping {team_name} season {season}...")
        season_data = scrape_team_season(team_name, season)
        
        if not season_data.empty:
            all_data.append(season_data)
        
        # Add random delay to avoid being blocked
        if season < max_season:  # Don't delay after the last season
            print(f"Waiting {random.uniform(1.0, 3.0):.1f} seconds before next request...")
            random_delay(1.0, 3.0)
    
    if all_data:
        combined_data = pd.concat(all_data, ignore_index=True)
        combined_data.index = range(len(combined_data.index))
        return combined_data
    else:
        return pd.DataFrame()

In [21]:
# Function to scrape any team with custom parameters
def scrape_any_team(team_name: str, min_season: int, max_season: int, output_filename: str = None):
    """
    Scrape any team's data across multiple seasons from English Transfermarkt.
    """
    print(f"Starting to scrape {team_name} players from season {min_season} to {max_season}")
    print("-" * 60)
    
    team_data = scrape_team_multiple_seasons(team_name, min_season, max_season)
    
    if not team_data.empty:
        print(f"\nScraped {len(team_data)} player records")
        print("\nFirst few records:")
        display(team_data.head())
        
        # Generate output filename if not provided
        if output_filename is None:
            output_filename = f"{team_name.lower().replace(' ', '_')}_players_{min_season}_{max_season}.xlsx"
        
        # Save to Excel
        team_data.to_excel(output_filename, encoding='utf-8', index=False)
        print(f"\nData saved to {output_filename}")
        print(f"Total records: {len(team_data)}")
        
        return team_data
    else:
        print("No data was scraped. Please check the team name and season range.")
        return pd.DataFrame()

In [23]:
# Example usage: scrape Valencia CF from 2020 to 2024
team_name = "Valencia CF"
min_season = 2020
max_season = 2024

print(f"Starting to scrape {team_name} players from season {min_season} to {max_season}")

team_data = scrape_team_multiple_seasons(team_name, min_season, max_season)

if not team_data.empty:
    print(f"\nScraped {len(team_data)} player records")
    print("\nFirst few records:")
    display(team_data.head())
else:
    print("No data was scraped. Please check the team name and season range.")

Starting to scrape Valencia CF players from season 2020 to 2024
Scraping Valencia CF season 2020...
Successfully scraped 35 players for Valencia CF season 2020
Waiting 1.5 seconds before next request...
Scraping Valencia CF season 2021...
Successfully scraped 44 players for Valencia CF season 2021
Waiting 1.1 seconds before next request...
Scraping Valencia CF season 2022...
Successfully scraped 40 players for Valencia CF season 2022
Waiting 2.7 seconds before next request...
Scraping Valencia CF season 2023...
Successfully scraped 36 players for Valencia CF season 2023
Waiting 2.5 seconds before next request...
Scraping Valencia CF season 2024...
Successfully scraped 26 players for Valencia CF season 2024

Scraped 181 player records

First few records:


Unnamed: 0,Player,Age,Current club,Market value,Nat.,Season,Contract,Shirt Number,Profile URL,Photo URL
0,"[Jasper Cillessen, Goalkeeper]",32,Valencia CF,€5.00m,Netherlands,2020,"Jul 1, 2019",,/jasper-cillessen/profil/spieler/146227,
1,"[Jaume Doménech, Goalkeeper]",30,Valencia CF,€4.00m,Spain,2020,"Jul 1, 2015",,/jaume-domenech/profil/spieler/227805,
2,"[Cristian Rivero, Goalkeeper]",23,Valencia CF,€300k,Spain,2020,"Aug 1, 2020",,/cristian-rivero/profil/spieler/398131,
3,"[Unai Etxebarria, Goalkeeper]",24,Valencia CF,€150k,Spain,2020,,,/unai-etxebarria/profil/spieler/288376,
4,"[Gabriel Paulista, Centre-Back]",30,Valencia CF,€15.00m,Brazil,2020,"Aug 18, 2017",,/gabriel-paulista/profil/spieler/149498,



Data saved to valencia_cf_players_2020_2024.xlsx
Total records: 181


NOTE: We can remove the Shirt Number, Photo URL and Profile URL columns as they are not needed for our analysis

In [26]:
team_data.drop(columns=['Shirt Number', 'Photo URL', 'Profile URL'], inplace=True)

In [27]:
team_data.head()

Unnamed: 0,Player,Age,Current club,Market value,Nat.,Season,Contract
0,"[Jasper Cillessen, Goalkeeper]",32,Valencia CF,€5.00m,Netherlands,2020,"Jul 1, 2019"
1,"[Jaume Doménech, Goalkeeper]",30,Valencia CF,€4.00m,Spain,2020,"Jul 1, 2015"
2,"[Cristian Rivero, Goalkeeper]",23,Valencia CF,€300k,Spain,2020,"Aug 1, 2020"
3,"[Unai Etxebarria, Goalkeeper]",24,Valencia CF,€150k,Spain,2020,
4,"[Gabriel Paulista, Centre-Back]",30,Valencia CF,€15.00m,Brazil,2020,"Aug 18, 2017"


In [24]:
# Save to Excel (removed encoding parameter)
output_filename = f"data/{team_name.lower().replace(' ', '_')}_players_{min_season}_{max_season}.xlsx"
team_data.to_excel(output_filename, index=False)
print(f"\nData saved to {output_filename}")
print(f"Total records: {len(team_data)}")


Data saved to data/valencia_cf_players_2020_2024.xlsx
Total records: 181


NOTE: This is our raw data, which we will save before cleaning it in the next pipeline step and merging it with the other data sources