# 03 Transfermarkt Scraper

In [137]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.parse

In [138]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}

In [139]:
page = 'https://www.transfermarkt.de/valencia-cf/startseite/verein/1049'

In [140]:
def get_transfermarkt_club_url(club_name, country='DE'):
    # Format search query
    base_search_url = 'https://www.transfermarkt.de/schnellsuche/ergebnis/schnellsuche'
    query = {'query': club_name}
    search_url = f"{base_search_url}?{urllib.parse.urlencode(query)}"

    headers = {
        'User-Agent': 'Mozilla/5.0'
    }

    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find club link in search results
    club_links = soup.select('a[href*="/startseite/verein/"]')
    for link in club_links:
        href = link.get('href', '')
        if '/startseite/verein/' in href:
            return urllib.parse.urljoin("https://www.transfermarkt.de", href)

    return None  # If no match found

In [141]:
club_url = get_transfermarkt_club_url("Valencia CF")
print(club_url)

https://www.transfermarkt.de/fc-valencia/startseite/verein/1049


In [142]:
page = club_url

In [143]:
page_tree = requests.get(page, headers=headers)
page_content = BeautifulSoup(page_tree.content, 'html.parser')

In [144]:
# Elements
players = page_content.find_all('img', {'class': 'bilderrahmen-fixed lazy lazy'})
age = page_content.find_all('td', {'class': 'zentriert'})
positions = page_content.find_all('td', {'class': [
    'zentriert rueckennummer bg_Torwart',
    'zentriert rueckennummer bg_Abwehr',
    'zentriert rueckennummer bg_Mittelfeld',
    'zentriert rueckennummer bg_Sturm']})
nations = page_content.find_all('td', {'class': 'zentriert'})
market_values = page_content.find_all('td', {'class': 'hauptlink'})

In [145]:
# Extract player names
PlayersList = [img.get("alt") for img in players if img.get("alt")]

# # Extract ages
# AgeList = []
# for i in range(1, len(players)*3, 3):
#     AgeList.append(str(age[i]).split(">", 1)[1].split("<", 1)[0])

# Extract ages (only the number inside parentheses)
AgeList = []
for i in range(1, len(players)*3, 3):
    raw_text = str(age[i]).split(">", 1)[1].split("<", 1)[0]
    age_str = raw_text.strip()
    if "(" in age_str and ")" in age_str:
        AgeList.append(age_str.split("(")[-1].split(")")[0])
    else:
        AgeList.append(None)

# Extract positions
PositionsList = []
for i in range(len(positions)):
    PositionsList.append(str(positions[i]).split('title="', 1)[1].split('"><div')[0])

# Extract nationalities
NationList = []
for i in range(2, len(players)*3, 3):
    NationList.append(str(nations[i]).split('title="', 1)[1].split('"', 1)[0])

In [146]:
# Extract only relevant market values
ValuesList = []

for tag in market_values:
    value = tag.text.strip()
    if '€' in value:
        ValuesList.append(value)

In [147]:
cleaned_values = []

for value in ValuesList:
    try:
        value = value.replace('.', '').replace(',', '.').replace('€', '').strip()

        if 'Mio' in value:
            numeric_part = value.replace('Mio', '').strip()
            cleaned_values.append(float(numeric_part) * 1_000_000)

        elif 'Tsd' in value:
            numeric_part = value.replace('Tsd', '').strip()
            cleaned_values.append(float(numeric_part) * 1_000)

        else:
            cleaned_values.append(None)

    except ValueError:
        cleaned_values.append(None)

In [149]:
final_df = pd.DataFrame({
    "Player": PlayersList,
    "Age": AgeList,
    "Position": PositionsList,
    "Nation": NationList,
    "Value": cleaned_values
})

In [150]:
print("Players:", len(PlayersList))
print("Ages:", len(AgeList))
print("Positions:", len(PositionsList))
print("Nations:", len(NationList))
print("Values:", len(cleaned_values))
display(final_df.head())

Players: 26
Ages: 26
Positions: 26
Nations: 26
Values: 26


Unnamed: 0,Player,Age,Position,Nation,Value
0,Giorgi Mamardashvili,24,Torwart,Georgien,30000000.0
1,Stole Dimitrievski,31,Torwart,Nordmazedonien,2500000.0
2,Jaume Doménech,34,Torwart,Spanien,400000.0
3,Cristhian Mosquera,20,Abwehr,Spanien,30000000.0
4,César Tárrega,23,Abwehr,Spanien,10000000.0
