<a href="https://colab.research.google.com/github/nickklos10/SerieA_Machine_Learning_Predictions_2025/blob/main/Transfer_SerieA_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests

url_market = "https://www.transfermarkt.us/serie-a/transfers/wettbewerb/IT1/plus/?saison_id=1990&s_w=&leihe=0&intern=0&intern=1"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

response = requests.get(url_market, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    print("Request successful!")
    html_content = response.text
else:
    print(f"Request failed with status code {response.status_code}")


Request successful!


In [None]:
with open('page_source.html', 'w', encoding='utf-8') as f:
    f.write(html_content)
print("HTML content saved to 'page_source.html'")

HTML content saved to 'page_source.html'


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
team_headers = soup.find_all('h2', class_='content-box-headline content-box-headline--inverted content-box-headline--logo')

In [None]:
import re

all_transfers = []

team_headers = soup.find_all('h2', class_='content-box-headline')

for team_header in team_headers:
    # Extract team name
    team_name_tag = team_header.find('a', title=True)
    team_name = team_name_tag['title'] if team_name_tag else team_header.text.strip()
    print(f"Extracting transfers for {team_name}...")


    current_element = team_header.find_next_sibling()
    while current_element and current_element.name != 'h2':
        if current_element.name == 'div' and 'responsive-table' in current_element.get('class', []):

            transfer_table = current_element.find('table')
            if transfer_table:

                header_row = transfer_table.find('tr')
                first_th = header_row.find('th')
                if first_th:
                    transfer_type_text = first_th.text.strip()
                    if 'In' in transfer_type_text:
                        transfer_type = 'In'
                    elif 'Out' in transfer_type_text:
                        transfer_type = 'Out'
                    else:
                        transfer_type = 'Unknown'
                else:
                    transfer_type = 'Unknown'

                for row in transfer_table.find_all('tr'):
                        cells = row.find_all('td')
                        if len(cells) >= 9:
                            # Player Name
                            player_cell = cells[0]
                            player_name_tag = player_cell.find('a', title=True)
                            player_name = player_name_tag['title'].strip() if player_name_tag else ''


                            # Age
                            age_cell = cells[1]
                            age = age_cell.text.strip()

                            # Nationality
                            nationality_cell = cells[2]
                            nationality_imgs = nationality_cell.find_all('img')
                            nationality = ', '.join([img['title'] for img in nationality_imgs if img.has_attr('title')])

                            # Position
                            position_cell = cells[4]
                            position = position_cell.text.strip()

                            # Market Value
                            market_value_cell = cells[5]
                            market_value = market_value_cell.text.strip()

                            # From/To Club
                            club_cell = cells[7]
                            club_name_tag = club_cell.find('a', title=True)
                            club_name = club_name_tag['title'].strip() if club_name_tag else club_cell.text.strip()

                            # Fee
                            fee_cell = cells[8]
                            fee_link = fee_cell.find('a')
                            fee = fee_link.text.strip() if fee_link else fee_cell.text.strip()

                            transfer = {
                                'Team': team_name,
                                'Transfer Type': transfer_type,
                                'Player': player_name,
                                'Age': age,
                                'Nationality': nationality,
                                'Position': position,
                                'Market Value': market_value,
                                'From Club': club_name,
                                'Fee': fee
                            }
                            all_transfers.append(transfer)
        current_element = current_element.find_next_sibling()

df_transfers = pd.DataFrame(all_transfers)

season_header = soup.find('h1', class_='content-box-headline')
season_text = season_header.text.strip() if season_header else ''
season_year_match = re.search(r'(\d{2})/\d{2}', season_text)
if season_year_match:
    season_year = '19' + season_year_match.group(1)
else:
    season_year = 'Unknown'

df_transfers['Year'] = season_year

print(f"Season Year: {season_year}")

csv_filename = f'serie_a_transfers{season_year}.csv'

df_transfers.to_csv(csv_filename, index=False)

print("Transfer data saved to csv file'")

print(df_transfers.head())

Extracting transfers for Transfer record...
Extracting transfers for Juventus FC...
Extracting transfers for SSC Napoli...
Extracting transfers for AC Pisa 1909...
Extracting transfers for AC Cesena...
Extracting transfers for Cagliari Calcio...
Extracting transfers for UC Sampdoria...
Extracting transfers for Bologna FC 1909...
Extracting transfers for US Lecce...
Extracting transfers for Atalanta BC...
Extracting transfers for AC Milan...
Extracting transfers for AC Fiorentina...
Extracting transfers for Torino Calcio...
Extracting transfers for SS Lazio...
Extracting transfers for AS Bari...
Extracting transfers for Genoa 1893...
Extracting transfers for AC Parma...
Extracting transfers for Inter Milan...
Extracting transfers for AS Roma...
Extracting transfers for Transfer record...
Season Year: 1990
Transfer data saved to csv file'
          Team Transfer Type           Player Age Nationality Position  \
0  Juventus FC            In   Roberto Baggio  23       Italy       SS   
1  

In [None]:
df_transfers['Age'] = pd.to_numeric(df_transfers['Age'], errors='coerce')

In [None]:
def parse_fee(value):
    # Check if the value is already a float or an integer
    if isinstance(value, (float, int)):
        return value

    value = value.replace('€', '').replace('£', '').replace('$', '').strip()

    if 'Loan fee:' in value or 'Loan Fee:' in value:
        fee_value = value.split(':')[1]
        return parse_numeric_value(fee_value)

    # Handle "m" for millions and "k" for thousands
    elif 'm' in value or 'k' in value:
        return parse_numeric_value(value)

    # Handle special cases like 'free transfer', 'loan', or non-numeric values
    elif value.lower() in ['free transfer', 'loan', 'loan transfer', 'end of loan']:
        return 0.0
    else:
        # For cases like "?", "Null", "-", or any non-numeric value
        return 'Other'

def parse_numeric_value(value):
    # Check if the value is already a float or an integer
    if isinstance(value, (float, int)):
        return value

    # Handle string values
    if isinstance(value, str):
        # Remove currency symbols and commas, replace decimal comma with dot
        value = value.replace('€', '').replace('$', '').replace('£', '').replace(',', '.').strip()

        if 'm' in value.lower():
            return float(value.lower().replace('m', '')) * 1e6
        elif 'k' in value.lower():
            return float(value.lower().replace('k', '')) * 1e3
        else:
            try:
                return float(value)
            except ValueError:
                return 'Other'
    else:
        return 'Other'

# Apply the function to the 'Market Value' column
df_transfers['Market Value'] = df_transfers['Market Value'].apply(parse_numeric_value)



In [None]:
df_transfers['Fee'] = df_transfers['Fee'].apply(parse_fee)
# Apply the function to the 'Market Value' column
df_transfers['Market Value'] = df_transfers['Market Value'].apply(parse_numeric_value)

# Save the cleaned DataFrame back to a CSV file
df_transfers.to_csv(csv_filename, index=False)

print("Data cleaned and saved to csv file")

Data cleaned and saved to csv file
