## Simplified Transfermarkt Data Cleaning Pipeline

In [5]:
import sys
sys.path.append("..")

from src.preprocessing.clean_transfermarkt_data import clean_transfermarkt_dataframe, explode_nationalities
from pathlib import Path

tm_path = Path("../data/raw/Valencia CF/transfermarkt/valencia_market_value_22_25.csv")
df_tm_clean = clean_transfermarkt_dataframe(tm_path)
df_tm_exploded = explode_nationalities(df_tm_clean)

df_tm_exploded

Unnamed: 0,Rank,Name,Position,Age,Nationality,Current club,Season,MarketValueEuro,PrimaryNationality
0,25.0,Giorgi Mamardashvili,Goalkeeper,22,Georgia,Valencia CF,2022,25000000.0,Georgia
1,23.0,Jaume Doménech,Goalkeeper,32,Spain,Valencia CF,2022,1000000.0,Spain
2,1.0,Iago Herrerín,Goalkeeper,35,Spain,Sestao River,2022,500000.0,Spain
3,42.0,Emilio Bernad,Goalkeeper,23,Spain,Racing Ferrol,2022,300000.0,Spain
4,13.0,Cristian Rivero,Goalkeeper,25,Spain,Albacete Balompié,2022,200000.0,Spain
...,...,...,...,...,...,...,...,...,...
124,16.0,Diego López,Right Winger,23,Spain,,2024,15000000.0,Spain
125,23.0,Fran Pérez,Right Winger,22,Spain,,2024,3000000.0,Spain
126,9.0,Hugo Duro,Centre-Forward,25,Spain,,2024,14000000.0,Spain
127,12.0,Umar Sadiq,Centre-Forward,28,Nigeria,,2024,5000000.0,Nigeria


In [None]:
output_path: Path = Path("../data/interim/Valencia CF/transfermarkt/valencia_market_value_22_25.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
df_tm_clean.to_csv(output_path, index=False)

---

# Clean Multiple Teams Transfermarkt Data

In [3]:
import sys
sys.path.append("..")

from src.preprocessing.clean_transfermarkt_data import clean_transfermarkt_dataframe, explode_nationalities
from pathlib import Path

teams_list: list[str] = [
    "Real Madrid CF",
    "FC Barcelona", 
    "Atlético Madrid",
    "Sevilla FC", 
    "Athletic Club",
    "Villarreal CF",
    "Real Sociedad",
    "Real Betis",
    "Valencia CF",
]

for team_name in teams_list:
    # Handle special case where CF/FC is dropped only in filename, not folder path
    team_name_for_filename: str = team_name.replace(" CF", "").replace(" FC", "")
    
    # Special handling for FC Barcelona filename
    if team_name == "FC Barcelona":
        input_file_path: Path = Path(f"../data/raw/{team_name}/transfermarkt/barcelona_market_values_2020_2024.csv")
    else:
        input_file_path: Path = Path(f"../data/raw/{team_name}/transfermarkt/{team_name_for_filename.lower().replace(' ', '_')}_market_values_2020_2024.csv")
    
    cleaned_dataframe = clean_transfermarkt_dataframe(input_file_path)
    exploded_dataframe = explode_nationalities(cleaned_dataframe)
    
    output_file_path: Path = Path(f"../data/interim/{team_name}/transfermarkt/{team_name.lower().replace(' ', '_')}_2020_2024.csv")
    output_file_path.parent.mkdir(parents=True, exist_ok=True)
    exploded_dataframe.to_csv(output_file_path, index=False)