## 05 Merge Interim Data of all Players (FBREF and Transfermarkt)

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
import sys
sys.path.append("..")
from src.preprocessing.merge_player_data import run_merge_pipeline

merge_result: tuple = run_merge_pipeline("Valencia CF", ["2223", "2324", "2425"])
df_all_players = merge_result[0] if isinstance(merge_result, tuple) else merge_result

print(f"Merge completed successfully!")
print(f"Total merged rows: {df_all_players.shape[0]}")
print(f"Total columns: {df_all_players.shape[1]}")

Current working directory: /Users/riadanas/football-talent-value-forecast/notebooks
Found project root: /Users/riadanas/football-talent-value-forecast
Trying possible paths: ['/Users/riadanas/football-talent-value-forecast/data', '../data', '../../data', '/Users/riadanas/football-talent-value-forecast/data', '/Users/riadanas/data', 'data']
Checking path: /Users/riadanas/football-talent-value-forecast/data - exists: True
Found data directory: /Users/riadanas/football-talent-value-forecast/data
Looking for FBref directory: /Users/riadanas/football-talent-value-forecast/data/interim/Valencia CF/fbref
Looking for Transfermarkt file: /Users/riadanas/football-talent-value-forecast/data/interim/Valencia CF/transfermarkt/valencia_cf_2020_2024.csv
Attempting fuzzy matching for 53 unmatched players...
Fuzzy match: 'nicolás gonzález' -> 'nico gonzález' (score: 92)
Fuzzy match: 'javier guerra' -> 'javi guerra' (score: 92)
Fuzzy match: 'jesus vazquez' -> 'jesús vázquez' (score: 92)
Fuzzy match: 'em

In [3]:
total_with_market_value = df_all_players['MarketValueEuro'].notna().sum()
print(f"Rows with market value data: {total_with_market_value}/{len(df_all_players)} ({total_with_market_value/len(df_all_players)*100:.1f}%)")

Rows with market value data: 78/114 (68.4%)


In [4]:
print("\nSample merged data:")
sample_cols = ['Player', 'Season', 'MarketValueEuro', 'Age_x', 'Position']
available_cols = [col for col in sample_cols if col in df_all_players.columns]
display(df_all_players[available_cols].head(20))


Sample merged data:


Unnamed: 0,Player,Season,MarketValueEuro,Position
21,Alberto Marí,2223,800000.0,Centre-Forward
43,Alberto Marí,2324,1500000.0,Centre-Forward
64,Alberto Marí,2425,1000000.0,Centre-Forward
18,Carlos Soler,2223,25000000.0,Central Midfield
12,Cenk Özkacar,2223,5000000.0,Centre-Back
35,Cenk Özkacar,2324,4000000.0,Centre-Back
66,Cenk Özkacar,2425,1200000.0,Centre-Back
76,Charlie Perez,2223,25000.0,Goalkeeper
19,Cristhian Mosquera,2223,1000000.0,Centre-Back
31,Cristhian Mosquera,2324,30000000.0,Centre-Back


In [5]:
# Group by player and sort by season to see all seasons for each player with missing market values
players_with_missing_market_values = df_all_players[df_all_players['MarketValueEuro'].isna()].sort_values(['Player', 'Season'])
display(players_with_missing_market_values[['Player', 'Season', 'Age', 'Position']].head(40))

Unnamed: 0,Player,Season,Age,Position
74,Cristian,2223,24,
88,Cristian,2324,25,
103,Dani Gómez,2425,26,
110,David Otorbi,2425,16,
68,Domingos André Ribeiro Almeida,2223,22,
84,Domingos André Ribeiro Almeida,2324,23,
97,Domingos André Ribeiro Almeida,2425,24,
96,Enzo Barrenechea,2425,23,
72,Francisco Perez,2223,19,
82,Francisco Perez,2324,20,


In [6]:
# Remove system/total rows
df_clean = df_all_players[~df_all_players['Player'].str.contains('Total|Squad|Opponent', case=False)]

# Only players with market values
df_with_market_values = df_all_players[df_all_players['MarketValueEuro'].notna()]

In [7]:
df_with_market_values

Unnamed: 0,Player,Season,Nation,Nationality,Pos,Position,Age,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR,Gls.1,Ast.1,G+A.1,G-PK.1,G+A-PK,xG.1,xAG.1,xG+xAG,npxG.1,npxG+xAG.1,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,FK,npxG/Sh,G-xG,np:G-xG,Cmp,Att,Cmp%,TotDist,PrgDist,Cmp.1,Att.1,Cmp%.1,Cmp.2,Att.2,Cmp%.2,Cmp.3,Att.3,Cmp%.3,xA,A-xAG,KP,1/3,PPA,CrsPA,Live,Dead,TB,Sw,Crs,TI,CK,In,Out,Str,Off,Blocks,SCA,SCA90,PassLive,PassDead,TO,Fld,Def,GCA,GCA90,PassLive.1,PassDead.1,TO.1,Sh.1,Fld.1,Def.1,Tkl,TklW,Def_3rd,Mid_3rd,Att_3rd,Tkl.1,Tkl%,Lost,Pass,Int,Tkl+Int,Clr,Err,Touches,Def_Pen,Att_Pen,Succ,Succ%,Tkld,Tkld%,Carries,CPA,Mis,Dis,Rec,MarketValueEuro,Current club
21,Alberto Marí,2223,es ESP,Spain,"FW,MF",Centre-Forward,21,5,0,86.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.2,0.2,0.0,0.2,1.0,1.0,6.0,1.05,0.0,1.05,1.05,1.05,0.25,0.0,0.25,0.25,0.25,2.0,1.0,50.0,2.09,1.05,0.5,1.0,10.7,0.0,0.12,0.8,0.8,8.0,20.0,40.0,102.0,7.0,7.0,12.0,58.3,1.0,4.0,25.0,0.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,30.0,0.0,7.0,1.0,25.0,3.0,75.0,21.0,1.0,2.0,1.0,26.0,800000.0,Valencia CF
43,Alberto Marí,2324,es ESP,Spain,FW,Centre-Forward,22,16,2,276.0,3.1,1.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,1.8,1.2,0.9,2.0,2.0,6.0,13.0,0.33,0.33,0.65,0.0,0.33,0.6,0.28,0.88,0.39,0.67,6.0,2.0,33.3,1.96,0.65,0.0,0.0,11.4,0.0,0.2,-0.8,-1.2,38.0,61.0,62.3,498.0,112.0,22.0,30.0,73.3,11.0,16.0,68.8,2.0,2.0,100.0,0.9,0.1,6.0,3.0,1.0,0.0,56.0,5.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,8.0,2.62,6.0,0.0,0.0,1.0,0.0,3.0,0.98,1.0,0.0,0.0,1.0,1.0,0.0,4.0,2.0,0.0,1.0,3.0,1.0,100.0,0.0,3.0,0.0,4.0,5.0,0.0,102.0,6.0,15.0,3.0,60.0,2.0,40.0,44.0,2.0,11.0,3.0,62.0,1500000.0,Valencia CF
64,Alberto Marí,2425,es ESP,Spain,FW,Centre-Forward,23,1,0,7.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,,,0.0,,0.0,0.0,2.0,4.0,50.0,33.0,7.0,1.0,1.0,100.0,1.0,1.0,100.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,12.86,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,,0.0,,2.0,0.0,0.0,0.0,4.0,1000000.0,Valencia CF
18,Carlos Soler,2223,es ESP,Spain,MF,Central Midfield,25,3,3,242.0,2.7,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.9,0.2,0.2,0.4,3.0,13.0,7.0,0.37,0.0,0.37,0.0,0.0,0.35,0.08,0.43,0.06,0.14,4.0,1.0,25.0,1.49,0.37,0.0,0.0,30.0,3.0,0.04,0.1,-0.2,92.0,123.0,74.8,1388.0,378.0,52.0,59.0,88.1,23.0,28.0,82.1,9.0,24.0,37.5,0.2,-0.2,4.0,5.0,4.0,0.0,102.0,21.0,0.0,1.0,16.0,0.0,11.0,5.0,5.0,0.0,0.0,3.0,6.0,2.23,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,2.0,66.7,1.0,1.0,0.0,2.0,2.0,0.0,143.0,2.0,4.0,1.0,50.0,0.0,0.0,83.0,0.0,2.0,1.0,98.0,25000000.0,Valencia CF
12,Cenk Özkacar,2223,tr TUR,Türkiye,DF,Centre-Back,21,17,14,1365.0,15.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.8,0.8,0.2,1.0,23.0,61.0,2.0,0.0,0.0,0.0,0.0,0.0,0.05,0.02,0.07,0.05,0.07,11.0,2.0,18.2,0.73,0.13,0.0,0.0,12.8,0.0,0.07,-0.8,-0.8,646.0,769.0,84.0,12014.0,4500.0,274.0,315.0,87.0,308.0,341.0,90.3,60.0,91.0,65.9,0.3,-0.2,4.0,54.0,3.0,1.0,744.0,23.0,0.0,16.0,3.0,2.0,0.0,0.0,0.0,0.0,2.0,8.0,18.0,1.19,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,20.0,17.0,10.0,0.0,14.0,73.7,5.0,12.0,12.0,39.0,47.0,4.0,908.0,85.0,16.0,10.0,100.0,0.0,0.0,522.0,0.0,8.0,4.0,549.0,5000000.0,Valencia CF
35,Cenk Özkacar,2324,tr TUR,Türkiye,DF,Centre-Back,22,23,18,1598.0,17.8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.3,0.3,0.1,0.3,17.0,56.0,5.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.02,0.01,0.02,6.0,3.0,50.0,0.34,0.17,0.0,0.0,14.3,0.0,0.04,-0.3,-0.3,698.0,860.0,81.2,13108.0,4662.0,265.0,295.0,89.8,364.0,398.0,91.5,58.0,135.0,43.0,0.3,-0.1,1.0,48.0,3.0,0.0,810.0,48.0,1.0,9.0,6.0,21.0,0.0,0.0,0.0,0.0,2.0,15.0,13.0,0.73,12.0,1.0,0.0,0.0,0.0,2.0,0.11,2.0,0.0,0.0,0.0,0.0,0.0,26.0,15.0,22.0,4.0,0.0,15.0,65.2,8.0,11.0,18.0,44.0,84.0,1.0,1035.0,109.0,12.0,3.0,37.5,4.0,50.0,532.0,0.0,4.0,3.0,617.0,4000000.0,Valencia CF
66,Cenk Özkacar,2425,tr TUR,Türkiye,DF,Centre-Back,23,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1200000.0,Valencia CF
76,Charlie Perez,2223,es ESP,Spain,GK,Goalkeeper,20,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25000.0,Valencia CF
19,Cristhian Mosquera,2223,es ESP,Spain,DF,Centre-Back,18,3,1,145.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.12,0.12,0.12,2.0,0.0,0.0,1.24,0.0,0.0,,9.3,0.0,0.1,-0.2,-0.2,53.0,65.0,81.5,828.0,314.0,29.0,33.0,87.9,20.0,23.0,87.0,4.0,9.0,44.4,0.0,0.0,0.0,0.0,0.0,0.0,46.0,19.0,0.0,0.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.62,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,3.0,1.0,0.0,4.0,80.0,1.0,1.0,1.0,5.0,5.0,0.0,81.0,11.0,2.0,0.0,,0.0,,38.0,0.0,2.0,0.0,38.0,1000000.0,Valencia CF
31,Cristhian Mosquera,2324,es ESP,Spain,DF,Centre-Back,19,36,33,3075.0,34.2,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.7,0.7,0.0,0.7,14.0,77.0,4.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.02,0.02,9.0,2.0,22.2,0.26,0.06,0.0,0.0,14.3,0.0,0.07,-0.7,-0.7,1454.0,1646.0,88.3,27551.0,8196.0,500.0,543.0,92.1,808.0,859.0,94.1,134.0,211.0,63.5,0.2,0.0,1.0,45.0,0.0,0.0,1561.0,82.0,0.0,6.0,1.0,18.0,0.0,0.0,0.0,0.0,3.0,13.0,16.0,0.47,15.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0,34.0,36.0,18.0,3.0,25.0,64.1,14.0,19.0,36.0,93.0,137.0,2.0,1980.0,274.0,15.0,6.0,85.7,1.0,14.3,1111.0,3.0,15.0,4.0,1240.0,30000000.0,Valencia CF


NOTE:
- Nation and Nationality contain different values
- Pos and Position are not the same and we have in Pos sometimes multiple values 
- We have new players with only one season or missing values 
    * it's best to look at players with 2-3+ seasons, so we'll skip these

In [8]:
#### Let's use this column instead of 'Pos' since it's cleaner
df_with_market_values['Position'].value_counts()

Position
Centre-Back           16
Centre-Forward        12
Central Midfield       9
Goalkeeper             9
Right Winger           7
Right-Back             6
Attacking Midfield     5
Left Winger            5
Defensive Midfield     4
Left-Back              4
Second Striker         1
Name: count, dtype: int64

In [9]:
df_with_market_values['Nation'].value_counts().head(3)

Nation
es ESP    48
fr FRA     4
br BRA     4
Name: count, dtype: int64

In [10]:
df_with_market_values['Nation'] = df_with_market_values['Nation'].str[-3:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_market_values['Nation'] = df_with_market_values['Nation'].str[-3:]


In [11]:
print(df_with_market_values.shape)
df_with_market_values.head(10)

(78, 122)


Unnamed: 0,Player,Season,Nation,Nationality,Pos,Position,Age,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR,Gls.1,Ast.1,G+A.1,G-PK.1,G+A-PK,xG.1,xAG.1,xG+xAG,npxG.1,npxG+xAG.1,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,FK,npxG/Sh,G-xG,np:G-xG,Cmp,Att,Cmp%,TotDist,PrgDist,Cmp.1,Att.1,Cmp%.1,Cmp.2,Att.2,Cmp%.2,Cmp.3,Att.3,Cmp%.3,xA,A-xAG,KP,1/3,PPA,CrsPA,Live,Dead,TB,Sw,Crs,TI,CK,In,Out,Str,Off,Blocks,SCA,SCA90,PassLive,PassDead,TO,Fld,Def,GCA,GCA90,PassLive.1,PassDead.1,TO.1,Sh.1,Fld.1,Def.1,Tkl,TklW,Def_3rd,Mid_3rd,Att_3rd,Tkl.1,Tkl%,Lost,Pass,Int,Tkl+Int,Clr,Err,Touches,Def_Pen,Att_Pen,Succ,Succ%,Tkld,Tkld%,Carries,CPA,Mis,Dis,Rec,MarketValueEuro,Current club
21,Alberto Marí,2223,ESP,Spain,"FW,MF",Centre-Forward,21,5,0,86.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.2,0.2,0.0,0.2,1.0,1.0,6.0,1.05,0.0,1.05,1.05,1.05,0.25,0.0,0.25,0.25,0.25,2.0,1.0,50.0,2.09,1.05,0.5,1.0,10.7,0.0,0.12,0.8,0.8,8.0,20.0,40.0,102.0,7.0,7.0,12.0,58.3,1.0,4.0,25.0,0.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,30.0,0.0,7.0,1.0,25.0,3.0,75.0,21.0,1.0,2.0,1.0,26.0,800000.0,Valencia CF
43,Alberto Marí,2324,ESP,Spain,FW,Centre-Forward,22,16,2,276.0,3.1,1.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,1.8,1.2,0.9,2.0,2.0,6.0,13.0,0.33,0.33,0.65,0.0,0.33,0.6,0.28,0.88,0.39,0.67,6.0,2.0,33.3,1.96,0.65,0.0,0.0,11.4,0.0,0.2,-0.8,-1.2,38.0,61.0,62.3,498.0,112.0,22.0,30.0,73.3,11.0,16.0,68.8,2.0,2.0,100.0,0.9,0.1,6.0,3.0,1.0,0.0,56.0,5.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,8.0,2.62,6.0,0.0,0.0,1.0,0.0,3.0,0.98,1.0,0.0,0.0,1.0,1.0,0.0,4.0,2.0,0.0,1.0,3.0,1.0,100.0,0.0,3.0,0.0,4.0,5.0,0.0,102.0,6.0,15.0,3.0,60.0,2.0,40.0,44.0,2.0,11.0,3.0,62.0,1500000.0,Valencia CF
64,Alberto Marí,2425,ESP,Spain,FW,Centre-Forward,23,1,0,7.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,,,0.0,,0.0,0.0,2.0,4.0,50.0,33.0,7.0,1.0,1.0,100.0,1.0,1.0,100.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,12.86,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,,0.0,,2.0,0.0,0.0,0.0,4.0,1000000.0,Valencia CF
18,Carlos Soler,2223,ESP,Spain,MF,Central Midfield,25,3,3,242.0,2.7,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.9,0.2,0.2,0.4,3.0,13.0,7.0,0.37,0.0,0.37,0.0,0.0,0.35,0.08,0.43,0.06,0.14,4.0,1.0,25.0,1.49,0.37,0.0,0.0,30.0,3.0,0.04,0.1,-0.2,92.0,123.0,74.8,1388.0,378.0,52.0,59.0,88.1,23.0,28.0,82.1,9.0,24.0,37.5,0.2,-0.2,4.0,5.0,4.0,0.0,102.0,21.0,0.0,1.0,16.0,0.0,11.0,5.0,5.0,0.0,0.0,3.0,6.0,2.23,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,2.0,66.7,1.0,1.0,0.0,2.0,2.0,0.0,143.0,2.0,4.0,1.0,50.0,0.0,0.0,83.0,0.0,2.0,1.0,98.0,25000000.0,Valencia CF
12,Cenk Özkacar,2223,TUR,Türkiye,DF,Centre-Back,21,17,14,1365.0,15.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.8,0.8,0.2,1.0,23.0,61.0,2.0,0.0,0.0,0.0,0.0,0.0,0.05,0.02,0.07,0.05,0.07,11.0,2.0,18.2,0.73,0.13,0.0,0.0,12.8,0.0,0.07,-0.8,-0.8,646.0,769.0,84.0,12014.0,4500.0,274.0,315.0,87.0,308.0,341.0,90.3,60.0,91.0,65.9,0.3,-0.2,4.0,54.0,3.0,1.0,744.0,23.0,0.0,16.0,3.0,2.0,0.0,0.0,0.0,0.0,2.0,8.0,18.0,1.19,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,20.0,17.0,10.0,0.0,14.0,73.7,5.0,12.0,12.0,39.0,47.0,4.0,908.0,85.0,16.0,10.0,100.0,0.0,0.0,522.0,0.0,8.0,4.0,549.0,5000000.0,Valencia CF
35,Cenk Özkacar,2324,TUR,Türkiye,DF,Centre-Back,22,23,18,1598.0,17.8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.3,0.3,0.1,0.3,17.0,56.0,5.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.02,0.01,0.02,6.0,3.0,50.0,0.34,0.17,0.0,0.0,14.3,0.0,0.04,-0.3,-0.3,698.0,860.0,81.2,13108.0,4662.0,265.0,295.0,89.8,364.0,398.0,91.5,58.0,135.0,43.0,0.3,-0.1,1.0,48.0,3.0,0.0,810.0,48.0,1.0,9.0,6.0,21.0,0.0,0.0,0.0,0.0,2.0,15.0,13.0,0.73,12.0,1.0,0.0,0.0,0.0,2.0,0.11,2.0,0.0,0.0,0.0,0.0,0.0,26.0,15.0,22.0,4.0,0.0,15.0,65.2,8.0,11.0,18.0,44.0,84.0,1.0,1035.0,109.0,12.0,3.0,37.5,4.0,50.0,532.0,0.0,4.0,3.0,617.0,4000000.0,Valencia CF
66,Cenk Özkacar,2425,TUR,Türkiye,DF,Centre-Back,23,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1200000.0,Valencia CF
76,Charlie Perez,2223,ESP,Spain,GK,Goalkeeper,20,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25000.0,Valencia CF
19,Cristhian Mosquera,2223,ESP,Spain,DF,Centre-Back,18,3,1,145.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.12,0.12,0.12,2.0,0.0,0.0,1.24,0.0,0.0,,9.3,0.0,0.1,-0.2,-0.2,53.0,65.0,81.5,828.0,314.0,29.0,33.0,87.9,20.0,23.0,87.0,4.0,9.0,44.4,0.0,0.0,0.0,0.0,0.0,0.0,46.0,19.0,0.0,0.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.62,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,3.0,1.0,0.0,4.0,80.0,1.0,1.0,1.0,5.0,5.0,0.0,81.0,11.0,2.0,0.0,,0.0,,38.0,0.0,2.0,0.0,38.0,1000000.0,Valencia CF
31,Cristhian Mosquera,2324,ESP,Spain,DF,Centre-Back,19,36,33,3075.0,34.2,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.7,0.7,0.0,0.7,14.0,77.0,4.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.02,0.02,9.0,2.0,22.2,0.26,0.06,0.0,0.0,14.3,0.0,0.07,-0.7,-0.7,1454.0,1646.0,88.3,27551.0,8196.0,500.0,543.0,92.1,808.0,859.0,94.1,134.0,211.0,63.5,0.2,0.0,1.0,45.0,0.0,0.0,1561.0,82.0,0.0,6.0,1.0,18.0,0.0,0.0,0.0,0.0,3.0,13.0,16.0,0.47,15.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0,34.0,36.0,18.0,3.0,25.0,64.1,14.0,19.0,36.0,93.0,137.0,2.0,1980.0,274.0,15.0,6.0,85.7,1.0,14.3,1111.0,3.0,15.0,4.0,1240.0,30000000.0,Valencia CF


In [12]:
df_with_market_values['Nation'].value_counts().head(3)

Nation
ESP    48
FRA     4
BRA     4
Name: count, dtype: int64

In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df_with_market_values['Nation'] = le.fit_transform(df_with_market_values['Nation'])
df_with_market_values['Position'] = le.fit_transform(df_with_market_values['Position'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_market_values['Nation'] = le.fit_transform(df_with_market_values['Nation'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_market_values['Position'] = le.fit_transform(df_with_market_values['Position'])


In [15]:
correlation_with_market_value = df_with_market_values.select_dtypes(include=['int64', 'float64']).corrwith(df_with_market_values['MarketValueEuro'])

print(correlation_with_market_value.sort_values(ascending=False).head(11))

MarketValueEuro    1.000000
Starts             0.712750
90s                0.685461
Min                0.685270
MP                 0.615856
Cmp.3              0.607253
PrgDist            0.596705
Att.3              0.591311
Carries            0.586538
TotDist            0.580317
Att                0.570538
dtype: float64


  c /= stddev[:, None]
  c /= stddev[None, :]


### Top 10 Features Most Correlated with Market Value

- **Starts (0.718)** — Indicates how often a player is trusted to start matches; more starts = more value.
- **90s (0.693)** — Reflects total playing time in 90-minute equivalents; regular play boosts visibility and importance.
- **Min (0.693)** — Total minutes played; more minutes typically means more experience and trust from coaches.
- **Cmp.3 (0.645)** — Completed passes in the attacking third; shows involvement in crucial offensive phases.
- **PrgDist (0.642)** — Total progressive carrying distance; highlights ability to move the ball forward effectively.
- **Att.3 (0.638)** — Passes attempted in the attacking third; shows a player’s role in build-up and attacking moves.
- **Def_Pen (0.621)** — Defensive actions inside the penalty area; key for evaluating defenders’ last-line contributions.
- **MP (0.610)** — Matches played; simple yet powerful indicator of consistency and fitness.
- **TotDist (0.592)** — Total distance covered while carrying the ball; proxies physical activity and involvement.
- **Carries (0.568)** — Number of ball carries; emphasizes players who actively retain and progress possession.


- Nation and Position don't have an impact on Market Value

### Interrogations:
- When scraping data from other teams to build a regression model to predict a player's next year's market value, I believe we should consider the ranking of the team. 
- A player for Valencia might not be value the same as a Real Madrid player even with the same performance.
- That's because Real Madrid plays bigger competitions and is more likely to win titles which boosts the player's exposure and therefore the market price.
- number of competitions played as well

## Other spanish teams to scrape for ML:

- https://fbref.com/en/squads/53a2f082/2024-2025/Real-Madrid-Stats
- https://fbref.com/en/squads/206d90db/Barcelona-Stats
- https://fbref.com/en/squads/ad2be733/Sevilla-Stats
- https://fbref.com/en/squads/db3b9613/Atletico-Madrid-Stats
- https://fbref.com/en/squads/2b390eca/Athletic-Club-Stats
- https://fbref.com/en/squads/2a8183b3/Villarreal-Stats
- https://fbref.com/en/squads/e31d1cd9/Real-Sociedad-Stats
- https://fbref.com/en/squads/fc536746/Real-Betis-Stats

---

# Merge all player data

In [None]:
import sys
import os
from pathlib import Path
sys.path.append("..")
from src.preprocessing.merge_player_data import run_merge_pipeline

teams_list: list[str] = [
    "Real Madrid CF",
    "FC Barcelona", 
    "Atlético Madrid",
    "Sevilla FC", 
    "Athletic Club",
    "Villarreal CF",
    "Real Sociedad",
    "Real Betis",
    "Valencia CF",
]

seasons_list: list[str] = ["2223", "2324", "2425"]

for team_name in teams_list:
    print(f"Processing {team_name}...")
    
    # Create directory structure
    team_directory: Path = Path(f"../data/interim/{team_name}/merged")
    team_directory.mkdir(parents=True, exist_ok=True)
    
    # Run merge pipeline for current team
    merge_result: tuple = run_merge_pipeline(team_name, seasons_list)
    df_all_players = merge_result[0] if isinstance(merge_result, tuple) else merge_result
    
    print(f"Merge completed for {team_name}!")
    print(f"Total merged rows: {df_all_players.shape[0]}")
    print(f"Total columns: {df_all_players.shape[1]}")
    
    # Remove system/total rows
    player_column_filled = df_all_players['Player'].fillna('Unknown')
    df_clean = df_all_players[~player_column_filled.str.contains('Total|Squad|Opponent', case=False)]

    # Only players with market values
    df_clean = df_clean[df_clean['MarketValueEuro'].notna()]
    
    # Clean nation column
    df_clean.loc[:, 'Nation'] = df_clean['Nation'].str.extract(r'([A-Z]{3})$').iloc[:, 0]

    # Save processed data
    clean_filename: str = f"{team_name.replace(' ', '_').lower()}_clean.csv"
    
    df_clean.to_csv(team_directory / clean_filename, index=False)
    
    print(f"Saved {clean_filename} to {team_directory}")
    print("-" * 50)

print("All teams processed successfully!")



Processing Real Madrid CF...
Current working directory: /Users/markuskuehnle/Documents/projects/talent-value-forecast/notebooks
Found project root: /Users/markuskuehnle/Documents/projects/talent-value-forecast
Trying possible paths: ['/Users/markuskuehnle/Documents/projects/talent-value-forecast/data', '../data', '../../data', '/Users/markuskuehnle/Documents/projects/talent-value-forecast/data', '/Users/markuskuehnle/Documents/projects/data', 'data']
Checking path: /Users/markuskuehnle/Documents/projects/talent-value-forecast/data - exists: True
Found data directory: /Users/markuskuehnle/Documents/projects/talent-value-forecast/data
Looking for FBref directory: /Users/markuskuehnle/Documents/projects/talent-value-forecast/data/interim/Real Madrid CF/fbref
Looking for Transfermarkt file: /Users/markuskuehnle/Documents/projects/talent-value-forecast/data/interim/Real Madrid CF/transfermarkt/real_madrid_cf_2020_2024.csv
Attempting fuzzy matching for 41 unmatched players...
Fuzzy match: 'v

---

# Concatenate all cleaned team datasets

In [None]:
# Concatenate all cleaned team datasets
import pandas as pd
from pathlib import Path

# Find all clean CSV files in the merged directories
merged_data_path: Path = Path("../data/interim")
clean_files_pattern: str = "*/merged/*_clean.csv"

all_clean_files: list[Path] = list(merged_data_path.glob(clean_files_pattern))

print(f"Found {len(all_clean_files)} clean CSV files:")
for file_path in all_clean_files:
    print(f"  - {file_path}")

# Load and concatenate all datasets
team_dataframes: list[pd.DataFrame] = []
for file_path in all_clean_files:
    team_df: pd.DataFrame = pd.read_csv(file_path)
    team_name: str = file_path.parent.name  # Extract team name from directory
    team_dataframes.append(team_df)

# Concatenate all dataframes
combined_dataset: pd.DataFrame = pd.concat(team_dataframes, ignore_index=True)

print(f"\nCombined dataset shape: {combined_dataset.shape}")
print(f"Seasons included: {sorted(combined_dataset['Season'].unique())}")

# Save combined dataset
output_path: Path = Path("../data/interim/datasets_combined")
output_path.mkdir(parents=True, exist_ok=True)
combined_filename: str = "all_teams_combined.csv"
combined_dataset: pd.DataFrame = combined_dataset.drop(columns=['Nation'])
combined_dataset.to_csv(output_path / combined_filename, index=False)

print(f"\nCombined dataset saved to: {output_path / combined_filename}")

Found 9 clean CSV files:
  - ../data/interim/Atlético Madrid/merged/atlético_madrid_clean.csv
  - ../data/interim/Valencia CF/merged/valencia_cf_clean.csv
  - ../data/interim/Sevilla FC/merged/sevilla_fc_clean.csv
  - ../data/interim/Villarreal CF/merged/villarreal_cf_clean.csv
  - ../data/interim/Real Madrid CF/merged/real_madrid_cf_clean.csv
  - ../data/interim/Real Betis/merged/real_betis_clean.csv
  - ../data/interim/FC Barcelona/merged/fc_barcelona_clean.csv
  - ../data/interim/Athletic Club/merged/athletic_club_clean.csv
  - ../data/interim/Real Sociedad/merged/real_sociedad_clean.csv

Combined dataset shape: (624, 122)
Seasons included: [np.int64(2223), np.int64(2324), np.int64(2425)]

Combined dataset saved to: ../data/interim/datasets_combined/all_teams_combined.csv


In [None]:
combined_dataset.head(30)

Unnamed: 0,Player,Season,Nationality,Pos,Position,Age,MP,Starts,Min,90s,...,Succ%,Tkld,Tkld%,Carries,CPA,Mis,Dis,Rec,MarketValueEuro,Current club
0,Abdellah Raihani,2324,Morocco,FW,Centre-Forward,19,1,0,10.0,0.1,...,,0.0,,1.0,0.0,1.0,0.0,1.0,300000.0,Atlético Madrid
1,Adrian Niño,2324,Spain,FW,Centre-Forward,19,0,0,,,...,,,,,,,,,500000.0,Atlético Madrid
2,Aitor Gismera,2324,Spain,MF,Central Midfield,19,0,0,,,...,,,,,,,,,250000.0,Atlético Madrid
3,Antoine Griezmann,2324,France,"FW,MF",Centre-Forward,32,33,31,2644.0,29.4,...,57.9,15.0,39.5,839.0,7.0,40.0,26.0,1194.0,25000000.0,Atlético Madrid
4,Antonio Gomis,2324,Spain,GK,Goalkeeper,20,0,0,,,...,,,,,,,,,150000.0,Atlético Madrid
5,Arthur Vermeeren,2324,Belgium,MF,Defensive Midfield,18,5,2,162.0,1.8,...,0.0,1.0,100.0,68.0,0.0,6.0,2.0,75.0,25000000.0,Atlético Madrid
6,Axel Witsel,2324,Belgium,DF,Centre-Back,34,35,30,2783.0,30.9,...,62.5,3.0,37.5,1364.0,0.0,14.0,6.0,1467.0,3500000.0,Atlético Madrid
7,César Azpilicueta,2324,Spain,DF,Centre-Back,33,25,14,1423.0,15.8,...,16.7,3.0,50.0,624.0,3.0,10.0,6.0,752.0,2000000.0,Atlético Madrid
8,Gabriel Paulista,2324,Brazil,DF,Centre-Back,32,5,5,405.0,4.5,...,,0.0,,187.0,0.0,0.0,1.0,212.0,2000000.0,Atlético Madrid
9,Horaţiu Moldovan,2324,Romania,GK,Goalkeeper,25,0,0,,,...,,,,,,,,,2500000.0,Atlético Madrid


NOTE: Data cleaning ideas for next iteration
 
Nationality field needs work:
- Maybe use FIFA country codes? More standardized

Position data is messy:
- POS column has multiple values separated by commas
    * Could split into separate columns or create boolean flags
- Position column has "Centre-Forward" vs "Center Forward" spelling differences
    * Should group into Attack/Midfield/Defense/Goalkeeper categories

Missing values everywhere:
- Need to figure out imputation strategy based on position/age
- Or just flag missing data for separate analysis
- Some players have no stats but high market values - need to investigate