## 05 Merge Interim Data of all Players (FBREF and Transfermarkt)

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
import sys
sys.path.append("..")
from src.preprocessing.merge_player_data import run_merge_pipeline

merge_result: tuple = run_merge_pipeline("Valencia CF", ["2223", "2324", "2425"])
df_all_players = merge_result[0] if isinstance(merge_result, tuple) else merge_result

print(f"Merge completed successfully!")
print(f"Total merged rows: {df_all_players.shape[0]}")
print(f"Total columns: {df_all_players.shape[1]}")

Attempting fuzzy matching for 51 unmatched players...
Fuzzy match: 'nicolás gonzález' -> 'nico gonzález' (score: 92)
Fuzzy match: 'javier guerra' -> 'javi guerra' (score: 92)
Fuzzy match: 'jesus vazquez' -> 'jesús vázquez' (score: 92)
Fuzzy match: 'emilio bernard' -> 'emilio bernad' (score: 96)
Fuzzy match: 'charlie perez' -> 'charlie pérez' (score: 96)
Fuzzy match: 'javier guerra' -> 'javi guerra' (score: 92)
Fuzzy match: 'jesus vazquez' -> 'jesús vázquez' (score: 92)
Fuzzy match: 'pablo gozalbez' -> 'pablo gozálbez' (score: 96)
Fuzzy match: 'marco camús' -> 'marco camus' (score: 95)
Fuzzy match: 'javier guerra' -> 'javi guerra' (score: 92)
Fuzzy match: 'jesus vazquez' -> 'jesús vázquez' (score: 92)
Dropped Nation_* columns: ['Nation_player_shooting', 'Nation_player_passing', 'Nation_player_passing_types', 'Nation_player_gca', 'Nation_player_defense', 'Nation_player_possession']
Dropped Age columns: ['Age_player_shooting', 'Age_player_passing', 'Age_player_passing_types', 'Age_player_



In [3]:
total_with_market_value = df_all_players['MarketValueEuro'].notna().sum()
print(f"Rows with market value data: {total_with_market_value}/{len(df_all_players)} ({total_with_market_value/len(df_all_players)*100:.1f}%)")

Rows with market value data: 80/114 (70.2%)


In [4]:
print("\nSample merged data:")
sample_cols = ['Player', 'Season', 'MarketValueEuro', 'Age_x', 'Position']
available_cols = [col for col in sample_cols if col in df_all_players.columns]
display(df_all_players[available_cols].head(20))


Sample merged data:


Unnamed: 0,Player,Season,MarketValueEuro,Position
21,Alberto Marí,2223,800000.0,Centre-Forward
43,Alberto Marí,2324,1500000.0,Centre-Forward
109,Alberto Marí,2425,,
18,Carlos Soler,2223,25000000.0,Central Midfield
12,Cenk Özkacar,2223,5000000.0,Centre-Back
35,Cenk Özkacar,2324,4000000.0,Centre-Back
117,Cenk Özkacar,2425,,
78,Charlie Perez,2223,25000.0,Goalkeeper
19,Cristhian Mosquera,2223,1000000.0,Centre-Back
31,Cristhian Mosquera,2324,30000000.0,Centre-Back


In [5]:
# Group by player and sort by season to see all seasons for each player with missing market values
players_with_missing_market_values = df_all_players[df_all_players['MarketValueEuro'].isna()].sort_values(['Player', 'Season'])
display(players_with_missing_market_values[['Player', 'Season', 'Age', 'Position']].head(40))

Unnamed: 0,Player,Season,Age,Position
109,Alberto Marí,2425,23,
117,Cenk Özkacar,2425,23,
76,Cristian,2223,24,
90,Cristian,2324,25,
101,Dani Gómez,2425,26,
108,David Otorbi,2425,16,
70,Domingos André Ribeiro Almeida,2223,22,
86,Domingos André Ribeiro Almeida,2324,23,
97,Domingos André Ribeiro Almeida,2425,24,
74,Francisco Perez,2223,19,


In [6]:
# Remove system/total rows
df_clean = df_all_players[~df_all_players['Player'].str.contains('Total|Squad|Opponent', case=False)]

# Only players with market values
df_with_market_values = df_all_players[df_all_players['MarketValueEuro'].notna()]

In [7]:
df_with_market_values.head(20)

Unnamed: 0,Player,Season,Nation,Nationality,Pos,Position,Age,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR,Gls.1,Ast.1,G+A.1,G-PK.1,G+A-PK,xG.1,xAG.1,xG+xAG,npxG.1,npxG+xAG.1,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,FK,npxG/Sh,G-xG,np:G-xG,Cmp,Att,Cmp%,TotDist,PrgDist,Cmp.1,Att.1,Cmp%.1,Cmp.2,Att.2,Cmp%.2,Cmp.3,Att.3,Cmp%.3,xA,A-xAG,KP,1/3,PPA,CrsPA,Live,Dead,TB,Sw,Crs,TI,CK,In,Out,Str,Off,Blocks,SCA,SCA90,PassLive,PassDead,TO,Fld,Def,GCA,GCA90,PassLive.1,PassDead.1,TO.1,Sh.1,Fld.1,Def.1,Tkl,TklW,Def_3rd,Mid_3rd,Att_3rd,Tkl.1,Tkl%,Lost,Pass,Int,Tkl+Int,Clr,Err,Touches,Def_Pen,Att_Pen,Succ,Succ%,Tkld,Tkld%,Carries,CPA,Mis,Dis,Rec,MarketValueEuro,Current club
21,Alberto Marí,2223,es ESP,Spain,"FW,MF",Centre-Forward,21,5,0,86.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.2,0.2,0.0,0.2,1.0,1.0,6.0,1.05,0.0,1.05,1.05,1.05,0.25,0.0,0.25,0.25,0.25,2.0,1.0,50.0,2.09,1.05,0.5,1.0,10.7,0.0,0.12,0.8,0.8,8.0,20.0,40.0,102.0,7.0,7.0,12.0,58.3,1.0,4.0,25.0,0.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,30.0,0.0,7.0,1.0,25.0,3.0,75.0,21.0,1.0,2.0,1.0,26.0,800000.0,Real Zaragoza
43,Alberto Marí,2324,es ESP,Spain,FW,Centre-Forward,22,16,2,276.0,3.1,1.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,1.8,1.2,0.9,2.0,2.0,6.0,13.0,0.33,0.33,0.65,0.0,0.33,0.6,0.28,0.88,0.39,0.67,6.0,2.0,33.3,1.96,0.65,0.0,0.0,11.4,0.0,0.2,-0.8,-1.2,38.0,61.0,62.3,498.0,112.0,22.0,30.0,73.3,11.0,16.0,68.8,2.0,2.0,100.0,0.9,0.1,6.0,3.0,1.0,0.0,56.0,5.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,8.0,2.62,6.0,0.0,0.0,1.0,0.0,3.0,0.98,1.0,0.0,0.0,1.0,1.0,0.0,4.0,2.0,0.0,1.0,3.0,1.0,100.0,0.0,3.0,0.0,4.0,5.0,0.0,102.0,6.0,15.0,3.0,60.0,2.0,40.0,44.0,2.0,11.0,3.0,62.0,1500000.0,Real Zaragoza
18,Carlos Soler,2223,es ESP,Spain,MF,Central Midfield,25,3,3,242.0,2.7,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.9,0.2,0.2,0.4,3.0,13.0,7.0,0.37,0.0,0.37,0.0,0.0,0.35,0.08,0.43,0.06,0.14,4.0,1.0,25.0,1.49,0.37,0.0,0.0,30.0,3.0,0.04,0.1,-0.2,92.0,123.0,74.8,1388.0,378.0,52.0,59.0,88.1,23.0,28.0,82.1,9.0,24.0,37.5,0.2,-0.2,4.0,5.0,4.0,0.0,102.0,21.0,0.0,1.0,16.0,0.0,11.0,5.0,5.0,0.0,0.0,3.0,6.0,2.23,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,2.0,66.7,1.0,1.0,0.0,2.0,2.0,0.0,143.0,2.0,4.0,1.0,50.0,0.0,0.0,83.0,0.0,2.0,1.0,98.0,25000000.0,West Ham United
12,Cenk Özkacar,2223,tr TUR,Türkiye,DF,Centre-Back,21,17,14,1365.0,15.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.8,0.8,0.2,1.0,23.0,61.0,2.0,0.0,0.0,0.0,0.0,0.0,0.05,0.02,0.07,0.05,0.07,11.0,2.0,18.2,0.73,0.13,0.0,0.0,12.8,0.0,0.07,-0.8,-0.8,646.0,769.0,84.0,12014.0,4500.0,274.0,315.0,87.0,308.0,341.0,90.3,60.0,91.0,65.9,0.3,-0.2,4.0,54.0,3.0,1.0,744.0,23.0,0.0,16.0,3.0,2.0,0.0,0.0,0.0,0.0,2.0,8.0,18.0,1.19,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,20.0,17.0,10.0,0.0,14.0,73.7,5.0,12.0,12.0,39.0,47.0,4.0,908.0,85.0,16.0,10.0,100.0,0.0,0.0,522.0,0.0,8.0,4.0,549.0,5000000.0,Real Valladolid CF
35,Cenk Özkacar,2324,tr TUR,Türkiye,DF,Centre-Back,22,23,18,1598.0,17.8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.3,0.3,0.1,0.3,17.0,56.0,5.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.02,0.01,0.02,6.0,3.0,50.0,0.34,0.17,0.0,0.0,14.3,0.0,0.04,-0.3,-0.3,698.0,860.0,81.2,13108.0,4662.0,265.0,295.0,89.8,364.0,398.0,91.5,58.0,135.0,43.0,0.3,-0.1,1.0,48.0,3.0,0.0,810.0,48.0,1.0,9.0,6.0,21.0,0.0,0.0,0.0,0.0,2.0,15.0,13.0,0.73,12.0,1.0,0.0,0.0,0.0,2.0,0.11,2.0,0.0,0.0,0.0,0.0,0.0,26.0,15.0,22.0,4.0,0.0,15.0,65.2,8.0,11.0,18.0,44.0,84.0,1.0,1035.0,109.0,12.0,3.0,37.5,4.0,50.0,532.0,0.0,4.0,3.0,617.0,4000000.0,Real Valladolid CF
78,Charlie Perez,2223,es ESP,Spain,GK,Goalkeeper,20,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25000.0,Real Unión Club
19,Cristhian Mosquera,2223,es ESP,"['Spain', 'Colombia']",DF,Centre-Back,18,3,1,145.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.12,0.12,0.12,2.0,0.0,0.0,1.24,0.0,0.0,,9.3,0.0,0.1,-0.2,-0.2,53.0,65.0,81.5,828.0,314.0,29.0,33.0,87.9,20.0,23.0,87.0,4.0,9.0,44.4,0.0,0.0,0.0,0.0,0.0,0.0,46.0,19.0,0.0,0.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.62,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,3.0,1.0,0.0,4.0,80.0,1.0,1.0,1.0,5.0,5.0,0.0,81.0,11.0,2.0,0.0,,0.0,,38.0,0.0,2.0,0.0,38.0,1000000.0,Valencia CF
31,Cristhian Mosquera,2324,es ESP,"['Spain', 'Colombia']",DF,Centre-Back,19,36,33,3075.0,34.2,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.7,0.7,0.0,0.7,14.0,77.0,4.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.02,0.02,9.0,2.0,22.2,0.26,0.06,0.0,0.0,14.3,0.0,0.07,-0.7,-0.7,1454.0,1646.0,88.3,27551.0,8196.0,500.0,543.0,92.1,808.0,859.0,94.1,134.0,211.0,63.5,0.2,0.0,1.0,45.0,0.0,0.0,1561.0,82.0,0.0,6.0,1.0,18.0,0.0,0.0,0.0,0.0,3.0,13.0,16.0,0.47,15.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0,34.0,36.0,18.0,3.0,25.0,64.1,14.0,19.0,36.0,93.0,137.0,2.0,1980.0,274.0,15.0,6.0,85.7,1.0,14.3,1111.0,3.0,15.0,4.0,1240.0,30000000.0,Valencia CF
50,Cristhian Mosquera,2425,es ESP,"['Spain', 'Colombia']",DF,Centre-Back,20,37,37,3319.0,36.9,1.0,0.0,1.0,1.0,0.0,0.0,6.0,0.0,0.4,0.4,0.0,0.4,35.0,123.0,6.0,0.03,0.0,0.03,0.03,0.03,0.01,0.0,0.01,0.01,0.01,5.0,2.0,40.0,0.14,0.05,0.2,0.5,10.5,0.0,0.08,0.6,0.6,1858.0,2053.0,90.5,32827.0,11556.0,766.0,812.0,94.3,947.0,1009.0,93.9,126.0,185.0,68.1,0.3,0.0,0.0,118.0,1.0,0.0,1931.0,117.0,0.0,6.0,2.0,49.0,0.0,0.0,0.0,0.0,5.0,9.0,14.0,0.38,12.0,0.0,0.0,0.0,2.0,1.0,0.03,1.0,0.0,0.0,0.0,0.0,0.0,56.0,36.0,41.0,13.0,2.0,31.0,77.5,9.0,13.0,35.0,91.0,128.0,4.0,2367.0,233.0,7.0,5.0,55.6,4.0,44.4,1401.0,0.0,18.0,10.0,1583.0,30000000.0,
47,César Tárrega,2324,es ESP,Spain,DF,Centre-Back,21,1,0,8.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,,,0.0,,0.0,0.0,5.0,5.0,100.0,90.0,22.0,1.0,1.0,100.0,4.0,4.0,100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,2.0,0.0,7.0,2.0,0.0,0.0,,0.0,,3.0,0.0,0.0,0.0,3.0,1500000.0,Valencia CF


NOTE: Current issues: 
- Nation and Nationality contain different values
- Pos and Position are not the same and we have in Pos sometimes multiple values
- We have new players with only one season or missing values