In [1]:
import pandas as pd
import soccerdata as sd
import warnings
import matplotlib.pyplot as plt

import sys
import os
warnings.filterwarnings("ignore")

# Add project root to Python path
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

from functions import *

season = ["2023-24","2024-25","2025-26"]
fbref = sd.FBref("Big 5 European Leagues Combined", season)

## Explore Data

In [83]:
#import functions from functions.py
result_df = pd.DataFrame()
temp_df = pd.DataFrame()

for i in ["shooting", "passing","defense", "possession","goal_shot_creation"]:
    globals()[f"{i}_df"] = read_players_data(fbref, i)
    
#forwards = df[df['pos_'].isin(['FW', 'FW,MF'])]

shooting_df = shooting_df[shooting_df['pos_'].isin(['FW', 'FW,MF'])]
shooting_df = shooting_df[shooting_df['90s_'] >= 15]

passing_df = passing_df[passing_df['pos_'].isin(['FW', 'FW,MF'])]
passing_df = passing_df[passing_df['90s_'] >= 15]

defense_df = defense_df[defense_df['pos_'].isin(['FW', 'FW,MF'])]
defense_df = defense_df[defense_df['90s_'] >= 15]

possession_df = possession_df[possession_df['pos_'].isin(['FW', 'FW,MF'])]
possession_df = possession_df[possession_df['90s_'] >= 15]

In [101]:
df = shooting_df.merge(passing_df, on=['player_', 'season_', 'team_'], how='outer', suffixes=('', '_sh'))
df = df.merge(defense_df, on=['player_', 'season_', 'team_'], how='outer', suffixes=('', '_pos'))
data = df.merge(possession_df, on=['player_', 'season_', 'team_'], how='outer', suffixes=('', '_gca'))

feat_90 = ["90s_","Standard_Sh/90", "Standard_SoT/90","Expected_xG", 'Total_Cmp', "Total_Att", "Total_PrgDist",
            "Ast_", "xAG_", "Expected_xA", "KP_","1/3_","PPA_","CrsPA_","PrgP_",
            "Tackles_Tkl","Tackles_TklW","Take-Ons_Att","Take-Ons_Succ","Carries_PrgDist",	"Carries_PrgC",	"Carries_1/3"]

df = data.groupby(['player_','born_'])[feat_90].sum().reset_index()

feat_90 = ["90s_","Expected_xG", 'Total_Cmp', "Total_Att", "Total_PrgDist",
            "Ast_", "xAG_", "Expected_xA", "KP_","1/3_","PPA_","CrsPA_","PrgP_",
            "Tackles_Tkl","Tackles_TklW","Take-Ons_Att","Take-Ons_Succ","Carries_PrgDist",	"Carries_PrgC",	"Carries_1/3"]

df[feat_90] = df[feat_90].div(df['90s_'], axis=0)


In [102]:
features = ["Standard_Sh/90", "Standard_SoT/90", "Expected_xG", 'Total_Cmp', "Total_Att", "Total_PrgDist",
            "Ast_", "xAG_", "Expected_xA", "KP_","1/3_","PPA_","CrsPA_","PrgP_",
            "Tackles_Tkl","Tackles_TklW","Take-Ons_Att","Take-Ons_Succ","Carries_PrgDist",	"Carries_PrgC",	"Carries_1/3"]

X = df[features]
# Handle missing values (simple imputation with median)
X = X.fillna(X.median())

In [103]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
salah_idx = df[df['player_'] == 'Mohamed Salah'].index[0]
salah_vector = X_scaled[salah_idx].reshape(1, -1)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(salah_vector, X_scaled)[0]
df['similarity_score'] = similarities

ValueError: Length of values (367) does not match length of index (504)

In [106]:
similar_players = (
    data[data['player_'] != 'Mohamed Salah']
    .sort_values('similarity_score', ascending=False)
    .head(50)
)

similar_players[['player_','team_', 'born_', 'similarity_score']]


KeyError: 'similarity_score'

In [74]:
similar_players

Unnamed: 0,player_,90s_,Standard_Sh/90,Standard_SoT/90,Expected_xG,Total_Cmp,Total_Att,Total_PrgDist,Ast_,xAG_,...,CrsPA_,PrgP_,Tackles_Tkl,Tackles_TklW,Take-Ons_Att,Take-Ons_Succ,Carries_PrgDist,Carries_PrgC,Carries_1/3,similarity_score
390,Ousmane Dembélé,19.2,4.99,2.55,0.864583,43.958333,55.625,191.927083,0.3125,0.427083,...,0.15625,7.135417,0.572917,0.260417,4.947917,2.1875,179.479167,6.614583,4.53125,0.858327
410,Raphinha,31.5,3.55,1.27,0.609524,33.777778,47.52381,156.603175,0.285714,0.403175,...,0.571429,4.285714,1.142857,0.698413,3.269841,1.650794,90.952381,2.984127,2.095238,0.853013
453,Son Heung-min,32.6,2.55,1.17,0.368098,27.208589,32.97546,86.564417,0.306748,0.361963,...,0.06135,4.263804,0.644172,0.398773,2.822086,1.196319,91.687117,3.650307,1.349693,0.851764
112,Deniz Undav,23.2,4.57,2.2,0.646552,25.646552,34.439655,115.474138,0.431034,0.280172,...,0.086207,5.172414,1.206897,0.560345,2.155172,1.163793,62.887931,1.939655,1.551724,0.849898
409,Raphinha,15.2,3.82,1.32,0.486842,34.342105,45.592105,157.697368,0.592105,0.375,...,0.526316,3.75,0.921053,0.526316,3.026316,1.578947,93.289474,3.223684,1.447368,0.834895
63,Bradley Barcola,24.2,3.05,1.57,0.553719,30.950413,37.892562,102.727273,0.413223,0.417355,...,0.165289,3.966942,1.115702,0.702479,5.661157,1.404959,144.834711,5.661157,2.68595,0.830357
264,Kylian Mbappé,17.4,4.7,1.89,0.902299,31.149425,35.862069,73.045977,0.229885,0.316092,...,0.114943,4.655172,0.114943,0.057471,5.632184,2.931034,116.091954,4.195402,2.988506,0.827291
95,Cole Palmer,29.0,3.45,1.28,0.627586,38.655172,48.793103,194.551724,0.37931,0.382759,...,0.172414,6.793103,0.793103,0.413793,3.586207,1.793103,120.413793,4.034483,2.275862,0.826338
6,Ademola Lookman,21.0,3.18,1.14,0.433333,30.238095,39.904762,119.0,0.333333,0.247619,...,0.095238,4.619048,0.952381,0.714286,4.142857,1.857143,93.904762,3.428571,2.0,0.798678
454,Son Heung-min,23.4,2.35,1.02,0.307692,31.196581,39.529915,122.521368,0.384615,0.350427,...,0.25641,4.74359,0.726496,0.384615,3.547009,1.239316,111.666667,4.102564,2.094017,0.788223
