In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
df = pd.read_csv("../../player-performance/data/processed/players_full_2425_with_score.csv")


In [3]:
MIN_90S = 5  # at least 5 full matches

if "90s" in df.columns:
    before = len(df)
    df = df[df["90s"].fillna(0) >= MIN_90S].copy()
    after = len(df)
    print(f"[MIN FILTER] 90s >= {MIN_90S}: {before} -> {after}")
else:
    print("[MIN FILTER] Column '90s' not found. Skipping minutes filter.")


[MIN FILTER] 90s >= 5: 2854 -> 1987


In [4]:
MIN_90S = 5
df = df[df["90s"] >= MIN_90S].copy()
df.shape


(1987, 324)

In [5]:
df.shape


(1987, 324)

In [6]:
df.shape
df[["Player","Squad","Pos"]].head()


Unnamed: 0,Player,Squad,Pos
5,Yunis Abdelhamid,Saint-Étienne,DF
6,Himad Abdelli,Angers,"MF,FW"
7,Mohamed Abdelmoneim,Nice,DF
8,Ali Abdi,Nice,"DF,MF"
10,Abel,Osasuna,DF


In [7]:
sim_features = [
    "Age", "90s",
    "Gls_90", "Ast_90", "G+A_90",
    "G-PK_90", "G+A-PK_90",
    "xG_90", "xAG_90", "xG+xAG_90",
    "npxG_90", "npxG+xAG_90",
    "PrgC", "PrgP", "PrgR"
]

df_sim = df.dropna(subset=sim_features).copy()

id_cols = ["Player", "Squad", "Pos", "Comp"]
for c in id_cols:
    if c not in df_sim.columns:
        df_sim[c] = "N/A"

df_sim.shape


(1984, 324)

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

X = df_sim[sim_features].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=8, random_state=42)
X_pca = pca.fit_transform(X_scaled)

pca.explained_variance_ratio_.sum()


np.float64(0.9893714098014912)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(X_pca)
sim_matrix.shape


(1984, 1984)

In [10]:
import numpy as np
import pandas as pd

def find_similar_players(player_name, top_n=10, same_position=False):
    # find exact match first, else fallback to contains
    exact = df_sim["Player"].astype(str).str.lower() == player_name.lower()

    if not exact.any():
        contains = df_sim["Player"].astype(str).str.lower().str.contains(player_name.lower(), na=False)
        matches = df_sim[contains][["Player", "Squad", "Pos"]].head(10)
        return "Player not found. Closest matches:\n" + matches.to_string(index=False)

    idx_label = df_sim.index[exact][0]
    idx = df_sim.index.get_loc(idx_label)

    player_pos = df_sim.iloc[idx]["Pos"]

    sims = sim_matrix[idx]
    ranked = np.argsort(sims)[::-1]

    out = []
    for j in ranked:
        if j == idx:
            continue
        if same_position and df_sim.iloc[j]["Pos"] != player_pos:
            continue

        out.append([
            df_sim.iloc[j]["Player"],
            df_sim.iloc[j]["Squad"],
            df_sim.iloc[j]["Pos"],
            float(sims[j])
        ])

        if len(out) >= top_n:
            break

    return pd.DataFrame(out, columns=["Player", "Squad", "Pos", "Similarity"])


In [11]:
find_similar_players("Raphinha", top_n=10, same_position=True)


Unnamed: 0,Player,Squad,Pos,Similarity
0,Christian Pulisic,Milan,"FW,MF",0.964009
1,Dennis Man,Parma,"FW,MF",0.958331
2,Florian Thauvin,Udinese,"FW,MF",0.946124
3,João Pedro,Brighton,"FW,MF",0.945169
4,Evann Guessand,Nice,"FW,MF",0.935625
5,Michael Olise,Bayern Munich,"FW,MF",0.929874
6,Álex Berenguer,Athletic Club,"FW,MF",0.920793
7,Serge Gnabry,Bayern Munich,"FW,MF",0.919502
8,Brahim Díaz,Real Madrid,"FW,MF",0.918731
9,Jarrod Bowen,West Ham,"FW,MF",0.918151


In [12]:
find_similar_players("Bellingham", top_n=10, same_position=True)


'Player not found. Closest matches:\n         Player       Squad Pos\nJude Bellingham Real Madrid  MF'

In [13]:
find_similar_players("Jude Bellingham", top_n=10, same_position=True)


Unnamed: 0,Player,Squad,Pos,Similarity
0,Morgan Gibbs-White,Nott'ham Forest,MF,0.931897
1,Justin Kluivert,Bournemouth,MF,0.920077
2,Dominik Szoboszlai,Liverpool,MF,0.90131
3,Emile Smith Rowe,Fulham,MF,0.892551
4,Xavi Simons,RB Leipzig,MF,0.887959
5,James Maddison,Tottenham,MF,0.876086
6,Enzo Fernández,Chelsea,MF,0.875659
7,Adrien Rabiot,Marseille,MF,0.874422
8,Hamed Junior Traorè,Auxerre,MF,0.856739
9,Corentin Tolisso,Lyon,MF,0.84944


This notebook implements a player similarity engine based on **playing style**, not reputation or raw output.

## Methodology
- Player statistics are standardized using `StandardScaler`
- Dimensionality is reduced using **PCA** to capture dominant playing patterns
- **Cosine similarity** is applied in the reduced feature space
- Players are compared based on how they contribute on the pitch

## Notes
- Similarity reflects how players play, not how good they are
- This module can be used for scouting, recruitment, and player profiling


In [14]:
import joblib

joblib.dump(scaler, "../models/similarity_scaler.pkl")
joblib.dump(pca, "../models/similarity_pca.pkl")
joblib.dump(sim_features, "../models/similarity_features.pkl")
joblib.dump(df_sim[["Player", "Squad", "Pos"]], "../models/similarity_metadata.pkl")


['../models/similarity_metadata.pkl']

In [15]:
find_similar_players("Raphinha", top_n=5, same_position=True)


Unnamed: 0,Player,Squad,Pos,Similarity
0,Christian Pulisic,Milan,"FW,MF",0.964009
1,Dennis Man,Parma,"FW,MF",0.958331
2,Florian Thauvin,Udinese,"FW,MF",0.946124
3,João Pedro,Brighton,"FW,MF",0.945169
4,Evann Guessand,Nice,"FW,MF",0.935625
