In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity


In [17]:
df = pd.read_csv("../../player-performance/data/processed/players_full_2425_with_score.csv")


In [19]:
MIN_90S = 5
df = df[df["90s"] >= MIN_90S].copy()
df.shape


(1987, 324)

In [18]:
df.shape


(2854, 324)

In [8]:
df.shape
df[["Player","Squad","Pos"]].head()


Unnamed: 0,Player,Squad,Pos
0,Max Aarons,Bournemouth,DF
1,Max Aarons,Valencia,"DF,MF"
2,Rodrigo Abajas,Valencia,DF
3,James Abankwah,Udinese,"DF,MF"
4,Keyliane Abdallah,Marseille,FW


In [10]:
sim_features = [
    "Age", "90s",
    "Gls_90", "Ast_90", "G+A_90",
    "G-PK_90", "G+A-PK_90",
    "xG_90", "xAG_90", "xG+xAG_90",
    "npxG_90", "npxG+xAG_90",
    "PrgC", "PrgP", "PrgR"
]

df_sim = df.dropna(subset=sim_features).copy()

id_cols = ["Player", "Squad", "Pos", "Comp"]
for c in id_cols:
    if c not in df_sim.columns:
        df_sim[c] = "N/A"

df_sim.shape


(2845, 324)

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

X = df_sim[sim_features].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=8, random_state=42)
X_pca = pca.fit_transform(X_scaled)

pca.explained_variance_ratio_.sum()


np.float64(0.9913578275915063)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(X_pca)
sim_matrix.shape


(2845, 2845)

In [14]:
import numpy as np
import pandas as pd

def find_similar_players(player_name, top_n=10, same_position=False):
    # find exact match first, else fallback to contains
    exact = df_sim["Player"].astype(str).str.lower() == player_name.lower()

    if not exact.any():
        contains = df_sim["Player"].astype(str).str.lower().str.contains(player_name.lower(), na=False)
        matches = df_sim[contains][["Player", "Squad", "Pos"]].head(10)
        return "Player not found. Closest matches:\n" + matches.to_string(index=False)

    idx_label = df_sim.index[exact][0]
    idx = df_sim.index.get_loc(idx_label)

    player_pos = df_sim.iloc[idx]["Pos"]

    sims = sim_matrix[idx]
    ranked = np.argsort(sims)[::-1]

    out = []
    for j in ranked:
        if j == idx:
            continue
        if same_position and df_sim.iloc[j]["Pos"] != player_pos:
            continue

        out.append([
            df_sim.iloc[j]["Player"],
            df_sim.iloc[j]["Squad"],
            df_sim.iloc[j]["Pos"],
            float(sims[j])
        ])

        if len(out) >= top_n:
            break

    return pd.DataFrame(out, columns=["Player", "Squad", "Pos", "Similarity"])


In [20]:
find_similar_players("Raphinha", top_n=10, same_position=True)


Unnamed: 0,Player,Squad,Pos,Similarity
0,Christian Pulisic,Milan,"FW,MF",0.970809
1,Dennis Man,Parma,"FW,MF",0.96064
2,Evann Guessand,Nice,"FW,MF",0.951266
3,Florian Thauvin,Udinese,"FW,MF",0.951209
4,João Pedro,Brighton,"FW,MF",0.950584
5,Brahim Díaz,Real Madrid,"FW,MF",0.949198
6,Michael Olise,Bayern Munich,"FW,MF",0.946432
7,Jarrod Bowen,West Ham,"FW,MF",0.939559
8,Mason Greenwood,Marseille,"FW,MF",0.936156
9,Khvicha Kvaratskhelia,Napoli,"FW,MF",0.934575


In [21]:
find_similar_players("Bellingham", top_n=10, same_position=True)


'Player not found. Closest matches:\n         Player       Squad Pos\nJude Bellingham Real Madrid  MF'

In [22]:
find_similar_players("Jude Bellingham", top_n=10, same_position=True)


Unnamed: 0,Player,Squad,Pos,Similarity
0,Morgan Gibbs-White,Nott'ham Forest,MF,0.952896
1,Dominik Szoboszlai,Liverpool,MF,0.937887
2,Emile Smith Rowe,Fulham,MF,0.926335
3,Xavi Simons,RB Leipzig,MF,0.921163
4,Justin Kluivert,Bournemouth,MF,0.919452
5,Enzo Fernández,Chelsea,MF,0.917786
6,Khéphren Thuram,Juventus,MF,0.907603
7,Brais Méndez,Real Sociedad,MF,0.904146
8,Adrien Rabiot,Marseille,MF,0.896972
9,James Maddison,Tottenham,MF,0.883634


This notebook implements a player similarity engine based on **playing style**, not reputation or raw output.

## Methodology
- Player statistics are standardized using `StandardScaler`
- Dimensionality is reduced using **PCA** to capture dominant playing patterns
- **Cosine similarity** is applied in the reduced feature space
- Players are compared based on how they contribute on the pitch

## Notes
- Similarity reflects how players play, not how good they are
- This module can be used for scouting, recruitment, and player profiling


In [23]:
import joblib

joblib.dump(scaler, "../models/similarity_scaler.pkl")
joblib.dump(pca, "../models/similarity_pca.pkl")
joblib.dump(sim_features, "../models/similarity_features.pkl")
joblib.dump(df_sim[["Player", "Squad", "Pos"]], "../models/similarity_metadata.pkl")


['../models/similarity_metadata.pkl']

In [24]:
find_similar_players("Raphinha", top_n=5, same_position=True)


Unnamed: 0,Player,Squad,Pos,Similarity
0,Christian Pulisic,Milan,"FW,MF",0.970809
1,Dennis Man,Parma,"FW,MF",0.96064
2,Evann Guessand,Nice,"FW,MF",0.951266
3,Florian Thauvin,Udinese,"FW,MF",0.951209
4,João Pedro,Brighton,"FW,MF",0.950584
