In [216]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from google.colab import files
import io

print("Libraries loaded successfully.")

Libraries loaded successfully.


In [217]:
#Loading data
path = '/content/drive/MyDrive/Colab Notebooks/ML Final Project/players_data-2024_2025.csv'
df = pd.read_csv(path)
target_player_name = "Lamine Yamal"

In [None]:
#Normalize data
# Clean Min column to ensure it is numeric (removes commas if present)
if df['Min'].dtype == object:
    df['Min'] = df['Min'].str.replace(',', '').astype(float)

df_filtered = df[df['Min'] >= 500].copy()
df_filtered = df_filtered.sort_values('Min', ascending=False).drop_duplicates(subset=['Player'])

if target_player_name not in df_filtered['Player'].values:
        print(f"Player '{target_player_name}' not found in dataset.")
numeric_cols = df_filtered.select_dtypes(include=[np.number]).columns

# Exclude metadata columns
exclude_cols = ['Rk', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast']
# intersection checks if the col exists before trying to remove it
feature_cols = [c for c in numeric_cols if c not in exclude_cols]

X_train = df_filtered[feature_cols].fillna(0)

In [218]:
#Random Forest
y_train = (df_filtered['Player'] == target_player_name).astype(int)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
df_filtered['RF_Score'] = rf.predict_proba(X_train)[:, 1]
rf_top = df_filtered[df_filtered['Player'] != target_player_name].sort_values('RF_Score', ascending=False).head(10)
display(rf_top[['Player', 'Squad', 'Age', 'Pos', 'RF_Score', 'Gls', 'Ast']])
display(pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).head(10))

Unnamed: 0,Player,Squad,Age,Pos,RF_Score,Gls,Ast
2304,Mohamed Salah,Liverpool,32.0,FW,0.05,29,18
1960,Michael Olise,Bayern Munich,22.0,"FW,MF",0.04,12,15
1356,Joshua Kimmich,Bayern Munich,29.0,MF,0.02,3,7
1691,Kylian Mbappé,Real Madrid,25.0,FW,0.02,31,3
757,Jeremy Doku,Manchester City,22.0,"FW,MF",0.02,3,6
200,Alex Baena,Villarreal,23.0,"MF,FW",0.01,7,9
238,Bradley Barcola,Paris S-G,21.0,FW,0.01,14,10
1417,Nikola Krstović,Lecce,24.0,FW,0.01,11,5
1289,Vinicius Júnior,Real Madrid,24.0,FW,0.01,11,8
633,Pau Cubarsí,Barcelona,17.0,DF,0.01,0,3


Unnamed: 0,0
onG,0.1
Succ,0.085714
Att_stats_possession,0.068445
PPA,0.066662
TO,0.057143
TB,0.054292
FK,0.029784
Att 3rd_stats_possession,0.028571
GCA,0.028571
Sh,0.024984


In [219]:
noise_level = 0.1
df_test = df.copy()
original_row = df_test[df_test['Player'] == target_player_name].iloc[0]

clone_name = f"Clone_{target_player_name}"
clone_row = original_row.copy()
clone_row['Player'] = clone_name

numeric_cols = df_test.select_dtypes(include=[np.number]).columns
exclude_cols = ['Rk', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'RF_Score', 'Gls', 'Ast']
feature_cols = [c for c in numeric_cols if c not in exclude_cols]

noise_factors = np.random.uniform(1 - noise_level, 1 + noise_level, size=len(feature_cols))

clone_row[feature_cols] = clone_row[feature_cols] * noise_factors

clone_df = pd.DataFrame([clone_row])
df_with_clone = pd.concat([df_test, clone_df], ignore_index=True)

X = df_with_clone[feature_cols].fillna(0)
y = (df_with_clone['Player'] == target_player_name).astype(int)


scores = rf.predict_proba(X)[:, 1]
df_with_clone['Stability_Score'] = scores

results = df_with_clone.sort_values('Stability_Score', ascending=False)

display(results[['Player', 'Squad', 'Stability_Score']].head(10))

Unnamed: 0,Player,Squad,Stability_Score
2792,Lamine Yamal,Barcelona,0.7
2854,Clone_Lamine Yamal,Barcelona,0.53
2304,Mohamed Salah,Liverpool,0.05
1960,Michael Olise,Bayern Munich,0.04
1691,Kylian Mbappé,Real Madrid,0.02
1356,Joshua Kimmich,Bayern Munich,0.02
757,Jeremy Doku,Manchester City,0.02
1781,Mikey Moore,Tottenham,0.01
2812,Arsen Zakharyan,Real Sociedad,0.01
2818,Nicolò Zaniolo,Atalanta,0.01
