<a href="https://colab.research.google.com/github/vatsalyatandon/PCA_Player_Finder/blob/master/app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [108]:
class color:
        PURPLE = '\033[95m'
        CYAN = '\033[96m'
        DARKCYAN = '\033[36m'
        BLUE = '\033[94m'
        GREEN = '\033[92m'
        YELLOW = '\033[93m'
        RED = '\033[91m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'
        END = '\033[0m'

print('by Parth Athale (@ParthAthale)\n')
print('edited by Vatsalya Tandon (@aylastav)\n')
print('Data credits to FBref/StatsBomb')
print('Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/')
print('Read code here: https://github.com/parth1902/PCA_Player_Finder\n')
print('Some examples as a guide to do this:')
print('If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall')
print('If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall')
print('If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work')

by Parth Athale (@ParthAthale)

Data credits to FBref/StatsBomb
Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/
Read code here: https://github.com/parth1902/PCA_Player_Finder

Some examples as a guide to do this:
If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall
If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall
If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work


In [109]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_columns', 10)
from IPython.display import HTML, display
pd.options.mode.chained_assignment = None  # default='warn'

display(HTML('''<style>
    .widget-label { min-width: 20ex !important; }
</style>'''))


In [110]:
url_games = 'https://raw.githubusercontent.com/parth1902/test/master/games.csv'
url_players = 'https://raw.githubusercontent.com/parth1902/test/master/players.csv'

#For running locally can use path instead of loading from the repository online

df_games = pd.read_csv(url_games,sep = ',')
df_players = pd.read_csv(url_players,sep = ',')

#df_games = pd.read_csv('games.csv', sep = ',')
#df_players = pd.read_csv('players.csv', sep = ',')

print(df_players.shape)
df_players

(2732, 151)


Unnamed: 0,player,nationality,position,squad,age,...,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct
0,Patrick van Aanholt,nl NED,DF,Crystal Palace,28,...,1.0,243.0,12.0,17.0,41.4
1,Max Aarons,eng ENG,DF,Norwich City,19,...,0.0,281.0,23.0,42.0,35.4
2,Yunis Abdelhamid,ma MAR,DF,Reims,31,...,0.0,226.0,86.0,42.0,67.2
3,Suleiman Abdullahi,ng NGA,"FW,MF",Union Berlin,22,...,0.0,15.0,6.0,11.0,35.3
4,Mehdi Abeid,dz ALG,MF,Nantes,26,...,0.0,194.0,14.0,19.0,42.4
...,...,...,...,...,...,...,...,...,...,...,...
2727,Bongani Zungu,za RSA,MF,Amiens,26,...,0.0,133.0,18.0,25.0,41.9
2728,Szymon Żurkowski,pl POL,MF,Fiorentina,21,...,0.0,2.0,1.0,0.0,100.0
2729,David Zurutuza,es ESP,MF,Real Sociedad,33,...,0.0,14.0,7.0,8.0,46.7
2730,Martin Ødegaard,no NOR,MF,Real Sociedad,20,...,0.0,231.0,11.0,20.0,35.5


In [111]:
#Method to convert result of a game into a numerical feature
def f(df):
    if df['result'] == 'W':
        val = 3
    elif df['result'] == 'D':
        val = 1
    else:
        val = 0
    return val

In [112]:
#Looking at players who played > 500 minutes
df_players = df_players[df_players['minutes'] > 500]

#Not including GKs to avoid anomalous similarities
df_players = df_players[df_players['position'] != 'GK']

#Converting results into numerical feature
df_games['Points taken'] = df_games.apply(f, axis = 1)

#Removing duplicate entries of players
df_players = df_players.drop_duplicates(subset=['player'], keep='last')

players = np.array(df_players['player'])
teams = np.array(df_players['squad'].unique())
teams = np.append('Overall',sorted(teams))

In [113]:
#Normalizing relevant features on a per90 basis

features = [ 'goals',
 'assists',
 'pens_made',
 'pens_att',
 'xg',
 'npxg',
 'xa',
 'shots_total',
 'shots_on_target',
 'shots_free_kicks',
 'xg_net',
 'npxg_net',
 'passes_completed',
 'passes',
 'passes_total_distance',
 'passes_progressive_distance',
 'passes_completed_short',
 'passes_short',
 'passes_completed_medium',
 'passes_medium',
 'passes_completed_long',
 'passes_long',
 'assisted_shots',
 'passes_into_final_third',
 'passes_into_penalty_area',
 'crosses_into_penalty_area',
 'progressive_passes',
 'passes_live',
 'passes_dead',
 'passes_free_kicks',
 'through_balls',
 'passes_pressure',
 'passes_switches',
 'crosses',
 'corner_kicks',
 'corner_kicks_in',
 'corner_kicks_out',
 'corner_kicks_straight',
 'passes_ground',
 'passes_low',
 'passes_high',
 'passes_left_foot',
 'passes_right_foot',
 'passes_head',
 'throw_ins',
 'passes_other_body',
 'passes_offsides',
 'passes_oob',
 'passes_intercepted',
 'passes_blocked',
 'sca',
 'sca_passes_live',
 'sca_passes_dead',
 'sca_dribbles',
 'sca_shots',
 'sca_fouled',
 'gca',
 'gca_passes_live',
 'gca_passes_dead',
 'gca_dribbles',
 'gca_shots',
 'gca_fouled',
 'gca_og_for',
 'tackles',
 'tackles_won',
 'tackles_def_3rd',
 'tackles_mid_3rd',
 'tackles_att_3rd',
 'dribble_tackles',
 'dribbles_vs',
 'dribbled_past',
 'pressures',
 'pressure_regains',
 'pressures_def_3rd',
 'pressures_mid_3rd',
 'pressures_att_3rd',
 'blocks',
 'blocked_shots',
 'blocked_shots_saves',
 'blocked_passes',
 'interceptions',
 'clearances',
 'errors',
 'touches',
 'touches_def_pen_area',
 'touches_def_3rd',
 'touches_mid_3rd',
 'touches_att_3rd',
 'touches_att_pen_area',
 'touches_live_ball',
 'dribbles_completed',
 'dribbles',
 'players_dribbled_past',
 'nutmegs',
 'carries',
 'carry_distance',
 'carry_progressive_distance',
 'pass_targets',
 'miscontrols',
 'dispossessed']

for i in range (0,len(features)):
        df_players[features[i]] = (df_players[features[i]]/df_players['minutes'])*90

In [114]:
#Dropping  irrelevant features 

df_players = df_players.drop(['goals_per90',
'cards_yellow',
'cards_red',
'assists_per90',
'goals_assists_per90',
'goals_pens_per90',
'goals_assists_pens_per90',
'xg_per90',
'xa_per90',
'xg_xa_per90',
'npxg_per90',
'npxg_xa_per90',
'minutes_90s',
'shots_total_per90',
'shots_on_target_per90',
'xa_net',
'sca_per90',
'gca_per90',
'passes_received',
'cards_yellow_red',
'fouls',
'fouled',
'offsides',
'pens_won',
'pens_conceded',
'own_goals',
'ball_recoveries',
'aerials_won',
'aerials_lost',
'aerials_won_pct'], axis = 1)

In [115]:
df_players_numerical = df_players.drop(['player',
 'nationality',
 'position',
 'squad',
 'age',
 'birth_year',
 'games',
 'games_starts',
 'minutes'], axis = 1)

numerical_columns = list(df_players_numerical.columns.values)

scaler = MinMaxScaler()

for i in range(0,len(numerical_columns)):
    df_players_numerical[numerical_columns[i]] = scaler.fit_transform(df_players_numerical[[numerical_columns[i]]])
       
df_players_numerical_1 = df_players_numerical.copy()

In [116]:
def shoot(df):
    return df[['goals',
     'xg',
     'npxg',
     'shots_total',
     'shots_on_target',
     'shots_free_kicks',
     'shots_on_target_pct',
     'goals_per_shot',
     'goals_per_shot_on_target',
     'npxg_per_shot',
     'xg_net',
     'npxg_net']]

def create(df):
    return df[['sca',
     'sca_passes_live',
     'sca_passes_dead',
     'sca_dribbles',
     'sca_shots',
     'sca_fouled',
     'assisted_shots',
     'through_balls',
     'gca',
     'gca_passes_live',
     'gca_passes_dead',
     'gca_dribbles',
     'gca_shots',
     'gca_fouled',
     'gca_og_for','assists','xa']]

def passes(df):
    return df[['passes_completed',
     'passes',
     'passes_pct',
     'passes_total_distance',
     'passes_progressive_distance',
     'passes_completed_short',
     'passes_short',
     'passes_pct_short',
     'passes_completed_medium',
     'passes_medium',
     'passes_pct_medium',
     'passes_completed_long',
     'passes_long',
     'passes_pct_long',
     'passes_into_final_third',
     'passes_into_penalty_area',
     'crosses_into_penalty_area',
     'progressive_passes',
     'passes_live',
     'passes_dead',
     'passes_free_kicks',
     'passes_pressure',
     'passes_switches',
     'crosses',
     'corner_kicks',
     'corner_kicks_in',
     'corner_kicks_out',
     'corner_kicks_straight',
     'passes_ground',
     'passes_low',
     'passes_high',
     'passes_left_foot',
     'passes_right_foot',
     'passes_head',
     'throw_ins',
     'passes_other_body',
     'passes_offsides',
     'passes_oob',
     'passes_intercepted',
     'passes_blocked']]

def defence(df):
    return df[['tackles',
     'tackles_won',
     'tackles_def_3rd',
     'tackles_mid_3rd',
     'tackles_att_3rd',
     'dribble_tackles',
     'dribbles_vs',
     'dribble_tackles_pct',
     'dribbled_past',
     'pressures',
     'pressure_regains',
     'pressure_regain_pct',
     'pressures_def_3rd',
     'pressures_mid_3rd',
     'pressures_att_3rd',
     'blocks',
     'blocked_shots',
     'blocked_shots_saves',
     'blocked_passes',
     'interceptions',
     'clearances',
     'errors']]

def possesion(df):
    return df[['touches',
     'touches_def_pen_area',
     'touches_def_3rd',
     'touches_mid_3rd',
     'touches_att_3rd',
     'touches_att_pen_area',
     'touches_live_ball',
     'dribbles_completed',
     'dribbles',
     'dribbles_completed_pct',
     'players_dribbled_past',
     'nutmegs',
     'carries',
     'carry_distance',
     'carry_progressive_distance',
     'pass_targets',
     'passes_received_pct',
     'miscontrols',
     'dispossessed']]0.9*400 - 300

SyntaxError: ignored

In [117]:
def skill_parser(df, skill):
  if skill == 'Possession':
        df_2 = possesion(df)
  elif skill == 'Shooting':
        df_2 = shoot(df)
  elif skill == 'Passing':
        df_2 = passes(df)
  elif skill == 'Creating':
        df_2 = create(df) 
  elif skill == 'Defensive work':
        df_2 = defence(df)
  else :
        df_2 = df
  return df_2

In [118]:
def find(player_name, team, skill, number_of_results):
    
    #Importing data
    
    global df_games
    global df_players
    global df_players_numerical
    global df_players_numerical_1
    global df_players_numerical_2
    global principalDf
    global finalDf
    global final
    
    df_players_numerical_1 = df_players_numerical.copy()
    
    #Calculating team correalations
    
    df_games.rename(columns={'xg_for':'xg'}, inplace=True) 
    df_games = df_games.loc[:,~df_games.T.duplicated(keep='first')]
    df_games = df_games.loc[:, ~df_games.columns.duplicated()]
    
    if team != "Overall":
        df_games_new = df_games[df_games['for'] == team]
        corrMatrix = df_games_new.corr()
    else:
        corrMatrix = df_games.corr()

    #Selecting relevant columns based on queries 
    
    numerical_columns = list(df_players_numerical_1.columns.values)
    
    for i in range(0,len(numerical_columns)):
        df_players_numerical_1[numerical_columns[i]] = (df_players_numerical_1[numerical_columns[i]]) * (corrMatrix['Points taken'][numerical_columns[i]])
    
    df_players_numerical_2 = skill_parser(df_players_numerical_1, skill)
    
    # Separating out the features
    x = df_players_numerical_2.values
    x = np.nan_to_num(x)
    
    # Separating out the target
    y = df_players.loc[:,['player']].values
    
    #Applying PCA
    pca = PCA(.90)
    principalComponents = pca.fit_transform(x)
    print('Number of PCA components:',pca.n_components_)
    print('\n')
    
    principalDf = pd.DataFrame(data = principalComponents)
    df_players = df_players.reset_index(drop=True)
    
    #Creating final dataframe with PCA features
    finalDf = pd.concat([principalDf, df_players[['player', 'squad', 'position', 'age']]], axis = 1)
    
    #a = current player, b = remaining players
    a = (finalDf[finalDf['player'] == player_name])[(finalDf[finalDf['player'] == player_name]).columns.drop(['player','squad','position','age'])]
    b = finalDf[finalDf.columns.drop(['player','squad','position','age'])]
    
    #Calculating euclidean distance
    finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)
    
    dist = finalDf['distance'].max()
    dist95 = finalDf['distance'].quantile(0.95)
    finalDf['% match'] = 100-(finalDf['distance']/dist95)*100
    
    final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]
    final = final.reset_index(drop=True)
    
    print(color.BOLD + 'List of similar players:' + color.END)
    print('\n')
    print(final[['player','squad','position','age','% match']])


    if(skill == 'Overall'):
      print("Select skill to plot radar plot")
    else:
      stats = skill_parser(df_players, skill)

      scaler = MinMaxScaler()
      stats_columns = list(stats.columns.values)

      for i in range(0,len(stats_columns)):
        stats[stats_columns[i]] = scaler.fit_transform(stats[[stats_columns[i]]])

      stats['player'] = df_players['player']
      stats_1 = stats.loc[stats['player'] == player_name]
      stats_1 = stats_1.drop(['player'], axis = 1)
      stats_1_num = stats_1.iloc[0].values
      
      stats_2 = stats.loc[stats['player'] == final.loc[0,'player']]
      stats_2 = stats_2.drop(['player'], axis = 1)
      stats_2_num = stats_2.iloc[0].values

      angles = np.linspace(0, 2*np.pi, stats_1.shape[1], endpoint=False)
      stats_1_new = np.concatenate((stats_1_num, [stats_1_num[0]]))
      stats_2_new = np.concatenate((stats_2_num, [stats_2_num[0]]))
      angles = np.concatenate((angles,[angles[0]]))


      print('\n')
      print('\n')
      print('\n')

      fig = plt.figure(figsize=(10,10))
      ax = fig.add_subplot(111, polar=True)
      ax.plot(angles, stats_1_new, 'o-', linewidth=2)
      ax.fill(angles, stats_1_new, 'teal', alpha=0.25)
      ax.plot(angles, stats_2_new, 'o-', linewidth=2)
      ax.fill(angles, stats_2_new, 'red', alpha=0.25)
      ax.set_thetagrids(angles * 180/np.pi, stats_1.columns)
      title_name = "Plot of " + player_name + " in blue vs " +  final.loc[0,'player'] + " in red"
      ax.set_title(title_name)
     
      plt.show()
    

In [128]:
def find2(player_name, team, stat, number_of_results):
    
    print('Stats selected:',stat)
    print('\n')
    global df_games
    global df_players
    global df_players_numerical
    global df_players_numerical_1
    global df_players_numerical_2
    global principalDf
    global finalDf
    global final
    
    df_players_numerical_1 = df_players_numerical.copy()

    df_games.rename(columns={'xg_for':'xg'}, inplace=True) 
    df_games = df_games.loc[:,~df_games.T.duplicated(keep='first')]
    df_games = df_games.loc[:, ~df_games.columns.duplicated()]
    
    if team != "Overall":
        df_games_new = df_games[df_games['for'] == team]
        corrMatrix = df_games_new.corr()
    else:
        corrMatrix = df_games.corr()
        
    numerical_columns = list(df_players_numerical_1.columns.values)
    
    for i in range(0,len(numerical_columns)):
        df_players_numerical_1[numerical_columns[i]] = (df_players_numerical_1[numerical_columns[i]]) * (corrMatrix['Points taken'][numerical_columns[i]])
    if not stat:
        print("Choose at least one stat to see output")
        return
    else:
        df_players_numerical_2 = df_players_numerical_1[np.array(stat)]
        
    # Separating out the features
    x = df_players_numerical_2.values
    x = np.nan_to_num(x)
    # Separating out the target
    y = df_players.loc[:,['player']].values
    
    pca = PCA(.90)
    principalComponents = pca.fit_transform(x)
    print('Number of PCA components:',pca.n_components_)
    print('\n')
    
    principalDf = pd.DataFrame(data = principalComponents)
    df_players = df_players.reset_index(drop=True)
    
    finalDf = pd.concat([principalDf, df_players[['player', 'squad', 'position', 'age']]], axis = 1)
    
    #a - current player, b - remaining players
    a = (finalDf[finalDf['player'] == player_name])[(finalDf[finalDf['player'] == player_name]).columns.drop(['player','squad','position','age'])]
    b = finalDf[finalDf.columns.drop(['player','squad','position','age'])]
    
    finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)
    
    dist = finalDf['distance'].max()
    dist95 = finalDf['distance'].quantile(0.95)
    finalDf['% match'] = 100-(finalDf['distance']/dist95)*100
    
    final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]
    final = final.reset_index(drop=True)
    
    print(color.BOLD + 'List of similar players:' + color.END)
    print('\n')
    print(final[['player','squad','position','age','% match']])

    if not stat:
      print("Select skill to plot radar plot")
    else:
      stats = df_players.loc[:, stat]

      scaler = MinMaxScaler()
      stats_columns = list(stats.columns.values)

      for i in range(0,len(stats_columns)):
        stats[stats_columns[i]] = scaler.fit_transform(stats[[stats_columns[i]]])

      stats['player'] = df_players['player']
      stats_1 = stats.loc[stats['player'] == player_name]
      stats_1 = stats_1.drop(['player'], axis = 1)
      stats_1_num = stats_1.iloc[0].values
      
      stats_2 = stats.loc[stats['player'] == final.loc[0,'player']]
      stats_2 = stats_2.drop(['player'], axis = 1)
      stats_2_num = stats_2.iloc[0].values

      angles = np.linspace(0, 2*np.pi, stats_1.shape[1], endpoint=False)
      stats_1_new = np.concatenate((stats_1_num, [stats_1_num[0]]))
      stats_2_new = np.concatenate((stats_2_num, [stats_2_num[0]]))
      angles = np.concatenate((angles,[angles[0]]))


      print('\n')
      print('\n')
      print('\n')

      fig = plt.figure(figsize=(10,10))
      ax = fig.add_subplot(111, polar=True)
      ax.plot(angles, stats_1_new, 'o-', linewidth=2)
      ax.fill(angles, stats_1_new, 'teal', alpha=0.25)
      ax.plot(angles, stats_2_new, 'o-', linewidth=2)
      ax.fill(angles, stats_2_new, 'red', alpha=0.25)
      ax.set_thetagrids(angles * 180/np.pi, stats_1.columns)
      title_name = "Plot of " + player_name + " in blue vs " +  final.loc[0,'player'] + " in red"
      ax.set_title(title_name)
     
      plt.show()

In [120]:
import ipywidgets as widgets
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


player_selected = widgets.Combobox(options = sorted(players.tolist()), description='Player', ensure_option=True)
output = widgets.Output()
team_selected = widgets.Dropdown(options = (teams), description='Teams')
skills = ['Overall','Passing','Creating','Shooting','Defensive work','Possession']
skill_selected = widgets.Dropdown(options = (skills), description = 'Skills')

def dropdown_player_eventhandler(change):
    output.clear_output()
    with output:
        display(find(player_name = change.new,team = team_selected.value, skill = skill_selected.value, number_of_results = 10))

def dropdown_team_eventhandler(change):
    output.clear_output()
    with output:
        display(find(player_name = player_selected.value,team = change.new, skill = skill_selected.value, number_of_results = 10))

def dropdown_skill_eventhandler(change):
    output.clear_output()
    with output:
        display(find(player_name = player_selected.value ,team = team_selected.value, skill = change.new , number_of_results = 10))
        
player_selected.observe(dropdown_player_eventhandler, names='value')
team_selected.observe(dropdown_team_eventhandler, names='value')
skill_selected.observe(dropdown_skill_eventhandler, names='value')

display(player_selected)
display(team_selected)
display(skill_selected)

display(output)


Combobox(value='', description='Player', ensure_option=True, options=('Aaron Connolly', 'Aaron Cresswell', 'Aa…

Dropdown(description='Teams', options=('Overall', 'Alavés', 'Amiens', 'Angers', 'Arsenal', 'Aston Villa', 'Ata…

Dropdown(description='Skills', options=('Overall', 'Passing', 'Creating', 'Shooting', 'Defensive work', 'Posse…

Output()

In [129]:
warnings.simplefilter(action='ignore', category=FutureWarning)


player_selected = widgets.Combobox(options = sorted(players.tolist()), description='Player', ensure_option=True)
output = widgets.Output()
team_selected = widgets.Dropdown(options = (teams), description='Teams')
stats = list(df_players_numerical_1.columns.values)
stats_selected = widgets.SelectMultiple(options = (stats), description = 'Stats')

def dropdown_player_eventhandler(change):
    output.clear_output()
    with output:
        display(find2(player_name = change.new,team = team_selected.value, stat = stats_selected.value, number_of_results = 10))
        # display(find2(player_name = change.new, team = teams, stat = widgets.SelectMultiple(options = list(df_players_numerical_1.columns.values), rows = 10), number_of_results = 10))

def dropdown_team_eventhandler(change):
    output.clear_output()
    with output:
        display(find2(player_name = player_selected.value,team = change.new, stat = stats_selected.value, number_of_results = 10))

def dropdown_stats_eventhandler(change):
    output.clear_output()
    with output:
        display(find2(player_name = player_selected.value ,team = team_selected.value, stat = change.new , number_of_results = 10))
        
player_selected.observe(dropdown_player_eventhandler, names='value')
team_selected.observe(dropdown_team_eventhandler, names='value')
stats_selected.observe(dropdown_stats_eventhandler, names='value')

display(player_selected)
display(team_selected)
display(stats_selected)

display(output)

Combobox(value='', description='Player', ensure_option=True, options=('Aaron Connolly', 'Aaron Cresswell', 'Aa…

Dropdown(description='Teams', options=('Overall', 'Alavés', 'Amiens', 'Angers', 'Arsenal', 'Aston Villa', 'Ata…

SelectMultiple(description='Stats', options=('goals', 'assists', 'pens_made', 'pens_att', 'xg', 'npxg', 'xa', …

Output()