In [1]:
#@title
#HIDDEN
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
class color:
        PURPLE = '\033[95m'
        CYAN = '\033[96m'
        DARKCYAN = '\033[36m'
        BLUE = '\033[94m'
        GREEN = '\033[92m'
        YELLOW = '\033[93m'
        RED = '\033[91m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'
        END = '\033[0m'

print('by Parth Athale (@ParthAthale)\n')
print('Data credits to FBref/StatsBomb')
print('Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/')
print('Read code here: https://github.com/parth1902/PCA_Player_Finder\n')
print('Some examples as a guide to do this:')
print('If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall')
print('If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall')
print('If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work')

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 100)
from IPython.display import HTML, display
pd.options.mode.chained_assignment =  None # default='warn'

display(HTML('''<style>
    .widget-label { min-width: 20ex !important; }
</style>'''))

url_games = 'https://raw.githubusercontent.com/parth1902/test/master/games.csv'
url_players = 'https://raw.githubusercontent.com/parth1902/test/master/players.csv'
url_players1718 = 'https://raw.githubusercontent.com/parth1902/PCA_Player_Finder/master/players1718.csv'
url_players1819 = 'https://raw.githubusercontent.com/parth1902/PCA_Player_Finder/master/players1819.csv'

#For running locally can use path instead of loading from the repository online

df_games = pd.read_csv(url_games,sep = ',')
df_players = pd.read_csv(url_players,sep = ',')
df_players1718 = pd.read_csv(url_players1718,sep = ',')
df_players1819 = pd.read_csv(url_players1819,sep = ',')
df_players['season'] = '2019-20'
df_players1718['season'] = '2017-18'
df_players1819['season'] = '2018-19'
df_players = pd.concat([df_players, df_players1819, df_players1718], ignore_index=True)

#df_games = pd.read_csv('games.csv', sep = ',')
#df_players = pd.read_csv('players.csv', sep = ',')

#Method to convert result of a game into a numerical feature
def f(df):
    if df['result'] == 'W':
        val = 3
    elif df['result'] == 'D':
        val = 1
    else:
        val = 0
    return val
#Looking at players who played > 500 minutes
df_players = df_players[df_players['minutes'] > 500]

#Not including GKs to avoid anomalous similarities
df_players = df_players[df_players['position'] != 'GK']

#Converting results into numerical feature
df_games['Points taken'] = df_games.apply(f, axis = 1)

#Removing duplicate entries of players
df_players = df_players.drop_duplicates(subset=['player','season'], keep='last')
df_players = df_players.reset_index(drop=True)

players = np.array(df_players['player'].unique())
players = sorted(players)
teams = np.array(df_players['squad'].unique())
teams = np.append('Overall',sorted(teams))

#Normalizing relevant features on a per90 basis

features = [ 'goals',
 'assists',
 'pens_made',
 'pens_att',
 'xg',
 'npxg',
 'xa',
 'shots_total',
 'shots_on_target',
 'shots_free_kicks',
 'xg_net',
 'npxg_net',
 'passes_completed',
 'passes',
 'passes_total_distance',
 'passes_progressive_distance',
 'passes_completed_short',
 'passes_short',
 'passes_completed_medium',
 'passes_medium',
 'passes_completed_long',
 'passes_long',
 'assisted_shots',
 'passes_into_final_third',
 'passes_into_penalty_area',
 'crosses_into_penalty_area',
 'progressive_passes',
 'passes_live',
 'passes_dead',
 'passes_free_kicks',
 'through_balls',
 'passes_pressure',
 'passes_switches',
 'crosses',
 'corner_kicks',
 'corner_kicks_in',
 'corner_kicks_out',
 'corner_kicks_straight',
 'passes_ground',
 'passes_low',
 'passes_high',
 'passes_left_foot',
 'passes_right_foot',
 'passes_head',
 'throw_ins',
 'passes_other_body',
 'passes_offsides',
 'passes_oob',
 'passes_intercepted',
 'passes_blocked',
 'sca',
 'sca_passes_live',
 'sca_passes_dead',
 'sca_dribbles',
 'sca_shots',
 'sca_fouled',
 'gca',
 'gca_passes_live',
 'gca_passes_dead',
 'gca_dribbles',
 'gca_shots',
 'gca_fouled',
 'gca_og_for',
 'tackles',
 'tackles_won',
 'tackles_def_3rd',
 'tackles_mid_3rd',
 'tackles_att_3rd',
 'dribble_tackles',
 'dribbles_vs',
 'dribbled_past',
 'pressures',
 'pressure_regains',
 'pressures_def_3rd',
 'pressures_mid_3rd',
 'pressures_att_3rd',
 'blocks',
 'blocked_shots',
 'blocked_shots_saves',
 'blocked_passes',
 'interceptions',
 'clearances',
 'errors',
 'touches',
 'touches_def_pen_area',
 'touches_def_3rd',
 'touches_mid_3rd',
 'touches_att_3rd',
 'touches_att_pen_area',
 'touches_live_ball',
 'dribbles_completed',
 'dribbles',
 'players_dribbled_past',
 'nutmegs',
 'carries',
 'carry_distance',
 'carry_progressive_distance',
 'pass_targets',
 'miscontrols',
 'dispossessed']

for i in range (0,len(features)):
        df_players[features[i]] = (df_players[features[i]]/df_players['minutes'])*90
#Dropping  irrelevant features 

features_to_drop = ['goals_per90',
'cards_yellow',
'cards_red',
'assists_per90',
'goals_assists_per90',
'goals_pens_per90',
'goals_assists_pens_per90',
'xg_per90',
'xa_per90',
'xg_xa_per90',
'npxg_per90',
'npxg_xa_per90',
'minutes_90s',
'shots_total_per90',
'shots_on_target_per90',
'xa_net',
'sca_per90',
'gca_per90',
'passes_received',
'cards_yellow_red',
'fouls',
'fouled',
'offsides',
'pens_won',
'pens_conceded',
'own_goals',
'ball_recoveries',
'aerials_won',
'aerials_lost',
'aerials_won_pct']

df_players = df_players.drop(features_to_drop, axis = 1)

text_features = ['player',
 'nationality',
 'position',
 'squad',
 'age',
 'birth_year',
 'games',
 'games_starts',
 'minutes','season']

df_players_numerical = df_players.drop(text_features, axis = 1)


numerical_columns = list(df_players_numerical.columns.values)

scaler = MinMaxScaler()

for i in range(0,len(numerical_columns)):
    df_players_numerical[numerical_columns[i]] = scaler.fit_transform(df_players_numerical[[numerical_columns[i]]])
       
df_players_numerical_1 = df_players_numerical.copy()


def shoot(df):
    return df[['goals',
     'xg',
     'npxg',
     'shots_total',
     'shots_on_target',
     'shots_free_kicks',
     'shots_on_target_pct',
     'goals_per_shot',
     'goals_per_shot_on_target',
     'npxg_per_shot',
     'xg_net',
     'npxg_net']]

def create(df):
    return df[['sca',
     'sca_passes_live',
     'sca_passes_dead',
     'sca_dribbles',
     'sca_shots',
     'sca_fouled',
     'assisted_shots',
     'through_balls',
     'gca',
     'gca_passes_live',
     'gca_passes_dead',
     'gca_dribbles',
     'gca_shots',
     'gca_fouled',
     'gca_og_for','assists','xa']]

def passes(df):
    return df[['passes_completed',
     'passes',
     'passes_pct',
     'passes_total_distance',
     'passes_progressive_distance',
     'passes_completed_short',
     'passes_short',
     'passes_pct_short',
     'passes_completed_medium',
     'passes_medium',
     'passes_pct_medium',
     'passes_completed_long',
     'passes_long',
     'passes_pct_long',
     'passes_into_final_third',
     'passes_into_penalty_area',
     'crosses_into_penalty_area',
     'progressive_passes',
     'passes_live',
     'passes_dead',
     'passes_free_kicks',
     'passes_pressure',
     'passes_switches',
     'crosses',
     'corner_kicks',
     'corner_kicks_in',
     'corner_kicks_out',
     'corner_kicks_straight',
     'passes_ground',
     'passes_low',
     'passes_high',
     'passes_left_foot',
     'passes_right_foot',
     'passes_head',
     'throw_ins',
     'passes_other_body',
     'passes_offsides',
     'passes_oob',
     'passes_intercepted',
     'passes_blocked']]

def defence(df):
    return df[['tackles',
     'tackles_won',
     'tackles_def_3rd',
     'tackles_mid_3rd',
     'tackles_att_3rd',
     'dribble_tackles',
     'dribbles_vs',
     'dribble_tackles_pct',
     'dribbled_past',
     'pressures',
     'pressure_regains',
     'pressure_regain_pct',
     'pressures_def_3rd',
     'pressures_mid_3rd',
     'pressures_att_3rd',
     'blocks',
     'blocked_shots',
     'blocked_shots_saves',
     'blocked_passes',
     'interceptions',
     'clearances',
     'errors']]

def possesion(df):
    return df[['touches',
     'touches_def_pen_area',
     'touches_def_3rd',
     'touches_mid_3rd',
     'touches_att_3rd',
     'touches_att_pen_area',
     'touches_live_ball',
     'dribbles_completed',
     'dribbles',
     'dribbles_completed_pct',
     'players_dribbled_past',
     'nutmegs',
     'carries',
     'carry_distance',
     'carry_progressive_distance',
     'pass_targets',
     'passes_received_pct',
     'miscontrols',
     'dispossessed']]
def skill_parser(df, skill):
  if skill == 'Possession':
        df_2 = possesion(df)
  elif skill == 'Shooting':
        df_2 = shoot(df)
  elif skill == 'Passing':
        df_2 = passes(df)
  elif skill == 'Creating':
        df_2 = create(df) 
  elif skill == 'Defensive work':
        df_2 = defence(df)
  else :
        df_2 = df
  return df_2

import ipywidgets as widgets
import warnings



  
def find(player_name, season, team, skill, number_of_results):
    
    #Importing data
    
    global df_games
    global df_players
    global df_players_numerical
    global df_players_numerical_1
    global df_players_numerical_2
    global principalDf
    global finalDf
    global final
    
    seasons1 = np.array(df_players[df_players['player'] == player_name]['season'].unique())
    if((season not in seasons1)):
        print('Check the player played in the corresponding season.')
        return
    
    df_players_numerical_1 = df_players_numerical.copy()
    
    #Calculating team correalations
    
    df_games.rename(columns={'xg_for':'xg'}, inplace=True) 
    df_games = df_games.loc[:,~df_games.T.duplicated(keep='first')]
    df_games = df_games.loc[:, ~df_games.columns.duplicated()]
    
    if team != "Overall":
        df_games_new = df_games[df_games['for'] == team]
        corrMatrix = df_games_new.corr()
    else:
        corrMatrix = df_games.corr()

    #Selecting relevant columns based on queries 
    
    numerical_columns = list(df_players_numerical_1.columns.values)
    
    for i in range(0,len(numerical_columns)):
        df_players_numerical_1[numerical_columns[i]] = (df_players_numerical_1[numerical_columns[i]]) * (corrMatrix['Points taken'][numerical_columns[i]])
    
    df_players_numerical_2 = skill_parser(df_players_numerical_1, skill)
    
    
    # Separating out the features
    x = df_players_numerical_2.values
    x = np.nan_to_num(x)
    
    # Separating out the target
    y = df_players.loc[:,['player']].values
    
    #Applying PCA
    pca = PCA(.90)
    principalComponents = pca.fit_transform(x)
    
    principalDf = pd.DataFrame(data = principalComponents)
    df_players = df_players.reset_index(drop=True)
    
    #Creating final dataframe with PCA features
    finalDf = pd.concat([principalDf, df_players[['player', 'squad', 'position', 'age','season']]], axis = 1)
    
    
    #a = current player, b = remaining players
    temp = finalDf[(finalDf['player'] == player_name)&(finalDf['season'] == season)]
    a = temp[temp.columns.drop(['player','squad','position','age','season'])]
    b = finalDf[finalDf.columns.drop(['player','squad','position','age','season'])]
    
    
    #Calculating euclidean distance
    finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)
    
    dist = finalDf['distance'].max()
    dist95 = finalDf['distance'].quantile(0.95)
    finalDf['% match'] = 100-(finalDf['distance']/dist95)*100
    
    final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]
    final = final.reset_index(drop=True)
    
    print(color.BOLD + 'List of similar players:' + color.END)
    print('\n')
    print(final[['player','squad','position','age','% match','season']])
    
def find2(player_name, season, team, stat, number_of_results):
    
    print('Stats selected:',stat)
    print('\n')
    global df_games
    global df_players
    global df_players_numerical
    global df_players_numerical_1
    global df_players_numerical_2
    global principalDf
    global finalDf
    global final
    
    seasons1 = np.array(df_players[df_players['player'] == player_name]['season'].unique())
    if((season not in seasons1)):
        print('Check the player played in the corresponding season.')
        return
    
    df_players_numerical_1 = df_players_numerical.copy()

    df_games.rename(columns={'xg_for':'xg'}, inplace=True) 
    df_games = df_games.loc[:,~df_games.T.duplicated(keep='first')]
    df_games = df_games.loc[:, ~df_games.columns.duplicated()]
    
    if team != "Overall":
        df_games_new = df_games[df_games['for'] == team]
        corrMatrix = df_games_new.corr()
    else:
        corrMatrix = df_games.corr()
        
    numerical_columns = list(df_players_numerical_1.columns.values)
    
    for i in range(0,len(numerical_columns)):
        df_players_numerical_1[numerical_columns[i]] = (df_players_numerical_1[numerical_columns[i]]) * (corrMatrix['Points taken'][numerical_columns[i]])
    if not stat:
        print("Choose at least one stat to see output")
        return
    else:
        df_players_numerical_2 = df_players_numerical_1[np.array(stat)]
        
    # Separating out the features
    x = df_players_numerical_2.values
    x = np.nan_to_num(x)
    # Separating out the target
    y = df_players.loc[:,['player']].values
    
    pca = PCA(.90)
    principalComponents = pca.fit_transform(x)
    
    principalDf = pd.DataFrame(data = principalComponents)
    df_players = df_players.reset_index(drop=True)
    
    finalDf = pd.concat([principalDf, df_players[['player', 'squad', 'position', 'age','season']]], axis = 1)
    
    #a - current player, b - remaining players
    temp = finalDf[(finalDf['player'] == player_name)&(finalDf['season'] == season)]
    a = temp[temp.columns.drop(['player','squad','position','age','season'])]
    b = finalDf[finalDf.columns.drop(['player','squad','position','age','season'])]
    
    finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)
    
    dist = finalDf['distance'].max()
    dist95 = finalDf['distance'].quantile(0.95)
    finalDf['% match'] = 100-(finalDf['distance']/dist95)*100
    
    final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]
    final = final.reset_index(drop=True)
    
    print(color.BOLD + 'List of similar players:' + color.END)
    print('\n')
    print(final[['player','squad','position','age','% match','season']])
    
def find3(player_name1, season1, player_name2, season2, stat):
    
    print('Stats selected:',stat)
    print('\n')
    global df_games
    global df_players
    global df_players_numerical
    global df_players_numerical_1
    global df_players_numerical_2
    global principalDf
    global finalDf
    global final
    
    seasons1 = np.array(df_players[df_players['player'] == player_name1]['season'].unique())
    seasons2 = np.array(df_players[df_players['player'] == player_name2]['season'].unique())
    if((season1 not in seasons1)|(season2 not in seasons2)):
        print('Check if both players played in the corresponding seasons.')
        return
    
    df_players_numerical_1 = df_players_numerical.copy()

    df_games.rename(columns={'xg_for':'xg'}, inplace=True) 
    df_games = df_games.loc[:,~df_games.T.duplicated(keep='first')]
    df_games = df_games.loc[:, ~df_games.columns.duplicated()]
    


    corrMatrix = df_games.corr()
        
    numerical_columns = list(df_players_numerical_1.columns.values)
    
    for i in range(0,len(numerical_columns)):
        df_players_numerical_1[numerical_columns[i]] = (df_players_numerical_1[numerical_columns[i]]) * (corrMatrix['Points taken'][numerical_columns[i]])
    if not stat:
        print("Choose at least one stat to see output")
        return
    else:
        df_players_numerical_2 = df_players_numerical_1[np.array(stat)]
        
    # Separating out the features
    x = df_players_numerical_2.values
    x = np.nan_to_num(x)
    # Separating out the target
    y = df_players.loc[:,['player']].values
    
    pca = PCA(.90)
    principalComponents = pca.fit_transform(x)
    
    principalDf = pd.DataFrame(data = principalComponents)
    df_players = df_players.reset_index(drop=True)
    
    finalDf = pd.concat([principalDf, df_players[['player', 'squad', 'position', 'age','season']]], axis = 1)
    
    #a - current player, b - remaining players
    temp = finalDf[(finalDf['player'] == player_name1)&(finalDf['season'] == season1)]
    a = temp[temp.columns.drop(['player','squad','position','age','season'])]
    b = finalDf[finalDf.columns.drop(['player','squad','position','age','season'])]
    
    
    finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)
    
    dist = finalDf['distance'].max()
    dist95 = finalDf['distance'].quantile(0.95)
    finalDf['% match'] = 100-(finalDf['distance']/dist95)*100
    
    final = ((finalDf.sort_values(['distance'], ascending=[True])))
    final = final.reset_index(drop=True)
    
    #print(player_name2, 'vjsndjkcsd')
    print(color.BOLD + '% match between these two players:' + color.END, final[(final['player'] == player_name2)&(final['season'] == season2)]['% match'].iloc[0])
    print('\n')

    if not stat:
        
        print("Select skill to plot radar plot")
    else:
        stats = df_players.loc[:, stat]

        scaler = MinMaxScaler()
        stats_columns = list(stats.columns.values)

        for i in range(0,len(stats_columns)):
            stats[stats_columns[i]] = scaler.fit_transform(stats[[stats_columns[i]]])

        stats['player'] = df_players['player']
        stats['season'] = df_players['season']
        stats_1 = stats.loc[(stats['player'] == player_name1)&(stats['season'] == season1)]
        stats_1 = stats_1.drop(['player','season'], axis = 1)
        stats_1_num = stats_1.iloc[0].values
      
        stats_2 = stats.loc[(stats['player'] == player_name2)&(stats['season'] == season2)]
        stats_2 = stats_2.drop(['player','season'], axis = 1)
        stats_2_num = stats_2.iloc[0].values

        angles = np.linspace(0, 2*np.pi, stats_1.shape[1], endpoint=False)
        stats_1_new = np.concatenate((stats_1_num, [stats_1_num[0]]))
        stats_2_new = np.concatenate((stats_2_num, [stats_2_num[0]]))
        angles = np.concatenate((angles,[angles[0]]))    



        fig = plt.figure(figsize=(10,10))
        ax = fig.add_subplot(111, polar=True)
        ax.plot(angles, stats_1_new, 'o-', linewidth=2)
        ax.fill(angles, stats_1_new, 'teal', alpha=0.25)
        ax.plot(angles, stats_2_new, 'o-', linewidth=2)
        ax.fill(angles, stats_2_new, 'red', alpha=0.25)
        ax.set_thetagrids(angles * 180/np.pi, stats_1.columns)
        title_name = "Plot of " + player_name1 + " in blue vs " +  player_name2 + " in red"
        ax.set_title(title_name)
        plt.show()
    

by Parth Athale (@ParthAthale)

Data credits to FBref/StatsBomb
Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/
Read code here: https://github.com/parth1902/PCA_Player_Finder

Some examples as a guide to do this:
If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall
If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall
If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work


In [2]:
#HIDDEN
interact(find, player_name = players,season = ['2019-20','2018-19','2017-18'],team = teams,skill = ['Overall','Passing','Creating','Shooting','Defensive work','Possession'],number_of_results = (range(100))[20:100]);



interactive(children=(Dropdown(description='player_name', options=('Aaron Connolly', 'Aaron Cresswell', 'Aaron…

In [3]:
#HIDDEN
interact(find2, player_name = players,season = ['2019-20','2018-19','2017-18'],team = teams,stat = widgets.SelectMultiple(options = numerical_columns, rows = 15),number_of_results = (range(100))[20:100]);



interactive(children=(Dropdown(description='player_name', options=('Aaron Connolly', 'Aaron Cresswell', 'Aaron…

In [4]:
#HIDDEN
interact(find3, player_name1 = players,season1 = ['2019-20','2018-19','2017-18'],player_name2 = players,season2 = ['2019-20','2018-19','2017-18'],stat = widgets.SelectMultiple(options = numerical_columns, rows = 15));


interactive(children=(Dropdown(description='player_name1', options=('Aaron Connolly', 'Aaron Cresswell', 'Aaro…