In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ScraperFC as sfc
from   sklearn.decomposition import PCA
import traceback

Run this cell if it's your first time using this notebook. Otherwise leave it commented out.

In [2]:
# scraper = sfc.FBRef()
# try:
#     per90, perc = scraper.scrape_complete_scouting_reports(year=2022, league='EPL')
# except:
#     traceback.print_exc()
# finally:
#     scraper.close()
# per90.to_pickle('epl2022_reports_per90.pkl')
# perc.to_pickle('epl2022_reports_perc.pkl')

In [3]:
per90 = pd.read_pickle('epl2022_reports_per90.pkl')
perc = pd.read_pickle('epl2022_reports_perc.pkl')

The function below will find the 5 most similar player to the chosen player. To find the players that are "closest" to the chosen player in the data space we will use the norm, which is a way to measure how long vectors are.

The most common norm is the 2-norm, or the Euclidean norm. This is just the square root of the sum of the squares of every element in a vector. Think about calculating the length of the hypotenuse of a right triangle. You square the 2 side lengths opposite the hypotenuse, add them together, and take the square root of that. This is the 2-norm.

Another common norm is the 1-norm, which is just the sum of the absolute values of all elements in a vector. This is just another way to measure distance in the data space.

So, to calculate the distance between two players we subtract their 2 data vectors (or rows of stats in the dataframe), and then compute the norm of that resulting vector. Players with smaller norms are more similar to the player of interest.

The plot that gets output isn't a perfect representation of the norms between all of the players and the player of interest. To display the graph, the player's stats had to be reduced to 2 dimensions (done using PCA). But it's a good way to approximately visualize the results.

In [12]:
def similar_players(player_name, per90, same_position=True, norm=2):
    
    
    df = per90.copy()
    df.drop(columns=[col for col in df.columns if np.any(df[col].isna())], inplace=True) # drop columns with nans
    
    # Get the player's details
    player_row = df[df['Player']==player_name]
    player_index = player_row.index
    pos = player_row['Position'].values[0]
    
    # Only players in same position
    if same_position:
        df = df[df['Position'] == pos]
    
    # Drop non-number columns, but save them to re-add later
    dropped_col_names = ['Player','Position','Minutes']
    dropped_cols = df[dropped_col_names]
    df = df.drop(columns=dropped_col_names)
    
    # Save df for later
    saved_df = df.copy()
    
    # Center about zero
    df = df.subtract(df.mean(axis=0), axis=1)
    
    # PCA reduction
    pca = PCA(n_components=2)
    reduced_arr = pca.fit_transform(df)
    df_reduced = pd.DataFrame(data=reduced_arr, columns=['PC1', 'PC2'])
    df_reduced.index = df.index
    
    # Calculate distances between players
    diff = df.subtract(df.loc[player_index,:].values, axis=1).values
    distances = np.linalg.norm(diff, ord=norm, axis=1)
    saved_df.insert(0, 'Distance', distances)
    df.insert(0, 'Distance', distances)
    
    # Re-add dropped columns
    for i, col_name in enumerate(dropped_col_names):
        df_reduced.insert(i, col_name, dropped_cols[col_name])
        df.insert(i, col_name, dropped_cols[col_name])
        saved_df.insert(i, col_name, dropped_cols[col_name])
        
    # Find nearest players
    sorted_df = df.sort_values(by='Distance')
    top5_index = sorted_df.iloc[1:6, :].index
    top5 = saved_df.loc[top5_index, :]
    top5_reduced = df_reduced.loc[top5_index, :]
    
    
    #### Plot ####
    plt.figure(figsize=[8,8])
    plt.scatter(df_reduced['PC1'], df_reduced['PC2'], color='white', edgecolor='black', alpha=0.5)
    plt.scatter(df_reduced.loc[player_index, 'PC1'], df_reduced.loc[player_index, 'PC2'], color='red')
    colors = pd.DataFrame(data=['blue', 'orange', 'yellow', 'salmon', 'green'], 
                          index=top5_reduced.index,
                          columns=['color'])
    for i in top5_reduced.index:
        plt.scatter(top5_reduced.loc[i,'PC1'], 
                    top5_reduced.loc[i,'PC2'], 
                    color=colors.loc[i,'color'], 
                    label=top5_reduced.loc[i,'Player'])
    plt.legend()
    if same_position:
        plt.suptitle(pos, y=0.93)
    else:
        plt.suptitle('All positions', y=0.93)
    plt.title('Players most similar to {} (in red)'.format(player_name))
    plt.xlabel('1st PC')
    plt.ylabel('2nd PC')
    plt.show()
    
#     return top5.reset_index(drop=True)
    return saved_df.loc[player_index,:].append(top5).reset_index(drop=True)

Now, lets use the function and see which attacking midfielders/wingers (same_position=True) are most similar to Bruno Fernandes. We'll use the 1-norm.

In [None]:
top5 = similar_players('Bruno Fernandes', per90, same_position=True, norm=2)
top5

What about across all positions?

In [None]:
top5 = similar_players('Bruno Fernandes', per90, same_position=False, norm=2)
top5

In [None]:
top5 = similar_players('Abdoulaye Doucoure', per90, same_position=False, norm=1)
top5