In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
%matplotlib inline
sns.set_style('darkgrid') # set the grid style for the seaborn plots

In [None]:
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in pandas

In [None]:
df = pd.read_excel('EPL Player Stats All Time.xlsx', index_col=0)

In [None]:
#df.set_index('Player_ID', inplace=True) # set column 'Player_ID' as index w/o creating a duplicate
df.index = df['Player_ID']
#df.dropna(inplace=True)
df.fillna(0, inplace=True)
#df.isnull().sum()
df = df[df['90s Played'] >= 0.01] # drop players who have no appearances

In [None]:
#df.columns.values.tolist()

In [None]:
#salaries['Base Salary'] = salaries['Base Salary'].apply(lambda salary: int(salary[1:].replace(',', '')))
#salaries['Guaranteed Compensation'] = salaries['Guaranteed Compensation'].apply(lambda salary: int(salary[1:].replace(',', '')))
#salaries = salaries.sort_values('Date').groupby('Player').last()
#salaries.rename(columns = {'Base Salary': 'Base Salary (USD)', 'Guaranteed Compensation': 'Guaranteed Compensation (USD)'}, inplace = True)
#salaries.reset_index(inplace = True) # reset the index

In [None]:
dft = df[['Player_ID','Non-Penalty xG', 'xA','Dribbles Completed']]
df1 = dft.drop(['Player_ID'], axis = 1)

In [None]:
#Feature scaling

scaler = StandardScaler()
scaler.fit(df1) # compute the mean and standard deviation to be used for later scaling
#StandardScaler(copy=True, with_mean=True, with_std=True)
scaled_features = scaler.transform(df1)
scaled_data = pd.DataFrame(data=scaled_features, columns=df1.columns)
scaled_data.head(3)

In [None]:
# KMeans clustering

kmeans = KMeans(n_clusters=3)
kmeans.fit(X=scaled_data)
#For those of you who know how this algorithm works under the hood, we have set the maximum number of iterations of the k-means algorithm for a single run to the default value of 300. And, we have set the number of time the k-means algorithm will be run with different centroid seeds to the default value of 10. The final results will be the best output of these 10 consecutive runs in terms of inertia.
dft['Cluster Label'] = kmeans.labels_

In [None]:
cluster_0 = dft[dft['Cluster Label']==0]['Player_ID']
cluster_1 = dft[dft['Cluster Label']==1]['Player_ID']
cluster_2 = dft[dft['Cluster Label']==2]['Player_ID']

In [None]:
print('\nCluster 0 players:\n')
print(cluster_0)
print('\n\n')
print('Cluster 1 players:\n')
print(cluster_1)
print('\n\n')
print('Cluster 2 players:\n')
print(cluster_2)

In [None]:
cluster_0_avg = dft[dft['Cluster Label']==0].mean()
cluster_1_avg = dft[dft['Cluster Label']==1].mean()
cluster_2_avg = dft[dft['Cluster Label']==2].mean()

In [None]:
avg_metric_values = pd.DataFrame(
    {'cluster 0 avg': cluster_0_avg, 
     'cluster 1 avg': cluster_1_avg, 
     'cluster 2 avg': cluster_2_avg}
)
avg_metric_values