## Topological Analysis of Premier League Players

In [62]:
import pandas as pd
import numpy as np
import kmapper as km
import sklearn
from plotly.offline import init_notebook_mode, iplot
import igraph as ig

np.random.seed(1234)
init_notebook_mode(connected=True)

In [63]:
%%capture
df=pd.read_csv("UpdatedPlayerStats.csv", encoding='ANSI')

# Filter out players who didn't play much
df = df[df['MinutesPlayed'] > 15]

In [64]:
df.columns

Index(['Points', 'MinutesPlayed', 'Goals', 'Assists', 'CleanSheet', 'Bonus',
       'YellowCards', 'RedCards', 'Crosses', 'BigChancesCreated',
       'ClearancesBlocksIntercepts', 'Recoveries', 'KeyPasses', 'Tackles',
       'AttemptedPasses', 'PassesCompleted', 'BigChancesMissed',
       'ErrorsToGoal', 'ErrorsToGoalAttempt', 'Tackled', 'Offside', 'Fouls',
       'Dribbles', 'Identifier'],
      dtype='object')

In [65]:
# Drop the mins played - want to base players on their performance rather than time on the pitch
df.drop('MinutesPlayed', axis=1, inplace=True)
df.drop('CleanSheet', axis=1, inplace=True)

# Add some derived stats
df['PassAccuracy'] = (df['PassesCompleted'] / df['AttemptedPasses']).replace(np.nan, 0)

In [66]:
g=df.groupby('Identifier')
df2=g.aggregate(np.mean)
df2.head()

Unnamed: 0_level_0,Points,Goals,Assists,Bonus,YellowCards,RedCards,Crosses,BigChancesCreated,ClearancesBlocksIntercepts,Recoveries,...,AttemptedPasses,PassesCompleted,BigChancesMissed,ErrorsToGoal,ErrorsToGoalAttempt,Tackled,Offside,Fouls,Dribbles,PassAccuracy
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aarons-Newcastle-MID,1.25,0.0,0.0,0.0,0.0,0.0,0.865385,0.0,0.321429,6.84169,...,32.159341,25.119849,0.0,0.0,0.0,6.520261,0.321429,0.321429,2.989698,0.697161
Abraham-Swansea-FWD,3.26087,0.26821,0.206522,0.623377,0.0,0.0,0.050819,0.086957,0.491588,2.095963,...,19.042641,12.30239,0.134677,0.0,0.0,3.949741,0.488636,0.924908,1.40105,0.687141
Adam Smith-Bournemouth-DEF,2.16,0.08,0.12,0.12,0.272934,0.0,0.528649,0.04,4.012509,4.706673,...,39.81294,32.983725,0.0,0.0,0.0,1.312934,0.04,1.12953,1.4,0.803334
Adam-Stoke-MID,1.142857,0.0,0.0,0.0,1.017857,0.44335,1.030612,0.762755,5.716309,8.797942,...,57.5614,37.541608,0.0,0.142857,0.803571,0.92602,0.0,2.609166,2.258621,0.609755
Adrián-West Ham-GK,3.2,0.0,0.0,0.066667,0.133333,0.0,0.0,0.0,1.266667,9.133333,...,27.866667,12.266667,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.446985


In [67]:
# Drop fantasy related colunms - we only want to cluster players based on their in-game performance
X = df2[[col for col in df2.columns if col not in ['Points', 'Bonus']]]
names = X.index.values
X.index = [i for i in range(X.shape[0])]

# Get averages for each stat
means = np.mean(X.values, axis=0)
std_dev = np.std(X.values, axis=0)

In [68]:
# Initialise mapper and create lens using TSNE
mapper = km.KeplerMapper(verbose=0)
lens = mapper.fit_transform(X.values, projection=sklearn.manifold.TSNE(), scaler=None)

# Create the graph of the nerve of the corresponding pullback
graph = mapper.map(lens, X.values,
                   # clusterer=sklearn.cluster.DBSCAN(eps=0.3, min_samples=1),
                   clusterer=sklearn.cluster.KMeans(n_clusters=2, random_state=1234),
                   nr_cubes=20, overlap_perc=0.9)

In [69]:
def get_cluster_summary(player_list, average_mean, average_std, dataset, columns):
    # Compare players against the average and list the attributes that are above and below the average

    cluster_mean = np.mean(dataset.iloc[player_list].values, axis=0)
    diff = cluster_mean - average_mean
    std_m = np.sqrt((cluster_mean - average_mean) ** 2) / average_std

    stats = sorted(zip(columns, cluster_mean, average_mean, diff, std_m), key=lambda x: x[4], reverse=True)
    above_stats = [a[0] + ': ' + f'{a[1]:.2f}' for a in stats if a[3] > 0]
    below_stats = [a[0] + ': ' + f'{a[1]:.2f}' for a in stats if a[3] < 0]

    # Create a string summary for the tooltips
    cluster_summary = 'Above Mean:<br>' + '<br>'.join(above_stats[:5]) + \
                      '<br><br>Below Mean:<br>' + '<br>'.join(below_stats[-5:])

    return cluster_summary

In [70]:
# Get the players per cluster and overall cluster stats
node_dict = {}
node_list = []
node_stats = []
for node in graph['nodes']:
    node_list.append(node)
    players = [names[i] for i in graph['nodes'][node]]
    node_dict[node] = players
    node_stats.append(get_cluster_summary(graph['nodes'][node], means, std_dev, X, X.columns))
    
# Add the edges to a list for passing into iGraph:
edge_list = []
for node in graph['links']:
    for nbr in graph['links'][node]:
        # Need to base everything on indices for igraph
        edge_list.append((node_list.index(node), node_list.index(nbr)))

In [71]:
n_nodes = len(node_list)
G = ig.Graph(n_nodes)

G.add_edges(edge_list)

avg_points = []
for node in G.vs.indices:
    avg_points.append(np.average([df2.iloc[i]['Points'] for i in graph['nodes'][node_list[node]]]))
    G.vs[node]['size'] = len(node_dict[node_list[node]])

links = G.get_edgelist()
layt = G.layout('kk')

In [72]:
N = len(layt)
Xnodes = [layt[k][0] for k in range(N)]  # x-coordinates of nodes
Ynodes = [layt[k][1] for k in range(N)]  # y-coordnates of nodes

Xedges = []
Yedges = []
for e in links:
    Xedges.extend([layt[e[0]][0], layt[e[1]][0], None])
    Yedges.extend([layt[e[0]][1], layt[e[1]][1], None])
    
edges_trace = dict(type='scatter',
                   x=Xedges,
                   y=Yedges,
                   mode='lines',
                   line=dict(color='rgb(200,200,200)',
                             width=0.5),
                   hoverinfo='none')

nodes_trace = dict(type='scatter',
                   x=Xnodes,
                   y=Ynodes,
                   mode='markers',
                   opacity=0.8,
                   marker=dict(symbol='dot',
                               colorscale='Viridis',
                               showscale=True,
                               reversescale=False,
                               color=avg_points,
                               # size=[x+1 for x in np.log2(G.vs['size'])],
                               size=G.vs['size'],
                               # size=5,
                               line=dict(color='rgb(200,200,200)',
                                         width=0.5),
                               colorbar=dict(thickness=20,
                                             ticklen=4)),
                   text=[],
                   hoverinfo='text')

axis = dict(showline=False,
            zeroline=False,
            showgrid=False,
            showticklabels=False,
            title='')

layout = dict(title='Test',
              font=dict(size=12),
              showlegend=False,
              autosize=False,
              width=700,
              height=700,
              xaxis=dict(axis),
              yaxis=dict(axis),
              hovermode='closest',
              plot_bgcolor='rgba(20,20,20, 0.8)')

In [73]:
# Add tooltips
nodes = G.vs.indices
div = '<br>-------<br>'

sizes = []
trace = []
for node in nodes:
    node_name = node_list[node]
    players = node_dict[node_name]
    sizes.append(len(players))
    node_info = node_name + div + '<br>'.join(players) + div + node_stats[node]
    nodes_trace['text'] += tuple([node_info])
    trace += tuple([node_info])

In [74]:
iplot(dict(data=[edges_trace, nodes_trace], layout=layout))