# PCA Aplicado a Jogadores de Futebol

### Importando as bibliotecas necessárias

In [178]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from scipy.spatial import distance_matrix

### Lendo o dataframe

In [179]:
playersStats = pd.read_csv('statsPlayers.csv', index_col=0)

In [180]:
display(playersStats)

Unnamed: 0,goals,expectedGoals,bigChancesMissed,successfulDribbles,totalShots,shotsOnTarget,shotsOffTarget,blockedShots,goalConversionPercentage,penaltiesTaken,...,team__shortName,team__id,team__teamColors__primary,team__teamColors__secondary,team__teamColors__text,team__fieldTranslations__nameTranslation__ar,team__fieldTranslations__nameTranslation__ru,team__userCount,team__type,position
0,7.0,3.34,2.0,21.0,58.0,20.0,15.0,23.0,12.07,0.0,...,Arsenal,42.0,#52b030,#52b030,#ffffff,أرسنال,Арсенал,0.0,0.0,MC
1,1.0,2.62,4.0,16.0,25.0,5.0,15.0,5.0,4.00,0.0,...,Everton,48.0,#52b030,#52b030,#ffffff,إيفرتون,Эвертон,0.0,0.0,DC
2,0.0,0.00,0.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,Manchester United,35.0,#52b030,#52b030,#ffffff,مانتشستر يونايتد,Манчестер Юнайтед,0.0,0.0,GK
3,0.0,0.00,0.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,Everton,48.0,#52b030,#52b030,#ffffff,إيفرتون,Эвертон,0.0,0.0,GK
4,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,Fulham,43.0,#52b030,#52b030,#ffffff,فولهام,Фулхэм,0.0,0.0,GK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2772,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,RB Leipzig,36360.0,#374df5,#374df5,#ffffff,آر بي لايبزيغ,РБ Лейпциг,0.0,0.0,DC
2773,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,RB Leipzig,36360.0,#374df5,#374df5,#ffffff,آر بي لايبزيغ,РБ Лейпциг,0.0,0.0,empty
2774,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,SV Werder Bremen,2534.0,#374df5,#374df5,#ffffff,فيردر بريمن,Вердер Бремен,0.0,0.0,empty
2775,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,FC Augsburg,2600.0,#374df5,#374df5,#ffffff,إف سي أوجسبورغ,Аугсбург,0.0,0.0,empty


### Removendo goleiros

In [181]:
playersStats = playersStats[playersStats.position != 'GK']

### Removendo jogadores com menos da metade dos minutos jogados

In [182]:
maxMinutesPlayed = playersStats.max(skipna=False)['minutesPlayed']

playersStats = playersStats.drop(playersStats[playersStats['minutesPlayed'] < (maxMinutesPlayed) / 2 ].index)

### Separando dados importantes

In [184]:
print(playersStats.columns)

Index(['goals', 'expectedGoals', 'bigChancesMissed', 'successfulDribbles',
       'totalShots', 'shotsOnTarget', 'shotsOffTarget', 'blockedShots',
       'goalConversionPercentage', 'penaltiesTaken', 'penaltyGoals',
       'freeKickGoal', 'penaltyWon', 'shotFromSetPiece',
       'goalsFromInsideTheBox', 'goalsFromOutsideTheBox', 'headedGoals',
       'leftFootGoals', 'rightFootGoals', 'hitWoodwork', 'offSides',
       'penaltyConversion', 'setPieceConversion', 'tackles', 'interceptions',
       'penaltyConceded', 'clearances', 'errorLeadToGoal', 'errorLeadToShot',
       'ownGoals', 'dribbledPast', 'cleanSheet', 'bigChancesCreated',
       'assists', 'accuratePasses', 'inaccuratePasses', 'totalPasses',
       'accuratePassesPercentage', 'accurateOwnHalfPasses',
       'accurateOppositionHalfPasses', 'accurateFinalThirdPasses', 'keyPasses',
       'accurateCrosses', 'accurateCrossesPercentage', 'accurateLongBalls',
       'accurateLongBallsPercentage', 'passToAssist', 'yellowCards',
   

In [185]:
playersName = playersStats['player__name']
playersClub = playersStats['team__name']
playersPosition = playersStats['position']

 ### Removendo colunas não utilizadas no PCA

In [186]:
nameIndex = playersStats.columns.get_loc('player__name')
lastIndex = playersStats.columns.get_loc('position')

playersStats = playersStats.drop(playersStats.columns[range(nameIndex, lastIndex + 1)], axis = 1)

In [187]:
playersStats.columns

Index(['goals', 'expectedGoals', 'bigChancesMissed', 'successfulDribbles',
       'totalShots', 'shotsOnTarget', 'shotsOffTarget', 'blockedShots',
       'goalConversionPercentage', 'penaltiesTaken', 'penaltyGoals',
       'freeKickGoal', 'penaltyWon', 'shotFromSetPiece',
       'goalsFromInsideTheBox', 'goalsFromOutsideTheBox', 'headedGoals',
       'leftFootGoals', 'rightFootGoals', 'hitWoodwork', 'offSides',
       'penaltyConversion', 'setPieceConversion', 'tackles', 'interceptions',
       'penaltyConceded', 'clearances', 'errorLeadToGoal', 'errorLeadToShot',
       'ownGoals', 'dribbledPast', 'cleanSheet', 'bigChancesCreated',
       'assists', 'accuratePasses', 'inaccuratePasses', 'totalPasses',
       'accuratePassesPercentage', 'accurateOwnHalfPasses',
       'accurateOppositionHalfPasses', 'accurateFinalThirdPasses', 'keyPasses',
       'accurateCrosses', 'accurateCrossesPercentage', 'accurateLongBalls',
       'accurateLongBallsPercentage', 'passToAssist', 'yellowCards',
   

### Transformando as estatísticas em estatísticas por 90 minutos

In [188]:
percentageStats = 'goalConversionPercentage', 'accuratePassesPercentage', 'accurateCrossesPercentage', 'accurateLongBallsPercentage', 'groundDuelsWonPercentage', 'aerialDuelsWonPercentage', 'totalDuelsWonPercentage', 'appearances', 'matchesStarted', 'appearances', 'minutesPlayed'

numericalStats = list(set(playersStats.columns) - set(percentageStats))

print(numericalStats)

['goals', 'accurateLongBalls', 'errorLeadToGoal', 'redCards', 'groundDuelsWon', 'leftFootGoals', 'penaltyGoals', 'bigChancesCreated', 'yellowCards', 'dispossessed', 'inaccuratePasses', 'ownGoals', 'headedGoals', 'interceptions', 'setPieceConversion', 'penaltyWon', 'rightFootGoals', 'totalShots', 'bigChancesMissed', 'accuratePasses', 'shotsOffTarget', 'penaltyConceded', 'keyPasses', 'totalPasses', 'shotsOnTarget', 'accurateFinalThirdPasses', 'aerialDuelsWon', 'goalsFromOutsideTheBox', 'goalsFromInsideTheBox', 'tackles', 'shotFromSetPiece', 'totalDuelsWon', 'penaltyConversion', 'cleanSheet', 'successfulDribbles', 'passToAssist', 'fouls', 'accurateOwnHalfPasses', 'accurateOppositionHalfPasses', 'penaltiesTaken', 'accurateCrosses', 'offSides', 'errorLeadToShot', 'expectedGoals', 'clearances', 'freeKickGoal', 'assists', 'wasFouled', 'possessionLost', 'hitWoodwork', 'blockedShots', 'dribbledPast']


In [189]:
playersStatsPerMin = playersStats.copy()

playersStatsPerMin[numericalStats] = playersStats[numericalStats].div(playersStats['minutesPlayed'], axis=0)

In [190]:
for i in numericalStats:
    playersStatsPerMin.loc[:, i] *= 90

In [191]:
display(playersStatsPerMin)

Unnamed: 0,goals,expectedGoals,bigChancesMissed,successfulDribbles,totalShots,shotsOnTarget,shotsOffTarget,blockedShots,goalConversionPercentage,penaltiesTaken,...,aerialDuelsWonPercentage,totalDuelsWon,totalDuelsWonPercentage,minutesPlayed,wasFouled,fouls,dispossessed,possessionLost,appearances,matchesStarted
0,0.195046,0.093065,0.055728,0.585139,1.616099,0.557276,0.417957,0.640867,12.07,0.000000,...,45.88,4.346749,52.17,3230.0,0.362229,0.947368,0.613003,9.167183,38.0,37.0
1,0.026323,0.068968,0.105294,0.421176,0.658087,0.131617,0.394852,0.131617,4.00,0.000000,...,70.05,6.370284,62.21,3419.0,0.658087,1.184557,0.421176,10.292483,38.0,38.0
5,0.513359,0.530381,0.378265,1.053738,2.945062,1.026719,1.080757,0.837586,17.43,0.081057,...,41.88,5.511858,37.99,3331.0,1.215851,1.134794,2.188532,11.510057,38.0,37.0
6,0.052632,0.042105,0.026316,0.263158,0.263158,0.105263,0.052632,0.105263,20.00,0.000000,...,59.50,3.657895,59.66,3420.0,0.421053,0.578947,0.342105,6.710526,38.0,38.0
8,0.345309,0.357865,0.219742,0.973143,2.228811,0.973143,0.878968,0.376700,15.49,0.156958,...,43.79,6.968957,43.19,2867.0,1.349843,1.538193,1.663760,12.462504,38.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2509,0.093701,0.080583,0.000000,0.140552,0.281104,0.140552,0.093701,0.046851,33.33,0.000000,...,46.99,3.607496,52.03,1921.0,0.234253,0.421655,0.093701,11.337845,23.0,21.0
2512,0.000000,0.030211,0.097455,0.389821,0.536004,0.146183,0.292366,0.097455,0.00,0.000000,...,30.56,4.726584,51.87,1847.0,1.608013,1.656741,0.438549,11.597185,23.0,23.0
2519,0.049669,0.030795,0.000000,1.142384,0.248344,0.049669,0.198675,0.000000,20.00,0.000000,...,57.50,4.122517,52.20,1812.0,0.347682,0.844371,0.745033,9.933775,22.0,21.0
2526,0.000000,0.017020,0.051576,0.051576,0.361032,0.000000,0.361032,0.000000,0.00,0.000000,...,68.57,4.435530,60.14,1745.0,0.154728,1.340974,0.103152,7.014327,22.0,21.0


### Normalizando valores

In [192]:
normalizedStats = (playersStatsPerMin - playersStatsPerMin.mean()) / playersStatsPerMin.std()
display(normalizedStats)

normalizedStats.to_csv('normalizedPlayerStats.csv')

Unnamed: 0,goals,expectedGoals,bigChancesMissed,successfulDribbles,totalShots,shotsOnTarget,shotsOffTarget,blockedShots,goalConversionPercentage,penaltiesTaken,...,aerialDuelsWonPercentage,totalDuelsWon,totalDuelsWonPercentage,minutesPlayed,wasFouled,fouls,dispossessed,possessionLost,appearances,matchesStarted
0,0.292713,-0.309597,-0.423172,-0.282793,0.367391,0.262833,-0.210230,1.114914,0.359596,-0.381903,...,-0.142143,-0.343799,0.101648,2.121640,-1.155645,-0.366710,-0.315381,-0.822726,1.793131,2.066341
1,-0.663807,-0.463661,-0.130659,-0.525080,-0.665253,-0.750847,-0.281837,-0.788414,-0.710919,-0.381903,...,1.616723,1.150004,1.410741,2.579215,-0.683483,0.146671,-0.635760,-0.495710,1.793131,2.271478
5,2.097285,2.486322,1.480263,0.409652,1.799884,1.380785,1.843982,1.850157,1.070620,1.257138,...,-0.433225,0.516301,-1.747250,2.366165,0.206658,0.038963,2.315981,-0.141878,1.793131,2.066341
6,-0.514661,-0.635401,-0.596744,-0.758582,-1.090948,-0.813609,-1.342480,-0.886914,1.411540,-0.381903,...,0.848993,-0.852322,1.078252,2.581636,-1.061768,-1.164138,-0.767819,-1.536639,1.793131,2.271478
8,1.144576,1.383366,0.544749,0.290557,1.027835,1.253196,1.218578,0.127588,0.813272,2.791941,...,-0.294234,1.591953,-1.069234,1.242806,0.420496,0.912099,1.439536,0.134906,1.793131,1.040654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2509,-0.281831,-0.389399,-0.752045,-0.939756,-1.071604,-0.729571,-1.215193,-1.105232,3.179815,-0.381903,...,-0.061368,-0.889527,0.083394,-1.047490,-1.359883,-1.504588,-1.182690,-0.191924,-2.005157,-1.215855
2512,-0.813039,-0.711445,-0.176918,-0.571413,-0.796846,-0.716160,-0.599472,-0.916095,-1.241534,-0.381903,...,-1.256989,-0.063400,0.062532,-1.226646,0.832512,1.168688,-0.606744,-0.116559,-2.005157,-0.805581
2519,-0.531458,-0.707714,-0.752045,0.540644,-1.106916,-0.946003,-0.889847,-1.280336,1.411540,-0.381903,...,0.703452,-0.509331,0.105559,-1.311383,-1.178861,-0.589643,-0.094871,-0.599951,-2.258376,-1.215855
2526,-0.813039,-0.795780,-0.447673,-1.071235,-0.985450,-1.064287,-0.386658,-1.280336,-1.241534,-0.381903,...,1.509022,-0.278260,1.140838,-1.473592,-1.486798,0.485228,-1.166906,-1.448353,-2.258376,-1.215855


### Convertendo para matriz

In [193]:
A = normalizedStats.to_numpy()

### PCA

In [194]:
C = A.astype(float).T
C = np.cov(C.astype(float))

In [195]:
eigenvalues, eigenvectors = np.linalg.eig(C)

idx = np.argsort(eigenvalues)[::-1]
ordered_eigenvectors = eigenvectors[:, idx]

PC1 = np.dot(normalizedStats, ordered_eigenvectors[:, 0].real)
PC2 = np.dot(normalizedStats, ordered_eigenvectors[:, 1].real)
PC3 = np.dot(normalizedStats, ordered_eigenvectors[:, 2].real)

### Plot

In [196]:
PCA = pd.DataFrame({'name': playersName, 'PC1': PC1, 'PC2': PC2, 'PC3': PC3, 'position': playersPosition})

PCA.reset_index(inplace=True, drop=True)

display(PCA)

Unnamed: 0,name,PC1,PC2,PC3,position
0,Declan Rice,0.360018,2.392453,3.256241,MC
1,James Tarkowski,3.833847,-1.435548,0.712958,DC
2,Dominic Solanke,-7.570318,-4.100130,1.069762,ST
3,William Saliba,4.773619,-0.810412,4.881982,DC
4,Carlton Morris,-5.589216,-4.446300,0.295520,ST
...,...,...,...,...,...
831,Miloš Veljković,3.617256,-0.956125,1.731060,DC
832,Fabian Holland,2.060054,1.192595,-2.528609,DL
833,Odilon Kossounou,3.665666,0.483581,1.744107,DC
834,Moritz Jenz,5.517837,-2.836307,-0.012644,DC


In [197]:
colorMap = {    "DC": "#0072c3",
                "DR": "#1192e8",
                "DL": "#33b1ff",
                "DM": "#bae6ff",
                "MC": "#e5f6ff",
                "AM": "#fff1f1",
                "MR":"#ffd7d9",
                "ML":"#ffb3b8",
                "RW":"#ff8389",
                "LW":"#fa4d56",
                "ST":"#da1e28"
                }

fig = px.scatter(PCA, x='PC1', y='PC2', hover_data="name", color='position', template="plotly_dark",
                color_discrete_map=colorMap, title="PCA 2D")

fig.update_traces(textposition='top center')


fig.show()

In [198]:
fig = px.scatter_3d(PCA, x='PC1', y='PC2', z='PC3', hover_data="name", color='position', template="plotly_dark",
                color_discrete_map=colorMap, title="PCA 3D")

fig.update_traces(textposition='top center')


fig.show()

In [199]:
eigenVectors = pd.DataFrame({'stats': normalizedStats.columns, 'PC1': ordered_eigenvectors[:, 0].real, 'PC2': ordered_eigenvectors[:, 1].real, 'PC3': ordered_eigenvectors[:, 2].real})
eigenVectors = eigenVectors.sort_values(by='PC1', ascending=False)

fig = px.bar(eigenVectors, x='stats', y='PC1',  width=600)

fig.update_layout(barmode='stack')

fig.show()

In [200]:
eigenVectors = eigenVectors.sort_values(by='PC2', ascending=False)

fig = px.bar(eigenVectors, x='stats', y='PC2',  width=600)

fig.update_layout(barmode='stack')

fig.show()

In [201]:
eigenVectors = eigenVectors.sort_values(by='PC3', ascending=False)

fig = px.bar(eigenVectors, x='stats', y='PC3',  width=600)

fig.update_layout(barmode='stack')

fig.show()

In [202]:
ev = np.divide(np.sort(eigenvalues.real), (np.sum(eigenvalues)))

ev = np.flip(ev)

In [203]:
dfev = pd.DataFrame({'Variância Explicada': ev})

dfev['Variância Explicada Acumulada'] = dfev['Variância Explicada'].cumsum()

fig = px.bar(dfev, y='Variância Explicada', width=1000)

fig.add_trace(
    go.Scatter(
        x=dfev.index,
        y=dfev['Variância Explicada Acumulada'],
        mode='lines+markers',
        name='Variância Explicada Acumulada',
        line=dict(color='red')
    )
)

fig.show()

### Encontrando Jogadores Mais Próximos

In [204]:
def closestPlayers(playerName, PCA, distancePlayers, numPlayers=5):
  rowId = PCA.index[PCA['name'] == playerName].tolist()
  if not rowId:
    raise ValueError(f"Player '{playerName}' not found in PCA DataFrame")
  rowId = rowId[0]

  m = distancePlayers[rowId]

  closest_indices = np.argsort(m)[1:numPlayers+1]

  closest_players = PCA.iloc[closest_indices]['name'].values

  return closest_players

In [205]:
matrixPCA = np.stack((PC1, PC2), axis=1)
distancePlayers = distance_matrix(matrixPCA, matrixPCA)

closestPlayers('João Palhinha', PCA, distancePlayers)

array(['Christian Nørgaard', 'Juan Miranda', 'Marc Roca', 'Joakim Mæhle',
       'Fabian Holland'], dtype=object)