# [CDAF] Atividade 4

## Nome: Thiago Pádua de Carvalho

## Matrícula: 2020007066

## Referências
- [1] https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
- [2] https://socceraction.readthedocs.io/en/latest/api/generated/socceraction.xthreat.ExpectedThreat.html#socceraction.xthreat.ExpectedThreat
- [3] https://socceraction.readthedocs.io/en/latest/api/generated/socceraction.xthreat.get_successful_move_actions.html#socceraction.xthreat.get_successful_move_actions
- [4] https://socceraction.readthedocs.io/en/latest/documentation/valuing_actions/xT.html

In [106]:
# Importando bibliotecas
from tqdm import tqdm
import numpy as np
import pandas as pd
import socceraction.spadl as spd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from socceraction import xthreat as xt

### LaLiga  p/ SPADL com pré-processamentos

In [107]:
# carregando os eventos
path = f'../data/events/events_Spain.json'
events = pd.read_json(path_or_buf=path)
events

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],3542,"[{'y': 61, 'x': 37}, {'y': 50, 'x': 50}]",2565548,Pass,682,1H,2.994582,85,180864419
1,8,Simple pass,[{'id': 1801}],274435,"[{'y': 50, 'x': 50}, {'y': 30, 'x': 45}]",2565548,Pass,682,1H,3.137020,85,180864418
2,8,Simple pass,[{'id': 1801}],364860,"[{'y': 30, 'x': 45}, {'y': 12, 'x': 38}]",2565548,Pass,682,1H,6.709668,85,180864420
3,8,Simple pass,[{'id': 1801}],3534,"[{'y': 12, 'x': 38}, {'y': 69, 'x': 32}]",2565548,Pass,682,1H,8.805497,85,180864421
4,8,Simple pass,[{'id': 1801}],3695,"[{'y': 69, 'x': 32}, {'y': 37, 'x': 31}]",2565548,Pass,682,1H,14.047492,85,180864422
...,...,...,...,...,...,...,...,...,...,...,...,...
628654,8,Simple pass,[{'id': 1801}],20623,"[{'y': 25, 'x': 66}, {'y': 2, 'x': 88}]",2565927,Pass,682,2H,2939.077491,85,253302671
628655,7,Acceleration,[{'id': 1801}],122832,"[{'y': 2, 'x': 88}, {'y': 21, 'x': 97}]",2565927,Others on the ball,682,2H,2940.515560,70,253302673
628656,8,Cross,"[{'id': 401}, {'id': 1802}]",122832,"[{'y': 21, 'x': 97}, {'y': 26, 'x': 92}]",2565927,Pass,682,2H,2942.098761,80,253302674
628657,8,Simple pass,[{'id': 1801}],40756,"[{'y': 74, 'x': 8}, {'y': 56, 'x': 9}]",2565927,Pass,675,2H,2943.089232,85,253302698


In [108]:
# pré processamento em colunas da tabela de eventos para facilitar a conversão p/ SPADL
events = events.rename(columns={'id': 'event_id', 'eventId': 'type_id', 'subEventId': 'subtype_id',
                                'teamId': 'team_id', 'playerId': 'player_id', 'matchId': 'game_id'})
events['milliseconds'] = events['eventSec'] * 1000
events['period_id'] = events['matchPeriod'].replace({'1H': 1, '2H': 2})

In [109]:
# carregando as partidas, pois vamos saber quais times jogam em casa e fora p/ usar como parametro do SPADL
path = f'../data/matches/matches_Spain.json'
matches = pd.read_json(path_or_buf=path)

In [110]:
# as informações dos times de cada partida estão em um dicionário dentro da coluna 'teamsData', então vamos separar essas informações
team_matches = []
for i in tqdm(range(len(matches))):
    match = pd.DataFrame(matches.loc[i, 'teamsData']).T
    match['matchId'] = matches.loc[i, 'wyId']
    team_matches.append(match)
team_matches = pd.concat(team_matches).reset_index(drop=True)

100%|██████████| 380/380 [00:00<00:00, 1159.99it/s]


In [111]:
# fazendo a conversão p/ SPADL, padronizando a direção de jogo da esquerda p/ a direita e adicionando os nomes dos tipos de ações
spadl = []
game_ids = events.game_id.unique().tolist()
for g in tqdm(game_ids):
    match_events = events.loc[events.game_id == g]
    match_home_id = team_matches.loc[(team_matches.matchId == g) & (team_matches.side == 'home'), 'teamId'].values[0]
    match_actions = spd.wyscout.convert_to_actions(events=match_events, home_team_id=match_home_id)
    match_actions = spd.play_left_to_right(actions=match_actions, home_team_id=match_home_id)
    match_actions = spd.add_names(match_actions)
    spadl.append(match_actions)
spadl = pd.concat(spadl).reset_index(drop=True)

100%|██████████| 380/380 [02:43<00:00,  2.32it/s]


In [112]:
# adicionando o nome dos jogadores
path = f'../data/players.json'
players = pd.read_json(path_or_buf=path)
players['player_name'] = players['firstName'] + ' ' + players['lastName']
players = players[['wyId', 'player_name']].rename(columns={'wyId': 'player_id'})
spadl = spadl.merge(players, on='player_id', how='left')

In [133]:
# correcting players' names
spadl['player_name'] = spadl['player_name'].str.decode('unicode-escape')

## Questão 1
- Crei um dataframe "shots" à partir do dataframe "spadl", contendo apenas os chutes.
- Crie 4 colunas no dataframe "shots" a serem usadas como features de um modelo de xG.
- Justifique a escolha das features.

In [134]:
def euclidean_distance(point1, point2=[105, 34]):
    point1 = np.array(point1)
    point2 = np.array(point2)
    return np.sqrt(np.sum((point2 - point1) ** 2)) 

In [135]:
def find_angle_to_goal(point):
    point = np.array(point)  # shot coordinates
    post1 = np.array([105, 37.66])
    post2 = np.array([105, 30.34])

    vector1 = post1 - point
    vector2 = post2 - point

    # Calculates the length of the vectors that goes from the player in the moment of the shot to the posts
    vector1_length = euclidean_distance(point, post1)  
    vector2_length = euclidean_distance(point, post2)

    cos_theta = np.dot(vector1, vector2) / (vector1_length * vector2_length)
    
    return np.arccos(cos_theta) * 180 / np.pi  # Returns the angle in degrees

In [136]:
shots = spadl.loc[spadl.type_name.apply(lambda x: x in ['shot', 'shot_freekick', 'shot_penalty'])].reset_index(drop=True)
shots.drop(columns=['action_id', 'game_id', 'original_event_id', 'bodypart_id', 'type_id'], inplace=True)

In [137]:
shots['distance'] = shots.apply(lambda row: euclidean_distance([row['start_x'], row['start_y']]), axis=1)
shots['angle'] = shots.apply(lambda row: find_angle_to_goal([row['start_x'], row['start_y']]), axis=1)
shots['distXangle'] = shots['distance'] * shots['angle']
shots['squared_distance'] = shots['distance'] ** 2
shots.tail()

Unnamed: 0,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,result_id,type_name,result_name,bodypart_name,player_name,distance,angle,distXangle,squared_distance
8540,2,1944.188119,682,267134,94.5,46.24,94.5,46.24,0,shot,fail,foot,Roger Beyker Martínez Tobinson,16.126612,17.3064,279.09359,260.0676
8541,2,2385.837008,682,134174,96.6,51.0,105.0,37.4,1,shot,success,foot,Samuel Castillejo Azuaga,18.962067,10.0722,190.989743,359.56
8542,2,2672.823612,682,134174,96.6,11.56,96.6,11.56,0,shot_freekick,fail,foot,Samuel Castillejo Azuaga,23.960668,6.258016,149.946247,574.1136
8543,2,2722.835144,675,3321,86.1,47.6,105.0,30.6,0,shot,fail,foot,Karim Benzema,23.284544,14.662142,341.401296,542.17
8544,2,2857.346465,675,4498,92.4,38.76,105.0,40.8,0,shot,fail,foot,Lucas Vázquez Iglesias,13.469135,28.763621,387.4211,181.4176


### Justificando Features
Para a criação do modelo de xG, foram escolhidas as seguintes features:
- **distance**: Distância entre o chute e o gol. \
Esta feature foi escolhida pois - desconsiderando jogadores no caminho - quanto mais próximo do gol, maior a chance de marcar.

- **angle**: Ângulo entre o chute e o gol. \
O ângulo entre as traves, visto pelo jogador que chuta, é aproximadamente proporcional à probabilidade de gol.

- **distXangle**: Multiplicação da distência pelo ângulo.
- **squared_distance**: Quadrado da distência do chute até o gol \
As duas últimas foram escolhidas como modo de evitar que durante a aplicação de modelos preditivos haja overfitting, isto é, quando o modelo não consegue generalizar e se adequa somente aos dados de treinamento.

## Questão 2
- Crie uma coluna numérica binária "goal" no dataframe "shots" indicando se o chute resultou em gol ou não.
- Use regressão logística [1] p/ treinar (.fit(X_train, y_train)) um modelo de xG usando as features criadas na questão 1.
- Use 70% dos dados para treino e 30% para teste.
- Reporte a acurácia do modelo para os conjuntos de treino (.score(X_train, y_train)) e teste (.score(X_test, y_test)).

In [138]:
shots['goal'] = shots['result_id'].isin([1])
train, test = train_test_split(shots, test_size=0.3, random_state=42)

model = LogisticRegression()
model.fit(train[['distance', 'angle', 'distXangle', 'squared_distance']], train['goal']).coef_

array([[-0.16699492,  0.0181    ,  0.00221405,  0.00159907]])

In [139]:
score = model.score(test[['distance', 'angle', 'distXangle', 'squared_distance']], test['goal'])
print(f"A precisão média observada foi de {score:.2f}")

A precisão média observada foi de 0.88


## Questão 3
- Use o modelo treinado na questão 2 p/ prever a probabilidade de gol de todos os chutes do dataframe "shots". Reporte essas probabilidades no dataframe "shots" em uma coluna "xG".
- Agrupe o dataframe "shots" por "player_name" e reporte a soma dos "goal" e "xG".
- Reporte os 10 jogadores com maior xG.
- Reporte os 10 jogadores com maior diferença de Gols e xG.

In [140]:
shots['xG'] = model.predict_proba(shots[['distance', 'angle', 'distXangle', 'squared_distance']])[:, 1]
shots.head()

Unnamed: 0,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,result_id,type_name,result_name,bodypart_name,player_name,distance,angle,distXangle,squared_distance,goal,xG
0,1,57.771186,695,225089,97.65,44.88,105.0,34.0,0,shot,fail,foot,José Luis Morales Nogales,13.13,18.694513,245.458956,172.3969,False,0.096875
1,1,60.727239,695,255738,84.0,27.88,84.0,27.88,0,shot,fail,foot,Jefferson Andrés Lerma Solís,21.873601,18.290727,400.084061,478.4544,False,0.053751
2,1,446.986112,682,37831,92.4,29.24,92.4,29.24,0,shot,fail,foot,Carlos Arturo Bacca Ahumada,13.469135,28.763621,387.4211,181.4176,False,0.144539
3,1,488.929113,682,15214,91.35,23.12,105.0,27.2,0,shot,fail,foot,Antonio Rukavina,17.455569,18.932259,330.473365,304.6969,False,0.072379
4,1,948.872079,695,225089,78.75,40.8,105.0,34.0,0,shot,fail,foot,José Luis Morales Nogales,27.116462,14.904849,404.166775,735.3025,False,0.032761


In [141]:
goal_xG = shots.groupby('player_name').agg({'goal': 'sum', 'xG': 'sum'})
goal_xG

Unnamed: 0_level_0,goal,xG
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aarón Martín Caricol,0,0.219013
Achraf Hakimi Mouh,2,0.414923
Adalberto Peñaranda Maestre,0,1.182779
Adnan Januzaj,3,4.988464
Adrián González Morales,3,3.792101
...,...,...
Óscar David Romero Villamayor,0,0.170497
Óscar Esau Duarte Gaitán,0,1.265735
Óscar Melendo Jiménez,0,0.617549
Óscar de Marcos Arana,1,1.284352


In [142]:
higher_xG = goal_xG.sort_values('xG', ascending=False).head(10)
higher_xG.index.name = 'Jogadores com maior xG'
higher_xG

Unnamed: 0_level_0,goal,xG
Jogadores com maior xG,Unnamed: 1_level_1,Unnamed: 2_level_1
Cristiano Ronaldo dos Santos Aveiro,26,25.145335
Lionel Andrés Messi Cuccittini,34,19.506641
Luis Alberto Suárez Díaz,25,19.046561
Gerard Moreno Balaguero,16,16.907328
Cristhian Ricardo Stuani Curbelo,21,15.650481
Maximiliano Gómez González,18,14.670646
Iago Aspas Juncal,22,13.508836
Jonathan Calleri,9,11.93008
Willian José da Silva,15,11.281611
Raúl García Escudero,10,11.268914


In [143]:
diff_goals_xg = abs(shots.groupby('player_name')['goal'].sum() - shots.groupby('player_name')['xG'].sum())
higher_goal_xG_diff = diff_goals_xg.sort_values(ascending=False).head(10)
higher_goal_xG_diff.index.name = 'Jogadores com maior diferença entre gols e xG'
display(higher_goal_xG_diff)

Jogadores com maior diferença entre gols e xG
Lionel Andrés Messi Cuccittini      14.493359
Antoine Griezmann                   10.874384
Iago Aspas Juncal                    8.491164
Enis Bardhi                          6.800336
Gareth Frank Bale                    6.005716
Luis Alberto Suárez Díaz             5.953439
Mikel Oyarzabal Ugarte               5.948351
Rodrigo Moreno Machado               5.728634
Cristhian Ricardo Stuani Curbelo     5.349519
Ángel Luis Rodríguez Díaz            5.080552
dtype: float64

## Questão 4 [4]
- Instancie um objeto ExpectedThreat [2] com parâmetros l=25 e w=16.
- Faça o fit do modelo ExpectedThreat com o dataframe "spadl".

In [144]:
xT_model = xt.ExpectedThreat(l=25, w=16)
xT_model = xT_model.fit(spadl)

  return np.nan_to_num(a / b)


## Questão 5
- Crie um dataframe "prog_actions" à partir do dataframe "spadl", contendo apenas as ações de progressão e que são bem-sucedidas [3].
- Use o método rate do objeto ExpectedThreat p/ calcular o valor de cada ação de progressão do dataframe "prog_actions", em uma coluna chamada "action_value".
- Agrupe o dataframe "prog_actions" por "player_name" e reporte a soma dos "action_value".
- Reporte os 10 jogadores com maior "action_value".

In [145]:
prog_actions = xt.get_successful_move_actions(spadl)
prog_actions.reset_index(drop=True, inplace=True)
prog_actions

Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,original_event_id,bodypart_id,type_id,result_id,action_id,type_name,result_name,bodypart_name,player_name
0,2565548,1,2.994582,682,3542,38.85,26.52,52.50,34.00,180864419,0,0,1,0,pass,success,foot,Manuel Trigueros Muñoz
1,2565548,1,3.137020,682,274435,52.50,34.00,47.25,47.60,180864418,0,0,1,1,pass,success,foot,Enes Ünal
2,2565548,1,6.709668,682,364860,47.25,47.60,39.90,59.84,180864420,0,0,1,2,pass,success,foot,Rodrigo Hernández Cascante
3,2565548,1,8.805497,682,3534,39.90,59.84,33.60,21.08,180864421,0,0,1,3,pass,success,foot,Jaume Vicent Costa Jordá
4,2565548,1,14.047492,682,3695,33.60,21.08,32.55,42.84,180864422,0,0,1,4,pass,success,foot,Álvaro González Soberón
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303200,2565927,2,2931.782904,682,3695,37.80,44.88,47.25,32.64,253302665,0,0,1,1480,pass,success,foot,Álvaro González Soberón
303201,2565927,2,2932.188168,682,20623,47.25,32.64,69.30,51.00,253302667,0,21,1,1481,dribble,success,foot,Roberto Soriano
303202,2565927,2,2939.077491,682,20623,69.30,51.00,92.40,66.64,253302671,0,0,1,1482,pass,success,foot,Roberto Soriano
303203,2565927,2,2940.515560,682,122832,92.40,66.64,101.85,53.72,253302673,0,21,1,1483,dribble,success,foot,Salem Mohammed Al Dawsari


In [146]:
prog_actions['action_value'] = xT_model.rate(prog_actions)
prog_actions

Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,original_event_id,bodypart_id,type_id,result_id,action_id,type_name,result_name,bodypart_name,player_name,action_value
0,2565548,1,2.994582,682,3542,38.85,26.52,52.50,34.00,180864419,0,0,1,0,pass,success,foot,Manuel Trigueros Muñoz,0.001583
1,2565548,1,3.137020,682,274435,52.50,34.00,47.25,47.60,180864418,0,0,1,1,pass,success,foot,Enes Ünal,-0.000136
2,2565548,1,6.709668,682,364860,47.25,47.60,39.90,59.84,180864420,0,0,1,2,pass,success,foot,Rodrigo Hernández Cascante,-0.003240
3,2565548,1,8.805497,682,3534,39.90,59.84,33.60,21.08,180864421,0,0,1,3,pass,success,foot,Jaume Vicent Costa Jordá,-0.000069
4,2565548,1,14.047492,682,3695,33.60,21.08,32.55,42.84,180864422,0,0,1,4,pass,success,foot,Álvaro González Soberón,0.000376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303200,2565927,2,2931.782904,682,3695,37.80,44.88,47.25,32.64,253302665,0,0,1,1480,pass,success,foot,Álvaro González Soberón,0.001687
303201,2565927,2,2932.188168,682,20623,47.25,32.64,69.30,51.00,253302667,0,21,1,1481,dribble,success,foot,Roberto Soriano,0.005220
303202,2565927,2,2939.077491,682,20623,69.30,51.00,92.40,66.64,253302671,0,0,1,1482,pass,success,foot,Roberto Soriano,0.009285
303203,2565927,2,2940.515560,682,122832,92.40,66.64,101.85,53.72,253302673,0,21,1,1483,dribble,success,foot,Salem Mohammed Al Dawsari,0.007890


In [147]:
prog_actions.groupby('player_name')['action_value'].sum()

player_name
Aarón Martín Caricol             5.578629
Achraf Hakimi Mouh               1.369938
Adalberto Peñaranda Maestre      0.251660
Adnan Januzaj                    4.142670
Adrián  González Morales         1.053695
                                   ...   
Óscar David Romero Villamayor    0.088822
Óscar Esau Duarte Gaitán         1.011419
Óscar Melendo Jiménez            0.317399
Óscar de Marcos Arana            2.742232
Šime Vrsaljko                    3.160489
Name: action_value, Length: 556, dtype: float64

In [148]:
highest_action_value = prog_actions.groupby('player_name')['action_value'].sum().sort_values(ascending=False).head(10)
highest_action_value.index.name = 'Jogadores com maior valor de ação'
highest_action_value

Jogadores com maior valor de ação
Lionel Andrés Messi Cuccittini    10.650189
Marcelo Vieira da Silva Júnior    10.264535
Álvaro Odriozola Arzallus          8.708854
José Luis Morales Nogales          7.819040
Hugo Mallo Novegil                 7.431915
Juan Francisco Moreno Fuertes      7.281309
Éver Maximiliano David Banega      7.015160
Lucas Vázquez Iglesias             6.908507
Jordi Alba Ramos                   6.824937
José Luis Gayá Peña                6.811350
Name: action_value, dtype: float64