In [None]:
!git clone https://github.com/narame7/UOS-FootballDataAnalytics-Tutorial

In [None]:
%cd 'UOS-FootballDataAnalytics-Tutorial/Week 13'

In [None]:
!pip install matplotsoccer==0.0.8
!pip install matplotlib==3.7.5

In [None]:
%load_ext autoreload
%autoreload 2
import os; import sys; sys.path.insert(0, '../')
import pandas as pd
import tqdm
import pickle

import numpy as np
import warnings

In [None]:
import tarfile

tar_path = "data/JL.tar.gz"
extract_path = "data"


import os
os.makedirs(extract_path, exist_ok=True)

with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(path=extract_path)

print("압축 풀기 완료!")


모델 및 데이터 로드 단계
- Feather 파일로 저장한 이벤트/게임/선수 데이터를 읽어온다.

In [26]:
### Configure file and folder names
data_feather = "../data/soccermix_J1_all_data.feather"

models = "../data/soccermix_J1_models.pkl"
c_weights = "../data/soccermix_J1_all_catweights.pkl"
l_weights = "../data/soccermix_J1_all_locweights.pkl"
d_weights = "../data/soccermix_J1_all_dirweights.pkl"


In [27]:
def loadall(filename):
    with open(filename, "rb") as f:
        while True:
            try:
                yield pickle.load(f)
            except EOFError:
                break


items = loadall(models)
d_w = loadall(d_weights)

In [28]:
cat_model = next(items)
loc_models = next(items)
dir_models = next(items)
dir_weights = next(d_w)

In [29]:
X = pd.read_feather(data_feather)
games = pd.read_feather('../data/JL_games.feather')
players = pd.read_feather('../data/JL_players.feather')
pg = pd.read_feather('../data/JL_player_games.feather')
mp = pg[["player_id", "minutes_played"]].groupby("player_id").sum().reset_index()

japan_id_map = dict(zip(games["home_team_name"], games["home_team_id"]))
merged_weights = dir_weights.copy()
merged_weights["team_id"] = X.team_id.values
merged_weights["game_id"] = X.game_id.values

전/후반기로 분리해서 선수 추출

In [8]:
games_first_half = games[games.game_day<=19]
games_second_half = games[games.game_day>19]

pg_first = pg[pg.game_id.isin(games_first_half.game_id)]
pg_second = pg[pg.game_id.isin(games_second_half.game_id)]

players_first = players[players.player_id.isin(pg_first.player_id)]
players_second = players[players.player_id.isin(pg_second.player_id)]

mp_first = pg_first[["player_id", "minutes_played"]].groupby("player_id").sum().reset_index()
mp_second = pg_second[["player_id", "minutes_played"]].groupby("player_id").sum().reset_index()

In [None]:
# Get player vectors
# dir_weights : 방향성 기반 Mixture Model 확률 가중치 
# 이를 player_id / game_id 단위로 합산하여 전술 특성 벡터로 변환.
# 한 선수가 경기 동안 어떤 전술적 행동을 얼마나 자주 시도했는지를 고차원 벡터로 표현하는 단계
# 시즌을 절반으로 나누어 분류


merged_weights = dir_weights.copy()
merged_weights["player_id"] = X.player_id.values
merged_weights["game_id"] = X.game_id.values

vectors_first = {}
for p in tqdm.tqdm(list(players_first.player_id.unique())):
    vectors_first[int(p)] = merged_weights.loc[((merged_weights.player_id == p)
                                              & (merged_weights.game_id.isin(games_first_half.game_id))),
                                              dir_weights.columns].sum().values
    
vectors_second = {}
for p in tqdm.tqdm(list(players_second.player_id.unique())):
    vectors_second[int(p)] = merged_weights.loc[((merged_weights.player_id == p)
                                              & (merged_weights.game_id.isin(games_second_half.game_id))),
                                              dir_weights.columns].sum().values
    
vectors_first_pd = pd.concat({k: pd.DataFrame(v).T for k,v in vectors_first.items()}).droplevel(level=1)
vectors_first_pd.index.name = "player_id"
vectors_first_pd.columns = dir_weights.columns

vectors_second_pd = pd.concat({k: pd.DataFrame(v).T for k,v in vectors_second.items()}).droplevel(level=1)
vectors_second_pd.index.name = "player_id"
vectors_second_pd.columns = dir_weights.columns

100%|██████████| 510/510 [00:08<00:00, 62.80it/s]
100%|██████████| 523/523 [00:08<00:00, 61.56it/s]


In [10]:
# Normalize vectors per 90 min game time

vectors_first_norm = pd.merge(vectors_first_pd, mp_first, left_index=True, right_on='player_id').set_index('player_id')
df1 = vectors_first_norm.loc[:, dir_weights.columns] * 90
vectors_first_norm.loc[:, dir_weights.columns] = df1.divide(vectors_first_norm.minutes_played, axis='rows')
vectors_first_norm.drop(columns=['minutes_played'], inplace=True)

vectors_second_norm = pd.merge(vectors_second_pd, mp_second, left_index=True, right_on='player_id').set_index('player_id')
df1 = vectors_second_norm.loc[:, dir_weights.columns] * 90
vectors_second_norm.loc[:, dir_weights.columns] = df1.divide(vectors_second_norm.minutes_played, axis='rows')
vectors_second_norm.drop(columns=['minutes_played'], inplace=True)

In [11]:
# Code below mainly from Pieter's implementation of this experiment with soccer vectors
# https://github.com/probberechts/soccer-player-vectors-thesis/blob/master/notebooks/5-experiments.ipynb

# Select correct players to test on 

train_players = pg_first.groupby('player_id').agg({
    'minutes_played': 'sum',
    'team_id': set
}).merge(players, on="player_id", how='left')

test_players = pg_second.groupby('player_id').agg({
    'minutes_played': 'sum',
    'team_id': set
}).merge(players, on="player_id", how='left')

In [12]:
all_players = pd.merge(train_players, test_players, on="player_id", suffixes=("_train", "_test"))
all_players['nb_teams'] = all_players.apply(lambda x: len(x.team_id_train | x.team_id_test), axis=1)
all_players = all_players[all_players.nb_teams == 1]

In [13]:
# Only players who played >= 900 minutes in both train and test season
all_players = all_players[(all_players.minutes_played_train >= 500) & (all_players.minutes_played_test >= 500)]

In [14]:
all_players = all_players.player_id.unique()
print("Number of players: ", len(all_players))

Number of players:  230


In [None]:
# Compute pairwise distances
# 모든 선수 간 전반기 vs 후반기 임베딩 거리 행렬 D 계산
#    - L1 거리(Manhattan) + L1 normalization 적용
from sklearn.metrics import pairwise_distances
from sklearn import preprocessing

D = pairwise_distances(
    preprocessing.normalize(vectors_first_norm.loc[all_players], norm="l1"),
    preprocessing.normalize(vectors_second_norm.loc[all_players], norm="l1"),
    metric="manhattan")

# sort each row
k_d = np.sort(D, axis = 1) 
# sort each row and replace distances by index
k_i = np.argsort(D, axis = 1) 
# replace indices by player ids
p_i = np.take(all_players, k_i, axis = 0)

In [None]:
rs = np.argmax(np.array([p_i[i,:] == all_players[i] for i in range(p_i.shape[0])]), axis=1)
rs

In [None]:
# 각 선수(행)의 거리 순위 기반으로:
#    - 동일 선수(Self) 위치의 순위(rs) 계산, 선수 본인을 얼마나 상위에 놔두는지 성능 측정
def mean_reciprocal_rank(rs):
    return np.mean(1. / (rs + 1))

def top_k(rs, k):
    return (rs < k).sum() / len(rs)

In [18]:
mean_reciprocal_rank(rs)

0.5516326699010587

In [19]:
top_k(rs, 10)

0.8217391304347826

In [20]:
print(top_k(rs, 5))
print(top_k(rs, 3))
print(top_k(rs, 1))

0.7
0.591304347826087
0.43043478260869567


# Get similar players to player

In [21]:
def get_similar_players(player_id):
    player_index = np.where(all_players == player_id)[0][0]
    print(player_index)
    sims = p_i[player_index,:]
    names = players_first.set_index("player_id").loc[sims, "player_name"].values
    dists = k_d[player_index,:]
    return pd.DataFrame({"name": names, "dist": dists})

In [22]:
train_players[train_players['player_id']==37655]

Unnamed: 0,player_id,minutes_played,team_id,player_name,nickname
130,37655,1390,{1891},Lucas Fernandes,Lucas Fernandes


In [None]:
'''
2024 J1 League 평점 1위 Lucas Fernandes(세레소 오사카) 
2024 J1 League MVP, J1리그 베스트 일레븐 Yoshinori Muto(비셀 고베)
'''
print(train_players[train_players.player_name.str.contains('Lucas')].player_id) 
print(train_players[train_players.player_name.str.contains('Yoshinori M')].player_id) 


130    37655
Name: player_id, dtype: int64
9    6337
Name: player_id, dtype: int64


In [24]:
get_similar_players(37655) # Similar to Lucas Fernandes

58


Unnamed: 0,name,dist
0,Lucas Fernandes,0.274876
1,Kazuya Konno,0.332686
2,Yan Matheus Santos Souza,0.361114
3,Yoshinori Muto,0.388811
4,Tomoaki Okubo,0.409537
...,...,...
225,Takanori Sugeno,1.797248
226,William Popp,1.798747
227,Sung-Ryong Jung,1.802955
228,Shusaku Nishikawa,1.804692


In [25]:
get_similar_players(6337) # Similar to Yoshinori Muto

5


Unnamed: 0,name,dist
0,Marco Túlio Oliveira Lemos,0.143439
1,Yoshinori Muto,0.195712
2,Tomoya Koyamatsu,0.279200
3,Shu Morooka,0.292096
4,Masaya Shibayama,0.300099
...,...,...
225,Mitchell James Langerak,1.796698
226,Sung-Ryong Jung,1.798005
227,Keisuke Osako,1.799687
228,Kenta Matsumoto,1.800289
