In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
N_SHUFFLES = 1
N_PREV = 1
N_DISCRETIZATION = 101
CLIP_POINT = 2

# Data preparation

In [3]:
data = pd.read_feather('data/tracking.feather')
data.dropna(axis=0, how='any', inplace=True)
fixed_cols = ['gameId', 'playId', 'frameId'] + [col for col in data.columns if col.endswith(('_dx', '_dy'))]
player_cols = [col for col in data.columns if col.endswith(('_x', '_y'))]

data = pd.concat(
    [data[fixed_cols]] + 
    [data[player_cols].shift(i) for i in range(1, N_PREV + 1)]
, axis=1)

data.columns = fixed_cols + [f'{col}-{i}' for i in range(1, N_PREV + 1) for col in player_cols]
data.dropna(axis=0, how='any', inplace=True)

data = data[data['gameId'] < 2022100000]

In [4]:
ball_pair = [item for sublist in [[f'p0_x-{j}', f'p0_y-{j}'] for j in range(1, N_PREV + 1)] for item in sublist]
player_pairs = []
for i in range(1, 23):
    pair = []
    for j in range(1, N_PREV + 1):
        pair += [f'p{i}_x-{j}', f'p{i}_y-{j}']
    player_pairs.append(pair)

In [5]:
def shuffle_pairs(pairs, remove_id=None):
    if remove_id is None:
        tmp_list = pairs
    else:
        tmp_list = pairs[:remove_id] + pairs[remove_id + 1:]
    np.random.shuffle(tmp_list)
    return tmp_list

In [6]:
data

Unnamed: 0,gameId,playId,frameId,p0_dx,p1_dx,p2_dx,p3_dx,p4_dx,p5_dx,p6_dx,...,p13_y-1,p14_y-1,p15_y-1,p16_y-1,p17_y-1,p18_y-1,p19_y-1,p20_y-1,p21_y-1,p22_y-1
2,2022090800,56,3,-0.010002,-0.11,-0.21,-0.02,-0.11,-0.05,0.03,...,29.89,24.51,32.79,32.71,29.14,25.31,29.05,27.12,32.79,30.18
3,2022090800,56,4,0.000000,-0.13,-0.20,-0.03,-0.11,-0.05,0.05,...,29.95,24.43,32.96,32.82,29.17,25.17,29.08,27.05,32.92,30.24
4,2022090800,56,5,0.000000,-0.15,-0.19,-0.04,-0.13,-0.05,0.03,...,30.02,24.34,33.14,32.93,29.21,25.04,29.11,26.98,33.05,30.30
5,2022090800,56,6,0.000000,-0.16,-0.19,-0.04,-0.12,-0.05,0.04,...,30.10,24.26,33.32,33.06,29.23,24.92,29.14,26.92,33.19,30.34
6,2022090800,56,7,0.010002,-0.17,-0.18,-0.05,-0.12,-0.05,0.03,...,30.18,24.17,33.50,33.18,29.24,24.79,29.18,26.85,33.33,30.38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20869,2022092900,3882,60,0.009995,0.02,0.03,0.00,0.06,0.00,0.03,...,29.66,30.47,28.34,29.60,30.98,27.52,29.73,26.10,33.41,28.59
20870,2022092900,3882,61,-0.009995,0.02,0.05,0.00,0.08,-0.01,0.04,...,29.65,30.49,28.35,29.61,30.95,27.54,29.74,26.16,33.41,28.59
20871,2022092900,3882,62,0.009995,0.02,0.05,-0.01,0.07,-0.02,0.05,...,29.65,30.50,28.35,29.62,30.91,27.56,29.74,26.17,33.40,28.60
20872,2022092900,3882,63,0.000000,0.02,0.06,0.00,0.08,-0.02,0.03,...,29.64,30.51,28.35,29.62,30.88,27.58,29.71,26.18,33.39,28.60


In [7]:
X, Y, X_ball_flag = [], [], []
for p_id in range(1, 23):
    for _ in range(N_SHUFFLES):
        X.append(data[sum([ball_pair] + [player_pairs[p_id-1]] + shuffle_pairs(player_pairs, p_id-1), [])].values)
        X_ball_flag.append(np.zeros((X[-1].shape[0], 1)))
        Y.append(data[[f'p{p_id}_dx', f'p{p_id}_dy']].values)

# Add ball
for _ in range(N_SHUFFLES):
    X.append(data[sum([ball_pair] + shuffle_pairs(player_pairs, None), [])].values)
    X_ball_flag.append(np.ones((X[-1].shape[0], 1)))
    Y.append(data[['p0_dx', 'p0_dy']].values)

X = np.concatenate(X, axis=0)
X_ball_flag = np.concatenate(X_ball_flag, axis=0)
X = np.concatenate([X, X_ball_flag], axis=1)
Y = np.concatenate(Y, axis=0).clip(-CLIP_POINT, CLIP_POINT)

In [8]:
discretizer_a = KBinsDiscretizer(n_bins=N_DISCRETIZATION, encode='ordinal', strategy='uniform')
discretizer_b = KBinsDiscretizer(n_bins=N_DISCRETIZATION, encode='ordinal', strategy='uniform')
YA = discretizer_a.fit_transform(Y[:, 0].reshape(-1, 1))
YB = discretizer_b.fit_transform(Y[:, 1].reshape(-1, 1))
Y = np.concatenate([YA, YB], axis=1)
pd.DataFrame(Y)

Unnamed: 0,0,1
0,47.0,50.0
1,47.0,50.0
2,46.0,50.0
3,46.0,50.0
4,46.0,49.0
...,...,...
21283804,50.0,50.0
21283805,50.0,50.0
21283806,50.0,50.0
21283807,50.0,50.0


In [9]:
pd.Series(discretizer_a.inverse_transform(YA).flatten())

0          -0.118812
1          -0.118812
2          -0.158416
3          -0.158416
4          -0.158416
              ...   
21283804    0.000000
21283805    0.000000
21283806    0.000000
21283807    0.000000
21283808    0.000000
Length: 21283809, dtype: float64

# Modeling

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=42)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((17027047, 47), (4256762, 47), (17027047, 2), (4256762, 2))

In [None]:
model = RandomForestClassifier(n_estimators=20, max_depth=10, random_state=42, n_jobs=-2)
model.fit(X_train, Y_train)

In [11]:
pred_proba = model.predict_proba(X_test)

In [12]:
pd.DataFrame(pred_proba[0]).mean().sort_values(ascending=False) * 100

50    12.345091
49     8.914230
51     8.878905
48     6.891566
52     6.775352
        ...    
94     0.000702
1      0.000657
2      0.000611
97     0.000426
98     0.000395
Length: 101, dtype: float64

In [13]:
import pickle
with open('models/rf_baseline/model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('models/rf_baseline/discretizer_a.pkl', 'wb') as f:
    pickle.dump(discretizer_a, f)
with open('models/rf_baseline/discretizer_b.pkl', 'wb') as f:
    pickle.dump(discretizer_b, f)