In [None]:
import copy
import json
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import torch
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import warnings
from preprocessing import PreProcessing
from imblearn.over_sampling import SMOTE, RandomOverSampler, BorderlineSMOTE
from imblearn.combine import SMOTEENN
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

## You'll see the data goes from 0 to 1 on each axis. The coordiante (0,0) is the top left, (1,1) is the bottom right, and (0.5,0.5) is the kick off point.

In [None]:
game_1_pre = PreProcessing()
game_1_data = game_1_pre.load_and_process_data(data_home="data/Sample_Game_1/Sample_Game_1_RawTrackingData_Home_Team.csv", data_away="data/Sample_Game_1/Sample_Game_1_RawTrackingData_Away_Team.csv", 
                                               add_ball_data=True, half_period="both")

In [None]:
game_2_pre = PreProcessing()
game_2_data = game_2_pre.load_and_process_data(data_home="data/Sample_Game_2/Sample_Game_2_RawTrackingData_Home_Team.csv", data_away="data/Sample_Game_2/Sample_Game_2_RawTrackingData_Away_Team.csv", 
                                               add_ball_data=True, half_period="both")

In [None]:
game_1_data

In [None]:
game_2_data

## Tracking for individual Players for the First Period

In [None]:
game_2_pre.player_tracking(game_2_data.iloc[2:3], players=[1,2,3,4,5,6,7,8,9,10,11], sides=["Home"]*11, plot_ball=False)

In [None]:
# player_tracking(period_1_data)
# player_tracking(period_1_data_home, [2])
# player_tracking(period_1_data_home, [5])
# player_tracking(period_1_data_home, [8])

# Adding extra features to dataset

In [None]:
vel_acc_game_1_data = game_1_pre.vel_acc(game_1_data)
vel_acc_game_2_data = game_2_pre.vel_acc(game_2_data)

In [None]:
vel_acc_game_1_data

In [None]:
vel_acc_game_2_data

In [None]:
game_1_ply_tra = vel_acc_game_1_data.iloc[:, 3:]
game_2_ply_tra = vel_acc_game_2_data.iloc[:, 3:]

In [None]:
game_2_ply_tra

In [None]:
# game_1_players, game_1_vel_acc = game_1_ply_tra.loc[:,"Home-P_1-x":"Away-P_24-y"], game_1_ply_tra.loc[:,"P_1_velocity":]
# game_2_players, game_2_vel_acc = game_2_ply_tra.loc[:,"Home-P_1-x":"Away-P_24-y"], game_2_ply_tra.loc[:,"P_1_velocity":]

In [None]:
game_1_player_data = game_1_pre.get_frames(game_1_ply_tra, columns=game_1_ply_tra.columns[0:46:2], frame=1000000, frame_interval=1000000).fillna(0)
game_2_player_data = game_2_pre.get_frames(game_2_ply_tra, columns=game_2_ply_tra.columns[0:46:2], frame=1000000, frame_interval=1000000).fillna(0)

In [None]:
game_2_player_data

# Classification

In [None]:
def change_player_name(data: pd.Series) -> pd.Series:
    result = []
    name = ""
    for i in range(len(data)):
        item = data.iloc[i]
        if len(item) == 8:
            if int(item[-2:]) < 12:
                name = f"Home-P_{item[-2:]}"
            else:
                name = f"Away-P_{item[-2:]}"
        else:
            name = f"Home-P_{item[-1:]}"
            
        result.append(name)

    return pd.Series(data=result, name=data.name)

In [None]:
game_1_event_data = pd.read_csv("data/Sample_Game_1/Sample_Game_1_RawEventsData.csv")
game_2_event_data = pd.read_csv("data/Sample_Game_2/Sample_Game_2_RawEventsData.csv")

In [None]:
game_1_event_data

In [None]:
game_2_event_data

In [None]:
game_1_event_data = game_1_event_data[["Type", "Start Frame", "End Frame", "From", "To"]]
game_2_event_data = game_2_event_data[["Type", "Start Frame", "End Frame", "From", "To"]]

In [None]:
game_1_event_data

In [None]:
def fix_events(event_dataset):
    fixed_event_df = event_dataset.copy()
    
    start_frames = fixed_event_df["Start Frame"].values
    end_frames = fixed_event_df["End Frame"].values
    
    conflict_indices = np.where(start_frames[1:] == end_frames[:-1])[0] + 1
    
    fixed_event_df.loc[fixed_event_df.index[conflict_indices], "Start Frame"] += 1
    
    return fixed_event_df

In [None]:
game_1_event_data = fix_events(game_1_event_data)
game_2_event_data = fix_events(game_2_event_data)

In [None]:
game_1_event_data

In [None]:
start_frames = game_1_event_data["Start Frame"].iloc[1:].to_numpy()
end_frames = game_1_event_data["End Frame"].iloc[1:].to_numpy()
event_types = game_1_event_data["Type"].iloc[1:].to_numpy()

assert start_frames.shape == end_frames.shape == event_types.shape

end = game_1_player_data.index[-1]
end_frames = np.minimum(end_frames, end)

frame_ranges = [np.arange(min(i, j), max(i, j) + 1) for i, j in zip(start_frames, end_frames)]
unique_indices  = np.unique(np.concatenate(frame_ranges))
game_1_event = pd.DataFrame(index=unique_indices, columns=["Type"])

for s, e, e_t in zip(start_frames, end_frames, event_types):
    game_1_event.loc[s:e, "Type"] = e_t

game_1_event = game_1_event.dropna()

In [None]:
game_1_event

In [None]:
start_frames = game_2_event_data["Start Frame"].iloc[1:].to_numpy()
end_frames = game_2_event_data["End Frame"].iloc[1:].to_numpy()
event_types = game_2_event_data["Type"].iloc[1:].to_numpy()

assert start_frames.shape == end_frames.shape == event_types.shape

end = game_2_player_data.index[-1]
end_frames = np.minimum(end_frames, end)

frame_ranges = [np.arange(min(i, j), max(i, j) + 1) for i, j in zip(start_frames, end_frames)]
unique_indices  = np.unique(np.concatenate(frame_ranges))
game_2_event = pd.DataFrame(index=unique_indices, columns=["Type"])

for s, e, e_t in zip(start_frames, end_frames, event_types):
    game_2_event.loc[s:e, "Type"] = e_t

game_2_event = game_2_event.dropna()

In [None]:
game_2_event

In [None]:
valid_indices = game_1_event.index.intersection(game_1_player_data.index)
X_1 = game_1_player_data.loc[valid_indices]
y_1 = game_1_event.loc[valid_indices, "Type"]

In [None]:
valid_indices = game_2_event.index.intersection(game_2_player_data.index)
X_2 = game_2_player_data.loc[valid_indices]
y_2 = game_2_event.loc[valid_indices, "Type"]

In [None]:
X_1 = game_1_pre.expand_dataset(dataset=X_1, look_back=50)
X_2 = game_2_pre.expand_dataset(dataset=X_2, look_back=50)

y_1 = game_1_pre.expand_dataset(dataset=y_1, look_back=50)
y_2 = game_2_pre.expand_dataset(dataset=y_2, look_back=50)

In [None]:
X = pd.concat([X_1])#, X_2])
y = y_1#np.concatenate((y_1, y_2))

assert X.shape[0] == y.shape[0]

In [None]:
np.unique(y, return_counts=True)

In [None]:
sme = SMOTE(k_neighbors=1)
X, y = sme.fit_resample(X, y)

In [None]:
X_res.shape

In [None]:
splits = {"X_train": [], "X_test": [], "y_train": [], "y_test": []}
scaled = {"X_train": [], "X_test": [], "y_train": [], "y_test": []}

for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    splits["X_train"].append(X_train)
    splits["X_test"].append(X_test)
    splits["y_train"].append(y_train)
    splits["y_test"].append(y_test)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # sme = SMOTE(n_jobs=-2, k_neighbors=1)
    # X_train_res, y_train_res = sme.fit_resample(X_train_scaled, y_train)
    
    scaled["X_train"].append(X_train_scaled)
    scaled["X_test"].append(X_test_scaled)
    scaled["y_train"].append(y_train)
    scaled["y_test"].append(y_test)


In [None]:
[i.shape for i in splits["X_train"]]

In [None]:
for k, v in scaled.items():
    for i in v:
        print(k, i.shape)

In [None]:
warnings.filterwarnings("always")

In [None]:
def conf_matrix(y_train, y_train_pred, y_test, y_pred, labels, split, model_name=""):
    fig, ax = plt.subplots(figsize=(12, 10))
    
    cm_counts = confusion_matrix(y_test, y_pred, labels=labels)
    cm_normalized = confusion_matrix(y_test, y_pred, labels=labels, normalize="true")

    annot = []
    for i in range(len(cm_counts)):
        row_annot = []
        for j in range(len(cm_counts[i])):
            row_annot.append(f"{cm_counts[i, j]} | {cm_normalized[i, j]:.2f}")
        annot.append(row_annot)
    
    sns.heatmap(cm_normalized, 
                annot=annot,
                fmt="", 
                cmap="viridis", 
                xticklabels=labels, 
                yticklabels=labels,
                cbar_kws={"label": "Normalized Frequency"})

    ax.set_xlabel("Predicted label")
    ax.set_ylabel("True label")
    ax.set_title(f"{model_name.title()} Confusion Matrix Split #{(split+1)}")
    plt.tight_layout()
    plt.show()
    
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"Split #{(split+1)} {model_name} training accuracy: {train_accuracy * 100:0.2f}%")
    print(f"Split #{(split+1)} {model_name} testing accuracy: {test_accuracy * 100:0.2f}%\n")   

In [None]:
X_res.shape

In [None]:
with open("model.pickle", "wb") as f:
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

# New Dataset

In [None]:
new_data = pd.read_csv("data/Sample_Game_3/Sample_Game_3_tracking.txt", sep=r"[;,:]", header=None, engine="python")
new_data.index =  range(1, len(new_data) + 1)

In [None]:
time = np.arange(0.04, (len(new_data) + 1) * 0.04, 0.04).round(2)
new_data.insert(1, "Time [s]", time)

In [None]:
columns = ['Frame', 'Time [s]', 'Home-Player11-x', 'Home-Player11-y', 'Home-Player1-x', 'Home-Player1-y', 'Home-Player2-x', 'Home-Player2-y',
       'Home-Player3-x', 'Home-Player3-y', 'Home-Player4-x', 'Home-Player4-y', 'Home-Player5-x', 'Home-Player5-y', 'Home-Player6-x', 'Home-Player6-y',
       'Home-Player7-x', 'Home-Player7-y', 'Home-Player8-x', 'Home-Player8-y', 'Home-Player9-x', 'Home-Player9-y', 'Home-Player10-x',
       'Home-Player10-y', 'Away-Player25-x', 'Away-Player25-y', 'Away-Player15-x', 'Away-Player15-y', 'Away-Player16-x',
       'Away-Player16-y', 'Away-Player17-x', 'Away-Player17-y', 'Away-Player18-x', 'Away-Player18-y', 'Away-Player19-x',
       'Away-Player19-y', 'Away-Player20-x', 'Away-Player20-y', 'Away-Player21-x', 'Away-Player21-y', 'Away-Player22-x',
       'Away-Player22-y', 'Away-Player23-x', 'Away-Player23-y', 'Away-Player24-x', 'Away-Player24-y', "Ball-x", "Ball-y"]

new_data.columns = columns

In [None]:
new_data

In [None]:
vel_acc_new_data = game_1_pre.vel_acc(new_data)
vel_acc_new_data

In [None]:
new_data_ply_tra = vel_acc_new_data.iloc[:, 2:]
new_data_ply_tra

In [None]:
new_data_player_data = game_1_pre.get_frames(new_data_ply_tra, columns=new_data_ply_tra.columns[0:46:2], frame=1000000, frame_interval=1000000).fillna(0)
new_data_player_data

In [None]:
with open("data/Sample_Game_3/Sample_Game_3_events.json", "r") as f:
    data = json.load(f)

In [None]:
type_ = np.array([])
start_frame = np.array([])
end_frame = np.array([])

for i in data["data"]:
    t = i["type"]["name"]
    sf = i["start"]["frame"]
    ef = i["end"]["frame"]

    if t != "CARRY":   
        type_ = np.append(type_, t)
        start_frame = np.append(start_frame, sf)
        end_frame = np.append(end_frame, ef)


new_data_event_data = pd.DataFrame({"Type": type_, "Start Frame": start_frame.astype(np.int64), "End Frame": end_frame.astype(np.int64)})

In [None]:
new_data_event_data

In [None]:
new_data_event_data = fix_events(new_data_event_data)

In [None]:
start_frames = new_data_event_data["Start Frame"].iloc[1:].to_numpy()
end_frames = new_data_event_data["End Frame"].iloc[1:].to_numpy()
event_types = new_data_event_data["Type"].iloc[1:].to_numpy()

assert start_frames.shape == end_frames.shape == event_types.shape

end = new_data_player_data.index[-1]
end_frames = np.minimum(end_frames, end)

frame_ranges = [np.arange(min(i, j), max(i, j) + 1) for i, j in zip(start_frames, end_frames)]
unique_indices  = np.unique(np.concatenate(frame_ranges))
new_data_event = pd.DataFrame(index=unique_indices, columns=["Type"])

for s, e, e_t in zip(start_frames, end_frames, event_types):
    new_data_event.loc[s:e, "Type"] = e_t

new_data_event = new_data_event.dropna()

In [None]:
new_data_event

In [None]:
new_data_event_data[new_data_event_data["Type"] == "SET PIECE"]

In [None]:
# with open("model.pickle", "rb") as f:
#     loaded_model = pickle.load(f)

In [None]:
valid_indices = new_data_event.index.intersection(new_data_player_data.index)
X_new = new_data_player_data.loc[valid_indices]
y_new = new_data_event.loc[valid_indices, "Type"]

In [None]:
X_new = game_1_pre.expand_dataset(dataset=X_new, look_back=50)
y_new = game_1_pre.expand_dataset(dataset=y_new, look_back=50)

In [None]:
np.unique(y, return_counts=True)

In [None]:
np.unique(y_2, return_counts=True)

In [None]:
np.unique(y_new, return_counts=True)

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
# y_new = label_encoder.transform(y_new)
y_2 = label_encoder.transform(y_2)

In [None]:
print(X.shape)
print(X_2.shape)
# print(X_new.shape)

In [None]:
# TEMP SOLUTIONS
X = X[:2132500]
y = y[:2132500]

X_2 = X_2[:2206100]
y_2 = y_2[:2206100]

# X_new = X_new[:57350]
# y_new = y_new[:57350]

In [None]:
print(X.shape)
print(X_2.shape)
# print(X_new.shape)

In [None]:
np.unique(y_new, return_counts=True)

In [None]:
assert X_new.shape[0] == y_new.shape[0]

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
# X_new = scaler.transform(X_new)
X_2 = scaler.transform(X_2)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
np.unique(y).shape, np.unique(y_2).shape, np.unique(y_new).shape

In [None]:
assert X.shape[1] == X_2.shape[1]# == X_new.shape[1]
# assert np.unique(y).shape == np.unique(y_2).shape == np.unique(y_new).shape

input_size = X.shape[1]
hidden_size = 256
num_layers = 2
num_classes = np.unique(y).shape[0]
sequence_length = 50
learning_rate = 0.001
batch_size = 256
num_epochs = 50

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
        self.init_weights()

    def init_weights(self):
        # Initialize LSTM weights
        for name, param in self.lstm.named_parameters():
            if "weight" in name:
                nn.init.xavier_uniform_(param)
       
        nn.init.xavier_uniform_(self.fc.weight)

        if self.fc.bias is not None:
            nn.init.zeros_(self.fc.bias)
                    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)
        
        return out

In [None]:
# Dataloader
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        # Convert pandas DataFrame/numpy arrays to PyTorch tensors
        self.features = torch.tensor(features.values if hasattr(features, 'values') else features, 
                                     dtype=torch.float32)
        self.labels = torch.tensor(labels.values if hasattr(labels, 'values') else labels, 
                                   dtype=torch.long)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Reshape the NumPy array
X_reshaped = X.reshape(-1, sequence_length, input_size)
X_2_reshaped = X_2.reshape(-1, sequence_length, input_size)
# X_new_reshaped = X_new.reshape(-1, sequence_length, input_size)

# Create DataLoaders
training_dataset = CustomDataset(X_reshaped, y)
validation_dataset = CustomDataset(X_2_reshaped, y_2)
# testing_dataset = CustomDataset(X_new_reshaped, y_new)

training_loader = DataLoader(
    dataset=training_dataset, 
    batch_size=batch_size, 
    shuffle=False,
)

validation_loader = DataLoader(
    dataset=validation_dataset, 
    batch_size=batch_size, 
    shuffle=False,
)

# testing_loader = DataLoader(
#     dataset=testing_dataset, 
#     batch_size=batch_size, 
#     shuffle=False,
# )

In [None]:
model = LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes).to(device)
model

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    train_acc = 0
    val_acc = 0
    
    model.train()
    
    for data, label in training_loader:
        data, label = data.to(device), label.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(data)
        
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()
        
        _, pred = torch.max(outputs, 1)
        train_acc += (pred == label).sum().item()

    train_acc /= len(training_loader.dataset)

    model.eval()
    with torch.no_grad():
        for data, label in validation_loader:
            data, label = data.to(device), label.to(device)
        
            outputs = model(data)
            
            _, pred = torch.max(outputs.data, 1)
            val_acc += (pred == label).sum().item()

    val_acc /= len(validation_loader.dataset)
    
    print(f"Epoch[{epoch + 1}] | training accuracy: {train_acc * 100:0.2f}% | validation accuracy: {val_acc * 100:0.2f}%")

In [None]:
test_acc = 0

model.eval()

with torch.no_grad():
    for data, label in testing_loader:
        data, label = data.to(device), label.to(device)
        
        outputs = model(data)
        
        _, pred = torch.max(outputs.data, 1)
        test_acc += (pred == label).sum().item()

    test_acc /= len(testing_loader.dataset)

print(f"Accuracy on test set: {test_acc * 100:.2f}%")