In [39]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

import torch
from torch.utils.data import TensorDataset, DataLoader

import joblib

In [2]:
df = pd.read_csv("../data/processed/fighters_clean.csv")

In [19]:
df.head()

Unnamed: 0,SLpM,SApM,Str_Acc,Str_Def,TD_Avg,TD_Acc,TD_Def,Sub_Avg,Height_in,Reach_in,Age,Win_Ratio,Total_Fights
0,2.83,3.77,0.5,0.5,1.2,0.45,0.71,1.0,74.0,76.0,41.041096,0.666667,9.0
1,6.23,6.7,0.63,0.53,1.5,0.3,0.92,0.0,70.0,70.0,36.106849,0.85,20.0
2,5.67,3.77,0.5,0.6,0.51,0.44,0.58,0.3,72.0,73.0,44.139726,0.708333,24.0
3,3.88,2.93,0.36,0.59,1.54,0.45,0.72,0.1,70.0,76.0,31.712329,0.75,28.0
4,3.49,3.99,0.45,0.53,3.83,0.46,0.63,0.3,69.0,69.0,42.342466,0.692308,26.0


In [12]:
df.isna().sum()

SLpM               0
SApM               0
Str_Acc            0
Str_Def            0
TD_Avg             0
TD_Acc             0
TD_Def             0
Sub_Avg            0
Height_in        335
Reach_in        1957
Age              550
Win_Ratio         19
Total_Fights       0
dtype: int64

In [24]:
# Select Final Features

features = [
    "SLpM", "SApM", "Str_Acc", "Str_Def",
    "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg",
    "Height_in", "Reach_in", "Age",
    "Win_Ratio", "Total_Fights"
]

In [41]:
# Save Feature Names

with open("../data/processed/feature_names.txt", "w") as f:
    for feat in features:
        f.write(feat + "\n")

In [25]:
# Separate feature types

performance_features = [
    "SLpM", "SApM", "Str_Acc", "Str_Def",
    "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg",
    "Total_Fights"
]

physical_features = [
    "Height_in", "Reach_in", "Age", "Win_Ratio"
]

In [26]:
# Impute physical features using MEDIAN

df[physical_features] = df[physical_features].fillna(
    df[physical_features].median()
)

In [27]:
df[features].isna().sum()

SLpM            0
SApM            0
Str_Acc         0
Str_Def         0
TD_Avg          0
TD_Acc          0
TD_Def          0
Sub_Avg         0
Height_in       0
Reach_in        0
Age             0
Win_Ratio       0
Total_Fights    0
dtype: int64

In [28]:
X = df[features].values

In [29]:
# Standardize Features

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [30]:
X_scaled.mean(axis=0)

array([ 1.07754740e-16, -2.87345973e-17,  2.87345973e-17, -3.99091629e-17,
       -2.71382308e-17,  3.99091629e-18, -2.95327805e-17, -6.42537522e-17,
        1.97470538e-15, -1.49818997e-15, -1.59636651e-17, -2.10720380e-16,
       -1.22920222e-16])

In [31]:
X_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
# Save Scaled Features

np.save("../data/processed/X_scaled.npy", X_scaled)

In [None]:
# Save the Scaler Object

joblib.dump(scaler, "../models/scaler.pkl")

['../models/scaler.pkl']

In [32]:
# Convert to PyTorch Tensors

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)

In [33]:
X_tensor.shape

torch.Size([4451, 13])

In [34]:
# Create Dataset & DataLoader

dataset = TensorDataset(X_tensor, X_tensor)

In [35]:
# DataLoader

batch_size = 32

dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True
)

In [36]:
# Sanity Check DataLoader

for batch_x, batch_y in dataloader:
    print(batch_x.shape)
    break

torch.Size([32, 13])


In [43]:
import os

os.listdir("../data/processed")

['feature_names.txt', 'fighters_clean.csv', 'X_scaled.npy']

In [44]:
os.listdir("../models")


['scaler.pkl']