# Data Preprocessing

# 1 Load Required Libraries

In [17]:
import src.util as utils
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler

# 2. Load Configuration File

In [64]:
config = utils.load_config()

# 3. Load Dataset

In [4]:
def load_dataset(config_data: dict):
    # Load every set of data
    x_train = utils.pickle_load(config_data["train_set_path"][0])
    y_train = utils.pickle_load(config_data["train_set_path"][1])

    x_valid = utils.pickle_load(config_data["valid_set_path"][0])
    y_valid = utils.pickle_load(config_data["valid_set_path"][1])

    x_test = utils.pickle_load(config_data["test_set_path"][0])
    y_test = utils.pickle_load(config_data["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # Return 3 set of data
    return train_set, valid_set, test_set

In [5]:
train_set, valid_set, test_set = load_dataset(config)

In [6]:
preds = config['predictors']
target = config['target']

In [9]:
train_set.head()

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner
3314,0,-1,-1,-1,0,1,0,-1,0,-5.08,-2.54,6,1
948,1,0,-1,-3,-2,-12,0,0,-3,2.54,0.0,-1,0
2342,1,-1,0,-1,-3,-1,0,-4,1,0.0,5.08,-2,0
3306,0,0,0,0,-1,-4,0,0,1,5.08,10.16,-5,1
306,1,-1,-1,-1,1,0,0,0,0,-5.08,0.0,3,0


# 4. Balancing Train Label

In [8]:
train_set[target].value_counts()

1    1999
0    1426
Name: Winner, dtype: int64

In [10]:
# we know from our EDA that the label is not balanced
# for that we will perform Random UnderSampling

x_rus, y_rus = RandomUnderSampler(random_state = 99).fit_resample(
    train_set[preds],
    train_set[target]
)
train_set_bal = pd.concat([x_rus, y_rus], axis = 1)

In [11]:
train_set_bal[target].value_counts()

0    1426
1    1426
Name: Winner, dtype: int64

# 5. Scale

In [22]:
scaler = StandardScaler()
scaler.fit(train_set_bal[preds])
scaler_desc_df = pd.DataFrame(
    np.array([scaler.mean_, scaler.var_]),
    columns = scaler.get_feature_names_out(),
    index = ['mean', 'var']
)
scaler_desc_df

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif
mean,0.179523,0.100281,0.706872,1.401823,0.74439,5.181627,0.297335,0.496844,0.279102,-0.010288,0.172616,0.878682
var,0.996523,2.625287,3.858676,16.232647,7.992518,252.187909,2.353387,4.026287,2.96558,40.664362,70.000055,26.881495


In [52]:
def scale_transform(df: pd.DataFrame, scaler: StandardScaler, preds: list = preds, target: list = target) -> pd.DataFrame:
    df_scaled = pd.DataFrame(scaler.transform(df[preds]),
        index = df.index,
        columns = df[preds].columns
    )
    df_scaled = pd.concat([df_scaled, df[target]], axis=1)

    return df_scaled


In [53]:
train_set_scaled = scale_transform(train_set_bal, scaler)
train_set_scaled

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner
0,0.821907,-0.061891,-0.868924,-1.092541,-0.970742,-1.081937,-0.193820,-0.247610,-1.904146,0.399928,-0.020632,-0.362349,0
1,0.821907,-0.679071,-0.359850,-0.596137,-1.324461,-0.389261,-0.193820,-2.241070,0.418619,0.001613,0.586544,-0.555222,0
2,0.821907,-0.679071,-0.868924,-0.596137,0.090414,-0.326290,-0.193820,-0.247610,-0.162072,-0.795017,-0.020632,0.409147,0
3,0.821907,-0.061891,-0.359850,-0.347935,0.090414,-0.137378,-0.193820,-0.247610,-0.162072,-0.795017,0.586544,0.216273,0
4,-0.179836,0.555289,0.149224,-0.347935,-1.324461,-0.767084,-0.193820,-0.247610,-0.162072,0.399928,0.282956,-0.169475,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2847,-0.179836,-0.061891,-0.359850,-0.099733,-0.263305,-0.389261,-0.193820,0.749120,-0.162072,-0.795017,-0.931395,0.216273,1
2848,-0.179836,1.172468,0.658298,0.148469,-0.617024,-0.578173,-0.193820,0.250755,0.999310,-0.396702,0.890132,-0.555222,1
2849,2.825393,-0.061891,-0.868924,-1.588946,-0.263305,-1.081937,-0.845679,-0.745975,-0.742763,0.001613,-0.324219,0.794894,1
2850,-0.179836,-0.061891,-0.359850,-0.347935,-0.263305,-0.326290,-0.193820,-0.247610,-0.162072,0.001613,-1.234983,0.409147,1


In [56]:
dummy_df = pd.DataFrame(scaler.inverse_transform(train_set_scaled[preds]))
dummy_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,-1.0,-3.0,-2.0,-12.0,0.0,0.0,-3.0,2.54,0.0,-1.0
1,1.0,-1.0,0.0,-1.0,-3.0,-1.0,0.0,-4.0,1.0,0.0,5.08,-2.0
2,1.0,-1.0,-1.0,-1.0,1.0,0.0,0.0,0.0,0.0,-5.08,0.0,3.0
3,1.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,-5.08,5.08,2.0
4,0.0,1.0,1.0,0.0,-3.0,-7.0,0.0,0.0,0.0,2.54,2.54,0.0


In [57]:
train_set_bal.head()

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner
0,1,0,-1,-3,-2,-12,0,0,-3,2.54,0.0,-1,0
1,1,-1,0,-1,-3,-1,0,-4,1,0.0,5.08,-2,0
2,1,-1,-1,-1,1,0,0,0,0,-5.08,0.0,3,0
3,1,0,0,0,1,3,0,0,0,-5.08,5.08,2,0
4,0,1,1,0,-3,-7,0,0,0,2.54,2.54,0,0


In [62]:
# data safely inverted, scaler works, we will also apply to valid and test set

valid_set_scaled = scale_transform(valid_set, scaler)
valid_set_scaled.head()

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner
1351,-0.179836,0.555289,0.149224,-0.099733,0.090414,-0.074408,-0.19382,0.250755,-0.162072,-1.193332,-0.324219,-0.362349,1
820,-0.179836,-0.061891,-0.35985,-0.347935,-0.970742,-0.704114,-0.19382,-0.24761,-0.162072,0.399928,-0.020632,-0.362349,0
663,-0.179836,-0.679071,-0.35985,-0.347935,1.505289,-0.578173,0.458038,0.250755,-1.904146,0.798243,0.586544,0.023399,0
798,-0.179836,6.727085,5.239962,0.893075,-1.67818,1.625799,8.28034,-3.2378,2.741383,0.798243,2.408072,-0.94097,1
135,-2.183322,-0.061891,-0.35985,0.396671,0.444133,0.807181,-0.19382,-0.745975,0.99931,0.798243,0.282956,0.409147,1


In [61]:
test_set_scaled = scale_transform(test_set, scaler)
test_set_scaled.head()

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner
2827,-0.179836,-0.061891,-0.35985,-0.347935,-0.263305,-0.452231,-0.19382,-0.24761,-0.162072,-0.396702,0.282956,-0.748096,0
3628,0.821907,-0.061891,0.149224,0.396671,0.797852,0.240445,-0.19382,0.250755,0.99931,0.798243,0.890132,0.023399,1
3018,0.821907,-1.29625,0.658298,1.389479,1.505289,1.122034,-0.19382,1.247485,0.418619,0.399928,0.586544,2.145011,0
1481,-0.179836,-1.29625,-1.887072,-3.078158,-3.093056,-2.656203,-0.845679,-1.24434,-1.323454,-0.795017,-0.020632,-2.098213,1
4460,-0.179836,0.555289,1.676445,1.637681,1.505289,1.81471,-0.19382,0.250755,0.418619,-1.193332,-0.627807,-0.362349,0


# 6. Dump

In [66]:
utils.pickle_dump(train_set_scaled[preds], config["train_set_modelready_path"][0])
utils.pickle_dump(train_set_scaled[target], config["train_set_modelready_path"][1])

utils.pickle_dump(valid_set_scaled[preds], config["valid_set_modelready_path"][0])
utils.pickle_dump(valid_set_scaled[target], config["valid_set_modelready_path"][1])

utils.pickle_dump(test_set_scaled[preds], config["test_set_modelready_path"][0])
utils.pickle_dump(test_set_scaled[target], config["test_set_modelready_path"][1])

utils.pickle_dump(scaler, config["scaler_path"])