# Data Preparation

In this notebook, we load, validate, defend, and split our data.


# 1. Load Required Libraries

In [436]:
import pandas as pd
import src.util as utils
from sklearn.model_selection import train_test_split

# 2. Load Configuration File

In [437]:
config = utils.load_config()

# 3. Load Dataset

In [558]:
dataset = pd.read_csv(config["dataset_raw_path"])

In [439]:
dataset

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,...,finish_details,finish_round,finish_round_time,total_fight_time_secs,r_dec_odds,b_dec_odds,r_sub_odds,b_sub_odds,r_ko_odds,b_ko_odds
0,Thiago Santos,Johnny Walker,-150.0,130,66.666667,130.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,...,,5.0,5:00,1500.0,800.0,900.0,2000.0,1600.0,-110.0,175.0
1,Alex Oliveira,Niko Price,170.0,-200,170.000000,50.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,450.0,350.0,700.0,1100.0,550.0,120.0
2,Misha Cirkunov,Krzysztof Jotko,110.0,-130,110.000000,76.923077,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,550.0,275.0,275.0,1400.0,600.0,185.0
3,Alexander Hernandez,Mike Breeden,-675.0,475,14.814815,475.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,...,Punch,1.0,1:20,80.0,175.0,900.0,500.0,3500.0,110.0,1100.0
4,Joe Solecki,Jared Gordon,-135.0,115,74.074074,115.000000,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,165.0,200.0,400.0,1200.0,900.0,600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,Duane Ludwig,Darren Elkins,-155.0,135,64.516129,135.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Blue,...,,1.0,0:44,44.0,,,,,,
4892,John Howard,Daniel Roberts,-210.0,175,47.619048,175.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,...,Punch,1.0,2:01,121.0,,,,,,
4893,Brendan Schaub,Chase Gormley,-260.0,220,38.461538,220.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,...,Punches,1.0,0:47,47.0,,,,,,
4894,Mike Pierce,Julio Paulino,-420.0,335,23.809524,335.000000,3/21/2010,"Broomfield, Colorado, USA",USA,Red,...,,3.0,5:00,900.0,,,,,,


# 4. Data Definition

In [440]:
# the target for our project would be 'Winner', let's assign it on our config file
# and check our target field

target = ['Winner']
dataset[target].value_counts()

Winner
Red       2859
Blue      2037
dtype: int64

In [441]:
# it only has two values, so we would make the classification binary
# let's assign 1 for Red, and 0 for Blue

winner_series = dataset[target].copy()
winner_series.replace('Red', 1, inplace=True)
winner_series.replace('Blue', 0, inplace=True)
winner_series.value_counts()

Winner
1         2859
0         2037
dtype: int64

In [442]:
dataset[target] = winner_series
dataset[target]

Unnamed: 0,Winner
0,1
1,0
2,0
3,1
4,0
...,...
4891,0
4892,1
4893,1
4894,1


In [443]:
# also notice that our dataset contains many fields
# let's check if we can drop some of the fields that wouldn't be need for our project

In [444]:
# counting number of field that has a nan value
dataset.isnull().sum().loc[lambda x : x >= 1].count()

52

In [445]:
dataset.isnull().sum().loc[lambda x : x >= 1].sort_values(ascending=False)

B_Women's Featherweight_rank    4896
R_Women's Featherweight_rank    4889
B_Pound-for-Pound_rank          4861
B_Women's Flyweight_rank        4852
R_Women's Flyweight_rank        4837
B_Women's Strawweight_rank      4835
B_Women's Bantamweight_rank     4818
B_Bantamweight_rank             4811
B_Lightweight_rank              4809
B_Welterweight_rank             4807
B_Featherweight_rank            4806
B_Light Heavyweight_rank        4803
B_Flyweight_rank                4801
B_Middleweight_rank             4794
R_Women's Strawweight_rank      4792
B_Heavyweight_rank              4786
R_Women's Bantamweight_rank     4778
R_Featherweight_rank            4763
R_Middleweight_rank             4762
R_Bantamweight_rank             4759
R_Lightweight_rank              4757
R_Welterweight_rank             4756
R_Light Heavyweight_rank        4755
R_Flyweight_rank                4754
R_Heavyweight_rank              4754
R_Pound-for-Pound_rank          4730
B_match_weightclass_rank        4019
R

In [446]:
# many '*_rank' field don't have values because the fighter didn't fight in that weight class
# we will omit them, but first let's check every field that have 'rank' in them

In [447]:
rank_field_ind = dataset.columns.str.contains('rank')
dataset.columns[rank_field_ind]

Index(['B_match_weightclass_rank', 'R_match_weightclass_rank',
       'R_Women's Flyweight_rank', 'R_Women's Featherweight_rank',
       'R_Women's Strawweight_rank', 'R_Women's Bantamweight_rank',
       'R_Heavyweight_rank', 'R_Light Heavyweight_rank', 'R_Middleweight_rank',
       'R_Welterweight_rank', 'R_Lightweight_rank', 'R_Featherweight_rank',
       'R_Bantamweight_rank', 'R_Flyweight_rank', 'R_Pound-for-Pound_rank',
       'B_Women's Flyweight_rank', 'B_Women's Featherweight_rank',
       'B_Women's Strawweight_rank', 'B_Women's Bantamweight_rank',
       'B_Heavyweight_rank', 'B_Light Heavyweight_rank', 'B_Middleweight_rank',
       'B_Welterweight_rank', 'B_Lightweight_rank', 'B_Featherweight_rank',
       'B_Bantamweight_rank', 'B_Flyweight_rank', 'B_Pound-for-Pound_rank',
       'better_rank'],
      dtype='object')

In [448]:
# we want to omit all above except 'better_rank'

In [449]:
field_to_omit = dataset.columns[rank_field_ind][:-1]
dataset_omitted_1 = dataset.drop(columns=field_to_omit)
dataset_omitted_1.columns

Index(['R_fighter', 'B_fighter', 'R_odds', 'B_odds', 'R_ev', 'B_ev', 'date',
       'location', 'country', 'Winner', 'title_bout', 'weight_class', 'gender',
       'no_of_rounds', 'B_current_lose_streak', 'B_current_win_streak',
       'B_draw', 'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT',
       'B_avg_TD_landed', 'B_avg_TD_pct', 'B_longest_win_streak', 'B_losses',
       'B_total_rounds_fought', 'B_total_title_bouts',
       'B_win_by_Decision_Majority', 'B_win_by_Decision_Split',
       'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms',
       'B_Reach_cms', 'B_Weight_lbs', 'R_current_lose_streak',
       'R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed',
       'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct',
       'R_longest_win_streak', 'R_losses', 'R_total_rounds_fought',
       'R_total_title_bouts', 'R_win_by_Decision_Majority',
  

In [450]:
dataset_omitted_1.head()

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,...,finish_details,finish_round,finish_round_time,total_fight_time_secs,r_dec_odds,b_dec_odds,r_sub_odds,b_sub_odds,r_ko_odds,b_ko_odds
0,Thiago Santos,Johnny Walker,-150.0,130,66.666667,130.0,2021-10-02,"Las Vegas, Nevada, USA",USA,1,...,,5.0,5:00,1500.0,800.0,900.0,2000.0,1600.0,-110.0,175.0
1,Alex Oliveira,Niko Price,170.0,-200,170.0,50.0,2021-10-02,"Las Vegas, Nevada, USA",USA,0,...,,3.0,5:00,900.0,450.0,350.0,700.0,1100.0,550.0,120.0
2,Misha Cirkunov,Krzysztof Jotko,110.0,-130,110.0,76.923077,2021-10-02,"Las Vegas, Nevada, USA",USA,0,...,,3.0,5:00,900.0,550.0,275.0,275.0,1400.0,600.0,185.0
3,Alexander Hernandez,Mike Breeden,-675.0,475,14.814815,475.0,2021-10-02,"Las Vegas, Nevada, USA",USA,1,...,Punch,1.0,1:20,80.0,175.0,900.0,500.0,3500.0,110.0,1100.0
4,Joe Solecki,Jared Gordon,-135.0,115,74.074074,115.0,2021-10-02,"Las Vegas, Nevada, USA",USA,0,...,,3.0,5:00,900.0,165.0,200.0,400.0,1200.0,900.0,600.0


In [451]:
# let's check another fields that has many nan values
dataset_omitted_1.isnull().sum().loc[lambda x : x >= 1].sort_values(ascending=False)

finish_details           2794
b_ko_odds                1062
b_sub_odds               1061
r_ko_odds                1049
r_sub_odds               1049
B_avg_SIG_STR_landed      930
B_avg_TD_pct              842
B_avg_TD_landed           833
B_avg_SUB_ATT             832
b_dec_odds                819
r_dec_odds                803
B_avg_SIG_STR_pct         765
finish_round_time         622
total_fight_time_secs     622
finish_round              622
R_avg_SIG_STR_landed      455
R_avg_TD_pct              367
R_avg_TD_landed           357
R_avg_SUB_ATT             357
R_avg_SIG_STR_pct         357
finish                    238
B_Stance                    2
R_ev                        1
R_odds                      1
dtype: int64

In [452]:
# there are still several fields contains many nan values
# let's check some of them

In [453]:
dataset_omitted_1['finish_details'].value_counts()

Punch                 507
Punches               467
Rear Naked Choke      331
Guillotine Choke      146
Kick                  119
Armbar                 95
Knee                   67
Elbows                 60
Arm Triangle           58
Triangle Choke         46
Elbow                  30
D'Arce Choke           26
Kimura                 21
Flying Knee            20
Knees                  16
Other - Choke          14
Anaconda Choke         14
Kneebar                12
Heel Hook              11
Spinning Back Fist      6
Slam                    6
Spinning Back Kick      6
Neck Crank              5
Other - Lock            5
North-South Choke       4
Injury                  3
Omoplata                2
Kicks                   2
Ankle Lock              1
Peruvian Necktie        1
Keylock                 1
Name: finish_details, dtype: int64

In [454]:
# 'finish*', '*round*', and '*time*' fields may not be out predictor because they have too many categories and doesn't directly say anything about our fighters
# also '*_odds' and '*_ev' fields are out of the scope of our project so let's omit them

In [455]:
frt_field_ind = dataset_omitted_1.columns.str.contains('finish|round|time')
dataset_omitted_1.columns[frt_field_ind]

Index(['no_of_rounds', 'B_total_rounds_fought', 'R_total_rounds_fought',
       'total_round_dif', 'finish', 'finish_details', 'finish_round',
       'finish_round_time', 'total_fight_time_secs'],
      dtype='object')

In [456]:
# let's not select out 'B_total_rounds_fought', 'R_total_rounds_fought', and 'total_round_dif'
frt_field_ind = dataset_omitted_1.columns.str.contains(r'^[^BR]*(finish|round|time){1}(.*[^(dif)]$|$)')
dataset_omitted_1.columns[frt_field_ind]

  frt_field_ind = dataset_omitted_1.columns.str.contains(r'^[^BR]*(finish|round|time){1}(.*[^(dif)]$|$)')


Index(['no_of_rounds', 'finish', 'finish_details', 'finish_round',
       'finish_round_time', 'total_fight_time_secs'],
      dtype='object')

In [457]:
odds_field_ind = dataset_omitted_1.columns.str.contains('odds')
dataset_omitted_1.columns[odds_field_ind]

Index(['R_odds', 'B_odds', 'r_dec_odds', 'b_dec_odds', 'r_sub_odds',
       'b_sub_odds', 'r_ko_odds', 'b_ko_odds'],
      dtype='object')

In [458]:
ev_field_ind = dataset_omitted_1.columns.str.contains('ev')
dataset_omitted_1.columns[ev_field_ind]

Index(['R_ev', 'B_ev'], dtype='object')

In [459]:
field_to_omit = dataset_omitted_1.columns[frt_field_ind | odds_field_ind | ev_field_ind]
dataset_omitted_2 = dataset_omitted_1.drop(columns=field_to_omit)
dataset_omitted_2.columns

Index(['R_fighter', 'B_fighter', 'date', 'location', 'country', 'Winner',
       'title_bout', 'weight_class', 'gender', 'B_current_lose_streak',
       'B_current_win_streak', 'B_draw', 'B_avg_SIG_STR_landed',
       'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 'B_avg_TD_pct',
       'B_longest_win_streak', 'B_losses', 'B_total_rounds_fought',
       'B_total_title_bouts', 'B_win_by_Decision_Majority',
       'B_win_by_Decision_Split', 'B_win_by_Decision_Unanimous',
       'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms',
       'B_Reach_cms', 'B_Weight_lbs', 'R_current_lose_streak',
       'R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed',
       'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct',
       'R_longest_win_streak', 'R_losses', 'R_total_rounds_fought',
       'R_total_title_bouts', 'R_win_by_Decision_Majority',
       'R_win_by_Decision_Split', 'R_win_by_Decision_U

In [460]:
dataset_omitted_2.head()

Unnamed: 0,R_fighter,B_fighter,date,location,country,Winner,title_bout,weight_class,gender,B_current_lose_streak,...,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,better_rank
0,Thiago Santos,Johnny Walker,2021-10-02,"Las Vegas, Nevada, USA",USA,1,False,Light Heavyweight,MALE,0,...,0,10.16,15.24,-8,-0.53,0.6,-0.37,1,1,Red
1,Alex Oliveira,Niko Price,2021-10-02,"Las Vegas, Nevada, USA",USA,0,False,Welterweight,MALE,2,...,-1,2.54,0.0,-1,2.19,0.3,-1.48,1,1,neither
2,Misha Cirkunov,Krzysztof Jotko,2021-10-02,"Las Vegas, Nevada, USA",USA,0,False,Middleweight,MALE,1,...,-5,-5.08,0.0,-2,-0.85,-1.6,-3.33,1,1,neither
3,Alexander Hernandez,Mike Breeden,2021-10-02,"Las Vegas, Nevada, USA",USA,1,False,Lightweight,MALE,1,...,0,2.54,-5.08,3,0.25,0.0,-1.57,1,1,neither
4,Joe Solecki,Jared Gordon,2021-10-02,"Las Vegas, Nevada, USA",USA,0,False,Lightweight,MALE,0,...,-2,0.0,-5.08,5,2.58,-0.6,-0.31,1,1,neither


In [461]:
dataset_omitted_2.isnull().sum().loc[lambda x : x >= 1].sort_values(ascending=False)

B_avg_SIG_STR_landed    930
B_avg_TD_pct            842
B_avg_TD_landed         833
B_avg_SUB_ATT           832
B_avg_SIG_STR_pct       765
R_avg_SIG_STR_landed    455
R_avg_TD_pct            367
R_avg_SIG_STR_pct       357
R_avg_SUB_ATT           357
R_avg_TD_landed         357
B_Stance                  2
dtype: int64

In [462]:
dataset_omitted_2.columns

Index(['R_fighter', 'B_fighter', 'date', 'location', 'country', 'Winner',
       'title_bout', 'weight_class', 'gender', 'B_current_lose_streak',
       'B_current_win_streak', 'B_draw', 'B_avg_SIG_STR_landed',
       'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT', 'B_avg_TD_landed', 'B_avg_TD_pct',
       'B_longest_win_streak', 'B_losses', 'B_total_rounds_fought',
       'B_total_title_bouts', 'B_win_by_Decision_Majority',
       'B_win_by_Decision_Split', 'B_win_by_Decision_Unanimous',
       'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms',
       'B_Reach_cms', 'B_Weight_lbs', 'R_current_lose_streak',
       'R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed',
       'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct',
       'R_longest_win_streak', 'R_losses', 'R_total_rounds_fought',
       'R_total_title_bouts', 'R_win_by_Decision_Majority',
       'R_win_by_Decision_Split', 'R_win_by_Decision_U

In [463]:
# we are still left with many features we can choose from
# for the sake of efficiency and interpretability, we are only going to
# take the 'dif' fields as predictor candidates

dif_ind = dataset_omitted_2.columns.str.contains('dif', case=False)
dif = dataset_omitted_2.columns[dif_ind].to_list()
dif

['lose_streak_dif',
 'win_streak_dif',
 'longest_win_streak_dif',
 'win_dif',
 'loss_dif',
 'total_round_dif',
 'total_title_bout_dif',
 'ko_dif',
 'sub_dif',
 'height_dif',
 'reach_dif',
 'age_dif',
 'sig_str_dif',
 'avg_sub_att_dif',
 'avg_td_dif']

# 5. Data Validation

In [464]:
# from this predictors we're going to validate using our fighters data

dataset_defined = dataset_omitted_2 #renaming

def check_fields(df:pd.DataFrame, keyword:str) -> list:
    # checks if given dataframe has fields with keywords
    field_ind = df.columns.str.contains(keyword, case=False)
    fields = df.columns[field_ind].to_list()

    return fields

def generate_dif(red:pd.Series, blue:pd.Series) -> pd.Series:
    diff = list()
    for i in range(len(red)):
        diff.append(red[i] - blue[i])
    
    diff = pd.Series(diff)
    
    return diff



In [465]:
dif[0]

'lose_streak_dif'

In [466]:
check_fields(dataset_defined, 'lose')

['B_current_lose_streak', 'R_current_lose_streak', 'lose_streak_dif']

In [467]:
lose_streak_dif = generate_dif(red = dataset_defined['R_current_lose_streak'],
                               blue = dataset_defined['B_current_lose_streak'])
lose_streak_dif

0       3
1       0
2       0
3       0
4       0
       ..
4891    1
4892    0
4893    0
4894    1
4895    0
Length: 4896, dtype: int64

In [468]:
dataset_defined['lose_streak_dif']

0      -3
1       0
2       0
3       0
4       0
       ..
4891    1
4892    0
4893    0
4894    1
4895    0
Name: lose_streak_dif, Length: 4896, dtype: int64

In [469]:
dataset_defined['lose_streak_dif'].equals(lose_streak_dif)

False

In [470]:
# it seems there is an inconsistency of the difference value
# i.e. some calculation it's blue minus red, and some calculation
# do the other way.

# we are going to update the all the dif field

In [471]:
dataset_defined['lose_streak_dif'] = lose_streak_dif

In [472]:
dataset_defined['lose_streak_dif']

0       3
1       0
2       0
3       0
4       0
       ..
4891    1
4892    0
4893    0
4894    1
4895    0
Name: lose_streak_dif, Length: 4896, dtype: int64

In [473]:
dif[1]

'win_streak_dif'

In [474]:
check_fields(dataset_defined, "win_streak")

['B_current_win_streak',
 'B_longest_win_streak',
 'R_current_win_streak',
 'R_longest_win_streak',
 'win_streak_dif',
 'longest_win_streak_dif']

In [475]:
win_streak_dif = generate_dif(
    red = dataset_defined['R_current_win_streak'],
    blue = dataset_defined['B_current_win_streak']
)
win_streak_dif

0      -1
1       0
2       0
3       0
4       2
       ..
4891    0
4892    3
4893    0
4894    0
4895    0
Length: 4896, dtype: int64

In [476]:
dataset_defined['win_streak_dif'] = win_streak_dif

In [477]:
# automate some of our work
def update_dif(red_field:str, blue_field:str, dif_field:str):
    diff = generate_dif(
        red = dataset_defined[red_field],
        blue = dataset_defined[blue_field]
    )
    dataset_defined[dif_field] = diff

In [478]:
dif[2]

'longest_win_streak_dif'

In [479]:
check_fields(dataset_defined, "longest_win")

['B_longest_win_streak', 'R_longest_win_streak', 'longest_win_streak_dif']

In [480]:
update_dif('R_longest_win_streak','B_longest_win_streak','longest_win_streak_dif')

In [481]:
dif[3]

'win_dif'

In [482]:
check_fields(dataset_defined, "win")

['Winner',
 'B_current_win_streak',
 'B_longest_win_streak',
 'B_win_by_Decision_Majority',
 'B_win_by_Decision_Split',
 'B_win_by_Decision_Unanimous',
 'B_win_by_KO/TKO',
 'B_win_by_Submission',
 'B_win_by_TKO_Doctor_Stoppage',
 'B_wins',
 'R_current_win_streak',
 'R_longest_win_streak',
 'R_win_by_Decision_Majority',
 'R_win_by_Decision_Split',
 'R_win_by_Decision_Unanimous',
 'R_win_by_KO/TKO',
 'R_win_by_Submission',
 'R_win_by_TKO_Doctor_Stoppage',
 'R_wins',
 'win_streak_dif',
 'longest_win_streak_dif',
 'win_dif']

In [483]:
update_dif('R_wins','B_wins','win_dif')

In [484]:
dif[4]

'loss_dif'

In [485]:
check_fields(dataset_defined, "loss")

['B_losses', 'R_losses', 'loss_dif']

In [486]:
update_dif('R_losses','B_losses','loss_dif')

In [487]:
dif[5]

'total_round_dif'

In [488]:
check_fields(dataset_defined, "total_round")

['B_total_rounds_fought', 'R_total_rounds_fought', 'total_round_dif']

In [489]:
update_dif('R_total_rounds_fought','B_total_rounds_fought','total_round_dif')

In [490]:
dif[6]

'total_title_bout_dif'

In [491]:
check_fields(dataset_defined, "title_bout")

['title_bout',
 'B_total_title_bouts',
 'R_total_title_bouts',
 'total_title_bout_dif']

In [492]:
update_dif('R_total_title_bouts','B_total_title_bouts','total_title_bout_dif')

In [493]:
dif[7]

'ko_dif'

In [494]:
check_fields(dataset_defined, "ko")

['B_win_by_KO/TKO',
 'B_win_by_TKO_Doctor_Stoppage',
 'R_win_by_KO/TKO',
 'R_win_by_TKO_Doctor_Stoppage',
 'ko_dif']

In [495]:
update_dif('R_win_by_KO/TKO','B_win_by_KO/TKO','ko_dif')

In [496]:
dif[8]

'sub_dif'

In [497]:
check_fields(dataset_defined, "sub")

['B_avg_SUB_ATT',
 'B_win_by_Submission',
 'R_avg_SUB_ATT',
 'R_win_by_Submission',
 'sub_dif',
 'avg_sub_att_dif']

In [498]:
update_dif('R_win_by_Submission','B_win_by_Submission','sub_dif')

In [499]:
dif[9]

'height_dif'

In [500]:
check_fields(dataset_defined, "height")

['B_Height_cms', 'R_Height_cms', 'height_dif']

In [501]:
update_dif('R_Height_cms','B_Height_cms','height_dif')

In [502]:
dif[10]

'reach_dif'

In [503]:
check_fields(dataset_defined, "reach")

['B_Reach_cms', 'R_Reach_cms', 'reach_dif']

In [504]:
update_dif('R_Reach_cms','B_Reach_cms','reach_dif')

In [505]:
dif[11]

'age_dif'

In [506]:
check_fields(dataset_defined, "age")

['B_win_by_TKO_Doctor_Stoppage',
 'R_win_by_TKO_Doctor_Stoppage',
 'R_age',
 'B_age',
 'age_dif']

In [507]:
update_dif('R_age','B_age','age_dif')

In [508]:
dif[12]

'sig_str_dif'

In [509]:
check_fields(dataset_defined, "sig_str")

['B_avg_SIG_STR_landed',
 'B_avg_SIG_STR_pct',
 'R_avg_SIG_STR_landed',
 'R_avg_SIG_STR_pct',
 'sig_str_dif']

In [510]:
update_dif('R_avg_SIG_STR_landed','B_avg_SIG_STR_landed','sig_str_dif')

In [511]:
dif[13]

'avg_sub_att_dif'

In [512]:
check_fields(dataset_defined, "sub_att")

['B_avg_SUB_ATT', 'R_avg_SUB_ATT', 'avg_sub_att_dif']

In [513]:
update_dif('R_avg_SUB_ATT','B_avg_SUB_ATT','avg_sub_att_dif')

In [514]:
dif[14]

'avg_td_dif'

In [515]:
check_fields(dataset_defined, "td")

['B_avg_TD_landed',
 'B_avg_TD_pct',
 'R_avg_TD_landed',
 'R_avg_TD_pct',
 'avg_td_dif']

In [516]:
update_dif('R_avg_TD_landed','B_avg_TD_landed','avg_td_dif')

In [517]:
dataset_defined[dif]

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif
0,3,-1,0,8,6,32,1,7,0,-10.16,-15.24,8,0.53,-0.600000,0.370000
1,0,0,2,5,3,20,0,0,1,-2.54,0.00,1,-2.19,-0.300000,1.480000
2,0,0,-1,-3,-1,-25,0,0,5,5.08,0.00,2,0.85,1.600000,3.330000
3,0,0,2,4,2,12,0,2,0,-2.54,5.08,-3,-0.25,0.000000,1.570000
4,0,2,2,-1,-3,-11,0,-1,2,0.00,5.08,-5,-2.58,0.600000,0.310000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,1,0,2,2,1,5,0,1,0,0.00,-2.54,6,,,
4892,0,3,3,3,0,9,0,1,0,-7.62,-7.62,-2,,,
4893,0,0,0,0,0,0,1,0,0,2.54,2.12,0,4.00,-1.000000,-1.000000
4894,1,0,1,1,1,6,0,0,0,-10.16,-7.62,-5,,,


In [518]:
dataset_defined[dif].isnull().sum()

lose_streak_dif              0
win_streak_dif               0
longest_win_streak_dif       0
win_dif                      0
loss_dif                     0
total_round_dif              0
total_title_bout_dif         0
ko_dif                       0
sub_dif                      0
height_dif                   0
reach_dif                    0
age_dif                      0
sig_str_dif               1041
avg_sub_att_dif            943
avg_td_dif                 944
dtype: int64

In [519]:
# again, for the sake of interpretability, we would prefer to select fewer features
# instead of dropping some of our records. We would take 12 fields that have no nan values
# and make it our predictors.

preds = dif[:-3]
preds

['lose_streak_dif',
 'win_streak_dif',
 'longest_win_streak_dif',
 'win_dif',
 'loss_dif',
 'total_round_dif',
 'total_title_bout_dif',
 'ko_dif',
 'sub_dif',
 'height_dif',
 'reach_dif',
 'age_dif']

In [520]:
dataset_defined[preds].isnull().sum()

lose_streak_dif           0
win_streak_dif            0
longest_win_streak_dif    0
win_dif                   0
loss_dif                  0
total_round_dif           0
total_title_bout_dif      0
ko_dif                    0
sub_dif                   0
height_dif                0
reach_dif                 0
age_dif                   0
dtype: int64

In [521]:
# now let's extract some information for our configuration file

for pred in preds:
    print('-', pred)

- lose_streak_dif
- win_streak_dif
- longest_win_streak_dif
- win_dif
- loss_dif
- total_round_dif
- total_title_bout_dif
- ko_dif
- sub_dif
- height_dif
- reach_dif
- age_dif


In [522]:
dataset_defined[preds].dtypes

lose_streak_dif             int64
win_streak_dif              int64
longest_win_streak_dif      int64
win_dif                     int64
loss_dif                    int64
total_round_dif             int64
total_title_bout_dif        int64
ko_dif                      int64
sub_dif                     int64
height_dif                float64
reach_dif                 float64
age_dif                     int64
dtype: object

In [523]:
int_columns = dataset_defined[preds].columns[dataset_defined[preds].dtypes == 'int64']
for column in int_columns:
    print('-', column)

- lose_streak_dif
- win_streak_dif
- longest_win_streak_dif
- win_dif
- loss_dif
- total_round_dif
- total_title_bout_dif
- ko_dif
- sub_dif
- age_dif


In [524]:
float_columns = dataset_defined[preds].columns[dataset_defined[preds].dtypes == 'float64']
for column in float_columns:
    print('-', column)

- height_dif
- reach_dif


In [525]:
# finally, let's check overall description about our defined data
dataset_validated = dataset_defined[preds+target]
dataset_validated.describe()

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner
count,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0
mean,0.135212,0.175654,0.765931,1.412786,0.667075,5.208538,0.319444,0.488562,0.277574,-0.053721,0.183877,0.538603,0.583946
std,0.990505,1.72945,2.029394,4.030496,2.819143,16.989572,1.649489,2.031889,1.759247,6.428207,8.753398,5.156641,0.492953
min,-5.0,-9.0,-14.0,-23.0,-14.0,-80.0,-14.0,-14.0,-10.0,-30.48,-30.48,-16.0,0.0
25%,0.0,0.0,0.0,0.0,-1.0,-2.0,0.0,0.0,0.0,-5.08,-5.08,-3.0,0.0
50%,0.0,0.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,1.0,2.0,3.0,2.0,12.0,0.0,1.0,1.0,5.08,5.08,4.0,1.0
max,6.0,13.0,12.0,28.0,18.0,448.0,16.0,21.0,13.0,33.02,187.96,17.0,1.0


In [526]:
# notice there is some anomaly that:
# 1. reach_dif = 187.96
# 2. total_round_dif = 448
# we'll try to sort out those records

dataset_validated.loc[dataset_validated['reach_dif'] >= 187]

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner
703,-1,3,3,3,-1,6,0,0,0,15.24,187.96,0,1


In [527]:
# first for the 'reach_dif'
outlier_ind = dataset_validated.loc[dataset_validated['reach_dif'] >= 187].index
temp_df = dataset_validated.drop(index=outlier_ind).copy()
temp_df.loc[dataset_validated['reach_dif'] >= 187]

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner


In [528]:
dataset_validated = temp_df

In [529]:
# now for the 'total_round_dif'
dataset_validated.loc[dataset_validated['total_round_dif'] == 448]

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner
537,-1,1,2,6,3,448,0,0,0,-2.54,-12.7,2,1


In [530]:
outlier_ind = dataset_validated.loc[dataset_validated['total_round_dif'] == 448].index
temp_df = dataset_validated.drop(index=outlier_ind).copy()
temp_df.loc[dataset_validated['total_round_dif'] == 448]

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner


In [531]:
dataset_validated = temp_df

In [532]:
dataset_validated.describe()

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner
count,4894.0,4894.0,4894.0,4894.0,4894.0,4894.0,4894.0,4894.0,4894.0,4894.0,4894.0,4894.0,4894.0
mean,0.135676,0.174908,0.765223,1.411524,0.666939,5.117899,0.319575,0.488762,0.277687,-0.056338,0.148141,0.538414,0.583776
std,0.990442,1.729292,2.02948,4.030722,2.819421,15.769745,1.649814,2.03228,1.759597,6.425703,8.331381,5.157647,0.492982
min,-5.0,-9.0,-14.0,-23.0,-14.0,-80.0,-14.0,-14.0,-10.0,-30.48,-30.48,-16.0,0.0
25%,0.0,0.0,0.0,0.0,-1.0,-2.0,0.0,0.0,0.0,-5.08,-5.08,-3.0,0.0
50%,0.0,0.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,1.0,2.0,3.0,2.0,12.0,0.0,1.0,1.0,5.08,5.08,4.0,1.0
max,6.0,13.0,12.0,28.0,18.0,86.0,16.0,21.0,13.0,33.02,33.02,17.0,1.0


In [533]:
# now our dataset looks more healthy :)

# 5. Data Defense

In [534]:
# now let's make our data defense
# for now we'll only going to check the data types
def check_data(df: pd.DataFrame, config: dict) -> None:
    # check data types
    assert df.select_dtypes('int').columns.to_list() == config['int_columns'], "an error occurs in int column(s)."
    assert df.select_dtypes('float').columns.to_list() == config['float_columns'], "an error occurs in float column(s)."

In [535]:
check_data(dataset_validated, config)

# 6. Data Splitting

In [544]:
x = dataset_validated[config["predictors"]].copy()
y = dataset_validated[config["target"]].copy()

In [545]:
x

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif
0,3,-1,0,8,6,32,1,7,0,-10.16,-15.24,8
1,0,0,2,5,3,20,0,0,1,-2.54,0.00,1
2,0,0,-1,-3,-1,-25,0,0,5,5.08,0.00,2
3,0,0,2,4,2,12,0,2,0,-2.54,5.08,-3
4,0,2,2,-1,-3,-11,0,-1,2,0.00,5.08,-5
...,...,...,...,...,...,...,...,...,...,...,...,...
4891,1,0,2,2,1,5,0,1,0,0.00,-2.54,6
4892,0,3,3,3,0,9,0,1,0,-7.62,-7.62,-2
4893,0,0,0,0,0,0,1,0,0,2.54,2.12,0
4894,1,0,1,1,1,6,0,0,0,-10.16,-7.62,-5


In [546]:
y

0       1
1       0
2       0
3       1
4       0
       ..
4891    0
4892    1
4893    1
4894    1
4895    0
Name: Winner, Length: 4894, dtype: int64

In [547]:
# let's split our dataset into train and test set
test_size = config['test_size']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = 99, stratify = y)

In [548]:
# then split our test set into valid and test set
valid_size = config['valid_size']
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = valid_size, random_state = 99, stratify = y_test)

In [549]:
utils.pickle_dump(dataset_validated, config["dataset_cleaned_path"])

utils.pickle_dump(x_train, config["train_set_path"][0])
utils.pickle_dump(y_train, config["train_set_path"][1])

utils.pickle_dump(x_valid, config["valid_set_path"][0])
utils.pickle_dump(y_valid, config["valid_set_path"][1])

utils.pickle_dump(x_test, config["test_set_path"][0])
utils.pickle_dump(y_test, config["test_set_path"][1])

In [552]:
train_set = pd.concat([x_train, y_train], axis = 1)

In [553]:
train_set.describe()

Unnamed: 0,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,Winner
count,3425.0,3425.0,3425.0,3425.0,3425.0,3425.0,3425.0,3425.0,3425.0,3425.0,3425.0,3425.0,3425.0
mean,0.155912,0.16438,0.745401,1.390657,0.680292,4.987153,0.284672,0.474745,0.290219,0.007007,0.197629,0.630365,0.58365
std,0.991309,1.658881,2.000437,4.061275,2.854274,15.951257,1.612003,2.030007,1.765248,6.36854,8.400836,5.17808,0.493025
min,-5.0,-8.0,-14.0,-23.0,-14.0,-80.0,-14.0,-14.0,-10.0,-30.48,-30.48,-16.0,0.0
25%,0.0,0.0,0.0,0.0,-1.0,-2.0,0.0,0.0,0.0,-5.08,-5.08,-3.0,0.0
50%,0.0,0.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,1.0,1.0,2.0,3.0,2.0,12.0,0.0,1.0,1.0,5.08,5.08,4.0,1.0
max,6.0,12.0,12.0,28.0,18.0,84.0,16.0,21.0,13.0,25.4,30.48,17.0,1.0


In [554]:
train_set.to_csv(config['csv_for_eda_path'], index=False)

# \#2 Attempt

After our failed first attempt to find a nice set of features, let's take a second look at our original dataset.

In [556]:
dataset.head()

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,...,finish_details,finish_round,finish_round_time,total_fight_time_secs,r_dec_odds,b_dec_odds,r_sub_odds,b_sub_odds,r_ko_odds,b_ko_odds
0,Thiago Santos,Johnny Walker,-150.0,130,66.666667,130.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,...,,5.0,5:00,1500.0,800.0,900.0,2000.0,1600.0,-110.0,175.0
1,Alex Oliveira,Niko Price,170.0,-200,170.0,50.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,450.0,350.0,700.0,1100.0,550.0,120.0
2,Misha Cirkunov,Krzysztof Jotko,110.0,-130,110.0,76.923077,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,550.0,275.0,275.0,1400.0,600.0,185.0
3,Alexander Hernandez,Mike Breeden,-675.0,475,14.814815,475.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,...,Punch,1.0,1:20,80.0,175.0,900.0,500.0,3500.0,110.0,1100.0
4,Joe Solecki,Jared Gordon,-135.0,115,74.074074,115.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,...,,3.0,5:00,900.0,165.0,200.0,400.0,1200.0,900.0,600.0


In [557]:
dataset.columns

Index(['R_fighter', 'B_fighter', 'R_odds', 'B_odds', 'R_ev', 'B_ev', 'date',
       'location', 'country', 'Winner',
       ...
       'finish_details', 'finish_round', 'finish_round_time',
       'total_fight_time_secs', 'r_dec_odds', 'b_dec_odds', 'r_sub_odds',
       'b_sub_odds', 'r_ko_odds', 'b_ko_odds'],
      dtype='object', length=119)

## Data Definition

In [606]:
dataset_len = len(dataset)
dataset_len

4896

In [595]:
# show all data types
dataset.dtypes.value_counts()

float64    58
int64      46
object     14
bool        1
dtype: int64

In [620]:
# check boolean data type
dataset.select_dtypes(bool).columns

Index(['title_bout'], dtype='object')

In [607]:
dataset.select_dtypes(bool).value_counts().sum() == dataset_len

True

In [611]:
# check columns with object data type
dataset.select_dtypes(object).columns

Index(['R_fighter', 'B_fighter', 'date', 'location', 'country', 'Winner',
       'weight_class', 'gender', 'B_Stance', 'R_Stance', 'better_rank',
       'finish', 'finish_details', 'finish_round_time'],
      dtype='object')

In [615]:
# show value counts for every columns
obj_cols = dataset.select_dtypes(object).columns
for col in obj_cols:
    display(dataset[col].value_counts())


Donald Cerrone      24
Jim Miller          22
Dustin Poirier      19
Demian Maia         19
Joseph Benavidez    18
                    ..
Syuri Kondo          1
Daichi Abe           1
Alvaro Herrera       1
Damian Grabowski     1
Eric Schafer         1
Name: R_fighter, Length: 1348, dtype: int64

Charles Oliveira    18
Jeremy Stephens     16
Nik Lentz           14
Angela Hill         14
Kevin Lee           12
                    ..
Danny Henry          1
Larissa Pacheco      1
Wendell Oliveira     1
Roman Bogatov        1
Chase Gormley        1
Name: B_fighter, Length: 1591, dtype: int64

11/19/2016    24
10/4/2014     22
5/31/2014     22
8/23/2014     21
6/28/2014     21
              ..
2/6/2021       8
8/30/2014      8
9/5/2020       7
8/1/2020       7
4/11/2014      7
Name: date, Length: 428, dtype: int64

Las Vegas, Nevada, USA                        1251
Abu Dhabi, Abu Dhabi, United Arab Emirates     153
Houston, Texas, USA                             82
Chicago, Illinois, USA                          80
Newark, New Jersey, USA                         79
                                              ... 
Bangor, Maine, USA                              10
Hollywood, Florida, USA                          9
Gold Coast, Queensland, Australia                9
Omaha, Nebraska, USA                             9
Ledyard, Connecticut, USA                        9
Name: location, Length: 145, dtype: int64

 USA                     2450
USA                       599
 Brazil                   400
 Canada                   337
 United Kingdom           165
 Australia                160
United Arab Emirates      141
 Sweden                    72
 Mexico                    70
 China                     61
 Germany                   54
 Japan                     53
 Singapore                 45
 Russia                    36
 New Zealand               33
 United Arab Emirates      29
 Netherlands               25
 South Korea               24
 Poland                    23
 Ireland                   19
 Croatia                   13
 Czech Republic            13
 Denmark                   13
 Chile                     13
 Uruguay                   13
 Philippines               12
 Argentina                 12
Brazil                     11
Name: country, dtype: int64

Red     2859
Blue    2037
Name: Winner, dtype: int64

Lightweight              855
Welterweight             830
Middleweight             593
Featherweight            566
Bantamweight             493
Light Heavyweight        394
Heavyweight              385
Flyweight                235
Women's Strawweight      203
Women's Bantamweight     156
Women's Flyweight        136
Catch Weight              33
Women's Featherweight     17
Name: weight_class, dtype: int64

MALE      4384
FEMALE     512
Name: gender, dtype: int64

Orthodox       3680
Southpaw        980
Switch          232
Switch            1
Open Stance       1
Name: B_Stance, dtype: int64

Orthodox       3712
Southpaw        991
Switch          189
Open Stance       4
Name: R_Stance, dtype: int64

neither    3522
Red        1300
Blue         74
Name: better_rank, dtype: int64

U-DEC         1778
KO/TKO        1495
SUB            854
S-DEC          483
M-DEC           32
DQ              14
Overturned       2
Name: finish, dtype: int64

Punch                 507
Punches               467
Rear Naked Choke      331
Guillotine Choke      146
Kick                  119
Armbar                 95
Knee                   67
Elbows                 60
Arm Triangle           58
Triangle Choke         46
Elbow                  30
D'Arce Choke           26
Kimura                 21
Flying Knee            20
Knees                  16
Other - Choke          14
Anaconda Choke         14
Kneebar                12
Heel Hook              11
Spinning Back Fist      6
Slam                    6
Spinning Back Kick      6
Neck Crank              5
Other - Lock            5
North-South Choke       4
Injury                  3
Omoplata                2
Kicks                   2
Ankle Lock              1
Peruvian Necktie        1
Keylock                 1
Name: finish_details, dtype: int64

5:00    2141
4:59      29
2:38      23
1:54      18
3:38      15
        ... 
0:09       1
4:04       1
3:20       1
0:05       1
3:55       1
Name: finish_round_time, Length: 293, dtype: int64

In [622]:
# In B_Stance we found 'switch' has two different value, let's fix it

dataset["B_Stance"].unique()

array(['Orthodox', 'Southpaw', 'Switch', nan, 'Switch ', 'Open Stance'],
      dtype=object)

In [634]:
# Notice that one 'Switch' has a space in it
B_Stance_err = dataset["B_Stance"].loc[lambda x : x == "Switch "]
B_Stance_err

4088    Switch 
Name: B_Stance, dtype: object

In [636]:
# assign new value
dataset["B_Stance"][B_Stance_err.index] = "Switch"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["B_Stance"][B_Stance_err.index] = "Switch"


In [637]:
dataset["B_Stance"].value_counts()

Orthodox       3680
Southpaw        980
Switch          233
Open Stance       1
Name: B_Stance, dtype: int64

In [638]:
# check columns with numeric data type
dataset.select_dtypes([int, float]).columns

Index(['R_odds', 'B_odds', 'R_ev', 'B_ev', 'no_of_rounds',
       'B_current_lose_streak', 'B_current_win_streak', 'B_draw',
       'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct',
       ...
       'B_Flyweight_rank', 'B_Pound-for-Pound_rank', 'finish_round',
       'total_fight_time_secs', 'r_dec_odds', 'b_dec_odds', 'r_sub_odds',
       'b_sub_odds', 'r_ko_odds', 'b_ko_odds'],
      dtype='object', length=104)

We still have 104 numeric features so we can't validate each feature for now. We would engineer our dataset first so we can perform EDA and do selection of our features.

For now we'll dump our dataset then preprocess it.

## Data Spliting and Dumping

In [655]:
x = dataset.drop(config["target"], axis=1).copy()
y = dataset[config["target"]].copy()

# split our dataset into train and test set
test_size = config['test_size']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = 99, stratify = y)

# then split our test set into valid and test set
valid_size = config['valid_size']
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = valid_size, random_state = 99, stratify = y_test)

utils.pickle_dump(dataset_validated, config["dataset_cleaned_path"])

utils.pickle_dump(x_train, config["train_set_path"][0])
utils.pickle_dump(y_train, config["train_set_path"][1])

utils.pickle_dump(x_valid, config["valid_set_path"][0])
utils.pickle_dump(y_valid, config["valid_set_path"][1])

utils.pickle_dump(x_test, config["test_set_path"][0])
utils.pickle_dump(y_test, config["test_set_path"][1])