In [1]:
import numpy as np
import pandas as pd

# if csv files are in drive
#from google.colab import drive
#drive.mount(path := '/drive'); path += '/MyDrive/MMA/'
# else
path = './'

In [2]:
df_fights = pd.read_csv(path + 'ufc_fight_data.csv')
print(df_fights.shape)
df_fights.head()

(7218, 15)


Unnamed: 0,fight_id,event_id,referee,f_1,f_2,winner,num_rounds,title_fight,weight_class,gender,result,result_details,finish_round,finish_time,fight_url
0,7218,664,Herb Dean,2976.0,2884.0,2884.0,5,F,Lightweight,M,KO/TKO,to \n Leg Injury,2,2:03,http://ufcstats.com/fight-details/23a604f46028...
1,7217,664,Mark Smith,1662.0,2464.0,1662.0,3,F,Featherweight,M,Decision,Unanimous,3,5:00,http://ufcstats.com/fight-details/da1b37edb8cc...
2,7216,664,Kerry Hatley,981.0,179.0,981.0,3,F,Women's Strawweight,F,KO/TKO,Punches to Head From Mount,2,2:42,http://ufcstats.com/fight-details/d8335b728604...
3,7215,664,Dan Miragliotta,3831.0,2974.0,3831.0,3,F,Welterweight,M,Submission,Rear Naked Choke,2,4:32,http://ufcstats.com/fight-details/bf647be41de3...
4,7214,664,Herb Dean,1108.0,2320.0,2320.0,3,F,Featherweight,M,Submission,Guillotine Choke From Bottom Guard,1,3:12,http://ufcstats.com/fight-details/6e1bf1b163b3...


In [3]:
# removing useless columns
to_remove = {'referee', 'result_details'}
df_fights = df_fights.drop(to_remove, axis=1)

In [4]:
print('Number of undefined values in each column:')
df_fights.isna().sum()

Number of undefined values in each column:


fight_id         0
event_id         0
f_1             19
f_2             13
winner          15
num_rounds       0
title_fight      0
weight_class    13
gender           0
result           0
finish_round     0
finish_time      0
fight_url        0
dtype: int64

In [5]:
# removing every row where there is at least one NaN value
df_fights = df_fights.dropna()

In [6]:
# deleting all female fights
condition = df_fights['gender'] == 'F'
indices   = df_fights[condition].index
print(f'deleting {len(indices)} rows')
df_fights = df_fights.drop(indices)

deleting 694 rows


In [7]:
# deleting non 3 rounds fights
condition = df_fights['num_rounds'] != '3'
indices   = df_fights[condition].index
print(f'deleting {len(indices)} rows')
df_fights = df_fights.drop(indices)

deleting 750 rows


In [8]:
df_fights['weight_class'].value_counts()

weight_class
Lightweight          1152
Welterweight         1129
Middleweight          856
Featherweight         626
Light Heavyweight     560
Bantamweight          554
Heavyweight           521
Flyweight             273
Catch Weight           56
Open Weight             2
Name: count, dtype: int64

There are only 58 fights in the categories 'Catch Weight' & 'Open Weight'.

In [9]:
# deleting Open Weight and Catch Weights fights
condition = (df_fights['weight_class'] == 'Catch Weight') |  (df_fights['weight_class'] == 'Open Weight')
indices   = df_fights[condition].index
print(f'deleting {len(indices)} rows')
df_fights = df_fights.drop(indices)

deleting 58 rows


In [10]:
# removing all the now useless features
to_remove = {'num_rounds', 'title_fight', 'gender', 'fight_url', 'event_id'}
df_fights = df_fights.drop(to_remove, axis=1)

In [11]:
print(df_fights.shape)
df_fights.head()

(5671, 8)


Unnamed: 0,fight_id,f_1,f_2,winner,weight_class,result,finish_round,finish_time
1,7217,1662.0,2464.0,1662.0,Featherweight,Decision,3,5:00
3,7215,3831.0,2974.0,3831.0,Welterweight,Submission,2,4:32
4,7214,1108.0,2320.0,2320.0,Featherweight,Submission,1,3:12
5,7213,3945.0,2373.0,2373.0,Bantamweight,Decision,3,5:00
6,7212,1752.0,3002.0,1752.0,Welterweight,KO/TKO,3,1:15


In [12]:
# converting 'finish_time' to seconds

def to_seconds(time : str):
    split = time.split(':')
    minutes, seconds = int(split[0]), int(split[1])
    return minutes * 60 + seconds

df_fights['finish_time'] = df_fights['finish_time'].apply(to_seconds)
df_fights['finish_time'] *= df_fights['finish_round']
df_fights.head()

Unnamed: 0,fight_id,f_1,f_2,winner,weight_class,result,finish_round,finish_time
1,7217,1662.0,2464.0,1662.0,Featherweight,Decision,3,900
3,7215,3831.0,2974.0,3831.0,Welterweight,Submission,2,544
4,7214,1108.0,2320.0,2320.0,Featherweight,Submission,1,192
5,7213,3945.0,2373.0,2373.0,Bantamweight,Decision,3,900
6,7212,1752.0,3002.0,1752.0,Welterweight,KO/TKO,3,225


In [13]:
condition = df_fights['winner'] == df_fights['f_1']
f1_wins = len(df_fights[condition])
print(f'fighter 1 is the winning fighter ~{f1_wins / len(df_fights)*100:.0f}% of the time')

fighter 1 is the winning fighter ~65% of the time


We do not want the model to learn how to select the fighter 1 and get $65\%$ of accuracy. We want it to make its prediction based on the fighters in-fight statistics $\Rightarrow$ we must randomize the distribution of winning fighters.

In [14]:
# randomizing the winner distribution

from random import randint

modified = []

for i in range(len(df_fights)):
    if randint(0, 1): modified.append(False); continue
    a, b = df_fights.iloc[i]['f_1'], df_fights.iloc[i]['f_2']
    df_fights.iloc[i, 1], df_fights.iloc[i, 2] = b, a
    modified.append(True)

In [15]:
condition = df_fights['winner'] == df_fights['f_1']
f1_wins = len(df_fights[condition])
print(f'now, fighter 1 is the winning fighter ~{f1_wins / len(df_fights)*100:.0f}% of the time')

now, fighter 1 is the winning fighter ~51% of the time


In [16]:
df_fights['result'].value_counts()

result
Decision                   2607
KO/TKO                     1850
Submission                 1135
TKO - Doctor's Stoppage      60
DQ                           19
Name: count, dtype: int64

We will remove the disqualifications ("DQ"), since the represent only 19 fights, and merge "TKO - Doctor's Stoppage" with "KO/TKO".

In [17]:
condition = df_fights['result'] == 'DQ'
indices   = df_fights[condition].index
df_fights['result'] = df_fights['result'].replace("TKO - Doctor's Stoppage", 'KO/TKO')
df_fights = df_fights.drop(indices)

In [18]:
df_fights['result'].value_counts()

result
Decision      2607
KO/TKO        1910
Submission    1135
Name: count, dtype: int64

At this point we have finished preprocessing this file, we will only need to encode the non-numerical features.

# `ufc_fight_stat_data.csv`

In [19]:
df_stats = pd.read_csv(path + 'ufc_fight_stat_data.csv')
print(df_stats.shape)
df_stats.head()

(14436, 14)


Unnamed: 0,fight_stat_id,fight_id,fighter_id,knockdowns,total_strikes_att,total_strikes_succ,sig_strikes_att,sig_strikes_succ,takedown_att,takedown_succ,submission_att,reversals,ctrl_time,fight_url
0,14436,7218,2976.0,0.0,34.0,19.0,32.0,18.0,0.0,0.0,0.0,0.0,0:00,http://ufcstats.com/fight-details/23a604f46028...
1,14435,7218,2884.0,0.0,42.0,17.0,40.0,16.0,6.0,1.0,0.0,0.0,1:28,http://ufcstats.com/fight-details/23a604f46028...
2,14434,7217,1662.0,0.0,59.0,37.0,40.0,23.0,15.0,5.0,1.0,0.0,7:33,http://ufcstats.com/fight-details/da1b37edb8cc...
3,14433,7217,2464.0,0.0,72.0,32.0,55.0,18.0,0.0,0.0,0.0,1.0,2:11,http://ufcstats.com/fight-details/da1b37edb8cc...
4,14432,7216,981.0,0.0,130.0,90.0,102.0,70.0,1.0,1.0,0.0,0.0,2:03,http://ufcstats.com/fight-details/d8335b728604...


In [20]:
# removing useless features
to_remove = {'fight_stat_id', 'fight_url'}
df_stats  = df_stats.drop(to_remove, axis=1)
df_stats.head()

Unnamed: 0,fight_id,fighter_id,knockdowns,total_strikes_att,total_strikes_succ,sig_strikes_att,sig_strikes_succ,takedown_att,takedown_succ,submission_att,reversals,ctrl_time
0,7218,2976.0,0.0,34.0,19.0,32.0,18.0,0.0,0.0,0.0,0.0,0:00
1,7218,2884.0,0.0,42.0,17.0,40.0,16.0,6.0,1.0,0.0,0.0,1:28
2,7217,1662.0,0.0,59.0,37.0,40.0,23.0,15.0,5.0,1.0,0.0,7:33
3,7217,2464.0,0.0,72.0,32.0,55.0,18.0,0.0,0.0,0.0,1.0,2:11
4,7216,981.0,0.0,130.0,90.0,102.0,70.0,1.0,1.0,0.0,0.0,2:03


In [21]:
# removing all the data from fights that are not in the dataframe 'df_fights'

ids       = df_fights['fight_id']
condition = df_stats['fight_id'].isin(ids)
indices   = df_stats[~condition].index
df_stats  = df_stats.drop(indices)

df_stats.shape

(11304, 12)

In [22]:
df_stats = df_stats.dropna()

In [23]:
# there are some 'ctlr_time' values that are not defined,
# because they are set as '--', we will replace them with the mean that is ~ 2min

# first we count them
condition = df_stats['ctrl_time'] == '--'
print(f'number of values to replace: {len(df_stats[condition])}')
df_stats['ctrl_time'] = df_stats['ctrl_time'].replace('--', '2:0')
# and we convert them to seconds at the same time
df_stats['ctrl_time'] = df_stats['ctrl_time'].apply(to_seconds)

number of values to replace: 2


In [24]:
# example of getting all stats related to a certain fighter

fighter_id = 1662 # Bryce Mitchell

get_fighter_stats = lambda id: df_stats[df_stats['fighter_id']==id]

print(f'All stats related to fighter {fighter_id}:')
get_fighter_stats(fighter_id)

All stats related to fighter 1662:


Unnamed: 0,fight_id,fighter_id,knockdowns,total_strikes_att,total_strikes_succ,sig_strikes_att,sig_strikes_succ,takedown_att,takedown_succ,submission_att,reversals,ctrl_time
2,7217,1662.0,0.0,59.0,37.0,40.0,23.0,15.0,5.0,1.0,0.0,453
842,6797,1662.0,0.0,94.0,46.0,82.0,36.0,9.0,1.0,0.0,0.0,89
1659,6389,1662.0,1.0,209.0,182.0,47.0,34.0,4.0,4.0,0.0,0.0,688
2958,5739,1662.0,0.0,135.0,100.0,69.0,46.0,13.0,7.0,1.0,0.0,612
3514,5461,1662.0,0.0,74.0,60.0,49.0,40.0,3.0,3.0,5.0,0.0,771
3768,5334,1662.0,0.0,28.0,24.0,7.0,6.0,1.0,1.0,1.0,0.0,240
4506,4965,1662.0,0.0,56.0,28.0,44.0,17.0,3.0,0.0,2.0,1.0,191
5227,4605,1662.0,0.0,78.0,45.0,69.0,38.0,10.0,3.0,1.0,0.0,135


In [25]:
# we divide all continuous stats by the duration of the fight (in sec)
# because 30 strikes landed in 900sec != 30 strikes landed in 100sec

columns = df_stats.columns[2:] # fight stats columns
df_stats = df_stats.merge(df_fights[['fight_id', 'finish_time']], on='fight_id', how='left')
df_stats[columns] = df_stats[columns].div(df_stats['finish_time'], axis=0)
df_stats = df_stats.drop('finish_time', axis=1)

print(f'All stats related to fighter {fighter_id}:')
get_fighter_stats(fighter_id)[:2]

All stats related to fighter 1662:


Unnamed: 0,fight_id,fighter_id,knockdowns,total_strikes_att,total_strikes_succ,sig_strikes_att,sig_strikes_succ,takedown_att,takedown_succ,submission_att,reversals,ctrl_time
0,7217,1662.0,0.0,0.065556,0.041111,0.044444,0.025556,0.016667,0.005556,0.001111,0.0,0.503333
604,6797,1662.0,0.0,0.247368,0.121053,0.215789,0.094737,0.023684,0.002632,0.0,0.0,0.234211


In [26]:
# replacing each stats by the mean of the last k fights
# and calculating the number of wins over the last k fights

# => this code is very messy and not optmized, it can and must be vectorized
# in its current state it takes ~60sec to run..

k = 2

to_remove = [] # list of entries to remove

# temp dataframe to add the wins over the last k fights
wins = pd.DataFrame(columns=['fight_id', 'fighter_id', 'wins dec', 'wins sub', 'wins ko'])

def get_wins(fighter_id, fighter_df, index, columns=wins.columns):
    ndec = nsub = nko = 0

    fight_id = fighter_df.iloc[index]['fight_id'] # current fight id

    r1 = df_fights[df_fights['fight_id'] == fighter_df.iloc[index+1]['fight_id']].iloc[0]
    r2 = df_fights[df_fights['fight_id'] == fighter_df.iloc[index+2]['fight_id']].iloc[0]

    result1 = r1['result']
    result2 = r2['result']

    if r1['winner'] == fighter_id:
        match result1:
            case 'Decision': ndec += 1
            case 'Submission': nsub += 1
            case 'KO/TKO': nko += 1

    if r2['winner'] == fighter_id:
        match result2:
            case 'Decision': ndec += 1
            case 'Submission': nsub += 1
            case 'KO/TKO': nko += 1

    return pd.DataFrame([[fight_id, fighter_id, ndec, nsub, nko]], columns=columns)

# for all fighters in the dataset
for fighter_id in df_stats['fighter_id'].unique():

    # get the df of stats related to the fighter
    fighter_df = get_fighter_stats(fighter_id).copy()

    # if the fighter has less than k+1 fights in the UFC
    if len(fighter_df) < k + 1:
        to_remove += list(fighter_df['fight_id'])
        continue

    # the fighter has at least k+1 fights (we can compute the means)
    for i in range(len(fighter_df) - k):

        # get the number of wins
        wins_row = get_wins(fighter_id, fighter_df, i)
        wins = pd.concat([wins, wins_row], axis=0)

        # replace each stat by the mean of the last k fights
        for col in columns:
            mean = fighter_df.iloc[i+1:i+k+1][col].mean()
            fighter_df.iloc[i, fighter_df.columns.get_loc(col)] = mean

    # add the first k fights of fighter to the list of fights to remove
    for i in range(len(fighter_df)-1, len(fighter_df)-k-1, -1):
        row = fighter_df.iloc[i]
        fight_id = row['fight_id']
        to_remove.append(fight_id)

    # update the original dataset
    df_stats.update(fighter_df)

print(f'number of fights to remove: {len(to_remove)}')

  wins = pd.concat([wins, wins_row], axis=0)


number of fights to remove: 3752


In [27]:
print('now, our dataframe looks like this:')
print(len(df_stats))
df_stats.head()

now, our dataframe looks like this:
11304


Unnamed: 0,fight_id,fighter_id,knockdowns,total_strikes_att,total_strikes_succ,sig_strikes_att,sig_strikes_succ,takedown_att,takedown_succ,submission_att,reversals,ctrl_time
0,7217,1662.0,0.000556,0.239795,0.161637,0.134006,0.066257,0.014064,0.003538,0.0,0.0,0.499327
1,7217,2464.0,0.001544,0.182262,0.088415,0.175345,0.082486,0.000988,0.000988,0.0,0.0,0.040564
2,7215,3831.0,0.035714,0.344762,0.121032,0.330317,0.10881,0.000556,0.0,0.0,0.0,0.107143
3,7215,2974.0,0.0,0.201873,0.116501,0.14437,0.066375,0.002222,0.0,0.002577,0.002577,0.538041
4,7214,1108.0,0.006944,0.223333,0.067222,0.221667,0.065556,0.0,0.0,0.0,0.0,0.006944


In [28]:
new_index = (wins.index == 0).cumsum()
wins = wins.sort_values(['fight_id', 'fighter_id'], ascending=False)
wins = wins.set_index(pd.Series([i-1 for i in new_index]))

df_stats = df_stats.sort_values(['fight_id', 'fighter_id'], ascending=False)

In [29]:
# removing all the first k = 2 fights of each fighter
# and all the fights of fighters that have less than 3 fights in total

# removing all the fights where at least one fighter does not have 2 fights in the UFC

condition = df_fights['fight_id'].isin(to_remove)
indices   = df_fights[condition].index

print(f'removing {len(indices)} from df_fights')

df_fights = df_fights.drop(indices)

# removing all the data from fights that are not in the dataframe 'df_fights'

ids       = df_fights['fight_id']
condition = df_stats['fight_id'].isin(ids)
indices   = df_stats[~condition].index
print(f'removing {len(indices)} from wins')
df_stats  = df_stats.drop(indices)

# same for wins

condition = wins['fight_id'].isin(to_remove)
indices   = wins[condition].index

print(f'removing {len(indices)} from wins')

wins = wins.drop(indices)

removing 2783 from df_fights
removing 5566 from wins
removing 1814 from wins


In [30]:
print('should be equal:')
print(len(df_fights)*2, len(df_stats), len(wins))

should be equal:
5738 5738 5738


In [31]:
df_stats = df_stats.set_index(wins.index)
df_stats = pd.concat([df_stats, wins[['wins dec', 'wins sub', 'wins ko']]], axis=1)
df_stats.head()

Unnamed: 0,fight_id,fighter_id,knockdowns,total_strikes_att,total_strikes_succ,sig_strikes_att,sig_strikes_succ,takedown_att,takedown_succ,submission_att,reversals,ctrl_time,wins dec,wins sub,wins ko
0,7217,2464.0,0.001544,0.182262,0.088415,0.175345,0.082486,0.000988,0.000988,0.0,0.0,0.040564,1,0,1
1,7217,1662.0,0.000556,0.239795,0.161637,0.134006,0.066257,0.014064,0.003538,0.0,0.0,0.499327,1,0,0
2,7215,3831.0,0.035714,0.344762,0.121032,0.330317,0.10881,0.000556,0.0,0.0,0.0,0.107143,0,0,1
3,7215,2974.0,0.0,0.201873,0.116501,0.14437,0.066375,0.002222,0.0,0.002577,0.002577,0.538041,0,1,0
4,7214,2320.0,0.0,0.276667,0.161111,0.206111,0.092222,0.001111,0.000556,0.0,0.0,0.24,1,0,0


Let's compare the two dataframes with Bryce Mitchell fights to see if didn't do any mistakes.

In [32]:
id = 1662
c1 = df_fights['f_1'] == id
c2 = df_fights['f_2'] == id
df_fights[c1 | c2]

Unnamed: 0,fight_id,f_1,f_2,winner,weight_class,result,finish_round,finish_time
1,7217,1662.0,2464.0,1662.0,Featherweight,Decision,3,900
421,6797,1662.0,380.0,380.0,Featherweight,Submission,2,380
829,6389,3863.0,1662.0,1662.0,Featherweight,Decision,3,900
1479,5739,1662.0,2991.0,1662.0,Featherweight,Decision,3,900
1757,5461,1662.0,949.0,1662.0,Featherweight,Decision,3,900
1884,5334,799.0,1662.0,1662.0,Featherweight,Submission,1,260


In [33]:
df_stats[df_stats['fighter_id'] == id]

Unnamed: 0,fight_id,fighter_id,knockdowns,total_strikes_att,total_strikes_succ,sig_strikes_att,sig_strikes_succ,takedown_att,takedown_succ,submission_att,reversals,ctrl_time,wins dec,wins sub,wins ko
1,7217,1662.0,0.000556,0.239795,0.161637,0.134006,0.066257,0.014064,0.003538,0.0,0.0,0.499327,1,0,0
434,6797,1662.0,0.000556,0.191111,0.156667,0.064444,0.044444,0.009444,0.006111,0.000556,0.0,0.722222,2,0,0
885,6389,1662.0,0.0,0.116111,0.088889,0.065556,0.047778,0.008889,0.005556,0.003333,0.0,0.768333,2,0,0
1565,5739,1662.0,0.0,0.094957,0.079487,0.040684,0.033761,0.00359,0.00359,0.004701,0.0,0.889872,1,1,0
1828,5461,1662.0,0.0,0.084957,0.061709,0.037906,0.020983,0.00359,0.001923,0.003034,0.000556,0.56765,1,1,0
1959,5334,1662.0,0.0,0.074444,0.040556,0.062778,0.030556,0.007222,0.001667,0.001667,0.000556,0.181111,2,0,0


# Merging both DataFrames

In [34]:
columns   = [s + 'A' for s in df_stats.columns] + [s + 'B' for s in df_stats.columns]
df_merged = pd.DataFrame(columns=columns)

i = 0
for index in range(0, len(df_stats)-1, 2):
    rowA = np.array(df_stats.iloc[index])
    rowB = np.array(df_stats.iloc[index+1])

    f1 = df_fights.iloc[i]['f_1']

    concat = np.append(rowA, rowB) if f1 == df_stats.iloc[index]['fighter_id'] else np.append(rowB, rowA)
    concat = pd.DataFrame([concat], columns=columns)

    df_merged = pd.concat([df_merged, concat])

    i += 1

  df_merged = pd.concat([df_merged, concat])


In [35]:
df_merged = df_merged.rename(columns={'fight_idA': 'fight_id'}).drop('fight_idB', axis=1)
df_merged.head()

Unnamed: 0,fight_id,fighter_idA,knockdownsA,total_strikes_attA,total_strikes_succA,sig_strikes_attA,sig_strikes_succA,takedown_attA,takedown_succA,submission_attA,...,sig_strikes_attB,sig_strikes_succB,takedown_attB,takedown_succB,submission_attB,reversalsB,ctrl_timeB,wins decB,wins subB,wins koB
0,7217,1662.0,0.000556,0.239795,0.161637,0.134006,0.066257,0.014064,0.003538,0.0,...,0.175345,0.082486,0.000988,0.000988,0.0,0.0,0.040564,1,0,1
0,7215,2974.0,0.0,0.201873,0.116501,0.14437,0.066375,0.002222,0.0,0.002577,...,0.330317,0.10881,0.000556,0.0,0.0,0.0,0.107143,0,0,1
0,7214,1108.0,0.006944,0.223333,0.067222,0.221667,0.065556,0.0,0.0,0.0,...,0.206111,0.092222,0.001111,0.000556,0.0,0.0,0.24,1,0,0
0,7213,3945.0,0.0,0.158333,0.108889,0.08,0.041111,0.005,0.002778,0.001111,...,0.20178,0.078934,0.008367,0.000556,0.0,0.0,0.071111,1,0,0
0,7212,1752.0,0.0,0.297739,0.166654,0.232494,0.10363,0.006654,0.005543,0.0,...,0.224603,0.066542,0.0,0.0,0.0,0.0,0.059773,0,0,0


In [36]:
print(df_merged.shape)
print(df_fights.shape)

(2869, 29)
(2869, 8)


In [37]:
df_merged['fight_id'] = df_merged['fight_id'].astype(int)

df_merged = df_merged.set_index('fight_id')
df_fights = df_fights.set_index('fight_id')

In [38]:
dataset = pd.concat([df_fights, df_merged.drop(['fighter_idA', 'fighter_idB'], axis=1)], axis=1)
print('our final dataset as the form:')
dataset[:10]

our final dataset as the form:


Unnamed: 0_level_0,f_1,f_2,winner,weight_class,result,finish_round,finish_time,knockdownsA,total_strikes_attA,total_strikes_succA,...,sig_strikes_attB,sig_strikes_succB,takedown_attB,takedown_succB,submission_attB,reversalsB,ctrl_timeB,wins decB,wins subB,wins koB
fight_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7217,1662.0,2464.0,1662.0,Featherweight,Decision,3,900,0.000556,0.239795,0.161637,...,0.175345,0.082486,0.000988,0.000988,0.0,0.0,0.040564,1,0,1
7215,2974.0,3831.0,3831.0,Welterweight,Submission,2,544,0.0,0.201873,0.116501,...,0.330317,0.10881,0.000556,0.0,0.0,0.0,0.107143,0,0,1
7214,1108.0,2320.0,2320.0,Featherweight,Submission,1,192,0.006944,0.223333,0.067222,...,0.206111,0.092222,0.001111,0.000556,0.0,0.0,0.24,1,0,0
7213,3945.0,2373.0,2373.0,Bantamweight,Decision,3,900,0.0,0.158333,0.108889,...,0.20178,0.078934,0.008367,0.000556,0.0,0.0,0.071111,1,0,0
7212,1752.0,3002.0,1752.0,Welterweight,KO/TKO,3,225,0.0,0.297739,0.166654,...,0.224603,0.066542,0.0,0.0,0.0,0.0,0.059773,0,0,0
7210,309.0,3421.0,309.0,Heavyweight,Decision,3,900,0.006944,0.358611,0.134167,...,0.285139,0.140764,0.004028,0.001736,0.001736,0.0,0.299861,0,0,0
7207,3250.0,2534.0,3250.0,Welterweight,Decision,3,900,0.003759,0.249357,0.150447,...,0.217831,0.084424,0.0,0.0,0.003145,0.0,0.128337,0,1,1
7205,23.0,2846.0,23.0,Lightweight,Submission,2,412,0.000556,0.223889,0.075556,...,0.097724,0.052774,0.0,0.0,0.0,0.0,0.0,0,0,1
7202,2181.0,2929.0,2181.0,Middleweight,KO/TKO,2,568,0.007576,0.478986,0.345439,...,0.129167,0.063056,0.004444,0.002778,0.003889,0.001944,0.6125,1,1,0
7197,342.0,225.0,225.0,Heavyweight,Submission,2,554,0.0025,0.210093,0.106019,...,0.132369,0.085784,0.0,0.0,0.0,0.0,0.298284,1,0,1


In [39]:
dataset['result'].value_counts()

result
Decision      1407
KO/TKO         942
Submission     520
Name: count, dtype: int64

In [40]:
f = lambda _: 0 if _['winner'] == _['f_1'] else 1

dataset['winner'] = dataset.apply(f, axis=1)

In [41]:
weight_classes = [
    'Flyweight',
    'Bantamweight',
    'Featherweight',
    'Lightweight',
    'Welterweight',
    'Middleweight',
    'Light Heavyweight',
    'Heavyweight',
] # ranked from lightest to heaviest

ordinal = {weight_classes[i]: i for i in range(len(weight_classes))}

dataset['weight_class'] = dataset['weight_class'].map(ordinal)

In [42]:
map_result = {
    'Decision': 0,
    'Submission': 1,
    'KO/TKO': 2,
}

dataset['result'] = dataset['result'].map(map_result)


In [43]:
dataset['finish_round'] = dataset['finish_round'].map(lambda _: _-1)

In [44]:
print(dataset.shape)
dataset.head()

(2869, 33)


Unnamed: 0_level_0,f_1,f_2,winner,weight_class,result,finish_round,finish_time,knockdownsA,total_strikes_attA,total_strikes_succA,...,sig_strikes_attB,sig_strikes_succB,takedown_attB,takedown_succB,submission_attB,reversalsB,ctrl_timeB,wins decB,wins subB,wins koB
fight_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7217,1662.0,2464.0,0,2,0,2,900,0.000556,0.239795,0.161637,...,0.175345,0.082486,0.000988,0.000988,0.0,0.0,0.040564,1,0,1
7215,2974.0,3831.0,1,4,1,1,544,0.0,0.201873,0.116501,...,0.330317,0.10881,0.000556,0.0,0.0,0.0,0.107143,0,0,1
7214,1108.0,2320.0,1,2,1,0,192,0.006944,0.223333,0.067222,...,0.206111,0.092222,0.001111,0.000556,0.0,0.0,0.24,1,0,0
7213,3945.0,2373.0,1,1,0,2,900,0.0,0.158333,0.108889,...,0.20178,0.078934,0.008367,0.000556,0.0,0.0,0.071111,1,0,0
7212,1752.0,3002.0,0,4,2,2,225,0.0,0.297739,0.166654,...,0.224603,0.066542,0.0,0.0,0.0,0.0,0.059773,0,0,0


In [45]:
dataset.to_csv('dataset.csv')