In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/raw/ufc-master.csv')

df.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Adrian Yanez,Gustavo Lopez,Chris Tognoni,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Bantamweight,0.0,0.0,...,0,1,0,0,Orthodox,170.18,177.8,135.0,31.0,27.0
1,Trevin Giles,Roman Dolidze,Herb Dean,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Middleweight,0.5,0.0,...,0,3,0,0,Orthodox,182.88,187.96,185.0,32.0,28.0
2,Tai Tuivasa,Harry Hunsucker,Herb Dean,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Heavyweight,,,...,1,3,0,0,Southpaw,187.96,190.5,264.0,32.0,28.0
3,Cheyanne Buys,Montserrat Conejo,Mark Smith,2021-03-20,"Las Vegas, Nevada, USA",Blue,False,WomenStrawweight,,,...,0,0,0,0,Switch,160.02,160.02,115.0,28.0,25.0
4,Marion Reneau,Macy Chiasson,Mark Smith,2021-03-20,"Las Vegas, Nevada, USA",Blue,False,WomenBantamweight,0.125,0.0,...,1,2,2,0,Orthodox,167.64,172.72,135.0,29.0,43.0


In [2]:
# Convert 'date' column to datetime objects
df['date'] = pd.to_datetime(df['date'])

# For the first model, we'll focus on fights with a clear winner. 
# Let's filter out draws for simplicity.
df_clean = df[df['Winner'].isin(['Red', 'Blue'])].copy()

# Define our target variable 'B_win'. 1 if Blue wins, 0 if Red wins (Blue loses).
df_clean['B_win'] = (df_clean['Winner'] == 'Blue').astype(int)

In [3]:
# Calculate the percentage of missing values for each column
missing_percentage = df_clean.isnull().sum() / len(df_clean)

# Identify columns to drop
columns_to_drop = missing_percentage[missing_percentage > 0.3].index

# Drop these columns from our dataframe
df_clean = df_clean.drop(columns=columns_to_drop)

print("Columns dropped:", columns_to_drop.tolist())

Columns dropped: []


In [4]:
# Select only numeric columns to iterate over for imputation
numeric_cols = df_clean.select_dtypes(include=np.number).columns.tolist()

for col in numeric_cols:
    median_val = df_clean[col].median()
    df_clean[col] = df_clean[col].fillna(median_val)

# Verify that there are no more missing values in our numeric columns
print(df_clean[numeric_cols].isnull().sum().sum())

0


In [5]:
# Fighter attribute differences
df_clean['height_diff'] = df_clean['B_Height_cms'] - df_clean['R_Height_cms']
df_clean['reach_diff'] = df_clean['B_Reach_cms'] - df_clean['R_Reach_cms']
df_clean['age_diff'] = df_clean['B_age'] - df_clean['R_age']

# Striking performance differences
df_clean['sig_str_landed_diff'] = df_clean['B_avg_SIG_STR_landed'] - df_clean['R_avg_SIG_STR_landed']
df_clean['sig_str_accuracy_diff'] = (df_clean['B_avg_SIG_STR_landed'] / df_clean['B_avg_SIG_STR_att']) - \
                                   (df_clean['R_avg_SIG_STR_landed'] / df_clean['R_avg_SIG_STR_att'])

# Grappling performance differences
df_clean['takedown_accuracy_diff'] = df_clean['B_avg_TD_pct'] - df_clean['R_avg_TD_pct']
df_clean['sub_avg_diff'] = df_clean['B_avg_SUB_ATT'] - df_clean['R_avg_SUB_ATT']

# Replace NaN values in our new features (resulting from division by zero) with 0
df_clean.fillna(0, inplace=True)

  df_clean['height_diff'] = df_clean['B_Height_cms'] - df_clean['R_Height_cms']
  df_clean['reach_diff'] = df_clean['B_Reach_cms'] - df_clean['R_Reach_cms']
  df_clean['age_diff'] = df_clean['B_age'] - df_clean['R_age']
  df_clean['sig_str_landed_diff'] = df_clean['B_avg_SIG_STR_landed'] - df_clean['R_avg_SIG_STR_landed']
  df_clean['sig_str_accuracy_diff'] = (df_clean['B_avg_SIG_STR_landed'] / df_clean['B_avg_SIG_STR_att']) - \
  df_clean['takedown_accuracy_diff'] = df_clean['B_avg_TD_pct'] - df_clean['R_avg_TD_pct']
  df_clean['sub_avg_diff'] = df_clean['B_avg_SUB_ATT'] - df_clean['R_avg_SUB_ATT']


In [7]:
# List of columns we want to keep for modeling
# We include our engineered features and the original betting odds
features = [
    'height_diff', 'reach_diff', 'age_diff',
    'sig_str_landed_diff', 'sig_str_accuracy_diff',
    'takedown_accuracy_diff', 'sub_avg_diff'
]

# Our target variable
target = 'B_win'

# Create our final DataFrame for modeling
df_model = df_clean[features + [target]].copy()

# Create a new directory for our processed data
import os
os.makedirs('../data/processed', exist_ok=True)

# Save the processed data to a new CSV file
df_model.to_csv('../data/processed/ufc_data_processed.csv', index=False)

print("Processed data saved successfully!")
df_model.head()

Processed data saved successfully!


Unnamed: 0,height_diff,reach_diff,age_diff,sig_str_landed_diff,sig_str_accuracy_diff,takedown_accuracy_diff,sub_avg_diff,B_win
0,-5.08,-7.62,4.0,3.0,-0.1,0.33,0.5,0
1,5.08,5.08,4.0,-8.15625,-0.022055,-0.10625,1.25,0
2,0.0,0.0,4.0,-0.843872,-0.068089,0.25,0.149902,0
3,-7.62,-5.08,3.0,-0.968872,0.005834,0.0,-0.037598,1
4,12.7,10.16,-14.0,13.575195,0.124863,-0.326719,-0.106445,1
