# 🔹UFC Fight Predictor ETL

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Import Libraries and Setup Environment

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)

# Get the current working directory
current_dir = os.getcwd()

# Navigate to the project root
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Import from /src
sys.path.append(os.path.join(project_root))
from src.helpers import *

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Load Data

In [2]:
# Define the path to the CSV file
file_path = os.path.join(project_root, 'data', 'raw', 'ufc_raw.csv')

# Load the CSV into a DataFrame
try:
    ufc_raw = pd.read_csv(file_path)
    logger.info(f"✅ Data successfully loaded: {ufc_raw.shape[0]} rows, {ufc_raw.shape[1]} columns.")
except Exception as e:
    logger.error(f"❌ Error loading training data: {e}")

[INFO] ✅ Data successfully loaded: 6541 rows, 118 columns.


<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Preview

In [3]:
# Preview the first few records
display(ufc_raw.head())

# General dataset information
ufc_raw.info()

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Colby Covington,Joaquin Buckley,205.0,-250.0,205.0,40.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,,3.0,4:42,882.0,300.0,175.0,1800.0,2000.0,1100.0,150.0
1,Cub Swanson,Billy Quarantillo,124.0,-148.0,124.0,67.5676,2024-12-14,"Tampa, Florida, USA",USA,Red,...,Punch,3.0,1:36,696.0,250.0,,1800.0,,450.0,
2,Manel Kape,Bruno Silva,-395.0,310.0,25.3165,310.0,2024-12-14,"Tampa, Florida, USA",USA,Red,...,Punches,3.0,1:57,717.0,-105.0,550.0,900.0,1800.0,225.0,1100.0
3,Vitor Petrino,Dustin Jacoby,-340.0,270.0,29.4118,270.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,Punch,3.0,3:44,824.0,240.0,500.0,550.0,3000.0,110.0,800.0
4,Adrian Yanez,Daniel Marcos,185.0,-225.0,185.0,44.4444,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,,3.0,5:00,900.0,450.0,150.0,2200.0,2200.0,450.0,200.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6541 entries, 0 to 6540
Columns: 118 entries, RedFighter to BKOOdds
dtypes: bool(1), float64(60), int64(43), object(14)
memory usage: 5.8+ MB


<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Check 

In [4]:
# Null values check
nulls = ufc_raw.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

# Duplicate analysis
duplicates = ufc_raw.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Data Cleaning

## Null Values

In [5]:
# Drop columns with too many null values (threshold: 1000)
ufc_deploy = ufc_raw.copy()
threshold = 1000
cols_to_drop = [col for col in ufc_deploy.columns if ufc_deploy[col].isnull().sum() > threshold]
for col in cols_to_drop:
    print('Dropping:', col)
ufc_deploy.drop(columns=cols_to_drop, inplace=True)

# Drop rows with any remaining missing values
print(f"➡️ Before dropna: {ufc_deploy.shape}")
ufc_deploy.dropna(inplace=True)
print(f"✅ After dropna: {ufc_deploy.shape}")

In [6]:
# Drop columns with too many null values (threshold: 300)
threshold = 300
cols_to_drop = [col for col in ufc_raw.columns if ufc_raw[col].isnull().sum() > threshold]
for col in cols_to_drop:
    print('Dropping:', col)
ufc_raw.drop(columns=cols_to_drop, inplace=True)

# Drop rows with any remaining missing values
print(f"➡️ Before dropna: {ufc_raw.shape}")
ufc_raw.dropna(inplace=True)
print(f"✅ After dropna: {ufc_raw.shape}")

In [7]:
# Null values check
nulls = ufc_raw.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

In [8]:
ufc_raw.columns

Index(['RedFighter', 'BlueFighter', 'RedOdds', 'BlueOdds', 'RedExpectedValue',
       'BlueExpectedValue', 'Date', 'Location', 'Country', 'Winner',
       'TitleBout', 'WeightClass', 'Gender', 'NumberOfRounds',
       'BlueCurrentLoseStreak', 'BlueCurrentWinStreak', 'BlueDraws',
       'BlueLongestWinStreak', 'BlueLosses', 'BlueTotalRoundsFought',
       'BlueTotalTitleBouts', 'BlueWinsByDecisionMajority',
       'BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous',
       'BlueWinsByKO', 'BlueWinsBySubmission', 'BlueWinsByTKODoctorStoppage',
       'BlueWins', 'BlueStance', 'BlueHeightCms', 'BlueReachCms',
       'BlueWeightLbs', 'RedCurrentLoseStreak', 'RedCurrentWinStreak',
       'RedDraws', 'RedLongestWinStreak', 'RedLosses', 'RedTotalRoundsFought',
       'RedTotalTitleBouts', 'RedWinsByDecisionMajority',
       'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous', 'RedWinsByKO',
       'RedWinsBySubmission', 'RedWinsByTKODoctorStoppage', 'RedWins',
       'RedStance', 'Red

## Incongruent Data, KNN Imputer

In [9]:
print(ufc_raw[['RedReachCms', 'BlueReachCms', 'RedHeightCms', 'BlueHeightCms']].describe())

In [10]:
from sklearn.impute import KNNImputer

# 1️⃣ Replace zeros with np.nan
ufc_raw['RedReachCms'] = ufc_raw['RedReachCms'].replace(0, np.nan)
ufc_raw['BlueReachCms'] = ufc_raw['BlueReachCms'].replace(0, np.nan)

# 2️⃣ Define relevant columns
cols = [
    'BlueReachCms', 'BlueHeightCms', 'BlueWeightLbs', 'BlueAge',
    'RedReachCms', 'RedHeightCms', 'RedWeightLbs', 'RedAge'
]

# 3️⃣ Ensure all missing values are np.nan (replace pd.NA, NaT, etc.)
ufc_raw[cols] = ufc_raw[cols].apply(pd.to_numeric, errors='coerce')

# 4️⃣ Apply KNNImputer
imputer = KNNImputer(n_neighbors=5)
ufc_raw[cols] = imputer.fit_transform(ufc_raw[cols])

In [11]:
print(ufc_raw[['RedReachCms', 'BlueReachCms', 'RedHeightCms', 'BlueHeightCms']].describe())

In [12]:
# Open Stance is incorrect
ufc_raw[ufc_raw['RedStance'] == 'Open Stance']

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish
6051,Krzysztof Soszynski,Igor Pokrajac,-255.0,235.0,39.2157,235.0,2011-12-10,"Toronto, Ontario, Canada",Canada,Blue,...,-1,-1,-2.54,-2.54,2,-15.5,-0.55,0.625,neither,KO/TKO
6366,Krzysztof Soszynski,Goran Reljic,-108.0,-102.0,92.5926,98.0392,2010-11-13,"Oberhausen, North Rhine-Westphalia, Germany",Germany,Red,...,-1,-2,5.08,10.16,7,-4.5,0.8333,0.8333,neither,U-DEC
6448,Krzysztof Soszynski,Stephan Bonnar,-190.0,175.0,52.6316,175.0,2010-07-03,"Las Vegas, Nevada, USA",USA,Blue,...,-1,0,7.62,2.54,-1,3.2,0.4,0.1273,neither,KO/TKO
6511,Nate Quarry,Jorge Rivera,-230.0,190.0,43.4783,190.0,2010-03-31,"Charlotte, North Carolina, USA",USA,Blue,...,-2,0,2.54,2.54,0,-2.8182,0.0909,0.4343,neither,KO/TKO


In [13]:
ufc_raw = ufc_raw[ufc_raw['RedStance'] != 'Open Stance']

In [14]:
# Open Stance is incorrect
ufc_raw[ufc_raw['RedStance'] == 'Open Stance']

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish


In [15]:
# Open Stance is incorrect
ufc_raw[ufc_raw['BlueStance'] == 'Open Stance']

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish
6216,Mike Massenzio,Krzysztof Soszynski,265.0,-325.0,265.0,30.7692,2011-06-11,"Vancouver, British Columbia, Canada",Canada,Blue,...,2,1,-2.54,10.16,-5,28.4762,-0.8095,-1.5714,neither,U-DEC


In [16]:
ufc_raw = ufc_raw[ufc_raw['BlueStance'] != 'Open Stance']

In [17]:
# Open Stance is incorrect
ufc_raw[ufc_raw['BlueStance'] == 'Open Stance']

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish


In [18]:
# Open Stance is incorrect
ufc_raw[ufc_raw['RedStance'] == 'Open Stance']

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish


## Create Fight Stance Columns
- If both fighters have the same fighting stance, the bout is considered a Closed Stance matchup. If their stances differ, it is classified as an Open Stance matchup.

In [19]:
# Create column FightStance according Stances matches.
ufc_raw['FightStance'] = np.where(
    ufc_raw['BlueStance'] == ufc_raw['RedStance'],
    'Closed Stance',
    'Open Stance'
)

In [20]:
ufc_preview = ufc_raw[ufc_raw['FightStance'] == 'Open Stance']

In [21]:
ufc_preview2 = ufc_raw[ufc_raw['FightStance'] == 'Closed Stance']

In [22]:
ufc_preview[['FightStance', 'BlueStance', 'RedStance']]

Unnamed: 0,FightStance,BlueStance,RedStance
0,Open Stance,Southpaw,Orthodox
2,Open Stance,Orthodox,Southpaw
6,Open Stance,Switch,Southpaw
10,Open Stance,Orthodox,Southpaw
11,Open Stance,Southpaw,Orthodox
...,...,...,...
6524,Open Stance,Orthodox,Southpaw
6525,Open Stance,Orthodox,Southpaw
6528,Open Stance,Orthodox,Switch
6529,Open Stance,Orthodox,Southpaw


In [23]:
ufc_preview2[['FightStance', 'BlueStance', 'RedStance']]

Unnamed: 0,FightStance,BlueStance,RedStance
1,Closed Stance,Orthodox,Orthodox
3,Closed Stance,Orthodox,Orthodox
4,Closed Stance,Orthodox,Orthodox
5,Closed Stance,Orthodox,Orthodox
7,Closed Stance,Orthodox,Orthodox
...,...,...,...
6535,Closed Stance,Orthodox,Orthodox
6536,Closed Stance,Orthodox,Orthodox
6538,Closed Stance,Orthodox,Orthodox
6539,Closed Stance,Orthodox,Orthodox


<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Check Clean Data

In [24]:
# Null values check
nulls = ufc_raw.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

# Duplicate analysis
duplicates = ufc_raw.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

In [25]:
# Preview the first few records
display(ufc_raw.head())
display(ufc_raw.columns)
# Para ver los tipos de dato de cada columna:
display(ufc_raw.dtypes)

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish,FightStance
0,Colby Covington,Joaquin Buckley,205.0,-250.0,205.0,40.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,-2,-2.54,10.16,-6,0.25,-0.2,-1.83,Red,KO/TKO,Open Stance
1,Cub Swanson,Billy Quarantillo,124.0,-148.0,124.0,67.5676,2024-12-14,"Tampa, Florida, USA",USA,Red,...,-1,5.08,0.0,-5,2.69,0.7,0.2,neither,KO/TKO,Closed Stance
2,Manel Kape,Bruno Silva,-395.0,310.0,25.3165,310.0,2024-12-14,"Tampa, Florida, USA",USA,Red,...,1,-2.54,-7.62,3,-1.12,-0.2,1.72,Red,KO/TKO,Open Stance
3,Vitor Petrino,Dustin Jacoby,-340.0,270.0,29.4118,270.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,-1,2.54,-2.54,9,2.68,-0.8,-3.62,neither,KO/TKO,Closed Stance
4,Adrian Yanez,Daniel Marcos,185.0,-225.0,185.0,44.4444,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,0,0.0,-2.54,0,-0.57,0.0,0.25,neither,S-DEC,Closed Stance


Index(['RedFighter', 'BlueFighter', 'RedOdds', 'BlueOdds', 'RedExpectedValue',
       'BlueExpectedValue', 'Date', 'Location', 'Country', 'Winner',
       'TitleBout', 'WeightClass', 'Gender', 'NumberOfRounds',
       'BlueCurrentLoseStreak', 'BlueCurrentWinStreak', 'BlueDraws',
       'BlueLongestWinStreak', 'BlueLosses', 'BlueTotalRoundsFought',
       'BlueTotalTitleBouts', 'BlueWinsByDecisionMajority',
       'BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous',
       'BlueWinsByKO', 'BlueWinsBySubmission', 'BlueWinsByTKODoctorStoppage',
       'BlueWins', 'BlueStance', 'BlueHeightCms', 'BlueReachCms',
       'BlueWeightLbs', 'RedCurrentLoseStreak', 'RedCurrentWinStreak',
       'RedDraws', 'RedLongestWinStreak', 'RedLosses', 'RedTotalRoundsFought',
       'RedTotalTitleBouts', 'RedWinsByDecisionMajority',
       'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous', 'RedWinsByKO',
       'RedWinsBySubmission', 'RedWinsByTKODoctorStoppage', 'RedWins',
       'RedStance', 'Red

RedFighter           object
BlueFighter          object
RedOdds             float64
BlueOdds            float64
RedExpectedValue    float64
                     ...   
AvgSubAttDif        float64
AvgTDDif            float64
BetterRank           object
Finish               object
FightStance          object
Length: 70, dtype: object

# Create the target value: **0** (Fighter Red wins) or **1** (Fighter Blue wins)

In [26]:
ufc_raw['label'] = ufc_raw['Winner'].apply(lambda x: 1 if x == 'Blue' else 0)
ufc_raw=ufc_raw.drop('Winner', axis=1)

ufc_deploy['label'] = ufc_deploy['Winner'].apply(lambda x: 1 if x == 'Blue' else 0)
ufc_deploy=ufc_deploy.drop('Winner', axis=1)

# Save

In [27]:
# Save the cleaned file
ufc_raw.to_csv(f'{project_root}/data/processed/ufc_etl.csv', index=False)
logger.info("✅ ETL file saved as 'ufc_etl.csv'.")

[INFO] ✅ ETL file saved as 'ufc_etl.csv'.


In [28]:
# Save the cleaned file
ufc_deploy.to_csv(f'{project_root}/data/processed/ufc_deploy.csv', index=False)
logger.info("✅ Deploy file saved as 'ufc_deploy.csv'.")

[INFO] ✅ Deploy file saved as 'ufc_deploy.csv'.


<div style="text-align: center;">
     <img src="../img/ufc_logo.png" width="800" /> 
</div>