# 🔹UFC Fight Predictor ETL

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Import Libraries and Setup Environment

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)

# Get the current working directory
current_dir = os.getcwd()

# Navigate to the project root
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Import from /src
sys.path.append(os.path.join(project_root))
from src.helpers import *

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Load Data

In [2]:
# Define the path to the CSV file
file_path = os.path.join(project_root, 'data', 'raw', 'ufc_raw.csv')

# Load the CSV into a DataFrame
try:
    ufc_raw = pd.read_csv(file_path)
    logger.info(f"✅ Data successfully loaded: {ufc_raw.shape[0]} rows, {ufc_raw.shape[1]} columns.")
except Exception as e:
    logger.error(f"❌ Error loading training data: {e}")

[INFO] ✅ Data successfully loaded: 8250 rows, 124 columns.


<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Preview

In [3]:
# Preview the first few records
display(ufc_raw.head())

# General dataset information
ufc_raw.info()

Unnamed: 0,event_id,event_name,date,location,fight_id,division,title_fight,method,finish_round,match_time_sec,...,b_splm,b_str_acc,b_sapm,b_str_def,b_td_avg,b_td_avg_acc,b_td_def,b_sub_avg,winner,winner_id
0,400c7b43c86d27d3,UFC Fight Night: Hill vs. Rountree Jr.,2025/06/21,"Baku, Azerbaijan",36ec204f47e4d613,catch weight,0,Submission,1,275,...,3.11,48,3.08,50,5.82,45,40,0.7,Myktybek Orolbai,bf2c8e01b07d3eb1
1,400c7b43c86d27d3,UFC Fight Night: Hill vs. Rountree Jr.,2025/06/21,"Baku, Azerbaijan",a1afc16e21d1a807,lightweight,0,Decision - Unanimous,3,300,...,6.55,45,4.33,56,0.0,0,75,0.4,Rafael Fiziev,c814b4c899793af6
2,400c7b43c86d27d3,UFC Fight Night: Hill vs. Rountree Jr.,2025/06/21,"Baku, Azerbaijan",7513a00037094075,lightweight,0,KO/TKO,2,257,...,4.13,38,5.28,52,0.0,0,83,0.0,Nazim Sadykhov,ff62013d2fce6d13
3,400c7b43c86d27d3,UFC Fight Night: Hill vs. Rountree Jr.,2025/06/21,"Baku, Azerbaijan",e512b80bbaea36c2,welterweight,0,Decision - Unanimous,3,300,...,3.5,48,2.24,60,1.7,44,55,0.0,Seokhyeon Ko,4a07b1988477502c
4,400c7b43c86d27d3,UFC Fight Night: Hill vs. Rountree Jr.,2025/06/21,"Baku, Azerbaijan",03bc32bdb5a33496,light heavyweight,0,Decision - Unanimous,5,300,...,3.79,40,4.24,49,0.0,0,58,0.1,Khalil Rountree Jr.,749f572d1d3161fb


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8250 entries, 0 to 8249
Columns: 124 entries, event_id to winner_id
dtypes: float64(87), int64(17), object(20)
memory usage: 7.8+ MB


<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Check 

In [8]:
pd.set_option('display.max_rows', None)

In [9]:
# Null values check
nulls = ufc_raw.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

# Duplicate analysis
duplicates = ufc_raw.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")


Null values per column:
 total_rounds             31
referee                  26
r_kd                     21
r_sig_str_landed         21
r_sig_str_atmpted        21
r_sig_str_acc            59
r_total_str_landed       21
r_total_str_atmpted      21
r_total_str_acc          48
r_td_landed              21
r_td_atmpted             21
r_td_acc               2658
r_sub_att                21
r_ctrl                  202
r_head_landed            21
r_head_atmpted           21
r_head_acc               99
r_body_landed            21
r_body_atmpted           21
r_body_acc              851
r_leg_landed             21
r_leg_atmpted            21
r_leg_acc              1568
r_dist_landed            21
r_dist_atmpted           21
r_dist_acc              139
r_clinch_landed          21
r_clinch_atmpted         21
r_clinch_acc           1977
r_ground_landed          21
r_ground_atmpted         21
r_ground_acc           2916
r_landed_head_per        21
r_landed_body_per        21
r_landed_leg_per      

In [10]:
pd.reset_option('display.max_rows')

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Data Cleaning

## Null Values

In [6]:
# Drop columns with too many null values (threshold: 300)
threshold = 300
cols_to_drop = [col for col in ufc_raw.columns if ufc_raw[col].isnull().sum() > threshold]
for col in cols_to_drop:
    print('Dropping:', col)
ufc_raw.drop(columns=cols_to_drop, inplace=True)

# Drop rows with any remaining missing values
print(f"➡️ Before dropna: {ufc_raw.shape}")
ufc_raw.dropna(inplace=True)
print(f"✅ After dropna: {ufc_raw.shape}")

In [7]:
# Null values check
nulls = ufc_raw.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

In [8]:
ufc_raw.columns

Index(['RedFighter', 'BlueFighter', 'RedOdds', 'BlueOdds', 'RedExpectedValue',
       'BlueExpectedValue', 'Date', 'Location', 'Country', 'Winner',
       'TitleBout', 'WeightClass', 'Gender', 'NumberOfRounds',
       'BlueCurrentLoseStreak', 'BlueCurrentWinStreak', 'BlueDraws',
       'BlueLongestWinStreak', 'BlueLosses', 'BlueTotalRoundsFought',
       'BlueTotalTitleBouts', 'BlueWinsByDecisionMajority',
       'BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous',
       'BlueWinsByKO', 'BlueWinsBySubmission', 'BlueWinsByTKODoctorStoppage',
       'BlueWins', 'BlueStance', 'BlueHeightCms', 'BlueReachCms',
       'BlueWeightLbs', 'RedCurrentLoseStreak', 'RedCurrentWinStreak',
       'RedDraws', 'RedLongestWinStreak', 'RedLosses', 'RedTotalRoundsFought',
       'RedTotalTitleBouts', 'RedWinsByDecisionMajority',
       'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous', 'RedWinsByKO',
       'RedWinsBySubmission', 'RedWinsByTKODoctorStoppage', 'RedWins',
       'RedStance', 'Red

## Incongruent Data, KNN Imputer

In [9]:
print(ufc_raw[['RedReachCms', 'BlueReachCms', 'RedHeightCms', 'BlueHeightCms']].describe())

In [10]:
from sklearn.impute import KNNImputer

# 1️⃣ Replace zeros with np.nan
ufc_raw['RedReachCms'] = ufc_raw['RedReachCms'].replace(0, np.nan)
ufc_raw['BlueReachCms'] = ufc_raw['BlueReachCms'].replace(0, np.nan)

# 2️⃣ Define relevant columns
cols = [
    'BlueReachCms', 'BlueHeightCms', 'BlueWeightLbs', 'BlueAge',
    'RedReachCms', 'RedHeightCms', 'RedWeightLbs', 'RedAge'
]

# 3️⃣ Ensure all missing values are np.nan (replace pd.NA, NaT, etc.)
ufc_raw[cols] = ufc_raw[cols].apply(pd.to_numeric, errors='coerce')

# 4️⃣ Apply KNNImputer
imputer = KNNImputer(n_neighbors=5)
ufc_raw[cols] = imputer.fit_transform(ufc_raw[cols])

In [11]:
print(ufc_raw[['RedReachCms', 'BlueReachCms', 'RedHeightCms', 'BlueHeightCms']].describe())

In [12]:
# Open Stance is incorrect
ufc_raw[ufc_raw['RedStance'] == 'Open Stance']

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish
6051,Krzysztof Soszynski,Igor Pokrajac,-255.0,235.0,39.2157,235.0,2011-12-10,"Toronto, Ontario, Canada",Canada,Blue,...,-1,-1,-2.54,-2.54,2,-15.5,-0.55,0.625,neither,KO/TKO
6366,Krzysztof Soszynski,Goran Reljic,-108.0,-102.0,92.5926,98.0392,2010-11-13,"Oberhausen, North Rhine-Westphalia, Germany",Germany,Red,...,-1,-2,5.08,10.16,7,-4.5,0.8333,0.8333,neither,U-DEC
6448,Krzysztof Soszynski,Stephan Bonnar,-190.0,175.0,52.6316,175.0,2010-07-03,"Las Vegas, Nevada, USA",USA,Blue,...,-1,0,7.62,2.54,-1,3.2,0.4,0.1273,neither,KO/TKO
6511,Nate Quarry,Jorge Rivera,-230.0,190.0,43.4783,190.0,2010-03-31,"Charlotte, North Carolina, USA",USA,Blue,...,-2,0,2.54,2.54,0,-2.8182,0.0909,0.4343,neither,KO/TKO


In [13]:
ufc_raw = ufc_raw[ufc_raw['RedStance'] != 'Open Stance']

In [14]:
# Open Stance is incorrect
ufc_raw[ufc_raw['RedStance'] == 'Open Stance']

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish


In [15]:
# Open Stance is incorrect
ufc_raw[ufc_raw['BlueStance'] == 'Open Stance']

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish
6216,Mike Massenzio,Krzysztof Soszynski,265.0,-325.0,265.0,30.7692,2011-06-11,"Vancouver, British Columbia, Canada",Canada,Blue,...,2,1,-2.54,10.16,-5,28.4762,-0.8095,-1.5714,neither,U-DEC


In [16]:
ufc_raw = ufc_raw[ufc_raw['BlueStance'] != 'Open Stance']

In [17]:
# Open Stance is incorrect
ufc_raw[ufc_raw['BlueStance'] == 'Open Stance']

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish


In [18]:
# Open Stance is incorrect
ufc_raw[ufc_raw['RedStance'] == 'Open Stance']

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,KODif,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish


## Create Fight Stance Columns
- If both fighters have the same fighting stance, the bout is considered a Closed Stance matchup. If their stances differ, it is classified as an Open Stance matchup.

In [19]:
# Create column FightStance according Stances matches.
ufc_raw['FightStance'] = np.where(
    ufc_raw['BlueStance'] == ufc_raw['RedStance'],
    'Closed Stance',
    'Open Stance'
)

In [20]:
ufc_preview = ufc_raw[ufc_raw['FightStance'] == 'Open Stance']

In [21]:
ufc_preview2 = ufc_raw[ufc_raw['FightStance'] == 'Closed Stance']

In [22]:
ufc_preview[['FightStance', 'BlueStance', 'RedStance']]

Unnamed: 0,FightStance,BlueStance,RedStance
0,Open Stance,Southpaw,Orthodox
2,Open Stance,Orthodox,Southpaw
6,Open Stance,Switch,Southpaw
10,Open Stance,Orthodox,Southpaw
11,Open Stance,Southpaw,Orthodox
...,...,...,...
6524,Open Stance,Orthodox,Southpaw
6525,Open Stance,Orthodox,Southpaw
6528,Open Stance,Orthodox,Switch
6529,Open Stance,Orthodox,Southpaw


In [23]:
ufc_preview2[['FightStance', 'BlueStance', 'RedStance']]

Unnamed: 0,FightStance,BlueStance,RedStance
1,Closed Stance,Orthodox,Orthodox
3,Closed Stance,Orthodox,Orthodox
4,Closed Stance,Orthodox,Orthodox
5,Closed Stance,Orthodox,Orthodox
7,Closed Stance,Orthodox,Orthodox
...,...,...,...
6535,Closed Stance,Orthodox,Orthodox
6536,Closed Stance,Orthodox,Orthodox
6538,Closed Stance,Orthodox,Orthodox
6539,Closed Stance,Orthodox,Orthodox


<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Check Clean Data

In [24]:
# Null values check
nulls = ufc_raw.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

# Duplicate analysis
duplicates = ufc_raw.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

In [25]:
# Preview the first few records
display(ufc_raw.head())
display(ufc_raw.columns)
# Para ver los tipos de dato de cada columna:
display(ufc_raw.dtypes)

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,SubDif,HeightDif,ReachDif,AgeDif,SigStrDif,AvgSubAttDif,AvgTDDif,BetterRank,Finish,FightStance
0,Colby Covington,Joaquin Buckley,205.0,-250.0,205.0,40.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,-2,-2.54,10.16,-6,0.25,-0.2,-1.83,Red,KO/TKO,Open Stance
1,Cub Swanson,Billy Quarantillo,124.0,-148.0,124.0,67.5676,2024-12-14,"Tampa, Florida, USA",USA,Red,...,-1,5.08,0.0,-5,2.69,0.7,0.2,neither,KO/TKO,Closed Stance
2,Manel Kape,Bruno Silva,-395.0,310.0,25.3165,310.0,2024-12-14,"Tampa, Florida, USA",USA,Red,...,1,-2.54,-7.62,3,-1.12,-0.2,1.72,Red,KO/TKO,Open Stance
3,Vitor Petrino,Dustin Jacoby,-340.0,270.0,29.4118,270.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,-1,2.54,-2.54,9,2.68,-0.8,-3.62,neither,KO/TKO,Closed Stance
4,Adrian Yanez,Daniel Marcos,185.0,-225.0,185.0,44.4444,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,0,0.0,-2.54,0,-0.57,0.0,0.25,neither,S-DEC,Closed Stance


Index(['RedFighter', 'BlueFighter', 'RedOdds', 'BlueOdds', 'RedExpectedValue',
       'BlueExpectedValue', 'Date', 'Location', 'Country', 'Winner',
       'TitleBout', 'WeightClass', 'Gender', 'NumberOfRounds',
       'BlueCurrentLoseStreak', 'BlueCurrentWinStreak', 'BlueDraws',
       'BlueLongestWinStreak', 'BlueLosses', 'BlueTotalRoundsFought',
       'BlueTotalTitleBouts', 'BlueWinsByDecisionMajority',
       'BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous',
       'BlueWinsByKO', 'BlueWinsBySubmission', 'BlueWinsByTKODoctorStoppage',
       'BlueWins', 'BlueStance', 'BlueHeightCms', 'BlueReachCms',
       'BlueWeightLbs', 'RedCurrentLoseStreak', 'RedCurrentWinStreak',
       'RedDraws', 'RedLongestWinStreak', 'RedLosses', 'RedTotalRoundsFought',
       'RedTotalTitleBouts', 'RedWinsByDecisionMajority',
       'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous', 'RedWinsByKO',
       'RedWinsBySubmission', 'RedWinsByTKODoctorStoppage', 'RedWins',
       'RedStance', 'Red

RedFighter           object
BlueFighter          object
RedOdds             float64
BlueOdds            float64
RedExpectedValue    float64
                     ...   
AvgSubAttDif        float64
AvgTDDif            float64
BetterRank           object
Finish               object
FightStance          object
Length: 70, dtype: object

# Create the target value: **0** (Fighter Red wins) or **1** (Fighter Blue wins)

In [26]:
ufc_raw['label'] = ufc_raw['Winner'].apply(lambda x: 1 if x == 'Blue' else 0)
ufc_raw=ufc_raw.drop('Winner', axis=1)

ufc_deploy['label'] = ufc_deploy['Winner'].apply(lambda x: 1 if x == 'Blue' else 0)
ufc_deploy=ufc_deploy.drop('Winner', axis=1)

# Save

In [27]:
# Save the cleaned file
ufc_raw.to_csv(f'{project_root}/data/processed/ufc_etl.csv', index=False)
logger.info("✅ ETL file saved as 'ufc_etl.csv'.")

[INFO] ✅ ETL file saved as 'ufc_etl.csv'.


In [28]:
# Save the cleaned file
ufc_deploy.to_csv(f'{project_root}/data/processed/ufc_deploy.csv', index=False)
logger.info("✅ Deploy file saved as 'ufc_deploy.csv'.")

[INFO] ✅ Deploy file saved as 'ufc_deploy.csv'.


# Create column from Dates and Fighter names for the Dataset

# Load Data

In [29]:
# Define the path to the CSV file
file_path = os.path.join(project_root, 'data', 'raw', 'ufc_raw.csv')

# Load the CSV into a DataFrame
try:
    ufc_raw = pd.read_csv(file_path)
    logger.info(f"✅ Data successfully loaded: {ufc_raw.shape[0]} rows, {ufc_raw.shape[1]} columns.")
except Exception as e:
    logger.error(f"❌ Error loading training data: {e}")

[INFO] ✅ Data successfully loaded: 6541 rows, 118 columns.


## Transform Date to datetime64

In [30]:
ufc_raw['Date']

0       2024-12-14
1       2024-12-14
2       2024-12-14
3       2024-12-14
4       2024-12-14
           ...    
6536    2010-03-21
6537    2010-03-21
6538    2010-03-21
6539    2010-03-21
6540    2010-03-21
Name: Date, Length: 6541, dtype: object

In [31]:
ufc_raw['Date'] = pd.to_datetime(ufc_raw['Date'], errors='coerce')

In [32]:
n_invalid = ufc_raw['Date'].isna().sum()
print(f" ✅ Conversion completed. Invalid dates: {n_invalid}")

In [33]:
ufc_raw['Date']

0      2024-12-14
1      2024-12-14
2      2024-12-14
3      2024-12-14
4      2024-12-14
          ...    
6536   2010-03-21
6537   2010-03-21
6538   2010-03-21
6539   2010-03-21
6540   2010-03-21
Name: Date, Length: 6541, dtype: datetime64[ns]

In [34]:
# Ordenar por fecha ascendente
ufc_raw = ufc_raw.sort_values(by='Date').reset_index(drop=True)

In [35]:
ufc_raw['Date']

0      2010-03-21
1      2010-03-21
2      2010-03-21
3      2010-03-21
4      2010-03-21
          ...    
6536   2024-12-14
6537   2024-12-14
6538   2024-12-14
6539   2024-12-14
6540   2024-12-14
Name: Date, Length: 6541, dtype: datetime64[ns]

In [36]:
ufc_raw['RedFighter'] = ufc_raw['RedFighter'].str.strip()
ufc_raw['BlueFighter'] = ufc_raw['BlueFighter'].str.strip()
ufc_raw

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Eric Schafer,Jason Brilz,140.0,-160.0,140.0000,62.5000,2010-03-21,"Broomfield, Colorado, USA",USA,Blue,...,,3.0,5:00,900.0,,,,,,
1,Brandon Vera,Jon Jones,215.0,-235.0,215.0000,42.5532,2010-03-21,"Broomfield, Colorado, USA",USA,Blue,...,Elbow,1.0,3:19,199.0,,,,,,
2,Junior Dos Santos,Gabriel Gonzaga,-250.0,230.0,40.0000,230.0000,2010-03-21,"Broomfield, Colorado, USA",USA,Red,...,Punches,1.0,3:53,233.0,,,,,,
3,Cheick Kongo,Paul Buentello,-345.0,315.0,28.9855,315.0000,2010-03-21,"Broomfield, Colorado, USA",USA,Red,...,Elbows,3.0,1:16,676.0,,,,,,
4,Alessio Sakara,James Irvin,-120.0,100.0,83.3333,100.0000,2010-03-21,"Broomfield, Colorado, USA",USA,Red,...,,1.0,3:01,181.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6536,Miranda Maverick,Jamey-Lyn Horth,-625.0,455.0,16.0000,455.0000,2024-12-14,"Tampa, Florida, USA",USA,Red,...,,3.0,5:00,900.0,-295.0,650.0,500.0,2500.0,1400.0,2200.0
6537,Davey Grant,Ramon Taveras,-122.0,102.0,81.9672,102.0000,2024-12-14,"Tampa, Florida, USA",USA,Red,...,,3.0,5:00,900.0,225.0,350.0,750.0,800.0,350.0,300.0
6538,Josefine Knutsson,Piera Rodriguez,-245.0,200.0,40.8163,200.0000,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,,3.0,5:00,900.0,-175.0,300.0,1800.0,1400.0,800.0,1200.0
6539,Michael Johnson,Ottman Azaitar,-230.0,190.0,43.4783,190.0000,2024-12-14,"Tampa, Florida, USA",USA,Red,...,Punch,2.0,2:03,423.0,300.0,650.0,1000.0,2500.0,120.0,350.0


## Create DaysSinceLastFight: Days since his last UFC fight

In [37]:
# Sort the DataFrame by ascending fight date
ufc_raw = ufc_raw.sort_values(by='Date').reset_index(drop=True)

# Dictionary to store each fighter's last fight date
last_fight_date_by_fighter = {}

# Lists to store the computed days since last fight
red_days_since_last_fight = []
blue_days_since_last_fight = []

# Iterate through each fight record
for idx, row in ufc_raw.iterrows():
    current_date = row['Date']
    red_fighter = row['RedFighter']
    blue_fighter = row['BlueFighter']

    # Days since last fight for Red corner
    red_last_date = last_fight_date_by_fighter.get(red_fighter, None)
    if red_last_date is None:
        red_days_since_last_fight.append(np.nan)
    else:
        red_days_since_last_fight.append((current_date - red_last_date).days)

    # Days since last fight for Blue corner
    blue_last_date = last_fight_date_by_fighter.get(blue_fighter, None)
    if blue_last_date is None:
        blue_days_since_last_fight.append(np.nan)
    else:
        blue_days_since_last_fight.append((current_date - blue_last_date).days)

    # Update last seen date for both fighters
    last_fight_date_by_fighter[red_fighter] = current_date
    last_fight_date_by_fighter[blue_fighter] = current_date

# Add the results as new columns in the dataframe
ufc_raw['RedDaysSinceLastFight'] = red_days_since_last_fight
ufc_raw['BlueDaysSinceLastFight'] = blue_days_since_last_fight

In [38]:
ufc_raw[['RedFighter', 'Date','RedDaysSinceLastFight']].sample(5)

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight
4129,Karolina Kowalkiewicz,2020-02-22,259.0
4777,Carlos Condit,2021-07-10,175.0
4221,Aljamain Sterling,2020-06-06,364.0
3369,Alex Perez,2018-08-04,161.0
6000,Rob Font,2023-12-02,119.0


In [39]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight
39,Anderson Silva,2010-04-10,
109,Anderson Silva,2010-08-07,119.0
238,Anderson Silva,2011-02-05,182.0
386,Anderson Silva,2011-08-27,203.0
689,Anderson Silva,2012-07-07,315.0
773,Anderson Silva,2012-10-13,98.0
1019,Anderson Silva,2013-07-06,266.0
2245,Anderson Silva,2016-02-27,791.0
2696,Anderson Silva,2017-02-11,217.0


## Create DaysSinceDebut column: Days since his first UFC fight

In [40]:
# Step 1: Build debut date dictionary for all fighters
debut_date_by_fighter = {}

# Iterate through the dataset chronologically
for idx, row in ufc_raw.iterrows():
    current_date = row['Date']
    red_fighter = row['RedFighter']
    blue_fighter = row['BlueFighter']

    # Store the first appearance date if not already present
    if red_fighter not in debut_date_by_fighter:
        debut_date_by_fighter[red_fighter] = current_date
    if blue_fighter not in debut_date_by_fighter:
        debut_date_by_fighter[blue_fighter] = current_date

# Step 2: Compute days since debut for Red and Blue corners
ufc_raw['RedDaysSinceDebut'] = ufc_raw.apply(
    lambda row: (row['Date'] - debut_date_by_fighter[row['RedFighter']]).days,
    axis=1
)

ufc_raw['BlueDaysSinceDebut'] = ufc_raw.apply(
    lambda row: (row['Date'] - debut_date_by_fighter[row['BlueFighter']]).days,
    axis=1
)

In [41]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight', 'RedDaysSinceDebut']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight,RedDaysSinceDebut
39,Anderson Silva,2010-04-10,,0
109,Anderson Silva,2010-08-07,119.0,119
238,Anderson Silva,2011-02-05,182.0,301
386,Anderson Silva,2011-08-27,203.0,504
689,Anderson Silva,2012-07-07,315.0,819
773,Anderson Silva,2012-10-13,98.0,917
1019,Anderson Silva,2013-07-06,266.0,1183
2245,Anderson Silva,2016-02-27,791.0,2149
2696,Anderson Silva,2017-02-11,217.0,2499


In [42]:
# Step 1: Create debut flags
ufc_raw['RedIsDebut'] = ufc_raw['RedDaysSinceLastFight'].isna().map({True: 'Yes', False: 'No'})
ufc_raw['BlueIsDebut'] = ufc_raw['BlueDaysSinceLastFight'].isna().map({True: 'Yes', False: 'No'})

# Step 2: Impute NaN with median value in the debut.
ufc_raw['RedDaysSinceLastFight'].fillna(180, inplace=True) 
ufc_raw['BlueDaysSinceLastFight'].fillna(180, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ufc_raw['RedDaysSinceLastFight'].fillna(180, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ufc_raw['BlueDaysSinceLastFight'].fillna(180, inplace=True)


In [43]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight', 'RedDaysSinceDebut', 'RedIsDebut']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight,RedDaysSinceDebut,RedIsDebut
39,Anderson Silva,2010-04-10,180.0,0,Yes
109,Anderson Silva,2010-08-07,119.0,119,No
238,Anderson Silva,2011-02-05,182.0,301,No
386,Anderson Silva,2011-08-27,203.0,504,No
689,Anderson Silva,2012-07-07,315.0,819,No
773,Anderson Silva,2012-10-13,98.0,917,No
1019,Anderson Silva,2013-07-06,266.0,1183,No
2245,Anderson Silva,2016-02-27,791.0,2149,No
2696,Anderson Silva,2017-02-11,217.0,2499,No


## Create FightsInLastYear/FightsInLast6Months : Number of fights in the last year/6 months

In [44]:
from collections import defaultdict

# Dictionary of past fight dates for each fighter
fight_history = defaultdict(list)

# Output lists
red_fights_last_year = []
blue_fights_last_year = []

# Iterate over each row in chronological order
for idx, row in ufc_raw.iterrows():
    current_date = row['Date']
    red_fighter = row['RedFighter']
    blue_fighter = row['BlueFighter']

    # Count red's fights in the past 365 days
    red_past_fights = [d for d in fight_history[red_fighter] if 0 < (current_date - d).days <= 365]
    red_fights_last_year.append(len(red_past_fights))

    # Count blue's fights in the past 365 days
    blue_past_fights = [d for d in fight_history[blue_fighter] if 0 < (current_date - d).days <= 365]
    blue_fights_last_year.append(len(blue_past_fights))

    # Update fight history with current fight date
    fight_history[red_fighter].append(current_date)
    fight_history[blue_fighter].append(current_date)

# Add the new columns to the dataframe
ufc_raw['RedFightsInLastYear'] = red_fights_last_year
ufc_raw['BlueFightsInLastYear'] = blue_fights_last_year

In [45]:
from collections import defaultdict

# Dictionary to store past fight dates for each fighter
fight_history_6m = defaultdict(list)

# Output columns
red_fights_last_6m = []
blue_fights_last_6m = []

# Iterate through the dataset chronologically
for idx, row in ufc_raw.iterrows():
    current_date = row['Date']
    red_fighter = row['RedFighter']
    blue_fighter = row['BlueFighter']

    # Count red's fights in the past 183 days (6 months)
    red_past_fights = [d for d in fight_history_6m[red_fighter] if 0 < (current_date - d).days <= 183]
    red_fights_last_6m.append(len(red_past_fights))

    # Count blue's fights in the past 183 days (6 months)
    blue_past_fights = [d for d in fight_history_6m[blue_fighter] if 0 < (current_date - d).days <= 183]
    blue_fights_last_6m.append(len(blue_past_fights))

    # Update fight history with current fight
    fight_history_6m[red_fighter].append(current_date)
    fight_history_6m[blue_fighter].append(current_date)

# Assign to new columns
ufc_raw['RedFightsInLast6Months'] = red_fights_last_6m
ufc_raw['BlueFightsInLast6Months'] = blue_fights_last_6m

In [46]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight', 'RedDaysSinceDebut', 'RedIsDebut', 'RedFightsInLastYear', 'RedFightsInLast6Months']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight,RedDaysSinceDebut,RedIsDebut,RedFightsInLastYear,RedFightsInLast6Months
39,Anderson Silva,2010-04-10,180.0,0,Yes,0,0
109,Anderson Silva,2010-08-07,119.0,119,No,1,1
238,Anderson Silva,2011-02-05,182.0,301,No,2,1
386,Anderson Silva,2011-08-27,203.0,504,No,1,0
689,Anderson Silva,2012-07-07,315.0,819,No,1,0
773,Anderson Silva,2012-10-13,98.0,917,No,1,1
1019,Anderson Silva,2013-07-06,266.0,1183,No,2,0
2245,Anderson Silva,2016-02-27,791.0,2149,No,0,0
2696,Anderson Silva,2017-02-11,217.0,2499,No,2,0


## Create DaysSinceLastWin : Number of fights in the last year/6 months

In [47]:
from collections import defaultdict

# Initialize win history tracker
last_win_date_by_fighter = {}

# Output columns
red_days_since_last_win = []
blue_days_since_last_win = []

# Iterate over ufc_raw in chronological order
for idx, row in ufc_raw.iterrows():
    date = row['Date']
    red = row['RedFighter']
    blue = row['BlueFighter']
    winner = row['Winner']  # Should be 'Red', 'Blue' or 'Draw'

    # Red corner
    red_last_win_date = last_win_date_by_fighter.get(red, None)
    if red_last_win_date is None:
        red_days_since_last_win.append(np.nan)
    else:
        red_days_since_last_win.append((date - red_last_win_date).days)

    # Blue corner
    blue_last_win_date = last_win_date_by_fighter.get(blue, None)
    if blue_last_win_date is None:
        blue_days_since_last_win.append(np.nan)
    else:
        blue_days_since_last_win.append((date - blue_last_win_date).days)

    # Update win dates
    if winner == 'Red':
        last_win_date_by_fighter[red] = date
    elif winner == 'Blue':
        last_win_date_by_fighter[blue] = date
    # No update for draw

# Add to DataFrame
ufc_raw['RedDaysSinceLastWin'] = red_days_since_last_win
ufc_raw['BlueDaysSinceLastWin'] = blue_days_since_last_win

In [48]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight', 'RedDaysSinceDebut', 'RedIsDebut', 'RedFightsInLastYear', 'RedFightsInLast6Months', 'RedDaysSinceLastWin']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight,RedDaysSinceDebut,RedIsDebut,RedFightsInLastYear,RedFightsInLast6Months,RedDaysSinceLastWin
39,Anderson Silva,2010-04-10,180.0,0,Yes,0,0,
109,Anderson Silva,2010-08-07,119.0,119,No,1,1,119.0
238,Anderson Silva,2011-02-05,182.0,301,No,2,1,182.0
386,Anderson Silva,2011-08-27,203.0,504,No,1,0,203.0
689,Anderson Silva,2012-07-07,315.0,819,No,1,0,315.0
773,Anderson Silva,2012-10-13,98.0,917,No,1,1,98.0
1019,Anderson Silva,2013-07-06,266.0,1183,No,2,0,266.0
2245,Anderson Silva,2016-02-27,791.0,2149,No,0,0,1232.0
2696,Anderson Silva,2017-02-11,217.0,2499,No,2,0,1582.0


In [49]:
ufc_raw['RedDaysSinceLastWin'].median()

246.0

In [50]:
# FIll with median, moodel already know if it is the first fight of a fighter
ufc_raw['RedDaysSinceLastWin'].fillna(246, inplace=True)
ufc_raw['BlueDaysSinceLastWin'].fillna(246, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ufc_raw['RedDaysSinceLastWin'].fillna(246, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ufc_raw['BlueDaysSinceLastWin'].fillna(246, inplace=True)


In [51]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight', 'RedDaysSinceDebut', 'RedIsDebut', 'RedFightsInLastYear', 'RedFightsInLast6Months', 'RedDaysSinceLastWin']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight,RedDaysSinceDebut,RedIsDebut,RedFightsInLastYear,RedFightsInLast6Months,RedDaysSinceLastWin
39,Anderson Silva,2010-04-10,180.0,0,Yes,0,0,246.0
109,Anderson Silva,2010-08-07,119.0,119,No,1,1,119.0
238,Anderson Silva,2011-02-05,182.0,301,No,2,1,182.0
386,Anderson Silva,2011-08-27,203.0,504,No,1,0,203.0
689,Anderson Silva,2012-07-07,315.0,819,No,1,0,315.0
773,Anderson Silva,2012-10-13,98.0,917,No,1,1,98.0
1019,Anderson Silva,2013-07-06,266.0,1183,No,2,0,266.0
2245,Anderson Silva,2016-02-27,791.0,2149,No,0,0,1232.0
2696,Anderson Silva,2017-02-11,217.0,2499,No,2,0,1582.0


In [52]:
# Save the cleaned file
ufc_raw.to_csv(f'{project_root}/data/raw/ufc_raw_dates_ft.csv', index=False)
logger.info("✅ Raw with Date Features file saved as 'ufc_raw_dates_ft.csv'.")

[INFO] ✅ Raw with Date Features file saved as 'ufc_raw_dates_ft.csv'.


<div style="text-align: center;">
     <img src="../img/ufc_logo.png" width="800" /> 
</div>