# 🔹UFC Fight Predictor ETL

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Import Libraries and Setup Environment

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)

# Get the current working directory
current_dir = os.getcwd()

# Navigate to the project root
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Import from /src
sys.path.append(os.path.join(project_root))
from src.helpers import *
from src.data import UFCData

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Load Data

In [2]:
# Define the path to the CSV file
file_path = os.path.join(project_root, 'data', 'raw', 'ufc_raw.csv')

# Load the CSV into a DataFrame
try:
    ufc_raw = pd.read_csv(file_path)
    logger.info(f"✅ Data successfully loaded: {ufc_raw.shape[0]} rows, {ufc_raw.shape[1]} columns.")
except Exception as e:
    logger.error(f"❌ Error loading training data: {e}")

[INFO] ✅ Data successfully loaded: 8250 rows, 124 columns.


<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Preview

In [3]:
# Preview the first few records
display(ufc_raw.head())

# General dataset information
ufc_raw.info()

Unnamed: 0,event_id,event_name,date,location,fight_id,division,title_fight,method,finish_round,match_time_sec,...,b_splm,b_str_acc,b_sapm,b_str_def,b_td_avg,b_td_avg_acc,b_td_def,b_sub_avg,winner,winner_id
0,400c7b43c86d27d3,UFC Fight Night: Hill vs. Rountree Jr.,2025/06/21,"Baku, Azerbaijan",36ec204f47e4d613,catch weight,0,Submission,1,275,...,3.11,48,3.08,50,5.82,45,40,0.7,Myktybek Orolbai,bf2c8e01b07d3eb1
1,400c7b43c86d27d3,UFC Fight Night: Hill vs. Rountree Jr.,2025/06/21,"Baku, Azerbaijan",a1afc16e21d1a807,lightweight,0,Decision - Unanimous,3,300,...,6.55,45,4.33,56,0.0,0,75,0.4,Rafael Fiziev,c814b4c899793af6
2,400c7b43c86d27d3,UFC Fight Night: Hill vs. Rountree Jr.,2025/06/21,"Baku, Azerbaijan",7513a00037094075,lightweight,0,KO/TKO,2,257,...,4.13,38,5.28,52,0.0,0,83,0.0,Nazim Sadykhov,ff62013d2fce6d13
3,400c7b43c86d27d3,UFC Fight Night: Hill vs. Rountree Jr.,2025/06/21,"Baku, Azerbaijan",e512b80bbaea36c2,welterweight,0,Decision - Unanimous,3,300,...,3.5,48,2.24,60,1.7,44,55,0.0,Seokhyeon Ko,4a07b1988477502c
4,400c7b43c86d27d3,UFC Fight Night: Hill vs. Rountree Jr.,2025/06/21,"Baku, Azerbaijan",03bc32bdb5a33496,light heavyweight,0,Decision - Unanimous,5,300,...,3.79,40,4.24,49,0.0,0,58,0.1,Khalil Rountree Jr.,749f572d1d3161fb


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8250 entries, 0 to 8249
Columns: 124 entries, event_id to winner_id
dtypes: float64(87), int64(17), object(20)
memory usage: 7.8+ MB


<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Check 

In [4]:
pd.set_option('display.max_rows', None)

In [5]:
# Null values check
nulls = ufc_raw.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

# Duplicate analysis
duplicates = ufc_raw.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")


Null values per column:
 total_rounds             31
referee                  26
r_kd                     21
r_sig_str_landed         21
r_sig_str_atmpted        21
r_sig_str_acc            59
r_total_str_landed       21
r_total_str_atmpted      21
r_total_str_acc          48
r_td_landed              21
r_td_atmpted             21
r_td_acc               2658
r_sub_att                21
r_ctrl                  202
r_head_landed            21
r_head_atmpted           21
r_head_acc               99
r_body_landed            21
r_body_atmpted           21
r_body_acc              851
r_leg_landed             21
r_leg_atmpted            21
r_leg_acc              1568
r_dist_landed            21
r_dist_atmpted           21
r_dist_acc              139
r_clinch_landed          21
r_clinch_atmpted         21
r_clinch_acc           1977
r_ground_landed          21
r_ground_atmpted         21
r_ground_acc           2916
r_landed_head_per        21
r_landed_body_per        21
r_landed_leg_per      

In [6]:
for col in ufc_raw.columns:
    print(col)

event_id
event_name
date
location
fight_id
division
title_fight
method
finish_round
match_time_sec
total_rounds
referee
r_name
r_id
r_kd
r_sig_str_landed
r_sig_str_atmpted
r_sig_str_acc
r_total_str_landed
r_total_str_atmpted
r_total_str_acc
r_td_landed
r_td_atmpted
r_td_acc
r_sub_att
r_ctrl
r_head_landed
r_head_atmpted
r_head_acc
r_body_landed
r_body_atmpted
r_body_acc
r_leg_landed
r_leg_atmpted
r_leg_acc
r_dist_landed
r_dist_atmpted
r_dist_acc
r_clinch_landed
r_clinch_atmpted
r_clinch_acc
r_ground_landed
r_ground_atmpted
r_ground_acc
r_landed_head_per
r_landed_body_per
r_landed_leg_per
r_landed_dist_per
r_landed_clinch_per
r_landed_ground_per
r_nick_name
r_wins
r_losses
r_draws
r_height
r_weight
r_reach
r_stance
r_dob
r_splm
r_str_acc
r_sapm
r_str_def
r_td_avg
r_td_avg_acc
r_td_def
r_sub_avg
b_name
b_id
b_kd
b_sig_str_landed
b_sig_str_atmpted
b_sig_str_acc
b_total_str_landed
b_total_str_atmpted
b_total_str_acc
b_td_landed
b_td_atmpted
b_td_acc
b_sub_att
b_ctrl
b_head_landed
b_head_atm

In [7]:
pd.reset_option('display.max_rows')

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Create Temporal Features

## Data Leakage

In [8]:
ufc_raw[ufc_raw['b_name'] == 'Rafael Fiziev'][['b_name', 'event_name','b_wins', 'b_losses', 'b_draws','b_splm','b_str_acc', 'b_sapm', 'b_str_def', 'b_td_avg', 'b_td_acc', 'b_td_def', 'b_sub_avg']]

Unnamed: 0,b_name,event_name,b_wins,b_losses,b_draws,b_splm,b_str_acc,b_sapm,b_str_def,b_td_avg,b_td_acc,b_td_def,b_sub_avg
169,Rafael Fiziev,UFC 313: Pereira vs. Ankalaev,13,4,0,4.77,52,4.77,50,0.89,100.0,90,0.0
1193,Rafael Fiziev,UFC 286: Edwards vs. Usman 3,13,4,0,4.77,52,4.77,50,0.89,,90,0.0
1536,Rafael Fiziev,UFC Fight Night: Dos Anjos vs. Fiziev,13,4,0,4.77,52,4.77,50,0.89,0.0,90,0.0
1853,Rafael Fiziev,UFC Fight Night: Font vs. Aldo,13,4,0,4.77,52,4.77,50,0.89,,90,0.0
2034,Rafael Fiziev,UFC 265: Lewis vs. Gane,13,4,0,4.77,52,4.77,50,0.89,,90,0.0
2341,Rafael Fiziev,UFC 256: Figueiredo vs. Moreno,13,4,0,4.77,52,4.77,50,0.89,0.0,90,0.0
2570,Rafael Fiziev,UFC Fight Night: Figueiredo vs. Benavidez 2,13,4,0,4.77,52,4.77,50,0.89,50.0,90,0.0
2861,Rafael Fiziev,UFC Fight Night: Maia vs. Askren,13,4,0,4.77,52,4.77,50,0.89,100.0,90,0.0
3153,Rafael Fiziev,UFC Fight Night: Overeem vs. Oleinik,13,4,0,4.77,52,4.77,50,0.89,,90,0.0


In [9]:
ufc_raw = ufc_raw.drop(['b_wins', 'b_losses', 'b_draws','b_splm','b_str_acc', 'b_sapm', 'b_str_def', 'b_td_avg', 'b_td_acc', 'b_td_def', 'b_sub_avg',
                       'r_wins', 'r_losses', 'r_draws','r_splm','r_str_acc', 'r_sapm', 'r_str_def', 'r_td_avg', 'r_td_acc', 'r_td_def', 'r_sub_avg'], axis=1)

## Create Winner Corner

In [10]:
ufc_raw['winner_corner'] = ufc_raw.apply(
    lambda row: 'Red' if row['winner'] == row['r_name'] 
    else ('Blue' if row['winner'] == row['b_name'] else None), 
    axis=1
)
ufc_raw['winner_corner_bin'] = ufc_raw['winner_corner'].map({'Red': 0, 'Blue': 1, None: 2})
ufc_raw[['r_name', 'b_name', 'winner', 'winner_corner', 'winner_corner_bin']].head()

Unnamed: 0,r_name,b_name,winner,winner_corner,winner_corner_bin
0,Tofiq Musayev,Myktybek Orolbai,Myktybek Orolbai,Blue,1
1,Rafael Fiziev,Ignacio Bahamondes,Rafael Fiziev,Red,0
2,Nazim Sadykhov,Nikolas Motta,Nazim Sadykhov,Red,0
3,Seokhyeon Ko,Oban Elliott,Seokhyeon Ko,Red,0
4,Jamahal Hill,Khalil Rountree Jr.,Khalil Rountree Jr.,Blue,1


## Check Fights with no winners

In [11]:
ufc_raw['winner_corner'].unique()

array(['Blue', 'Red', None], dtype=object)

In [12]:
ufc_raw[ufc_raw['winner_corner'] == None]

Unnamed: 0,event_id,event_name,date,location,fight_id,division,title_fight,method,finish_round,match_time_sec,...,b_height,b_weight,b_reach,b_stance,b_dob,b_td_avg_acc,winner,winner_id,winner_corner,winner_corner_bin


In [13]:
ufc_raw['winner_corner_bin'].unique()

array([1, 0, 2])

In [14]:
ufc_raw[ufc_raw['winner_corner_bin'] == 2].tail()

Unnamed: 0,event_id,event_name,date,location,fight_id,division,title_fight,method,finish_round,match_time_sec,...,b_height,b_weight,b_reach,b_stance,b_dob,b_td_avg_acc,winner,winner_id,winner_corner,winner_corner_bin
8033,afaad7d6a581e307,UFC 22: Only One Can be Champion,1999/09/24,"Lake Charles, Louisiana, USA",96c247ba0eb1ac65,heavyweight,0,Decision - Unanimous,3,300,...,187.96,127.01,,Orthodox,1965/11/23,69,,,,2
8041,afaad7d6a581e307,UFC 22: Only One Can be Champion,1999/09/24,"Lake Charles, Louisiana, USA",a1c1e3c1e9c6cf1e,lightweight,0,Decision - Majority,2,300,...,160.02,65.77,,,1970/06/10,0,,,,2
8110,29f935654825331b,UFC - Ultimate Japan,1997/12/21,"Yokohama, Kanagawa, Japan",2750ac5854e8b28b,heavyweight,0,Overturned,1,111,...,190.5,109.77,,Orthodox,,0,,,,2
8213,5af480a3b2e1726b,UFC 7: The Brawl in Buffalo,1995/09/08,"Buffalo, New York, USA",3932f8e9a74f3d11,superfight championship,0,Other,2,180,...,182.88,95.25,,Orthodox,1967/08/26,0,,,,2
8240,dedc3bb440d09554,UFC 5: The Return of the Beast,1995/04/07,"Charlotte, North Carolina, USA",db8df615610f3632,superfight championship,0,Other,2,300,...,185.42,79.38,,Southpaw,1966/12/12,0,,,,2


In [15]:
draw_methods = ['Decision - Majority', 'Decision - Split', 'Decision - Unanimous']

def determine_winner(row):
    if row['winner'] == row['r_name']:
        return 'Red'
    elif row['winner'] == row['b_name']:
        return 'Blue'
    elif row['method'] in draw_methods:
        return 'Draw'
    else:
        return None

ufc_raw['winner_corner'] = ufc_raw.apply(determine_winner, axis=1)

ufc_raw['winner_corner_bin'] = ufc_raw['winner_corner'].map({
    'Red': 0,
    'Blue': 1,
    'Draw': 2,
    None: 3
})

ufc_raw[ufc_raw['winner_corner_bin']==3]

Unnamed: 0,event_id,event_name,date,location,fight_id,division,title_fight,method,finish_round,match_time_sec,...,b_height,b_weight,b_reach,b_stance,b_dob,b_td_avg_acc,winner,winner_id,winner_corner,winner_corner_bin
66,de277a4abcfeea46,UFC Fight Night: Usman vs. Buckley,2025/06/14,"Atlanta, Georgia, USA",13e2ff8b3a122094,light heavyweight,0,Could Not Continue,1,299,...,190.50,92.99,190.50,Orthodox,1996/02/05,36,,,,3
511,66e0a70352fef46a,UFC Fight Night: Namajunas vs. Cortez,2024/07/13,"Denver, Colorado, USA",46fa8e79fe8a9539,middleweight,0,Could Not Continue,1,37,...,182.88,83.91,182.88,Orthodox,1994/05/16,47,,,,3
672,c398235fcaf8d71d,UFC Fight Night: Tuivasa vs. Tybura,2024/03/16,"Las Vegas, Nevada, USA",a7fd9da657bbce8e,welterweight,0,Could Not Continue,2,60,...,177.80,77.11,187.96,Orthodox,1993/03/18,39,,,,3
729,eaea0fc7b76525a8,UFC Fight Night: Hermansson vs. Pyfer,2024/02/10,"Las Vegas, Nevada, USA",6cd542650d886c22,bantamweight,0,Could Not Continue,2,208,...,170.18,61.23,175.26,Orthodox,1993/06/25,46,,,,3
751,cce79e827569f26e,UFC Fight Night: Dolidze vs. Imavov,2024/02/03,"Las Vegas, Nevada, USA",415ea2520b7ba720,middleweight,0,Could Not Continue,1,11,...,187.96,83.91,190.50,Orthodox,1990/02/08,66,,,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7925,9fd1f08dd4aec14a,UFC 37: High Impact,2002/05/10,"Bossier City, Louisiana, USA",fe60fd0ee67902dc,welterweight,0,Overturned,1,27,...,180.34,77.11,,Orthodox,1973/05/20,40,,,,3
7973,2b1587a3376ab743,UFC 30: Battle on the Boardwalk,2001/02/23,"Atlantic City, New Jersey, USA",651da45cc83ce011,heavyweight,0,Overturned,1,207,...,182.88,120.20,,Orthodox,,0,,,,3
8110,29f935654825331b,UFC - Ultimate Japan,1997/12/21,"Yokohama, Kanagawa, Japan",2750ac5854e8b28b,heavyweight,0,Overturned,1,111,...,190.50,109.77,,Orthodox,,0,,,,3
8213,5af480a3b2e1726b,UFC 7: The Brawl in Buffalo,1995/09/08,"Buffalo, New York, USA",3932f8e9a74f3d11,superfight championship,0,Other,2,180,...,182.88,95.25,,Orthodox,1967/08/26,0,,,,3


In [16]:
ufc_raw = ufc_raw[ufc_raw['winner_corner_bin'] != 3].copy()

In [17]:
ufc_raw['winner_corner_bin'].unique()

array([1, 0, 2])

## Create Historical Wins-Losses, Win-Lose streaks(current/longest) columns

In [18]:
# Convert to datetime
ufc_raw['date'] = pd.to_datetime(ufc_raw['date'], errors='coerce')
ufc_raw = ufc_raw.sort_values('date').reset_index(drop=True)
ufc_raw['date']

0      1994-03-11
1      1994-03-11
2      1994-03-11
3      1994-03-11
4      1994-03-11
          ...    
8156   2025-07-12
8157   2025-07-12
8158   2025-07-12
8159   2025-07-12
8160   2025-07-12
Name: date, Length: 8161, dtype: datetime64[ns]

In [19]:
# Initialize columns
for corner in ['r', 'b']:
    ufc_raw[f'{corner}_wins'] = 0
    ufc_raw[f'{corner}_losses'] = 0
    ufc_raw[f'{corner}_draws'] = 0
    ufc_raw[f'{corner}_total_fights'] = 0
    ufc_raw[f'{corner}_current_win_streak'] = 0
    ufc_raw[f'{corner}_current_lose_streak'] = 0
    ufc_raw[f'{corner}_longest_win_streak'] = 0
    ufc_raw[f'{corner}_longest_lose_streak'] = 0

# Dictionary to keep track of fighter history
history = {}

# Loop through all fights
for idx, row in ufc_raw.iterrows():
    for corner in ['r', 'b']:
        fighter = row[f"{corner}_name"]

        # Initialize history if fighter is new
        if fighter not in history:
            history[fighter] = {
                'wins': 0,
                'losses': 0,
                'draws': 0,
                'win_streak': 0,
                'lose_streak': 0,
                'longest_win_streak': 0,
                'longest_lose_streak': 0
            }

        # Save prior stats before the fight
        stats = history[fighter]
        total = stats['wins'] + stats['losses'] + stats['draws']

        ufc_raw.at[idx, f"{corner}_wins"] = stats['wins']
        ufc_raw.at[idx, f"{corner}_losses"] = stats['losses']
        ufc_raw.at[idx, f"{corner}_draws"] = stats['draws']
        ufc_raw.at[idx, f"{corner}_total_fights"] = total
        ufc_raw.at[idx, f"{corner}_current_win_streak"] = stats['win_streak']
        ufc_raw.at[idx, f"{corner}_current_lose_streak"] = stats['lose_streak']
        ufc_raw.at[idx, f"{corner}_longest_win_streak"] = stats['longest_win_streak']
        ufc_raw.at[idx, f"{corner}_longest_lose_streak"] = stats['longest_lose_streak']

        # Update history with the result of the current fight
        if row['winner_corner'] == 'Red' and corner == 'r':
            stats['wins'] += 1
            stats['win_streak'] += 1
            stats['lose_streak'] = 0
            stats['longest_win_streak'] = max(stats['longest_win_streak'], stats['win_streak'])

        elif row['winner_corner'] == 'Blue' and corner == 'b':
            stats['wins'] += 1
            stats['win_streak'] += 1
            stats['lose_streak'] = 0
            stats['longest_win_streak'] = max(stats['longest_win_streak'], stats['win_streak'])

        elif row['winner_corner'] in ['Red', 'Blue']:
            stats['losses'] += 1
            stats['lose_streak'] += 1
            stats['win_streak'] = 0
            stats['longest_lose_streak'] = max(stats['longest_lose_streak'], stats['lose_streak'])

        elif row['winner_corner'] == 'Draw':
            stats['draws'] += 1
            # reset streaks because draw doesn't extend win/lose streak
            stats['win_streak'] = 0
            stats['lose_streak'] = 0
            
# General record differences
ufc_raw['wins_dif'] = ufc_raw['b_wins'] - ufc_raw['r_wins']
ufc_raw['losses_dif'] = ufc_raw['b_losses'] - ufc_raw['r_losses']
ufc_raw['draws_dif'] = ufc_raw['b_draws'] - ufc_raw['r_draws']
ufc_raw['total_fights_dif'] = ufc_raw['b_total_fights'] - ufc_raw['r_total_fights']

# Streak differences
ufc_raw['current_win_streak_dif'] = ufc_raw['b_current_win_streak'] - ufc_raw['r_current_win_streak']
ufc_raw['current_lose_streak_dif'] = ufc_raw['b_current_lose_streak'] - ufc_raw['r_current_lose_streak']
ufc_raw['longest_win_streak_dif'] = ufc_raw['b_longest_win_streak'] - ufc_raw['r_longest_win_streak']
ufc_raw['longest_lose_streak_dif'] = ufc_raw['b_longest_lose_streak'] - ufc_raw['r_longest_lose_streak']

# Check results
ufc_raw[['date','r_name','b_name','winner_corner',
         'r_wins','r_losses','r_draws','r_total_fights',
         'b_wins','b_losses','b_draws','b_total_fights']].tail(15)

Unnamed: 0,date,r_name,b_name,winner_corner,r_wins,r_losses,r_draws,r_total_fights,b_wins,b_losses,b_draws,b_total_fights
8146,2025-06-28,Payton Talbott,Felipe Lima,Red,3,1,0,4,2,0,0,2
8147,2025-06-28,Beneil Dariush,Renato Moicano,Red,16,6,1,23,12,6,0,18
8148,2025-06-28,Hyder Amil,Jose Delgado,Blue,3,0,0,3,1,0,0,1
8149,2025-07-12,Derrick Lewis,Tallison Teixeira,Red,19,10,0,29,1,0,0,1
8150,2025-07-12,Nate Landwehr,Morgan Charriere,Blue,5,4,0,9,2,2,0,4
8151,2025-07-12,Calvin Kattar,Steve Garcia,Blue,7,7,0,14,6,2,0,8
8152,2025-07-12,Vitor Petrino,Austen Lane,Red,4,2,0,6,1,3,0,4
8153,2025-07-12,Stephen Thompson,Gabriel Bonfim,Blue,12,8,1,21,4,1,0,5
8154,2025-07-12,Max Griffin,Chris Curtis,Blue,8,9,0,17,5,4,0,9
8155,2025-07-12,Mitch Ramirez,Mike Davis,Blue,0,1,0,1,4,2,0,6


## Checking

In [20]:
ufc_raw[
    (ufc_raw['r_name'] == 'Jon Jones') | (ufc_raw['b_name'] == 'Jon Jones')
][['date','r_name','b_name','winner_corner',
               'r_wins','r_losses','r_total_fights','r_current_win_streak',
               'r_current_lose_streak','r_longest_win_streak','r_longest_lose_streak',
               'b_wins','b_losses','b_total_fights','b_current_win_streak',
               'b_current_lose_streak','b_longest_win_streak','b_longest_lose_streak']].head(15)

Unnamed: 0,date,r_name,b_name,winner_corner,r_wins,r_losses,r_total_fights,r_current_win_streak,r_current_lose_streak,r_longest_win_streak,r_longest_lose_streak,b_wins,b_losses,b_total_fights,b_current_win_streak,b_current_lose_streak,b_longest_win_streak,b_longest_lose_streak
957,2008-08-09,Jon Jones,Andre Gusmao,Red,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1058,2009-01-31,Jon Jones,Stephan Bonnar,Red,1,0,1,1,0,1,0,5,3,8,2,0,3,2
1151,2009-07-11,Jon Jones,Jake O'Brien,Red,2,0,2,2,0,2,0,4,2,6,1,0,3,2
1236,2009-12-05,Matt Hamill,Jon Jones,Red,6,2,8,2,0,3,1,3,0,3,3,0,3,0
1299,2010-03-21,Brandon Vera,Jon Jones,Blue,7,4,11,0,1,4,2,3,1,4,0,1,3,1
1401,2010-08-01,Jon Jones,Vladimir Matyushenko,Red,4,1,5,1,0,3,1,5,2,7,2,0,2,1
1532,2011-02-05,Jon Jones,Ryan Bader,Red,5,1,6,2,0,3,1,5,0,5,5,0,5,0
1568,2011-03-19,Mauricio Rua,Jon Jones,Blue,3,2,5,1,0,2,1,6,1,7,3,0,3,1
1704,2011-09-24,Jon Jones,Quinton Jackson,Red,7,1,8,4,0,4,1,7,2,9,2,0,3,1
1788,2011-12-10,Jon Jones,Lyoto Machida,Red,8,1,9,5,0,5,1,9,2,11,1,0,8,2


## Remove Draws

In [21]:
ufc_raw = ufc_raw[ufc_raw['winner_corner_bin'] != 2].copy()

## Create Historical Wins By KO/TKO, Wins By Decision, Wins By Submission

In [22]:
ufc_raw['method'].unique()

array(['Submission', 'KO/TKO', "TKO - Doctor's Stoppage",
       'Decision - Unanimous', 'Decision - Split', 'Decision - Majority',
       'DQ'], dtype=object)

In [23]:
# Drop disqualification fights before tracking (23 rows)
ufc_raw = ufc_raw[ufc_raw['method'] != 'DQ'].reset_index(drop=True)

In [24]:
# Initialize new columns for method-specific outcomes
for corner in ['r', 'b']:
    for result in ['wins', 'losses']:
        for method in ['ko', 'sub', 'dec']:
            ufc_raw[f"{corner}_{result}_{method}"] = 0

# Dictionary to keep track of method-specific history
method_history = {}

# Loop through fights
for idx, row in ufc_raw.iterrows():
    for corner in ['r', 'b']:
        fighter = row[f"{corner}_name"]

        # Initialize fighter history if not present
        if fighter not in method_history:
            method_history[fighter] = {
                'wins_ko': 0, 'wins_sub': 0, 'wins_dec': 0,
                'losses_ko': 0, 'losses_sub': 0, 'losses_dec': 0
            }

        # Save prior stats
        for result in ['wins', 'losses']:
            for method in ['ko', 'sub', 'dec']:
                ufc_raw.at[idx, f"{corner}_{result}_{method}"] = method_history[fighter][f"{result}_{method}"]

        # Update history with the result of the current fight
        # Determine if fighter won
        fighter_won = ((row['winner_corner'] == 'Red' and corner == 'r') or
                       (row['winner_corner'] == 'Blue' and corner == 'b'))

        if fighter_won:
            if row['method'] in ['KO/TKO', "TKO - Doctor's Stoppage"]:
                method_history[fighter]['wins_ko'] += 1
            elif row['method'] == 'Submission':
                method_history[fighter]['wins_sub'] += 1
            elif row['method'] in ['Decision - Unanimous', 'Decision - Split', 'Decision - Majority']:
                method_history[fighter]['wins_dec'] += 1

        elif row['winner_corner'] in ['Red', 'Blue']:  # lost
            if row['method'] in ['KO/TKO', "TKO - Doctor's Stoppage"]:
                method_history[fighter]['losses_ko'] += 1
            elif row['method'] == 'Submission':
                method_history[fighter]['losses_sub'] += 1
            elif row['method'] in ['Decision - Unanimous', 'Decision - Split', 'Decision - Majority']:
                method_history[fighter]['losses_dec'] += 1

# Wins by method differences
ufc_raw['ko_wins_dif'] = ufc_raw['b_wins_ko'] - ufc_raw['r_wins_ko']
ufc_raw['sub_wins_dif'] = ufc_raw['b_wins_sub'] - ufc_raw['r_wins_sub']
ufc_raw['dec_wins_dif'] = ufc_raw['b_wins_dec'] - ufc_raw['r_wins_dec']

# Losses by method differences
ufc_raw['ko_losses_dif'] = ufc_raw['b_losses_ko'] - ufc_raw['r_losses_ko']
ufc_raw['sub_losses_dif'] = ufc_raw['b_losses_sub'] - ufc_raw['r_losses_sub']
ufc_raw['dec_losses_dif'] = ufc_raw['b_losses_dec'] - ufc_raw['r_losses_dec']

# Quick check
ufc_raw[['date','r_name','b_name','winner_corner','method',
               'r_wins_ko','r_wins_sub','r_wins_dec','r_losses_ko','r_losses_sub','r_losses_dec',
               'b_wins_ko','b_wins_sub','b_wins_dec','b_losses_ko','b_losses_sub','b_losses_dec']].tail(15)


Unnamed: 0,date,r_name,b_name,winner_corner,method,r_wins_ko,r_wins_sub,r_wins_dec,r_losses_ko,r_losses_sub,r_losses_dec,b_wins_ko,b_wins_sub,b_wins_dec,b_losses_ko,b_losses_sub,b_losses_dec
8066,2025-06-28,Payton Talbott,Felipe Lima,Red,Decision - Unanimous,2,1,0,0,0,1,0,1,1,0,0,0
8067,2025-06-28,Beneil Dariush,Renato Moicano,Red,Decision - Unanimous,3,5,8,5,1,0,2,6,4,3,2,1
8068,2025-06-28,Hyder Amil,Jose Delgado,Blue,KO/TKO,2,0,1,0,0,0,1,0,0,0,0,0
8069,2025-07-12,Derrick Lewis,Tallison Teixeira,Red,KO/TKO,15,0,4,7,2,1,1,0,0,0,0,0
8070,2025-07-12,Nate Landwehr,Morgan Charriere,Blue,KO/TKO,1,2,2,3,0,1,2,0,0,0,0,2
8071,2025-07-12,Calvin Kattar,Steve Garcia,Blue,Decision - Unanimous,4,0,3,1,0,6,6,0,0,1,0,1
8072,2025-07-12,Vitor Petrino,Austen Lane,Red,Submission,1,1,2,1,1,0,0,0,1,3,0,0
8073,2025-07-12,Stephen Thompson,Gabriel Bonfim,Blue,Decision - Split,6,0,6,2,1,5,0,3,1,1,0,0
8074,2025-07-12,Max Griffin,Chris Curtis,Blue,Decision - Split,3,0,5,1,1,7,3,0,2,1,0,3
8075,2025-07-12,Mitch Ramirez,Mike Davis,Blue,KO/TKO,0,0,0,1,0,0,1,1,2,0,1,1


In [25]:
ufc_raw[
    (ufc_raw['r_name'] == 'Jon Jones') | (ufc_raw['b_name'] == 'Jon Jones')
][['date','r_name','b_name','winner_corner',
               'r_wins','r_losses','r_total_fights', 'r_wins_ko','r_wins_sub','r_wins_dec',
               'b_wins','b_losses','b_total_fights', 'b_wins_ko','b_wins_sub','b_wins_dec']].head(15)

Unnamed: 0,date,r_name,b_name,winner_corner,r_wins,r_losses,r_total_fights,r_wins_ko,r_wins_sub,r_wins_dec,b_wins,b_losses,b_total_fights,b_wins_ko,b_wins_sub,b_wins_dec
947,2008-08-09,Jon Jones,Andre Gusmao,Red,0,0,0,0,0,0,0,0,0,0,0,0
1048,2009-01-31,Jon Jones,Stephan Bonnar,Red,1,0,1,0,0,1,5,3,8,1,2,2
1141,2009-07-11,Jon Jones,Jake O'Brien,Red,2,0,2,0,0,2,4,2,6,1,0,3
1286,2010-03-21,Brandon Vera,Jon Jones,Blue,7,4,11,4,1,2,3,1,4,0,1,2
1387,2010-08-01,Jon Jones,Vladimir Matyushenko,Red,4,1,5,1,1,2,5,2,7,1,0,4
1515,2011-02-05,Jon Jones,Ryan Bader,Red,5,1,6,2,1,2,5,0,5,2,0,3
1550,2011-03-19,Mauricio Rua,Jon Jones,Blue,3,2,5,3,0,0,6,1,7,2,2,2
1686,2011-09-24,Jon Jones,Quinton Jackson,Red,7,1,8,3,2,2,7,2,9,3,0,4
1770,2011-12-10,Jon Jones,Lyoto Machida,Red,8,1,9,3,3,2,9,2,11,3,1,5
1874,2012-04-21,Jon Jones,Rashad Evans,Red,9,1,10,3,4,2,12,1,14,5,0,7


In [26]:
ufc_raw[
    (ufc_raw['r_name'] == 'Jon Jones') | (ufc_raw['b_name'] == 'Jon Jones')
][['date','r_name','b_name','winner_corner',
               'r_wins','r_losses','r_total_fights','r_losses_ko','r_losses_sub','r_losses_dec',
               'b_wins','b_losses','b_total_fights','b_losses_ko','b_losses_sub','b_losses_dec']].head(15)

Unnamed: 0,date,r_name,b_name,winner_corner,r_wins,r_losses,r_total_fights,r_losses_ko,r_losses_sub,r_losses_dec,b_wins,b_losses,b_total_fights,b_losses_ko,b_losses_sub,b_losses_dec
947,2008-08-09,Jon Jones,Andre Gusmao,Red,0,0,0,0,0,0,0,0,0,0,0,0
1048,2009-01-31,Jon Jones,Stephan Bonnar,Red,1,0,1,0,0,0,5,3,8,0,0,3
1141,2009-07-11,Jon Jones,Jake O'Brien,Red,2,0,2,0,0,0,4,2,6,2,0,0
1286,2010-03-21,Brandon Vera,Jon Jones,Blue,7,4,11,1,0,3,3,1,4,0,0,0
1387,2010-08-01,Jon Jones,Vladimir Matyushenko,Red,4,1,5,0,0,0,5,2,7,1,0,1
1515,2011-02-05,Jon Jones,Ryan Bader,Red,5,1,6,0,0,0,5,0,5,0,0,0
1550,2011-03-19,Mauricio Rua,Jon Jones,Blue,3,2,5,0,1,1,6,1,7,0,0,0
1686,2011-09-24,Jon Jones,Quinton Jackson,Red,7,1,8,0,0,0,7,2,9,0,0,2
1770,2011-12-10,Jon Jones,Lyoto Machida,Red,8,1,9,0,0,0,9,2,11,1,0,1
1874,2012-04-21,Jon Jones,Rashad Evans,Red,9,1,10,0,0,0,12,1,14,1,0,0


## Create Historical Age columns

In [27]:
ufc_raw['r_dob'] = pd.to_datetime(ufc_raw['r_dob'], errors='coerce')
ufc_raw['b_dob'] = pd.to_datetime(ufc_raw['b_dob'], errors='coerce')
ufc_raw['r_dob']

0             NaT
1      1963-08-28
2      1966-12-12
3      1969-07-24
4             NaT
          ...    
8076   1994-08-19
8077   1992-06-13
8078   2000-07-12
8079   1996-09-21
8080   1983-07-27
Name: r_dob, Length: 8081, dtype: datetime64[ns]

In [28]:
# Calculate fighter ages at the time of the fight
ufc_raw['r_age'] = np.floor((ufc_raw['date'] - ufc_raw['r_dob']).dt.days / 365.25)
ufc_raw['b_age'] = np.floor((ufc_raw['date'] - ufc_raw['b_dob']).dt.days / 365.25)

# Add age difference (Red - Blue)
ufc_raw['age_dif'] = ufc_raw['b_age'] - ufc_raw['r_age']

# Check results
ufc_raw[['date','r_name','r_age','b_name','b_age','age_dif']].tail(15)

Unnamed: 0,date,r_name,r_age,b_name,b_age,age_dif
8066,2025-06-28,Payton Talbott,26.0,Felipe Lima,27.0,1.0
8067,2025-06-28,Beneil Dariush,36.0,Renato Moicano,36.0,0.0
8068,2025-06-28,Hyder Amil,35.0,Jose Delgado,27.0,-8.0
8069,2025-07-12,Derrick Lewis,40.0,Tallison Teixeira,25.0,-15.0
8070,2025-07-12,Nate Landwehr,37.0,Morgan Charriere,29.0,-8.0
8071,2025-07-12,Calvin Kattar,37.0,Steve Garcia,33.0,-4.0
8072,2025-07-12,Vitor Petrino,27.0,Austen Lane,37.0,10.0
8073,2025-07-12,Stephen Thompson,42.0,Gabriel Bonfim,27.0,-15.0
8074,2025-07-12,Max Griffin,39.0,Chris Curtis,37.0,-2.0
8075,2025-07-12,Mitch Ramirez,32.0,Mike Davis,32.0,0.0


## Checking

In [29]:
ufc_raw[
    (ufc_raw['r_name'] == 'Jon Jones') | (ufc_raw['b_name'] == 'Jon Jones')
][['date','r_name','b_name','winner_corner',
               'r_wins','r_losses', 'r_age', 'b_age', 'age_dif']]

Unnamed: 0,date,r_name,b_name,winner_corner,r_wins,r_losses,r_age,b_age,age_dif
947,2008-08-09,Jon Jones,Andre Gusmao,Red,0,0,21.0,31.0,10.0
1048,2009-01-31,Jon Jones,Stephan Bonnar,Red,1,0,21.0,31.0,10.0
1141,2009-07-11,Jon Jones,Jake O'Brien,Red,2,0,21.0,24.0,3.0
1286,2010-03-21,Brandon Vera,Jon Jones,Blue,7,4,32.0,22.0,-10.0
1387,2010-08-01,Jon Jones,Vladimir Matyushenko,Red,4,1,23.0,39.0,16.0
1515,2011-02-05,Jon Jones,Ryan Bader,Red,5,1,23.0,27.0,4.0
1550,2011-03-19,Mauricio Rua,Jon Jones,Blue,3,2,29.0,23.0,-6.0
1686,2011-09-24,Jon Jones,Quinton Jackson,Red,7,1,24.0,33.0,9.0
1770,2011-12-10,Jon Jones,Lyoto Machida,Red,8,1,24.0,33.0,9.0
1874,2012-04-21,Jon Jones,Rashad Evans,Red,9,1,24.0,32.0,8.0


## Clean _acc

In [30]:
ufc_raw[['r_sig_str_landed', 'r_sig_str_atmpted', 'r_sig_str_acc',  'r_clinch_landed', 'r_clinch_atmpted', 'r_clinch_acc']]

Unnamed: 0,r_sig_str_landed,r_sig_str_atmpted,r_sig_str_acc,r_clinch_landed,r_clinch_atmpted,r_clinch_acc
0,13.0,29.0,44.0,1.0,2.0,50.0
1,13.0,17.0,76.0,1.0,1.0,100.0
2,0.0,0.0,,0.0,0.0,
3,3.0,5.0,60.0,0.0,0.0,
4,4.0,6.0,66.0,0.0,0.0,
...,...,...,...,...,...,...
8076,3.0,7.0,42.0,1.0,2.0,50.0
8077,0.0,0.0,,0.0,0.0,
8078,92.0,181.0,50.0,12.0,14.0,86.0
8079,24.0,38.0,63.0,10.0,10.0,100.0


In [31]:
# Column categories que tienen landed/attempted/acc
categories = [
    "sig_str", "total_str", "td", 
    "head", "body", "leg", 
    "dist", "clinch", "ground"
]
for corner in ['r', 'b']:
    for cat in categories:
        landed_col = f"{corner}_{cat}_landed"
        att_col = f"{corner}_{cat}_atmpted"
        acc_col = f"{corner}_{cat}_acc"
        
        if landed_col in ufc_raw.columns and att_col in ufc_raw.columns:
            # accuracy = landed/attempted * 100, pero 0 si attempted == 0
            ufc_raw[acc_col] = np.where(
                ufc_raw[att_col] > 0,
                (ufc_raw[landed_col] / ufc_raw[att_col]) * 100,
                0
            ).round(3)


In [32]:
ufc_raw[['r_sig_str_landed', 'r_sig_str_atmpted', 'r_sig_str_acc', 'r_clinch_landed', 'r_clinch_atmpted', 'r_clinch_acc']]

Unnamed: 0,r_sig_str_landed,r_sig_str_atmpted,r_sig_str_acc,r_clinch_landed,r_clinch_atmpted,r_clinch_acc
0,13.0,29.0,44.828,1.0,2.0,50.000
1,13.0,17.0,76.471,1.0,1.0,100.000
2,0.0,0.0,0.000,0.0,0.0,0.000
3,3.0,5.0,60.000,0.0,0.0,0.000
4,4.0,6.0,66.667,0.0,0.0,0.000
...,...,...,...,...,...,...
8076,3.0,7.0,42.857,1.0,2.0,50.000
8077,0.0,0.0,0.000,0.0,0.0,0.000
8078,92.0,181.0,50.829,12.0,14.0,85.714
8079,24.0,38.0,63.158,10.0,10.0,100.000


## Checkpoint

In [33]:
ufc_raw.to_csv("../data/processed/ufc_etl.csv", index=False)

In [34]:
ufc_raw = pd.read_csv("../data/processed/ufc_etl.csv")

## Recalculate UFCStats Features

## 📌 UFC Stats Features

Let $T$ be the total fight duration in seconds (`match_time_sec`), then the time in minutes is:

$$
M = \frac{T}{60}
$$

---

### 1. Significant Strikes Landed per Minute (SLpM)

$$
SLpM = \frac{\text{Significant Strikes Landed}}{M}
$$

---

### 2. Significant Striking Accuracy (Str. Acc.)

$$
Str.\ Acc. = \frac{\text{Significant Strikes Landed}}{\text{Significant Strikes Attempted}} \times 100
$$

---

### 3. Significant Strikes Absorbed per Minute (SApM)

$$
SApM = \frac{\text{Opponent Significant Strikes Landed}}{M}
$$

---

### 4. Significant Strike Defense (Str. Def.)

$$
Str.\ Def. = \Big( 1 - \frac{\text{Opponent Significant Strikes Landed}}{\text{Opponent Significant Strikes Attempted}} \Big) \times 100
$$

---

### 5. Average Takedowns Landed per 15 Minutes (TD Avg.)

$$
TD\ Avg. = \frac{\text{Takedowns Landed}}{M} \times 15
$$

---

### 6. Takedown Accuracy (TD Acc.)

$$
TD\ Acc. = \frac{\text{Takedowns Landed}}{\text{Takedowns Attempted}} \times 100
$$

---

### 7. Takedown Defense (TD Def.)

$$
TD\ Def. = \Big( 1 - \frac{\text{Opponent Takedowns Landed}}{\text{Opponent Takedowns Attempted}} \Big) \times 100
$$

---

### 8. Average Submissions Attempted per 15 Minutes (Sub. Avg.)

$$
Sub.\ Avg. = \frac{\text{Submission Attempts}}{M} \times 15
$$


In [35]:
def add_ufcstats_metrics(df):
    for corner in ['r', 'b']:
        opp = 'b' if corner == 'r' else 'r'
        prefix = f"{corner}_"
        opp_prefix = f"{opp}_"
        
        minutes = df["match_time_sec"] / 60
        
        # SLpM - Significant Strikes Landed per Minute
        df[f"{prefix}SLpM"] = np.where(
            minutes > 0,
            df[f"{prefix}sig_str_landed"] / minutes,
            0
        ).round(3)

        # Str. Acc. - Significant Striking Accuracy
        df[f"{prefix}Str_Acc"] = np.where(
            df[f"{prefix}sig_str_atmpted"] > 0,
            (df[f"{prefix}sig_str_landed"] / df[f"{prefix}sig_str_atmpted"]) * 100,
            0
        ).round(3)

        # SApM - Significant Strikes Absorbed per Minute
        df[f"{prefix}SApM"] = np.where(
            minutes > 0,
            df[f"{opp_prefix}sig_str_landed"] / minutes,
            0
        ).round(3)

        # Str. Def. - Significant Strike Defence
        df[f"{prefix}Str_Def"] = np.where(
            df[f"{opp_prefix}sig_str_atmpted"] > 0,
            (1 - df[f"{opp_prefix}sig_str_landed"] / df[f"{opp_prefix}sig_str_atmpted"]) * 100,
            0
        ).round(3)

        # TD Avg. - Takedowns Landed per 15 minutes
        df[f"{prefix}TD_Avg"] = np.where(
            minutes > 0,
            (df[f"{prefix}td_landed"] / minutes) * 15,
            0
        ).round(3)

        # TD Acc. - Takedown Accuracy
        df[f"{prefix}TD_Acc"] = np.where(
            df[f"{prefix}td_atmpted"] > 0,
            (df[f"{prefix}td_landed"] / df[f"{prefix}td_atmpted"]) * 100,
            0
        ).round(3)

        # TD Def. - Takedown Defense
        df[f"{prefix}TD_Def"] = np.where(
            df[f"{opp_prefix}td_atmpted"] > 0,
            (1 - df[f"{opp_prefix}td_landed"] / df[f"{opp_prefix}td_atmpted"]) * 100,
            0
        ).round(3)

        # Sub. Avg. - Submissions Attempted per 15 minutes
        df[f"{prefix}Sub_Avg"] = np.where(
            minutes > 0,
            (df[f"{prefix}sub_att"] / minutes) * 15,
            0
        ).round(3)

    return df

# Apply to dataframe
ufc_raw = add_ufcstats_metrics(ufc_raw)

In [36]:
check_cols = [
    'r_SLpM', 'r_Str_Acc', 'r_SApM', 'r_Str_Def',
    'r_TD_Avg', 'r_TD_Acc', 'r_TD_Def', 'r_Sub_Avg',
    'b_SLpM', 'b_Str_Acc', 'b_SApM', 'b_Str_Def',
    'b_TD_Avg', 'b_TD_Acc', 'b_TD_Def', 'b_Sub_Avg'
]

df_check = ufc_raw[check_cols].head(20)
df_check

Unnamed: 0,r_SLpM,r_Str_Acc,r_SApM,r_Str_Def,r_TD_Avg,r_TD_Acc,r_TD_Def,r_Sub_Avg,b_SLpM,b_Str_Acc,b_SApM,b_Str_Def,b_TD_Avg,b_TD_Acc,b_TD_Def,b_Sub_Avg
0,4.171,44.828,1.283,42.857,0.0,0.0,0.0,4.813,1.283,57.143,4.171,55.172,0.0,0.0,0.0,0.0
1,26.0,76.471,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,26.0,23.529,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.433,0.0,0.0,0.0,0.0,13.433,100.0,0.0,0.0
3,0.449,60.0,0.0,100.0,0.0,0.0,0.0,11.222,0.0,0.0,0.449,40.0,2.244,100.0,100.0,0.0
4,0.406,66.667,0.102,66.667,1.523,100.0,0.0,1.523,0.102,33.333,0.406,33.333,0.0,0.0,0.0,0.0
5,0.9,64.706,0.327,20.0,1.228,100.0,0.0,0.0,0.327,80.0,0.9,35.294,0.0,0.0,0.0,0.0
6,2.824,66.667,0.706,66.667,0.0,0.0,0.0,0.0,0.706,33.333,2.824,33.333,5.294,100.0,0.0,5.294
7,4.478,41.667,3.582,55.556,0.0,0.0,0.0,13.433,3.582,44.444,4.478,58.333,0.0,0.0,0.0,0.0
8,0.39,50.0,0.584,57.143,2.922,100.0,0.0,5.844,0.584,42.857,0.39,50.0,0.0,0.0,0.0,0.0
9,1.034,100.0,1.034,0.0,0.0,0.0,0.0,15.517,1.034,100.0,1.034,0.0,0.0,0.0,100.0,0.0


In [37]:
# Ensure chronological order
ufc_raw['date'] = pd.to_datetime(ufc_raw['date'])
ufc_raw = ufc_raw.sort_values('date').reset_index(drop=True)

# UFCStats official metrics
metrics = ['SLpM', 'Str_Acc', 'SApM', 'Str_Def', 
           'TD_Avg', 'TD_Acc', 'TD_Def', 'Sub_Avg']

# Initialize new columns (without low_sample flags)
for corner in ['r', 'b']:
    for m in metrics:
        for w in ['last_3', 'last_5', 'career']:
            ufc_raw[f"{corner}_{m}_{w}"] = np.nan

# History dict to track fighters' past metrics
history = {}

# Iterate chronologically over fights
for idx, row in ufc_raw.iterrows():
    for corner in ['r', 'b']:
        fighter = row[f"{corner}_name"]
        
        # Initialize history if new fighter
        if fighter not in history:
            history[fighter] = {m: [] for m in metrics}
        
        # Compute historical averages BEFORE this fight
        for m in metrics:
            past_values = history[fighter][m]
            if past_values:
                # last_3
                ufc_raw.at[idx, f"{corner}_{m}_last_3"] = np.mean(past_values[-3:])
                # last_5
                ufc_raw.at[idx, f"{corner}_{m}_last_5"] = np.mean(past_values[-5:])
                # career
                ufc_raw.at[idx, f"{corner}_{m}_career"] = np.mean(past_values)
            else:
                # No previous fights → 0 values
                ufc_raw.at[idx, f"{corner}_{m}_last_3"] = 0
                ufc_raw.at[idx, f"{corner}_{m}_last_5"] = 0
                ufc_raw.at[idx, f"{corner}_{m}_career"] = 0
        
        # Add this fight's metrics to history
        for m in metrics:
            col = f"{corner}_{m}"
            if col in ufc_raw.columns and not pd.isna(row[col]):
                history[fighter][m].append(row[col])

# Quick check of the new columns
ufc_raw[[c for c in ufc_raw.columns if 'SLpM' in c]].tail(10)

Unnamed: 0,r_SLpM,b_SLpM,r_SLpM_last_3,r_SLpM_last_5,r_SLpM_career,b_SLpM_last_3,b_SLpM_last_5,b_SLpM_career
8071,0.0,4.444,12.88,13.3404,19.728692,2.73,2.73,2.73
8072,2.609,4.348,12.733333,13.506,10.662048,17.885333,13.6906,17.870625
8073,11.8,12.8,5.857333,7.8744,12.333824,27.098333,21.6152,19.356889
8074,5.434,2.491,13.829667,10.3206,10.3206,3.6035,3.6035,3.6035
8075,0.703,0.703,7.038,8.125,8.070833,20.519333,16.67,16.67
8076,5.2,17.0,38.066667,33.8,31.767143,14.474333,18.1338,14.865375
8077,140.0,148.889,10.036,12.7214,14.258,20.140667,16.79375,16.79375
8078,15.429,5.143,28.371,17.8954,12.904897,13.714,13.714,13.714
8079,8.4,8.0,19.101,12.9806,13.53255,11.939333,9.8608,9.8608
8080,7.6,12.8,14.383333,13.0812,12.622357,11.440667,11.440667,11.440667


In [38]:
# Ensure chronological order
ufc_raw['date'] = pd.to_datetime(ufc_raw['date'])
ufc_raw = ufc_raw.sort_values('date').reset_index(drop=True)

# Extra detailed metrics
extra_metrics = [
    'sig_str_landed', 'sig_str_atmpted', 
    'total_str_landed', 'total_str_atmpted', 'total_str_acc',
    'td_landed', 'td_atmpted',
    'sub_att', 'ctrl',
    'head_landed', 'head_atmpted', 'head_acc',
    'body_landed', 'body_atmpted', 'body_acc',
    'leg_landed', 'leg_atmpted', 'leg_acc',
    'dist_landed', 'dist_atmpted', 'dist_acc',
    'clinch_landed', 'clinch_atmpted', 'clinch_acc',
    'ground_landed', 'ground_atmpted', 'ground_acc',
    'landed_head_per', 'landed_body_per', 'landed_leg_per',
    'landed_dist_per', 'landed_clinch_per', 'landed_ground_per'
]

# === 1. Pre-create all new columns at once (without low_sample) ===
new_cols = {}
for corner in ['r', 'b']:
    for m in extra_metrics:
        for w in ['last_3', 'last_5']:
            new_cols[f"{corner}_{m}_{w}"] = np.nan

# Create auxiliary DataFrame with new columns
extra_df = pd.DataFrame(new_cols, index=ufc_raw.index)

# Concatenate once → avoids fragmentation
ufc_raw = pd.concat([ufc_raw, extra_df], axis=1)

# === 2. Build fight history ===
history_extra = {}

# Iterate chronologically over fights
for idx, row in ufc_raw.iterrows():
    for corner in ['r', 'b']:
        fighter = row[f"{corner}_name"]

        # Initialize history if this is the first fight
        if fighter not in history_extra:
            history_extra[fighter] = {m: [] for m in extra_metrics}

        # Compute historical averages BEFORE the fight
        for m in extra_metrics:
            past_values = history_extra[fighter][m]
            if past_values:
                # last_3
                ufc_raw.at[idx, f"{corner}_{m}_last_3"] = np.mean(past_values[-3:])
                # last_5
                ufc_raw.at[idx, f"{corner}_{m}_last_5"] = np.mean(past_values[-5:])
            else:
                # No previous fights → assign 0
                ufc_raw.at[idx, f"{corner}_{m}_last_3"] = 0
                ufc_raw.at[idx, f"{corner}_{m}_last_5"] = 0

        # Update history with current fight values
        for m in extra_metrics:
            col = f"{corner}_{m}"
            if col in ufc_raw.columns and not pd.isna(row[col]):
                history_extra[fighter][m].append(row[col])

# Quick check for clinch accuracy
ufc_raw[[c for c in ufc_raw.columns if 'clinch_acc_last_' in c]].tail(10)

Unnamed: 0,r_clinch_acc_last_3,r_clinch_acc_last_5,b_clinch_acc_last_3,b_clinch_acc_last_5
8071,82.777667,62.238,77.778,83.3335
8072,33.333333,40.0,29.166667,37.5
8073,33.333333,45.8334,20.0,20.0
8074,85.029333,71.0176,37.777667,22.6666
8075,76.666667,67.3334,77.027333,76.2164
8076,28.889,33.8958,13.333333,13.333333
8077,87.5,87.5,37.5,37.5
8078,100.0,100.0,66.666667,70.3174
8079,44.444333,53.3332,45.238,64.6428
8080,46.103667,57.5922,94.444333,94.444333


## Clean Stances

In [39]:
ufc_raw['r_stance'].unique()

array(['Orthodox', 'Southpaw', nan, 'Sideways', 'Switch', 'Open Stance'],
      dtype=object)

In [40]:
ufc_raw['b_stance'].unique()

array(['Southpaw', 'Orthodox', nan, 'Open Stance', 'Switch', 'Sideways'],
      dtype=object)

In [41]:
targets = ['Sideways', 'Open Stance']

print("R Stance problematic counts:")
print(ufc_raw['r_stance'].isin(targets).sum(), "Sideways/Open Stance")
print(ufc_raw['r_stance'].isna().sum(), "NaN")

print("\nB Stance problematic counts:")
print(ufc_raw['b_stance'].isin(targets).sum(), "Sideways/Open Stance")
print(ufc_raw['b_stance'].isna().sum(), "NaN")


R Stance problematic counts:
17 Sideways/Open Stance
26 NaN

B Stance problematic counts:
13 Sideways/Open Stance
67 NaN


In [42]:
# Mapear Sideways y Open Stance a Switch
ufc_raw['r_stance'] = ufc_raw['r_stance'].replace({'Sideways': 'Switch', 'Open Stance': 'Switch'})
ufc_raw['b_stance'] = ufc_raw['b_stance'].replace({'Sideways': 'Switch', 'Open Stance': 'Switch'})

# Opcional: eliminar filas con NaN en stances
ufc_raw = ufc_raw.dropna(subset=['r_stance', 'b_stance'])

print("R stance counts:")
print(ufc_raw['r_stance'].value_counts(dropna=False))
print("\nB stance counts:")
print(ufc_raw['b_stance'].value_counts(dropna=False))

R stance counts:
r_stance
Orthodox    5966
Southpaw    1582
Switch       452
Name: count, dtype: int64

B stance counts:
b_stance
Orthodox    6009
Southpaw    1530
Switch       461
Name: count, dtype: int64


# Save Data

In [43]:
# Save the cleaned file
ufc_raw.to_csv(f'{project_root}/data/processed/ufc_etl.csv', index=False)
logger.info("✅ ETL file saved as 'ufc_etl.csv'.")

[INFO] ✅ ETL file saved as 'ufc_etl.csv'.


# Load Data

In [44]:
# Define the path to the CSV file
file_path = os.path.join(project_root, 'data', 'processed', 'ufc_etl.csv')

# Load the CSV into a DataFrame
try:
    ufc_raw = pd.read_csv(file_path)
    logger.info(f"✅ Data successfully loaded: {ufc_raw.shape[0]} rows, {ufc_raw.shape[1]} columns.")
except Exception as e:
    logger.error(f"❌ Error loading training data: {e}")

[INFO] ✅ Data successfully loaded: 8000 rows, 347 columns.


# Check 

In [45]:
pd.set_option('display.max_rows', None)

In [46]:
# Null values check
nulls = ufc_raw.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

# Duplicate analysis
duplicates = ufc_raw.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")


Null values per column:
 total_rounds             22
referee                  25
r_kd                      6
r_sig_str_landed          6
r_sig_str_atmpted         6
r_total_str_landed        6
r_total_str_atmpted       6
r_td_landed               6
r_td_atmpted              6
r_sub_att                 6
r_ctrl                  164
r_head_landed             6
r_head_atmpted            6
r_body_landed             6
r_body_atmpted            6
r_leg_landed              6
r_leg_atmpted             6
r_dist_landed             6
r_dist_atmpted            6
r_clinch_landed           6
r_clinch_atmpted          6
r_ground_landed           6
r_ground_atmpted          6
r_landed_head_per         6
r_landed_body_per         6
r_landed_leg_per          6
r_landed_dist_per         6
r_landed_clinch_per       6
r_landed_ground_per       6
r_nick_name            2231
r_height                  1
r_reach                 360
r_dob                    43
b_kd                      6
b_sig_str_landed      

# Drop Debut Fights

In [47]:
ufc_raw = ufc_raw[
    (ufc_raw['r_total_fights'] > 0) & 
    (ufc_raw['b_total_fights'] > 0)
]

print("Size after dropping debuts:", len(ufc_raw))


Size after dropping debuts: 5998


In [48]:
# Null values check
nulls = ufc_raw.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

# Duplicate analysis
duplicates = ufc_raw.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")


Null values per column:
 total_rounds              7
referee                  19
r_kd                      3
r_sig_str_landed          3
r_sig_str_atmpted         3
r_total_str_landed        3
r_total_str_atmpted       3
r_td_landed               3
r_td_atmpted              3
r_sub_att                 3
r_ctrl                   69
r_head_landed             3
r_head_atmpted            3
r_body_landed             3
r_body_atmpted            3
r_leg_landed              3
r_leg_atmpted             3
r_dist_landed             3
r_dist_atmpted            3
r_clinch_landed           3
r_clinch_atmpted          3
r_ground_landed           3
r_ground_atmpted          3
r_landed_head_per         3
r_landed_body_per         3
r_landed_leg_per          3
r_landed_dist_per         3
r_landed_clinch_per       3
r_landed_ground_per       3
r_nick_name            1598
r_reach                 136
r_dob                     8
b_kd                      3
b_sig_str_landed          3
b_sig_str_atmpted     

# Data Cleaning

## Null Values

In [49]:
# Drop columns with too many null values (threshold: 300)
threshold = 400
cols_to_drop = [col for col in ufc_raw.columns if ufc_raw[col].isnull().sum() > threshold]
for col in cols_to_drop:
    print('Dropping:', col)
ufc_raw.drop(columns=cols_to_drop, inplace=True)

# Drop rows with any remaining missing values
print(f"➡️ Before dropna: {ufc_raw.shape}")
ufc_raw.dropna(inplace=True)
print(f"✅ After dropna: {ufc_raw.shape}")

Dropping: r_nick_name
Dropping: b_nick_name
➡️ Before dropna: (5998, 345)
✅ After dropna: (5594, 345)


In [50]:
# Null values check
nulls = ufc_raw.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])


Null values per column:
 Series([], dtype: int64)


In [51]:
ufc_raw = ufc_raw.rename(columns={"winner_corner_bin": "label"})

# Save Data

In [52]:
# Save the cleaned file
ufc_raw.to_csv(f'{project_root}/data/processed/ufc_etl.csv', index=False)
logger.info("✅ ETL file saved as 'ufc_etl.csv'.")

[INFO] ✅ ETL file saved as 'ufc_etl.csv'.


# Load Data

In [53]:
# Define the path to the CSV file
file_path = os.path.join(project_root, 'data', 'processed', 'ufc_etl.csv')

# Load the CSV into a DataFrame
try:
    ufc_raw = pd.read_csv(file_path)
    logger.info(f"✅ Data successfully loaded: {ufc_raw.shape[0]} rows, {ufc_raw.shape[1]} columns.")
except Exception as e:
    logger.error(f"❌ Error loading training data: {e}")

[INFO] ✅ Data successfully loaded: 5594 rows, 345 columns.


# Initialize UFCData

In [54]:
ufc_data = UFCData(ufc_raw)

In [55]:
ufc_data

📊 UFC Dataset Summary
----------------------------------------
🧪 Total samples      : 5594
🧪 Train/Test split  : 4475 / 1119
🧪 Total features     : 344

🔢 Numerical features : 325
🔠 Categorical features: 19
    - Binary          : 1
    - Multiclass      : 18

🏷 Label distribution (raw):
   - Class 0: 3391 (60.6%)
   - Class 1: 2203 (39.4%)

✅ No missing values detected

📈 Feature summary statistics (train set):
                               mean      std     min       max
title_fight                   0.060    0.237    0.00     1.000
finish_round                  2.457    1.039    1.00     5.000
match_time_sec              229.610   91.478    5.00   300.000
total_rounds                  3.247    0.658    3.00     5.000
r_kd                          0.247    0.519    0.00     4.000
r_sig_str_landed             42.006   34.733    0.00   445.000
r_sig_str_atmpted            92.072   74.948    0.00   744.000
r_sig_str_acc                47.375   15.431    0.00   100.000
r_total_str_lande

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

# Create column from Dates and Fighter names for the Dataset

# Load Data

In [57]:
# Define the path to the CSV file
file_path = os.path.join(project_root, 'data', 'raw', 'ufc_raw.csv')

# Load the CSV into a DataFrame
try:
    ufc_raw = pd.read_csv(file_path)
    logger.info(f"✅ Data successfully loaded: {ufc_raw.shape[0]} rows, {ufc_raw.shape[1]} columns.")
except Exception as e:
    logger.error(f"❌ Error loading training data: {e}")

[INFO] ✅ Data successfully loaded: 8250 rows, 124 columns.


## Transform Date to datetime64

In [30]:
ufc_raw['Date']

0       2024-12-14
1       2024-12-14
2       2024-12-14
3       2024-12-14
4       2024-12-14
           ...    
6536    2010-03-21
6537    2010-03-21
6538    2010-03-21
6539    2010-03-21
6540    2010-03-21
Name: Date, Length: 6541, dtype: object

In [31]:
ufc_raw['Date'] = pd.to_datetime(ufc_raw['Date'], errors='coerce')

In [32]:
n_invalid = ufc_raw['Date'].isna().sum()
print(f" ✅ Conversion completed. Invalid dates: {n_invalid}")

In [33]:
ufc_raw['Date']

0      2024-12-14
1      2024-12-14
2      2024-12-14
3      2024-12-14
4      2024-12-14
          ...    
6536   2010-03-21
6537   2010-03-21
6538   2010-03-21
6539   2010-03-21
6540   2010-03-21
Name: Date, Length: 6541, dtype: datetime64[ns]

In [34]:
# Ordenar por fecha ascendente
ufc_raw = ufc_raw.sort_values(by='Date').reset_index(drop=True)

In [35]:
ufc_raw['Date']

0      2010-03-21
1      2010-03-21
2      2010-03-21
3      2010-03-21
4      2010-03-21
          ...    
6536   2024-12-14
6537   2024-12-14
6538   2024-12-14
6539   2024-12-14
6540   2024-12-14
Name: Date, Length: 6541, dtype: datetime64[ns]

In [36]:
ufc_raw['RedFighter'] = ufc_raw['RedFighter'].str.strip()
ufc_raw['BlueFighter'] = ufc_raw['BlueFighter'].str.strip()
ufc_raw

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Eric Schafer,Jason Brilz,140.0,-160.0,140.0000,62.5000,2010-03-21,"Broomfield, Colorado, USA",USA,Blue,...,,3.0,5:00,900.0,,,,,,
1,Brandon Vera,Jon Jones,215.0,-235.0,215.0000,42.5532,2010-03-21,"Broomfield, Colorado, USA",USA,Blue,...,Elbow,1.0,3:19,199.0,,,,,,
2,Junior Dos Santos,Gabriel Gonzaga,-250.0,230.0,40.0000,230.0000,2010-03-21,"Broomfield, Colorado, USA",USA,Red,...,Punches,1.0,3:53,233.0,,,,,,
3,Cheick Kongo,Paul Buentello,-345.0,315.0,28.9855,315.0000,2010-03-21,"Broomfield, Colorado, USA",USA,Red,...,Elbows,3.0,1:16,676.0,,,,,,
4,Alessio Sakara,James Irvin,-120.0,100.0,83.3333,100.0000,2010-03-21,"Broomfield, Colorado, USA",USA,Red,...,,1.0,3:01,181.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6536,Miranda Maverick,Jamey-Lyn Horth,-625.0,455.0,16.0000,455.0000,2024-12-14,"Tampa, Florida, USA",USA,Red,...,,3.0,5:00,900.0,-295.0,650.0,500.0,2500.0,1400.0,2200.0
6537,Davey Grant,Ramon Taveras,-122.0,102.0,81.9672,102.0000,2024-12-14,"Tampa, Florida, USA",USA,Red,...,,3.0,5:00,900.0,225.0,350.0,750.0,800.0,350.0,300.0
6538,Josefine Knutsson,Piera Rodriguez,-245.0,200.0,40.8163,200.0000,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,,3.0,5:00,900.0,-175.0,300.0,1800.0,1400.0,800.0,1200.0
6539,Michael Johnson,Ottman Azaitar,-230.0,190.0,43.4783,190.0000,2024-12-14,"Tampa, Florida, USA",USA,Red,...,Punch,2.0,2:03,423.0,300.0,650.0,1000.0,2500.0,120.0,350.0


## Create DaysSinceLastFight: Days since his last UFC fight

In [37]:
# Sort the DataFrame by ascending fight date
ufc_raw = ufc_raw.sort_values(by='Date').reset_index(drop=True)

# Dictionary to store each fighter's last fight date
last_fight_date_by_fighter = {}

# Lists to store the computed days since last fight
red_days_since_last_fight = []
blue_days_since_last_fight = []

# Iterate through each fight record
for idx, row in ufc_raw.iterrows():
    current_date = row['Date']
    red_fighter = row['RedFighter']
    blue_fighter = row['BlueFighter']

    # Days since last fight for Red corner
    red_last_date = last_fight_date_by_fighter.get(red_fighter, None)
    if red_last_date is None:
        red_days_since_last_fight.append(np.nan)
    else:
        red_days_since_last_fight.append((current_date - red_last_date).days)

    # Days since last fight for Blue corner
    blue_last_date = last_fight_date_by_fighter.get(blue_fighter, None)
    if blue_last_date is None:
        blue_days_since_last_fight.append(np.nan)
    else:
        blue_days_since_last_fight.append((current_date - blue_last_date).days)

    # Update last seen date for both fighters
    last_fight_date_by_fighter[red_fighter] = current_date
    last_fight_date_by_fighter[blue_fighter] = current_date

# Add the results as new columns in the dataframe
ufc_raw['RedDaysSinceLastFight'] = red_days_since_last_fight
ufc_raw['BlueDaysSinceLastFight'] = blue_days_since_last_fight

In [38]:
ufc_raw[['RedFighter', 'Date','RedDaysSinceLastFight']].sample(5)

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight
4129,Karolina Kowalkiewicz,2020-02-22,259.0
4777,Carlos Condit,2021-07-10,175.0
4221,Aljamain Sterling,2020-06-06,364.0
3369,Alex Perez,2018-08-04,161.0
6000,Rob Font,2023-12-02,119.0


In [39]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight
39,Anderson Silva,2010-04-10,
109,Anderson Silva,2010-08-07,119.0
238,Anderson Silva,2011-02-05,182.0
386,Anderson Silva,2011-08-27,203.0
689,Anderson Silva,2012-07-07,315.0
773,Anderson Silva,2012-10-13,98.0
1019,Anderson Silva,2013-07-06,266.0
2245,Anderson Silva,2016-02-27,791.0
2696,Anderson Silva,2017-02-11,217.0


## Create DaysSinceDebut column: Days since his first UFC fight

In [40]:
# Step 1: Build debut date dictionary for all fighters
debut_date_by_fighter = {}

# Iterate through the dataset chronologically
for idx, row in ufc_raw.iterrows():
    current_date = row['Date']
    red_fighter = row['RedFighter']
    blue_fighter = row['BlueFighter']

    # Store the first appearance date if not already present
    if red_fighter not in debut_date_by_fighter:
        debut_date_by_fighter[red_fighter] = current_date
    if blue_fighter not in debut_date_by_fighter:
        debut_date_by_fighter[blue_fighter] = current_date

# Step 2: Compute days since debut for Red and Blue corners
ufc_raw['RedDaysSinceDebut'] = ufc_raw.apply(
    lambda row: (row['Date'] - debut_date_by_fighter[row['RedFighter']]).days,
    axis=1
)

ufc_raw['BlueDaysSinceDebut'] = ufc_raw.apply(
    lambda row: (row['Date'] - debut_date_by_fighter[row['BlueFighter']]).days,
    axis=1
)

In [41]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight', 'RedDaysSinceDebut']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight,RedDaysSinceDebut
39,Anderson Silva,2010-04-10,,0
109,Anderson Silva,2010-08-07,119.0,119
238,Anderson Silva,2011-02-05,182.0,301
386,Anderson Silva,2011-08-27,203.0,504
689,Anderson Silva,2012-07-07,315.0,819
773,Anderson Silva,2012-10-13,98.0,917
1019,Anderson Silva,2013-07-06,266.0,1183
2245,Anderson Silva,2016-02-27,791.0,2149
2696,Anderson Silva,2017-02-11,217.0,2499


In [42]:
# Step 1: Create debut flags
ufc_raw['RedIsDebut'] = ufc_raw['RedDaysSinceLastFight'].isna().map({True: 'Yes', False: 'No'})
ufc_raw['BlueIsDebut'] = ufc_raw['BlueDaysSinceLastFight'].isna().map({True: 'Yes', False: 'No'})

# Step 2: Impute NaN with median value in the debut.
ufc_raw['RedDaysSinceLastFight'].fillna(180, inplace=True) 
ufc_raw['BlueDaysSinceLastFight'].fillna(180, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ufc_raw['RedDaysSinceLastFight'].fillna(180, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ufc_raw['BlueDaysSinceLastFight'].fillna(180, inplace=True)


In [43]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight', 'RedDaysSinceDebut', 'RedIsDebut']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight,RedDaysSinceDebut,RedIsDebut
39,Anderson Silva,2010-04-10,180.0,0,Yes
109,Anderson Silva,2010-08-07,119.0,119,No
238,Anderson Silva,2011-02-05,182.0,301,No
386,Anderson Silva,2011-08-27,203.0,504,No
689,Anderson Silva,2012-07-07,315.0,819,No
773,Anderson Silva,2012-10-13,98.0,917,No
1019,Anderson Silva,2013-07-06,266.0,1183,No
2245,Anderson Silva,2016-02-27,791.0,2149,No
2696,Anderson Silva,2017-02-11,217.0,2499,No


## Create FightsInLastYear/FightsInLast6Months : Number of fights in the last year/6 months

In [44]:
from collections import defaultdict

# Dictionary of past fight dates for each fighter
fight_history = defaultdict(list)

# Output lists
red_fights_last_year = []
blue_fights_last_year = []

# Iterate over each row in chronological order
for idx, row in ufc_raw.iterrows():
    current_date = row['Date']
    red_fighter = row['RedFighter']
    blue_fighter = row['BlueFighter']

    # Count red's fights in the past 365 days
    red_past_fights = [d for d in fight_history[red_fighter] if 0 < (current_date - d).days <= 365]
    red_fights_last_year.append(len(red_past_fights))

    # Count blue's fights in the past 365 days
    blue_past_fights = [d for d in fight_history[blue_fighter] if 0 < (current_date - d).days <= 365]
    blue_fights_last_year.append(len(blue_past_fights))

    # Update fight history with current fight date
    fight_history[red_fighter].append(current_date)
    fight_history[blue_fighter].append(current_date)

# Add the new columns to the dataframe
ufc_raw['RedFightsInLastYear'] = red_fights_last_year
ufc_raw['BlueFightsInLastYear'] = blue_fights_last_year

In [45]:
from collections import defaultdict

# Dictionary to store past fight dates for each fighter
fight_history_6m = defaultdict(list)

# Output columns
red_fights_last_6m = []
blue_fights_last_6m = []

# Iterate through the dataset chronologically
for idx, row in ufc_raw.iterrows():
    current_date = row['Date']
    red_fighter = row['RedFighter']
    blue_fighter = row['BlueFighter']

    # Count red's fights in the past 183 days (6 months)
    red_past_fights = [d for d in fight_history_6m[red_fighter] if 0 < (current_date - d).days <= 183]
    red_fights_last_6m.append(len(red_past_fights))

    # Count blue's fights in the past 183 days (6 months)
    blue_past_fights = [d for d in fight_history_6m[blue_fighter] if 0 < (current_date - d).days <= 183]
    blue_fights_last_6m.append(len(blue_past_fights))

    # Update fight history with current fight
    fight_history_6m[red_fighter].append(current_date)
    fight_history_6m[blue_fighter].append(current_date)

# Assign to new columns
ufc_raw['RedFightsInLast6Months'] = red_fights_last_6m
ufc_raw['BlueFightsInLast6Months'] = blue_fights_last_6m

In [46]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight', 'RedDaysSinceDebut', 'RedIsDebut', 'RedFightsInLastYear', 'RedFightsInLast6Months']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight,RedDaysSinceDebut,RedIsDebut,RedFightsInLastYear,RedFightsInLast6Months
39,Anderson Silva,2010-04-10,180.0,0,Yes,0,0
109,Anderson Silva,2010-08-07,119.0,119,No,1,1
238,Anderson Silva,2011-02-05,182.0,301,No,2,1
386,Anderson Silva,2011-08-27,203.0,504,No,1,0
689,Anderson Silva,2012-07-07,315.0,819,No,1,0
773,Anderson Silva,2012-10-13,98.0,917,No,1,1
1019,Anderson Silva,2013-07-06,266.0,1183,No,2,0
2245,Anderson Silva,2016-02-27,791.0,2149,No,0,0
2696,Anderson Silva,2017-02-11,217.0,2499,No,2,0


## Create DaysSinceLastWin : Number of fights in the last year/6 months

In [47]:
from collections import defaultdict

# Initialize win history tracker
last_win_date_by_fighter = {}

# Output columns
red_days_since_last_win = []
blue_days_since_last_win = []

# Iterate over ufc_raw in chronological order
for idx, row in ufc_raw.iterrows():
    date = row['Date']
    red = row['RedFighter']
    blue = row['BlueFighter']
    winner = row['Winner']  # Should be 'Red', 'Blue' or 'Draw'

    # Red corner
    red_last_win_date = last_win_date_by_fighter.get(red, None)
    if red_last_win_date is None:
        red_days_since_last_win.append(np.nan)
    else:
        red_days_since_last_win.append((date - red_last_win_date).days)

    # Blue corner
    blue_last_win_date = last_win_date_by_fighter.get(blue, None)
    if blue_last_win_date is None:
        blue_days_since_last_win.append(np.nan)
    else:
        blue_days_since_last_win.append((date - blue_last_win_date).days)

    # Update win dates
    if winner == 'Red':
        last_win_date_by_fighter[red] = date
    elif winner == 'Blue':
        last_win_date_by_fighter[blue] = date
    # No update for draw

# Add to DataFrame
ufc_raw['RedDaysSinceLastWin'] = red_days_since_last_win
ufc_raw['BlueDaysSinceLastWin'] = blue_days_since_last_win

In [48]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight', 'RedDaysSinceDebut', 'RedIsDebut', 'RedFightsInLastYear', 'RedFightsInLast6Months', 'RedDaysSinceLastWin']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight,RedDaysSinceDebut,RedIsDebut,RedFightsInLastYear,RedFightsInLast6Months,RedDaysSinceLastWin
39,Anderson Silva,2010-04-10,180.0,0,Yes,0,0,
109,Anderson Silva,2010-08-07,119.0,119,No,1,1,119.0
238,Anderson Silva,2011-02-05,182.0,301,No,2,1,182.0
386,Anderson Silva,2011-08-27,203.0,504,No,1,0,203.0
689,Anderson Silva,2012-07-07,315.0,819,No,1,0,315.0
773,Anderson Silva,2012-10-13,98.0,917,No,1,1,98.0
1019,Anderson Silva,2013-07-06,266.0,1183,No,2,0,266.0
2245,Anderson Silva,2016-02-27,791.0,2149,No,0,0,1232.0
2696,Anderson Silva,2017-02-11,217.0,2499,No,2,0,1582.0


In [49]:
ufc_raw['RedDaysSinceLastWin'].median()

246.0

In [50]:
# FIll with median, moodel already know if it is the first fight of a fighter
ufc_raw['RedDaysSinceLastWin'].fillna(246, inplace=True)
ufc_raw['BlueDaysSinceLastWin'].fillna(246, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ufc_raw['RedDaysSinceLastWin'].fillna(246, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ufc_raw['BlueDaysSinceLastWin'].fillna(246, inplace=True)


In [51]:
ufc_raw[ufc_raw['RedFighter'] == 'Anderson Silva'][['RedFighter', 'Date', 'RedDaysSinceLastFight', 'RedDaysSinceDebut', 'RedIsDebut', 'RedFightsInLastYear', 'RedFightsInLast6Months', 'RedDaysSinceLastWin']]

Unnamed: 0,RedFighter,Date,RedDaysSinceLastFight,RedDaysSinceDebut,RedIsDebut,RedFightsInLastYear,RedFightsInLast6Months,RedDaysSinceLastWin
39,Anderson Silva,2010-04-10,180.0,0,Yes,0,0,246.0
109,Anderson Silva,2010-08-07,119.0,119,No,1,1,119.0
238,Anderson Silva,2011-02-05,182.0,301,No,2,1,182.0
386,Anderson Silva,2011-08-27,203.0,504,No,1,0,203.0
689,Anderson Silva,2012-07-07,315.0,819,No,1,0,315.0
773,Anderson Silva,2012-10-13,98.0,917,No,1,1,98.0
1019,Anderson Silva,2013-07-06,266.0,1183,No,2,0,266.0
2245,Anderson Silva,2016-02-27,791.0,2149,No,0,0,1232.0
2696,Anderson Silva,2017-02-11,217.0,2499,No,2,0,1582.0


In [52]:
# Save the cleaned file
ufc_raw.to_csv(f'{project_root}/data/raw/ufc_raw_dates_ft.csv', index=False)
logger.info("✅ Raw with Date Features file saved as 'ufc_raw_dates_ft.csv'.")

[INFO] ✅ Raw with Date Features file saved as 'ufc_raw_dates_ft.csv'.


<div style="text-align: center;">
     <img src="../img/ufc_logo.png" width="800" /> 
</div>