# 🔹UFC Feature Engineering

## 1. Import Libraries and Setup Environment

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Get the current working directory
current_dir = os.getcwd()

# Navigate to the project root
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Import from /src
sys.path.append(os.path.join(project_root, 'src'))
from utils.helpers import *

## 2. Load Data

In [2]:
# Define the path to the CSV file
file_path = os.path.join(project_root, 'data', 'processed', 'ufc_etl.csv')

# Load the CSV into a DataFrame
ufc_data = pd.read_csv(file_path)
print(f"Data successfully loaded: {ufc_data.shape[0]} rows, {ufc_data.shape[1]} columns.")

Data successfully loaded: 6541 rows, 55 columns.


## 3. Feature Engineering

- **Feature Vector Construction**: The feature vector for each fight is represented as:
  $$x = \text{features-fighter-blue} - \text{features-fighter-red}$$
- The task is framed as a binary classification problem, where the model predicts either **0** (Fighter Red wins) or **1** (Fighter Blue wins).

#### Create the target value: **0** (Fighter Red wins) or **1** (Fighter Blue wins)

In [None]:
ufc_data['label'] = ufc_data['Winner'].apply(lambda x: 1 if x == 'Blue' else 0)
ufc_data=ufc_data.drop('Winner', axis=1)
ufc_data = pd.get_dummies(ufc_data, columns=['TitleBout', 'Gender'], drop_first=True)

In [None]:
numerical_columns = []

## 4. Numerical Data

### Stance
- **Orthodox Stance:** A fighter in orthodox stance leads with their left foot and left hand, making it the natural stance for right-handed individuals. 
- **Southpaw Stance:** A fighter in southpaw stance leads with their right foot and right hand, making it the natural stance for left-handed individuals. 
- **Open Stance Matchup:** When one fighter is orthodox and the other is southpaw, it creates an "open stance" matchup. This differs from a "closed stance" where both fighters are in the same stance (e.g., both orthodox or both southpaw)
- **Switch:** When a fighter switches from an orthodox to a southpaw stance or vice versa, it can disrupt their opponent's rhythm, causing them to miss or react incorrectly to strikes. 

In [None]:
ufc_data['BlueStance'].unique()

In [None]:
# Para BlueStance
ufc_data['BlueStance_Orthodox'] = (ufc_data['BlueStance'] == 'Orthodox').astype(int)
ufc_data['BlueStance_Southpaw'] = (ufc_data['BlueStance'] == 'Southpaw').astype(int)
ufc_data['BlueStance_Switch'] = (ufc_data['BlueStance'] == 'Switch').astype(int)
ufc_data['BlueStance_OpenStance'] = (ufc_data['BlueStance'] == 'Open Stance').astype(int)

# Para RedStance (mismo enfoque)
ufc_data['RedStance_Orthodox'] = (ufc_data['RedStance'] == 'Orthodox').astype(int)
ufc_data['RedStance_Southpaw'] = (ufc_data['RedStance'] == 'Southpaw').astype(int)
ufc_data['RedStance_Switch'] = (ufc_data['RedStance'] == 'Switch').astype(int)
ufc_data['RedStance_OpenStance'] = (ufc_data['RedStance'] == 'Open Stance').astype(int)

#Borramos las columnas
ufc_data=ufc_data.drop(['BlueStance','RedStance'], axis=1)


### Better Rank

In [None]:
display(ufc_data['BetterRank'].unique())

In [None]:
# Para BlueStance
ufc_data['BetterRank_Red'] = (ufc_data['BetterRank'] == 'Red').astype(int)
ufc_data['BetterRank_Blue'] = (ufc_data['BetterRank'] == 'Blue').astype(int)
ufc_data['BetterRank_neither'] = (ufc_data['BetterRank'] == 'neither').astype(int)
#Borramos las columnas
ufc_data=ufc_data.drop(['BetterRank'], axis=1)

## Creación de features

In [None]:
#Tasa de Finalización (Red y Blue):
RedFinishRate = (ufc_data['RedWinsByKO'] + ufc_data['RedWinsBySubmission'] + ufc_data['RedWinsByTKODoctorStoppage']) / ufc_data['RedWins'].replace(0, 1)
BlueFinishRate = (ufc_data['BlueWinsByKO'] + ufc_data['BlueWinsBySubmission'] + ufc_data['BlueWinsByTKODoctorStoppage']) / ufc_data['BlueWins'].replace(0, 1)
ufc_data['FinishRate'] = BlueFinishRate - RedFinishRate
#numerical_columns.append('FinishRate')

In [None]:
#Win Ratio (Red y Blue)
RedWinRatio = ufc_data['RedWins'] / (ufc_data['RedWins'] + ufc_data['RedLosses']).replace(0, 1)
BlueWinRatio = ufc_data['BlueWins'] / (ufc_data['BlueWins'] + ufc_data['BlueLosses']).replace(0, 1)
ufc_data['WinRatio']= BlueWinRatio - RedWinRatio 

In [None]:
#Edad vs. Experiencia
RedExpPerYear = ufc_data['RedTotalRoundsFought'] / ufc_data['RedAge']
BlueExpPerYear = ufc_data['BlueTotalRoundsFought'] / ufc_data['BlueAge']
ufc_data['ExpPerYear']=BlueExpPerYear - RedExpPerYear 

In [None]:
#Reach Advantage Ratio
ufc_data['ReachAdvantageRatio'] = ufc_data['RedReachCms'] / ufc_data['BlueReachCms']

In [None]:
#Height/Reach Ratio (para cada peleador)
RedHeightReachRatio = ufc_data['RedHeightCms'] / ufc_data['RedReachCms']
BlueHeightReachRatio = ufc_data['BlueHeightCms'] / ufc_data['BlueReachCms']
ufc_data['HeightReachRatio']= BlueHeightReachRatio - RedHeightReachRatio

In [None]:
BlueWinsByDecision = ufc_data[['BlueWinsByDecisionMajority', 'BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous']].sum(axis=1)
RedWinsByDecision = ufc_data[['RedWinsByDecisionMajority', 'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous']].sum(axis=1)
ufc_data['WinsByDecision']=BlueWinsByDecision- RedWinsByDecision

In [None]:
BlueDecisionRate = BlueWinsByDecision / ufc_data['BlueWins'].replace(0, 1)  # Evitar división por cero
RedDecisionRate = RedWinsByDecision  / ufc_data['RedWins'].replace(0, 1)  # Evitar división por cero
ufc_data['DecisionRate']= BlueDecisionRate - RedDecisionRate



In [None]:
ufc_data['Gender_MALE'] = ufc_data['Gender_MALE'].astype(int)  # True → 1, False → 0

## Selección de features

Dado lo siguiente:
-  LoseStreakDif: BlueCurrentLoseStreak - RedCurrentLoseStreak
-  WinStreakDif: BlueCurrentWinStreak - RedCurrentWinStreak
-  LongestWinStreakDif: BlueLongestWinStreak - 'RedLongestWinStreak
-  WinDif: BlueWins - RedWins
-  LossDif: BlueLosses - RedLosses
-  TotalRoundDif: BlueTotalRoundsFought - RedTotalRoundsFought
-  TotalTitleBoutDif: BlueTotalTitleBouts - RedTotalTitleBouts
-  KODif: BlueWinsByKO - RedWinsByKO
-  SubDif: BlueWinsBySubmission - RedWinsBySubmission
-  HeightDif: BlueHeightCms - RedHeightCms
-  ReachDif: BlueReachCms - RedReachCms
-  AgeDif: BlueAge - RedAge
  
Se eliminan ciertas columnas que pueden ser redundantes.

In [None]:
ufc_data=ufc_data.drop(['BlueCurrentLoseStreak', 'RedCurrentLoseStreak','BlueCurrentWinStreak',
                      'RedCurrentWinStreak','BlueLongestWinStreak', 'RedLongestWinStreak', 'BlueWins',
                      'RedWins','BlueLosses','RedLosses', 'BlueTotalRoundsFought','RedTotalRoundsFought',
                      'BlueTotalTitleBouts', 'RedTotalTitleBouts', 'BlueWinsByKO', 'RedWinsByKO', 'BlueWinsBySubmission',
                      'RedWinsBySubmission','BlueHeightCms','RedHeightCms','BlueReachCms','RedReachCms',
                      'BlueAge', 'RedAge'], axis=1)

También aquellas relacionadas con los features construidos:
- WinsByDecision
- DecisionRate
- FinishRate


In [None]:
ufc_data=ufc_data.drop(['BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous',
       'BlueWinsByTKODoctorStoppage', 'RedWinsByDecisionMajority',
       'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous',
       'RedWinsByTKODoctorStoppage','BlueWinsByDecisionMajority'], axis=1)

### Columnas de baja varianza
- BlueDraws
- RedDraws
- BlueWeightLbs
- RedWeightLbs
- TitleBout_True 

In [None]:
ufc_data=ufc_data.drop(['BlueDraws','RedDraws','BlueWeightLbs','RedWeightLbs','TitleBout_True'], axis=1)

In [None]:
# Preview the first few records
display(ufc_data.head())
display(ufc_data.columns)
# Para ver los tipos de dato de cada columna:
display(ufc_data.dtypes)

In [None]:
display(ufc_data.head())

## 5. Split Dataset and Standarize

In [None]:
#ufc_train, ufc_test = split_and_standardize(ufc_data, numerical_columns)

## 6. Save

In [None]:
# Save the cleaned file
ufc_data.to_csv(f'{project_root}/data/processed/ufc_processed.csv', index=False)
print("\nFeature Engineering file saved as 'ufc_processed.csv'.")