# 🔹UFC ETL

## 1. Import Libraries and Setup Environment

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Visualization Config
sns.set(style="whitegrid")

## 2. Load Data

In [10]:
project_root = '/home/mfourier/ufc-predictor'
file_path = f'{project_root}/data/raw/ufc_raw.csv'
df = pd.read_csv(file_path)
print(f"Data loaded: {df.shape[0]} rows and {df.shape[1]} columns.")

Data loaded: 6541 rows and 118 columns.


## 3. Preview

In [11]:
# Preview the first few records
display(df.head())

# General dataset information
df.info()

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Colby Covington,Joaquin Buckley,205.0,-250.0,205.0,40.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,,3.0,4:42,882.0,300.0,175.0,1800.0,2000.0,1100.0,150.0
1,Cub Swanson,Billy Quarantillo,124.0,-148.0,124.0,67.5676,2024-12-14,"Tampa, Florida, USA",USA,Red,...,Punch,3.0,1:36,696.0,250.0,,1800.0,,450.0,
2,Manel Kape,Bruno Silva,-395.0,310.0,25.3165,310.0,2024-12-14,"Tampa, Florida, USA",USA,Red,...,Punches,3.0,1:57,717.0,-105.0,550.0,900.0,1800.0,225.0,1100.0
3,Vitor Petrino,Dustin Jacoby,-340.0,270.0,29.4118,270.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,Punch,3.0,3:44,824.0,240.0,500.0,550.0,3000.0,110.0,800.0
4,Adrian Yanez,Daniel Marcos,185.0,-225.0,185.0,44.4444,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,,3.0,5:00,900.0,450.0,150.0,2200.0,2200.0,450.0,200.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6541 entries, 0 to 6540
Columns: 118 entries, RedFighter to BKOOdds
dtypes: bool(1), float64(60), int64(43), object(14)
memory usage: 5.8+ MB


## 4. Check 

In [12]:
# Null values check
nulls = df.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

# Duplicate analysis
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")


Null values per column:
 RedOdds                   227
BlueOdds                  226
RedExpectedValue          227
BlueExpectedValue         226
BlueAvgSigStrLanded       930
BlueAvgSigStrPct          765
BlueAvgSubAtt             832
BlueAvgTDLanded           833
BlueAvgTDPct              842
BlueStance                  3
RedAvgSigStrLanded        455
RedAvgSigStrPct           357
RedAvgSubAtt              357
RedAvgTDLanded            357
RedAvgTDPct               367
EmptyArena               1499
BMatchWCRank             5339
RMatchWCRank             4760
RWFlyweightRank          6445
RWFeatherweightRank      6532
RWStrawweightRank        6395
RWBantamweightRank       6387
RHeavyweightRank         6355
RLightHeavyweightRank    6357
RMiddleweightRank        6359
RWelterweightRank        6349
RLightweightRank         6357
RFeatherweightRank       6364
RBantamweightRank        6360
RFlyweightRank           6352
RPFPRank                 6288
BWFlyweightRank          6468
BWFeatherweigh

## 5. Data Cleaning

In [13]:
# Drop irrelevant columns (e.g., IDs or redundant columns)
#irrelevant = ['fight_id', 'fighter_id', 'event_id']
#df.drop(columns=irrelevant, inplace=True, errors='ignore')

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Drop columns with too many null values (threshold: 30%)
threshold = 0.3 * len(df)
cols_to_drop = [col for col in df.columns if df[col].isnull().sum() > threshold]
df.drop(columns=cols_to_drop, inplace=True)

# Fill remaining null values with mean or mode, depending on the data type
for col in df.columns:
    if df[col].dtype == 'object':
        mode = df[col].mode()[0]
        df[col] = df[col].fillna(mode)
    else:
        mean = df[col].mean()
        df[col] = df[col].fillna(mean)

## 6. Preview Clean Data

In [14]:
# Preview the first few records
display(df.head())

# General dataset information
df.info()

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,Finish,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Colby Covington,Joaquin Buckley,205.0,-250.0,205.0,40.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,KO/TKO,3.0,4:42,882.0,300.0,175.0,1800.0,2000.0,1100.0,150.0
1,Cub Swanson,Billy Quarantillo,124.0,-148.0,124.0,67.5676,2024-12-14,"Tampa, Florida, USA",USA,Red,...,KO/TKO,3.0,1:36,696.0,250.0,425.889565,1800.0,1102.388149,450.0,636.803475
2,Manel Kape,Bruno Silva,-395.0,310.0,25.3165,310.0,2024-12-14,"Tampa, Florida, USA",USA,Red,...,KO/TKO,3.0,1:57,717.0,-105.0,550.0,900.0,1800.0,225.0,1100.0
3,Vitor Petrino,Dustin Jacoby,-340.0,270.0,29.4118,270.0,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,KO/TKO,3.0,3:44,824.0,240.0,500.0,550.0,3000.0,110.0,800.0
4,Adrian Yanez,Daniel Marcos,185.0,-225.0,185.0,44.4444,2024-12-14,"Tampa, Florida, USA",USA,Blue,...,S-DEC,3.0,5:00,900.0,450.0,150.0,2200.0,2200.0,450.0,200.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6541 entries, 0 to 6540
Data columns (total 89 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   RedFighter                   6541 non-null   object 
 1   BlueFighter                  6541 non-null   object 
 2   RedOdds                      6541 non-null   float64
 3   BlueOdds                     6541 non-null   float64
 4   RedExpectedValue             6541 non-null   float64
 5   BlueExpectedValue            6541 non-null   float64
 6   Date                         6541 non-null   object 
 7   Location                     6541 non-null   object 
 8   Country                      6541 non-null   object 
 9   Winner                       6541 non-null   object 
 10  TitleBout                    6541 non-null   bool   
 11  WeightClass                  6541 non-null   object 
 12  Gender                       6541 non-null   object 
 13  NumberOfRounds    

## 7. Check Clean Data

In [15]:
# Null values check
nulls = df.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

# Duplicate analysis
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")


Null values per column:
 Series([], dtype: int64)

Duplicate rows: 0


## 6. Save

In [16]:
# Save the cleaned file
df.to_csv(f'{project_root}/data/processed/ufc_etl.csv', index=False)
print("\nETL file saved as 'ufc_etl.csv'.")


ETL file saved as 'ufc_etl.csv'.
