## Import and clean dataset

In [1]:
import pandas as pd
import numpy as np
import os


path = r"D:\0 - Data Analytics\6 - Advanced Analytics & Dashboard Design\UFC Analysis"

# import the dataframe
ufc_df = pd.read_csv(os.path.join(path, "02 Data", "Original Data", "ufc-master.csv"))

In [2]:
# check shape and head using viarables and print
ufc_df.info
# ufc_df.shape
# ufc_df.head()


<bound method DataFrame.info of              RedFighter        BlueFighter  RedOdds  BlueOdds  \
0     Alexandre Pantoja        Kai Asakura   -250.0     215.0   
1     Shavkat Rakhmonov  Ian Machado Garry   -210.0     295.0   
2            Ciryl Gane   Alexander Volkov   -380.0     300.0   
3        Bryce Mitchell        Kron Gracie   -950.0     625.0   
4         Nate Landwehr         Dooho Choi   -130.0     110.0   
...                 ...                ...      ...       ...   
6523       Duane Ludwig      Darren Elkins   -155.0     135.0   
6524        John Howard     Daniel Roberts   -210.0     175.0   
6525     Brendan Schaub      Chase Gormley   -260.0     220.0   
6526        Mike Pierce      Julio Paulino   -420.0     335.0   
6527       Eric Schafer        Jason Brilz    140.0    -160.0   

      RedExpectedValue  BlueExpectedValue        Date  \
0              40.0000              215.0  2024-12-07   
1              47.6190              295.0  2024-12-07   
2              2

In [3]:
# Check for missing values in the dataset
missing_values_summary = ufc_df.isnull().sum()

# Display columns with missing values
missing_values_summary = missing_values_summary[missing_values_summary > 0]

# Display missing values count for each column
missing_values_df = pd.DataFrame(missing_values_summary, columns=["Missing Values"])
missing_values_df

Unnamed: 0,Missing Values
RedOdds,227
BlueOdds,226
RedExpectedValue,227
BlueExpectedValue,226
BlueAvgSigStrLanded,930
BlueAvgSigStrPct,765
BlueAvgSubAtt,832
BlueAvgTDLanded,833
BlueAvgTDPct,842
BlueStance,3


## Handling missing values

In [4]:
# Fill missing values in betting odds columns with median values
betting_odds_columns = ["RedOdds", "BlueOdds", "RedExpectedValue", "BlueExpectedValue"]
ufc_df[betting_odds_columns] = ufc_df[betting_odds_columns].fillna(
    ufc_df[betting_odds_columns].median()
)

In [5]:
# Fill missing values using a fighter's personal fight history first, then weight-class average
def fill_missing_with_fighter_or_weight_class_avg(
    df, fighter_col, stat_cols, weight_class_col
):
    for col in stat_cols:
        # Step 1: Fill missing values with fighter's past fight average
        df[col] = df.groupby(fighter_col)[col].transform(lambda x: x.fillna(x.mean()))

        # Step 2: Fill any remaining missing values with weight class average
        df[col] = df.groupby(weight_class_col)[col].transform(
            lambda x: x.fillna(x.mean())
        )

    return df


# Columns to fill using this improved approach
fighter_stat_columns = [
    "BlueAvgSigStrLanded",
    "BlueAvgSigStrPct",
    "BlueAvgSubAtt",
    "BlueAvgTDLanded",
    "BlueAvgTDPct",
    "RedAvgSigStrLanded",
    "RedAvgSigStrPct",
    "RedAvgSubAtt",
    "RedAvgTDLanded",
    "RedAvgTDPct",
]

# Apply the function for both Blue and Red fighters using their weight class as fallback
ufc_df = fill_missing_with_fighter_or_weight_class_avg(
    ufc_df, "BlueFighter", fighter_stat_columns, "WeightClass"
)
ufc_df = fill_missing_with_fighter_or_weight_class_avg(
    ufc_df, "RedFighter", fighter_stat_columns, "WeightClass"
)


In [6]:
# Fill missing stance values with 'Unknown'.
# Fighters stance is a key attribute in UFC fights, so it is important to have a value for each fighter, but not all the fighters actually have a stance, they might be ambidextrous or switch stances during the fight.

# Fill missing values in the stance columns with 'Unknown'
ufc_df["RedStance"] = ufc_df["RedStance"].fillna("Unknown")
ufc_df["BlueStance"] = ufc_df["BlueStance"].fillna("Unknown")

In [7]:
# Leave ranking columns as they are (no action needed becasue not all fighter are ranked)

In [8]:
# I chose to fill the missing values in the EmptyArena column based on the fight date.
# # I assumed that all fights that took place between March 14, 2020, and December 18, 2021, were held in an empty arena.
# All other fights were held in a non-empty arena.
# # This approach is based on the COVID-19 pandemic, which led to the cancellation of live audiences at sporting events.

# Fill missing values in EmptyArena based on fight date
ufc_df.loc[
    (ufc_df["Date"] >= "2020-03-14")
    & (ufc_df["Date"] <= "2021-12-18")
    & (ufc_df["EmptyArena"].isnull()),
    "EmptyArena",
] = 1.0
ufc_df.loc[
    ((ufc_df["Date"] < "2020-03-14") | (ufc_df["Date"] > "2021-12-18"))
    & (ufc_df["EmptyArena"].isnull()),
    "EmptyArena",
] = 0.0

# Verify if there are still missing values in EmptyArena
missing_empty_arena = ufc_df["EmptyArena"].isnull().sum()

# Display the result
missing_empty_arena
missing_values_df

Unnamed: 0,Missing Values
RedOdds,227
BlueOdds,226
RedExpectedValue,227
BlueExpectedValue,226
BlueAvgSigStrLanded,930
BlueAvgSigStrPct,765
BlueAvgSubAtt,832
BlueAvgTDLanded,833
BlueAvgTDPct,842
BlueStance,3


In [9]:
# Fill missing ranking values with "Unranked"
all_ranking_columns = [
    "BMatchWCRank",
    "RMatchWCRank",
    "RWFlyweightRank",
    "RWFeatherweightRank",
    "RWStrawweightRank",
    "RWBantamweightRank",
    "RHeavyweightRank",
    "RLightHeavyweightRank",
    "RMiddleweightRank",
    "RWelterweightRank",
    "RLightweightRank",
    "RFeatherweightRank",
    "RBantamweightRank",
    "RFlyweightRank",
    "RPFPRank",
    "BWFlyweightRank",
    "BWFeatherweightRank",
    "BWStrawweightRank",
    "BWBantamweightRank",
    "BHeavyweightRank",
    "BLightHeavyweightRank",
    "BMiddleweightRank",
    "BWelterweightRank",
    "BLightweightRank",
    "BFeatherweightRank",
    "BBantamweightRank",
    "BFlyweightRank",
    "BPFPRank",
]
ufc_df[all_ranking_columns] = ufc_df[all_ranking_columns].fillna("Unranked")


In [10]:
# Verify dataset after cleaning
ufc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6528 entries, 0 to 6527
Columns: 118 entries, RedFighter to BKOOdds
dtypes: bool(1), float64(32), int64(43), object(42)
memory usage: 5.8+ MB


In [11]:
# value count of finish
ufc_df["Finish"].value_counts()

Finish
U-DEC         2404
KO/TKO        2009
SUB           1157
S-DEC          654
M-DEC           46
DQ              18
Overturned       2
Name: count, dtype: int64

In [12]:
# Fill missing `Finish` and `FinishDetails` with "Unknown"
# Since we have a winnder and loser in each fight, we can assume that all fights have a finish type and details. But we will mark them as unknown.

ufc_df.loc[ufc_df["Finish"].isnull(), "Finish"] = "Unknown"
ufc_df.loc[ufc_df["FinishDetails"].isnull(), "FinishDetails"] = "Unknown"

In [13]:
# Check for missing values in the entire dataset
missing_values_summary = ufc_df.isnull().sum()

# Filter only columns with missing values
remaining_missing_values = missing_values_summary[missing_values_summary > 0]

# Display the missing values count for each column
if remaining_missing_values.empty:
    print("No missing values left in the dataset!")
else:
    print("Columns with missing values:")
    print(remaining_missing_values)

Columns with missing values:
FinishRound            622
FinishRoundTime        622
TotalFightTimeSecs     622
RedDecOdds            1087
BlueDecOdds           1116
RSubOdds              1336
BSubOdds              1359
RKOOdds               1334
BKOOdds               1360
dtype: int64


In [14]:
# value count of finish
ufc_df["Finish"].value_counts()

Finish
U-DEC         2404
KO/TKO        2009
SUB           1157
S-DEC          654
Unknown        238
M-DEC           46
DQ              18
Overturned       2
Name: count, dtype: int64

In [15]:
# Fill Any Remaining Missing Values with Estimated Medians
# Compute the median values based on available data
median_finish_round = ufc_df["FinishRound"].median()
median_total_fight_time = ufc_df["TotalFightTimeSecs"].median()

# Force-fill all remaining missing values with the computed medians
ufc_df.loc[ufc_df["FinishRound"].isnull(), "FinishRound"] = median_finish_round
ufc_df.loc[ufc_df["TotalFightTimeSecs"].isnull(), "TotalFightTimeSecs"] = (
    median_total_fight_time
)

# Convert `FinishRound` and `TotalFightTimeSecs` to integer format for consistency
ufc_df["FinishRound"] = ufc_df["FinishRound"].astype("Int64")
ufc_df["TotalFightTimeSecs"] = ufc_df["TotalFightTimeSecs"].astype("Int64")

In [16]:
# check value counts FinishRound, and FinishRoundTime TotalFightTimeSecs
ufc_df["FinishRound"].value_counts()
# ufc_df["FinishRoundTime"].value_counts()
# ufc_df["TotalFightTimeSecs"].value_counts()


FinishRound
3    3797
1    1492
2     953
5     250
4      36
Name: count, dtype: Int64

In [17]:
# Check for duplicates in the dataset
duplicates = ufc_df.duplicated()

# display duplicates
duplicates

0       False
1       False
2       False
3       False
4       False
        ...  
6523    False
6524    False
6525    False
6526    False
6527    False
Length: 6528, dtype: bool

In [18]:
# **Define Columns to Keep**
columns_to_keep = [
    # Fight Outcome Analysis
    "Finish",
    "FinishDetails",
    "TotalFightTimeSecs",
    "FinishRound",
    "FinishRoundTime",  # Fight results and round details
    # Fighter Information & Performance
    "RedFighter",
    "BlueFighter",  # Fighter names for identification
    "Gender",  # Gender of fighters to analyze gender-based trends
    "WeightClass",
    "RedReachCms",
    "BlueReachCms",
    "RedHeightCms",
    "BlueHeightCms",
    "RedWeightLbs",
    "BlueWeightLbs",  # Physical attributes
    "RedTotalTitleBouts",
    "BlueTotalTitleBouts",
    "RedStance",
    "BlueStance",
    "RedAge",
    "BlueAge",  # Experience & age for performance analysis
    "RedCurrentWinStreak",
    "BlueCurrentWinStreak",
    "RedCurrentLoseStreak",
    "BlueCurrentLoseStreak",  # Win/Loss streaks
    "RedWins",
    "BlueWins",
    "RedLosses",
    "BlueLosses",
    "RedDraws",
    "BlueDraws",  # Fighter records
    # Fighter Finish Methods
    "RedWinsByKO",
    "BlueWinsByKO",
    "RedWinsBySubmission",
    "BlueWinsBySubmission",  # KO/Sub wins
    # Fighter Performance Statistics
    "RedAvgSigStrLanded",
    "BlueAvgSigStrLanded",  # Significant strikes landed
    "RedAvgSigStrPct",
    "BlueAvgSigStrPct",  # Significant strike accuracy percentage
    "RedAvgTDLanded",
    "BlueAvgTDLanded",  # Successful takedowns
    "RedAvgTDPct",
    "BlueAvgTDPct",  # Takedown accuracy
    "RedAvgSubAtt",
    "BlueAvgSubAtt",  # Submission attempts
    # **Betting & Fight Predictions**
    "RedOdds",
    "BlueOdds",
    "Winner",  # Betting odds and fight outcomes
    # Time-Based Trends & External Factors
    "Date",
    "Location",
    "Country",
    "TitleBout",
    "NumberOfRounds",
    "EmptyArena",  # Fight metadata
]

# Apply Column Selection to the Dataset
ufc_df = ufc_df[columns_to_keep]

# Display Dataset Structure After Selection
ufc_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6528 entries, 0 to 6527
Data columns (total 54 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Finish                 6528 non-null   object 
 1   FinishDetails          6528 non-null   object 
 2   TotalFightTimeSecs     6528 non-null   Int64  
 3   FinishRound            6528 non-null   Int64  
 4   FinishRoundTime        5906 non-null   object 
 5   RedFighter             6528 non-null   object 
 6   BlueFighter            6528 non-null   object 
 7   Gender                 6528 non-null   object 
 8   WeightClass            6528 non-null   object 
 9   RedReachCms            6528 non-null   float64
 10  BlueReachCms           6528 non-null   float64
 11  RedHeightCms           6528 non-null   float64
 12  BlueHeightCms          6528 non-null   float64
 13  RedWeightLbs           6528 non-null   int64  
 14  BlueWeightLbs          6528 non-null   int64  
 15  RedT

In [19]:
# convert finishRoundTime to seconds

# Function to convert `MM:SS` format to seconds
def convert_time_to_seconds(time_str):
    try:
        minutes, seconds = map(int, time_str.split(":"))
        return minutes * 60 + seconds
    except:
        return None  # Handle invalid or missing values


# Apply conversion to `FinishRoundTime`
ufc_df["FinishRoundTimeSecs"] = (
    ufc_df["FinishRoundTime"].astype(str).apply(convert_time_to_seconds)
)

# Drop the original `FinishRoundTime` column
ufc_df.drop(columns=["FinishRoundTime"], inplace=True)

# Display dataset structure after conversion
ufc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6528 entries, 0 to 6527
Data columns (total 54 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Finish                 6528 non-null   object 
 1   FinishDetails          6528 non-null   object 
 2   TotalFightTimeSecs     6528 non-null   Int64  
 3   FinishRound            6528 non-null   Int64  
 4   RedFighter             6528 non-null   object 
 5   BlueFighter            6528 non-null   object 
 6   Gender                 6528 non-null   object 
 7   WeightClass            6528 non-null   object 
 8   RedReachCms            6528 non-null   float64
 9   BlueReachCms           6528 non-null   float64
 10  RedHeightCms           6528 non-null   float64
 11  BlueHeightCms          6528 non-null   float64
 12  RedWeightLbs           6528 non-null   int64  
 13  BlueWeightLbs          6528 non-null   int64  
 14  RedTotalTitleBouts     6528 non-null   int64  
 15  Blue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ufc_df["FinishRoundTimeSecs"] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ufc_df.drop(columns=["FinishRoundTime"], inplace=True)


In [20]:
# Fill Missing `FinishRoundTimeSecs` Using Median of Similar Fights
ufc_df = ufc_df.copy()  # Avoid SettingWithCopyWarning
ufc_df.loc[:, "FinishRoundTimeSecs"] = ufc_df.groupby(
    ["WeightClass", "Finish"], group_keys=False
)["FinishRoundTimeSecs"].apply(lambda x: x.fillna(x.median()))

# Fill Remaining Missing Values with Overall Median
overall_median_time = ufc_df["FinishRoundTimeSecs"].median()
ufc_df.loc[:, "FinishRoundTimeSecs"] = ufc_df["FinishRoundTimeSecs"].fillna(
    overall_median_time
)


In [21]:
# check for mixed type columns

for col in ufc_df.columns.tolist():
    weird = (ufc_df[[col]].map(type) != ufc_df[[col]].iloc[0].apply(type)).any(axis=1)
    if len(ufc_df[weird]) > 0:
        print(col)

In [22]:
# check for missing values
missing_values_summary = ufc_df.isnull().sum()

# show missing values
missing_values_summary

Finish                   0
FinishDetails            0
TotalFightTimeSecs       0
FinishRound              0
RedFighter               0
BlueFighter              0
Gender                   0
WeightClass              0
RedReachCms              0
BlueReachCms             0
RedHeightCms             0
BlueHeightCms            0
RedWeightLbs             0
BlueWeightLbs            0
RedTotalTitleBouts       0
BlueTotalTitleBouts      0
RedStance                0
BlueStance               0
RedAge                   0
BlueAge                  0
RedCurrentWinStreak      0
BlueCurrentWinStreak     0
RedCurrentLoseStreak     0
BlueCurrentLoseStreak    0
RedWins                  0
BlueWins                 0
RedLosses                0
BlueLosses               0
RedDraws                 0
BlueDraws                0
RedWinsByKO              0
BlueWinsByKO             0
RedWinsBySubmission      0
BlueWinsBySubmission     0
RedAvgSigStrLanded       0
BlueAvgSigStrLanded      0
RedAvgSigStrPct          0
B

In [23]:
# export the cleaned data
ufc_df.to_csv(
    os.path.join(path, "02 Data", "Prepared Data", "ufc_cleaned.csv"), index=False
)