# UFC Fight Prediction


In [None]:
# Import dependencies
import pandas as pd
import plotly.express as px
import numpy as np


In [None]:
# Read in the CSV file (data.csv) as a DataFrame
ufc_df = pd.read_csv("Resources/scraped_data.csv", low_memory=False)
ufc_df

# DtypeWarning: Columns (2,98,122,139,142,146,163,166,259,300,303,307,324,327) have mixed types.
# FIX: To ensure no mixed types either set low_memory=False in read_csv(),
# or specify the type with the dtype parameter. Specifying the datatype may result in memory improvements.


## Data Cleaning


In [None]:
# View duplicated rows.
display(ufc_df[ufc_df.duplicated()])

# Dropping duplicate rows. Can happen sometimes due to the new event being scraped.
ufc_df = ufc_df.drop_duplicates()
display(ufc_df)

# If the subset of columns contains a duplicate row then Raise SystemExit exception; historically, this has not occurred before.
if len(ufc_df[ufc_df.duplicated(subset=["Event_Date", "B_Name", "R_Name"])]) != 0:
    raise SystemExit("Duplicate events found!")


In [None]:
# Converting Event_Date column values to datetime64 to drop older fights below
ufc_df["Event_Date"] = pd.to_datetime(ufc_df["Event_Date"])


In [None]:
# Removing old fights due to the lack of rule consistency
ufc_df = ufc_df[ufc_df["Event_Date"] >= pd.to_datetime("5/3/2001")]


In [None]:
# TODO: Fix handling of `--`, `---`, or `No Time Limit`; I was unsure on what each value means so I just set them to NaN.
ufc_df = ufc_df.replace("--", np.NaN)
ufc_df = ufc_df.replace("---", np.NaN)
ufc_df = ufc_df.replace("No Time Limit", np.NaN)


In [None]:
# Extract number inside the parenthesis using Regex matching; no contest(s) amount
ufc_df["R_No_Contest"] = ufc_df["R_Draws"].str.extract("\((\d+)\sNC\)", expand=False)
ufc_df["B_No_Contest"] = ufc_df["B_Draws"].str.extract("\((\d+)\sNC\)", expand=False)

# Extract first number at start of string using Regex matching; draw(s) amount
ufc_df["R_Draws"] = ufc_df["R_Draws"].str.extract("^(\d+)", expand=False)
ufc_df["B_Draws"] = ufc_df["B_Draws"].str.extract("^(\d+)", expand=False)


|   Weight Class    | Minimum Weight (lb) | Maximum Weight (lb) |
| :---------------: | :-----------------: | :-----------------: |
|    Heavyweight    |         205         |         265         |
| Light Heavyweight |         185         |         205         |
|   Middleweight    |         170         |         185         |
|   Welterweight    |         155         |         170         |
|    Lightweight    |         145         |         155         |
|   Featherweight   |         135         |         145         |
|   Bantamweight    |         125         |         135         |
|     Flyweight     |         115         |         125         |
|   Strawweight\*   |          0          |         115         |

- "The women’s UFC division is split into only 4 classes: strawweight, flyweight, bantamweight, and featherweight. The strawweight class is only used in the women’s division and men do not compete in it."


In [None]:
# View descriptive statistics (central tendency, dispersion and shape of a dataset’s distribution, excluding NaN values)
# before removing rows that do not contain a standardized weight class
display(ufc_df[["R_Weight", "B_Weight"]].describe())
display(ufc_df.Weight_Class.value_counts())

# Keep rows that contain standardized weight class
ufc_df = ufc_df.loc[
    (ufc_df.Weight_Class == "Heavyweight")
    | (ufc_df.Weight_Class == "Light Heavyweight")
    | (ufc_df.Weight_Class == "Middleweight")
    | (ufc_df.Weight_Class == "Welterweight")
    | (ufc_df.Weight_Class == "Lightweight")
    | (ufc_df.Weight_Class == "Featherweight")
    | (ufc_df.Weight_Class == "Bantamweight")
    | (ufc_df.Weight_Class == "Flyweight")
    | (ufc_df.Weight_Class == "Strawweight")
    | (ufc_df.Weight_Class == "Women's Strawweight")
    | (ufc_df.Weight_Class == "Women's Flyweight")
    | (ufc_df.Weight_Class == "Women's Bantamweight")
    | (ufc_df.Weight_Class == "Women's Featherweight")
    # TODO: Should "Catch Weight" be excluded?
    # | (ufc_df.Weight_Class == "Catch Weight")
    # | (ufc_df.Weight_Class == "Super Heavyweight")  # no longer a weight class or part of dataset
    # | (ufc_df.Weight_Class == "Open Weight")  # no longer a weight class or part of dataset
    # TODO: Create `other` category for anything that does not fall in standardized Weight_Class (?)
    # | (ufc_df.Winnner == "Other")
]

# View descriptive statistics (central tendency, dispersion and shape of a dataset’s distribution, excluding NaN values)
# after removing rows that do not contain a standardized weight class
display("-" * 50)
display(ufc_df[["R_Weight", "B_Weight"]].describe())
display(ufc_df.Weight_Class.value_counts())


In [None]:
# Converting blue corner values to the correct dtype (Categorical or numeric)
ufc_df = ufc_df.astype(
    {
        "B_Name": "category",
        "B_Reach": "float",
        "B_Significant_Strike_Perc": "float",
        "B_Takedown_Perc": "float",
        "B_Round_One_Significant_Strike_Perc": "float",
        "B_Round_One_Takedown_Perc": "float",
        "B_Round_Two_Significant_Strike_Perc": "float",
        "B_Round_Two_Takedown_Perc": "float",
        "B_Round_Three_Significant_Strike_Perc": "float",
        "B_Round_Three_Takedown_Perc": "float",
        "B_Round_Four_Takedown_Perc": "float",
        "B_Round_Five_Takedown_Perc": "float",
        "B_Round_Four_Significant_Strike_Perc": "float",
        "B_Round_Five_Significant_Strike_Perc": "float",
        "B_Draws": "float",
        "B_No_Contest": "float",
    }
)

# Converting red corner values to the correct dtype (categorical or numeric)
ufc_df = ufc_df.astype(
    {
        "R_Name": "category",
        "R_Reach": "float",
        "R_Significant_Strike_Perc": "float",
        "R_Takedown_Perc": "float",
        "R_Round_One_Significant_Strike_Perc": "float",
        "R_Round_One_Takedown_Perc": "float",
        "R_Round_Two_Significant_Strike_Perc": "float",
        "R_Round_Two_Takedown_Perc": "float",
        "R_Round_Three_Significant_Strike_Perc": "float",
        "R_Round_Three_Takedown_Perc": "float",
        "R_Round_Four_Takedown_Perc": "float",
        "R_Round_Five_Takedown_Perc": "float",
        "R_Round_Five_Significant_Strike_Perc": "float",
        "R_Draws": "float",
        "R_No_Contest": "float",
    }
)

# Convert Max_Rounds to Integer
# TODO: Why is `Max_Rounds` being inferred as a object and not `Ending_Round`?
ufc_df["Max_Rounds"] = ufc_df["Max_Rounds"].astype("int64")


In [None]:
# Alphabetically sort column names
original_column_list = ufc_df.columns.tolist()
sorted_column_list = sorted(ufc_df.columns.tolist())
# TODO: Is there a benefit of using .reindex instead?
ufc_df = ufc_df[sorted_column_list]


In [None]:
# Symmetric difference between original & sorted column names to ensure no columns are not missing.
set(original_column_list) ^ set(sorted_column_list)


In [None]:
#  Infer best column dtype & add missing values to rows
ufc_df = ufc_df.convert_dtypes()
ufc_df.dtypes.value_counts()


In [None]:
ufc_df.head()


## Data Exploration


### Winner (Red vs. Blue)


In [None]:
fig_piechart_winrate_color = px.pie(
    title="Win Rate by Fighter Color",
    values=ufc_df["Winner"].value_counts().values,
    names=ufc_df["Winner"].value_counts().index,
    color=ufc_df["Winner"].value_counts().index,
    # color values from px.colors.qualitative.Plotly
    color_discrete_map={
        "Red": "#EF553B",
        "Blue": "#636EFA",
    },
)
fig_piechart_winrate_color.update_traces(hovertemplate=None)
fig_piechart_winrate_color.show()

### Age


In [None]:
# Generate descriptive statistics on Age columns (min, max, etc)
ufc_df[["R_Age", "B_Age"]].describe()


In [None]:
# Use Box & Whisker plot to visualize Age outliers
fig_boxplot_age = px.box(
    pd.melt(ufc_df[["R_Age", "B_Age"]]),
    x="variable",
    y="value",
    color="variable",
    # color values from px.colors.qualitative.Plotly
    color_discrete_map={
        "R_Age": "#EF553B",
        "B_Age": "#636EFA",
    },
    # Axis titles (and legend titles) can also be overridden using the labels argument of Plotly Express functions
    labels=dict(variable="Fighter Color", value="Age (years)"),
)
fig_boxplot_age.show()

### Height


In [None]:
# Generate descriptive statistics on Height columns (min, max, etc)
ufc_df[["R_Height", "B_Height"]].describe()


In [None]:
# Use Box & Whisker plot to visualize Height outliers
fig_boxplot_height = px.box(
    pd.melt(ufc_df[["R_Height", "B_Height"]]),
    x="variable",
    y="value",
    color="variable",
    # Color values from px.colors.qualitative.Plotly
    color_discrete_map={
        "R_Height": "#EF553B",
        "B_Height": "#636EFA",
    },
    # Axis titles (and legend titles) can also be overridden using the labels argument of Plotly Express functions
    labels=dict(variable="Fighter Color", value="Height (inches)"),
)
fig_boxplot_height.show()

### Weight


In [None]:
# Generate descriptive statistics on Weight columns (min, max, etc)
ufc_df[["R_Weight", "B_Weight"]].describe()


In [None]:
# Use Box & Whisker plot to visualize Height outliers
fig_boxplot_weight = px.box(
    pd.melt(ufc_df[["R_Weight", "B_Weight"]]),
    x="variable",
    y="value",
    color="variable",
    # Color values from px.colors.qualitative.Plotly
    color_discrete_map={
        "R_Weight": "#EF553B",
        "B_Weight": "#636EFA",
    },
    # Axis titles (and legend titles) can also be overridden using the labels argument of Plotly Express functions
    labels=dict(variable="Fighter Color", value="Weight (pounds)"),
)
fig_boxplot_weight.show()

## Feature Engineering


### Feature Creation


#### Age Bucket


In [None]:
# Find the minimum age in R_Age
ufc_df["R_Age"].min()


In [None]:
# Find the maximum age in R_Age
ufc_df["R_Age"].max()


In [None]:
# Find the minimum age in B_Age
ufc_df["B_Age"].min()


In [None]:
# Find the maximum age in B_Age
ufc_df["B_Age"].max()


In [None]:
# Define four equal-sized buckets for the Age columns based on sample quantiles

# Red age buckets
r_age_bucket_labels = ["Under 27", "27 - 30", "30 - 32", "Over 32"]

ufc_df["R_Age_Bucket"] = pd.qcut(
    x=ufc_df["R_Age"], q=4, labels=r_age_bucket_labels, retbins=False, precision=3
)

# Blue age buckets
b_age_bucket_labels = ["Under 26", "26 - 29", "29 - 32", "Over 32"]

ufc_df["B_Age_Bucket"] = pd.qcut(
    x=ufc_df["B_Age"], q=4, labels=b_age_bucket_labels, retbins=False, precision=3
)

# Drop Age columns
# ufc_df = ufc_df.drop(columns=["R_Age", "B_Age"])


In [None]:
# A bar graph to visualize R_Age_Bucket and B_Age_Bucket
fig_barplot_age_bucket = px.bar(    
    # outer join the count of red and blue bucket into a single dataframe
    # source: https://pandas.pydata.org/docs/user_guide/merging.html#joining-key-columns-on-an-index
    pd.merge(
        left=ufc_df["R_Age_Bucket"].value_counts(),
        right=ufc_df["B_Age_Bucket"].value_counts(),
        left_index=True,
        right_index=True,
        how="outer",
    ),   
    y=["R_Age_Bucket", "B_Age_Bucket"],
    color="variable",
    # Color values from px.colors.qualitative.Plotly
    color_discrete_map={
        "R_Age_Bucket": "#EF553B",
        "B_Age_Bucket": "#636EFA",
    },
    # Axis titles (and legend titles) can also be overridden using the labels argument of Plotly Express functions
    labels=dict(variable="Fighter Color", value="count", index="Age Range (years)"),
    barmode="group",
    title = "Age Bucket by Fighter Color",
)
fig_barplot_age_bucket.show()

#### Height Bucket


In [None]:
# Find the minimum height in R_Height
ufc_df["R_Height"].min()


In [None]:
# Find the maximum height in R_Height
ufc_df["R_Height"].max()


In [None]:
# Find the minimum height in B_Height
ufc_df["B_Height"].min()


In [None]:
# Find the maximum height in B_Height
ufc_df["B_Height"].max()


In [None]:
# Create four equal-sized buckets for the Height columns based on sample quantiles

# Red height buckets
r_height_bucket_labels = ["70 - 73", "60 - 68", "68 - 70", "73 - 83"]

ufc_df["R_Height_Bucket"] = pd.qcut(
    x=ufc_df["R_Height"], q=4, labels=r_height_bucket_labels, retbins=False, precision=3
)

# Blue height buckets
b_height_bucket_labels = ["60 - 68", "70 - 73", "68 - 70", "73 - 83"]
ufc_df["B_Height_Bucket"] = pd.qcut(
    x=ufc_df["B_Height"], q=4, labels=b_height_bucket_labels, retbins=False, precision=3
)

# Drop height columns
# ufc_df = ufc_df.drop(columns=["R_Height", "B_Height"])


In [None]:
# A bar graph to visualize R_Height_Bucket and B_Height_Bucket
fig_barplot_height_bucket = px.bar(    
    # outer join the count of red and blue bucket into a single dataframe
    # source: https://pandas.pydata.org/docs/user_guide/merging.html#joining-key-columns-on-an-index
    pd.merge(
        left=ufc_df["R_Height_Bucket"].value_counts(),
        right=ufc_df["B_Height_Bucket"].value_counts(),
        left_index=True,
        right_index=True,
        how="outer",
    ),   
    y=["R_Height_Bucket", "B_Height_Bucket"],
    color="variable",
    # Color values from px.colors.qualitative.Plotly
    color_discrete_map={
        "R_Height_Bucket": "#EF553B",
        "B_Height_Bucket": "#636EFA",
    },
    # Axis titles (and legend titles) can also be overridden using the labels argument of Plotly Express functions
    labels=dict(variable="Fighter Color", value="count", index="Height Range (inches)"),
    barmode="group",
    title = "Height Bucket by Fighter Color",
)
fig_barplot_height_bucket.show()

#### Gender


In [None]:
# Create Gender Fight Class Column

# TODO: There might be a *better* way of determining Gender, but this seems to works.
# ufc_df["Gender"] = np.where(
#     ufc_df["Weight_Class"].str.contains("Women's"), "Female", "Male"
# )

ufc_df["Gender"] = np.where(
    ufc_df["Weight_Class"].str.contains("Women's"), 0, 1
)  # Gender (sex) is 0 if female and 1 if male.


In [None]:
# display(
#     ufc_df.loc[
#         (ufc_df.Weight_Class == "Women's Strawweight")
#         | (ufc_df.Weight_Class == "Women's Flyweight")
#         | (ufc_df.Weight_Class == "Women's Bantamweight")
#         | (ufc_df.Weight_Class == "Women's Featherweight")
#     ].shape[0]
# )

ufc_df["Gender"].value_counts()


#### Body Mass Index (BMI)


[Calculating BMI Using the English System](https://www.cdc.gov/nccdphp/dnpao/growthcharts/training/bmiage/page5_2.html)

Formula:

> **weight** (lb) / [**height** (in)]<sup>2</sup> \* **703**


> "The normal BMI scores may not be accurate if you're very muscular because muscle can add extra kilos, resulting in a high BMI when you're not an unhealthy weight." ([National Health Service](https://www.nhs.uk/conditions/obesity/diagnosis/))


In [None]:
# Divide weight by height twice, multiply by 703, and round to one decimal place

# Calculate Red fighter BMI
ufc_df["R_BMI"] = round(ufc_df["R_Weight"] / ufc_df["R_Height"] ** 2 * 703, 1)

# Calculate Blue fighter BMI
ufc_df["B_BMI"] = round(ufc_df["B_Weight"] / ufc_df["B_Height"] ** 2 * 703, 1)


#### Proposed new BMI


[Proposed formula](https://web.archive.org/web/20220321050055/https://people.maths.ox.ac.uk/trefethen/bmi.html) by Nick Trefethen, Professor of numerical analysis at University of Oxford.

New formula:

> **5734** \* **weight** (lb) / [**height** (in)]<sup>2.5</sup>


In [None]:
# Calculate BMI with the new proposed formula.

# Calculate Red fighter BMI under proposed BMI
ufc_df["R_BMI_proposed"] = round(
    5734 * ufc_df["R_Weight"] / ufc_df["R_Height"] ** 2.5, 1
)

# Calculate Blue fighter BMI under proposed BMI
ufc_df["B_BMI_proposed"] = round(
    5734 * ufc_df["B_Weight"] / ufc_df["B_Height"] ** 2.5, 1
)


#### Estimation of Body Fat


Adult Body Fat Percentage

Formula:

> Body fat % = ((1.39 \* BMI) + (0.16 \* Age) - (10.34 \* Gender) - 9)

- Where gender (sex) is 0 if female and 1 if male to account for the lower body fat percentage of men (International Journal of Obesity and Related Metabolic Disorders in 2002)


In [None]:
# Estimate Red Fighter Body Fat Percentage
ufc_df["R_Body_Fat_Percentage"] = (1.39 * ufc_df["R_BMI"]) + (
    0.16 * ufc_df["R_Age"] - (10.34 * ufc_df["Gender"]) - 9
)

# Estimate Blue Fighter Body Fat Percentage
ufc_df["B_Body_Fat_Percentage"] = (1.39 * ufc_df["R_BMI"]) + (
    0.16 * ufc_df["R_Age"] - (10.34 * ufc_df["Gender"]) - 9
)


#### Lean Body Mass


"For calculating lean body mass, body fat percentage was subtracted from 100 to get the lean mass percentage and lean mass percentage was divided by 100 to calculate the decimal for lean mass percentage, and then lean mass decimal was multiplied with total body weight." [(Prakash KO, Choudhary R, Singh G. Lean body mass, body fat percentage, and handgrip strength as predictors of bone mineral density in postmenopausal women. J Mid-life Health 2021;12:299-303)](https://www.jmidlifehealth.org/text.asp?2021/12/4/299/336149)

Formula(s):

> Lean Mass Percentage = 100 - Body Fat Percentage

> Lean Mass Decimal = Lean Mass Percentage / 100

> Lean Body Mass = Lean Mass Decimal \* Total Body Weight

> Lean Body Mass = ((100 - Body Fat Percentage) / 100) \* Total Body Weight


In [None]:
# Estimate Red Fighter Lean Body Mass
ufc_df["R_Lean_Body_Mass"] = ((100 - ufc_df["R_Body_Fat_Percentage"]) / 100) * ufc_df[
    "R_Weight"
]

# Estimate Blue Fighter Lean Body Mass
ufc_df["B_Lean_Body_Mass"] = ((100 - ufc_df["B_Body_Fat_Percentage"]) / 100) * ufc_df[
    "B_Weight"
]


### Set Categories


In [None]:
# Convert `winby` column into a category dtype
ufc_df["Win_By"] = ufc_df["Win_By"].astype("category")

# Convert stances column into a category dtype
ufc_df["R_Stance"] = ufc_df["R_Stance"].astype("category")
ufc_df["B_Stance"] = ufc_df["B_Stance"].astype("category")

# Convert Weight_Class column into a category dtype
ufc_df["Weight_Class"] = ufc_df["Weight_Class"].astype("category")

# Convert Gender column into a category dtype
ufc_df["Gender"] = ufc_df["Gender"].astype("category")
ufc_df["Gender"] = ufc_df["Gender"].cat.rename_categories({0: "Female", 1: "Male"})


### View Features


In [None]:
# View category dtypes
ufc_df.select_dtypes(include=["category"]).columns.tolist()


In [None]:
# View numerical values
ufc_df.select_dtypes(include=["number"]).columns.tolist()


In [None]:
# TODO: Convert time to correct datatype
# display(ufc_df.select_dtypes(include=["string"]).columns.tolist())


In [None]:
ufc_df.head()


In [None]:
# Display the data types of all columns
with pd.option_context("display.max_rows", None):
    print(ufc_df.dtypes)


### Export Dataset


In [None]:
ufc_df.to_csv("Resources/clean_scraped_data.csv")

# Uncomment to create your own scraped data file. Do not upload file to repo.
# from joblib import dump
# dump(ufc_df, "Resources/clean_scraped_data.joblib")
