## UFC Fight Model


In [None]:
# Run this to make sure your versions are the same as below.
import sys
print("Python", sys.version)

import numpy
print("NumPy", numpy.__version__)

import scipy
print("SciPy", scipy.__version__)

import sklearn
print("SciKit", sklearn.__version__)

import xgboost
print("XGBoost", xgboost.__version__)



- Python 3.9.13
- NumPy 1.22.4
- SciPy 1.8.1
- SciKit 1.1.1
- XGBoost 1.6.1

In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn import set_config
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.ensemble import (
    GradientBoostingClassifier,
    RandomForestClassifier,
    VotingClassifier,
)
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


In [None]:
# Do not uncomment to prevent dtypes from changing. Read in the CSV file as a DataFrame
# ufc_df = pd.read_csv("Resources/train_preprocessed.csv", low_memory=False, index_col=0)
# ufc_df

# Load cleaned dataset. Do not upload file to repo.
from joblib import load

ufc_df = load("Resources/train_preprocessed.joblib")
# remove latest event from training
ufc_df = ufc_df[ufc_df["Event_Date"] <= pd.to_datetime("3/19/2022")]
ufc_df

## Select Features


In [None]:
numerical_cols = selector(dtype_include="number")
categorical_cols = selector(dtype_include="category")


## Split Train-Test


In [None]:
X = ufc_df.drop("Winner", axis=1)
y = ufc_df["Winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


## VotingClassifier Pipeline


### Soft voting


In [None]:
# Imputation transformer to replace missing values using null values along each column.
# Standardize features by removing the mean and scaling to unit variance with `StandardScalar()`.
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", missing_values=pd.NA)),
        ("scaler", StandardScaler()),
    ]
)

# categorical_transformer = OneHotEncoder(handle_unknown="ignore")
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", missing_values=pd.NA)),
        ("scaler", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "classifier",
            VotingClassifier(
                estimators=[
                    ("gbc", GradientBoostingClassifier(random_state=0)),
                    ("rf", RandomForestClassifier(random_state=0)),
                    ("mlp", MLPClassifier(random_state=0)),
                    ("svc", SVC(random_state=0, probability=True)),
                    ("xgb", XGBClassifier(random_state=0)),
                ],
                voting="soft",
            ),
        ),
    ]
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = accuracy_score(y_test, y_pred)
print("Soft Voting Score: %.3f" % score)


## GridSearch


In [None]:
# Uncomment to run GridSearchCV on selected columns.
# IMPORTANT: each parameter was run separately due to time constraints.
# Running GridSearch with the whole param_grid will take many hours.

# Features
# numerical_cols = [
# "B_Age",
# "B_Height",
# "B_Weight",
# "B_Reach",
# "B_Wins",
# "B_Losses",
# "B_Draws",
# "B_No_Contest",
# "B_Career_Significant_Strikes_Landed_PM",
# "B_Career_Striking_Accuracy",
# "B_Career_Significant_Strike_Defence",
# "B_Career_Takedown_Average",
# "B_Career_Takedown_Accuracy",
# "B_Career_Takedown_Defence",
# "B_Career_Submission_Average",
# "B_Knockdowns",
# "R_Age",
# "R_Height",
# "R_Weight",
# "R_Reach",
# "R_Wins",
# "R_Losses",
# "R_Draws",
# "R_No_Contest",
# "R_Career_Significant_Strikes_Landed_PM",
# "R_Career_Striking_Accuracy",
# "R_Career_Significant_Strike_Defence",
# "R_Career_Takedown_Average",
# "R_Career_Takedown_Accuracy",
# "R_Career_Takedown_Defence",
# "R_Career_Submission_Average",
# "R_Knockdowns"]

# categorical_cols = ["B_Stance", "R_Stance"]

# Train-Test split
# X = ufc_df.drop("Winner", axis=1)
# y = ufc_df["Winner"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Apply pipeline
# numeric_transformer = Pipeline(
# steps=[
# ("imputer", SimpleImputer(strategy="constant", add_indicator=True)),
# ("scaler", StandardScaler()),
# ]
# )

# categorical_transformer = Pipeline(
# steps=[
# ("onehot", OneHotEncoder(handle_unknown="ignore")),
# ]
# )

# preprocessor = ColumnTransformer(
# transformers=[
# ("num", numeric_transformer, numerical_cols),
# ("cat", categorical_transformer, categorical_cols),
# ]
# )

# clf = Pipeline(
# steps=[
# ("preprocessor", preprocessor),
# ("classifier", VotingClassifier(estimators=[
# ("gbc", GradientBoostingClassifier(random_state=0)),
# ("rf", RandomForestClassifier(random_state=0)),
# ("mlp", MLPClassifier(random_state=0)),
# ("svc", SVC(random_state=0, probability=True)),
# ("xgb", XGBClassifier(random_state=0))], voting="soft")),
# ]
# )

# clf.fit(X_train, y_train)

# param_grid = {
# "classifier__gbc__learning_rate": [0.05, 0.1, 0.5],
# "classifier__gbc__loss": ["deviance", "exponential"],
# "classifier__rf__n_estimators": [100, 200, 250, 300, 500, 700],
# "classifier__rf__max_features": ["auto", "sqrt"],
# "classifier__mlp__alpha": [0.0001, 0.001, 0.005],
# "classifier__mlp__activation": ["tanh", "relu"],
# "classifier__svc__C": [0.05, 0.1, 0.5],
# "classifier__xgb__base_score": [0.3, 0.5],
# "classifier__xgb__booster": ["gbtree", "gblinear"],
# "classifier__xgb__max_depth": [3, 5, 7]
# }

# grid = GridSearchCV(clf, param_grid=param_grid, cv=5)
# grid = grid.fit(X_train, y_train)
# print (grid.best_params_)


## VotingClassifier with modified parameters


In [None]:
# TODO: Grid Search needs to be ran again to find parameters.

# # Features
# numerical_cols = selector(dtype_include="number")
# categorical_cols = selector(dtype_include="category")

# # Train-Test split
# X = ufc_df.drop("Winner", axis=1)
# y = ufc_df["Winner"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# # Apply pipeline
# numeric_transformer = Pipeline(
#     steps=[
#         ("imputer", SimpleImputer(strategy="constant", add_indicator=True)),
#         ("scaler", StandardScaler()),
#     ]
# )

# categorical_transformer = Pipeline(
#     steps=[("onehot", OneHotEncoder(handle_unknown="ignore")),]
# )

# preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", numeric_transformer, numerical_cols),
#         ("cat", categorical_transformer, categorical_cols),
#     ]
# )

# clf = Pipeline(
#     steps=[
#         ("preprocessor", preprocessor),
#         (
#             "classifier",
#             VotingClassifier(
#                 estimators=[
#                     (
#                         "gbc",
#                         GradientBoostingClassifier(
#                             learning_rate=0.1, loss="deviance", random_state=1
#                         ),
#                     ),
#                     (
#                         "rf",
#                         RandomForestClassifier(
#                             max_features="auto", n_estimators=500, random_state=1
#                         ),
#                     ),
#                     (
#                         "mlp",
#                         MLPClassifier(alpha=0.001, activation="relu", random_state=1),
#                     ),
#                     ("svc", SVC(C=0.1, probability=True, random_state=1)),
#                     (
#                         "xgb",
#                         XGBClassifier(
#                             base_score=0.3,
#                             booster="gbtree",
#                             max_depth=5,
#                             random_state=1,
#                         ),
#                     ),
#                 ],
#                 voting="soft",
#             ),
#         ),
#     ]
# )

# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# score = accuracy_score(y_test, y_pred)
# print("Soft Voting Score: %.3f" % score)


### Display Diagram of Pipeline


In [None]:
set_config(display="diagram")
clf


### Classification Report


In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


### Confusion Matrix


In [None]:
disp = ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred, cmap="Blues", values_format="d"
)
plt.show()


### Model Persistence


In [None]:
# Uncomment to create save classifier. Do not upload file to repo.
# from joblib import dump
# dump(clf, "Resources/clf.joblib")
