# 🔹 UFC Fight Predictor Model Training

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

## 1. Import Libraries and Setup Environment

In [1]:
# Import necessary libraries
import os
import sys
import pandas as pd

pd.set_option('display.max_colwidth', 200) 

# Get the current working directory
current_dir = os.getcwd()

# Navigate to the project root
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Import from /src
sys.path.append(os.path.join(project_root, 'src'))
from metrics import *
from model_factory import *
from io_model import *
from helpers import *
from config import *

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

## 2. Load Data

In [2]:
# Load UFCData
try:
    UFCData = load_data()
except Exception as e:
    print_header(f"Error loading training data: {e}", color='bright_red')

/home/mlioi/ufc-predictor/data/processed/ufc_data.pkl
📦 UFCData object loaded from: /home/mlioi/ufc-predictor/data/processed/ufc_data.pkl


In [3]:
UFCData.summary()

📊 UFC Dataset Summary
----------------------------------------
🧪 Total samples      : 6541
🧪 Train/Test split  : 5232 / 1309
🧪 Total features     : 42

🔢 Numerical features : 36
🔠 Categorical features: 6
    - Binary          : 2
    - Multiclass      : 4

🏷 Label distribution (raw):
   - Class 0: 3795 (58.0%)
   - Class 1: 2746 (42.0%)

✅ No missing values detected

📈 Feature summary statistics (train set):
                        mean      std       min       max
RedOdds             -115.483  274.909 -2000.000   775.000
BlueOdds              59.122  250.391 -1200.000  1300.000
NumberOfRounds         3.187    0.580     3.000     5.000
BlueAvgSigStrLanded   19.800   18.830     0.000   154.000
BlueAvgSigStrPct       0.453    0.105     0.000     1.000
BlueAvgSubAtt          0.500    0.634     0.000     8.400
BlueAvgTDLanded        1.315    1.261     0.000    10.610
BlueAvgTDPct           0.324    0.224     0.000     1.000
RedAvgSigStrLanded    21.121   19.175     0.000   141.000
RedAvgSi

<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

## 3. 🔧 Hyperparameters Tuning 🔧

In [4]:
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

hyperparameter_tuning = True

if hyperparameter_tuning:
    print_header('Hyperparameters Tuning is Activated', color = 'bright_blue')
    model_params = {
    "Support Vector Machine": (
        SVC(probability=True),
        {'C': [1, 5], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['auto']}
    ),
    "Random Forest": (
        RandomForestClassifier(),
        {'n_estimators': [10, 50, 100, 1000], 'max_depth': [3, 5, 10, 100]}
    ),
    "Logistic Regression": (
        LogisticRegression(),
        {'C': [1, 5, 10], 'solver': ['liblinear', 'lbfgs']}
    ),
    "K-Nearest Neighbors": (
        KNeighborsClassifier(),
        {'n_neighbors': [3, 6, 9, 12, 15], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']}
    ),
    "AdaBoost": (
        AdaBoostClassifier(),
        {'n_estimators': [10, 50, 100], 'learning_rate': [0.01, 0.1, 1.0]}
    ),
    "Naive Bayes": (
        GaussianNB(),
        {'var_smoothing': [1e-8, 1e-7, 1e-6, 1e-5]}
    ),
    "Gradient Boosting": (
        GradientBoostingClassifier(),
        {'n_estimators': [10, 50, 100], 'learning_rate': [0.01, 0.1, 1.0], 'max_depth': [3, 5, 7]}
    ),
    "Extra Trees": (
        ExtraTreesClassifier(),
        {'n_estimators': [50, 100], 'max_depth': [None, 10, 20]}
    ),
    "Quadratic Discriminant Analysis": (
        QuadraticDiscriminantAnalysis(),
        {'reg_param': [0.1, 1]}
    ),
    "Neural Network": (
        MLPClassifier(max_iter=200, random_state=42),
        {
        # Architecture: number and size of hidden layers
        'hidden_layer_sizes': [
            (200,), (50, 50),
        ],
        # Activation function for hidden layers
        'activation': ['relu', 'logistic'],
        # Optimizer for gradient descent
        'solver': ['adam', 'sgd'],
        # L2 regularization strength (higher alpha reduces overfitting but can cause underfitting)
        'alpha': [0.001],
        # Learning rate schedule
        'learning_rate': ['adaptive'],
        # Initial learning rate (for adam and sgd solvers)
        'learning_rate_init': [0.01],
        # Early stopping based on validation performance to prevent overfitting
        'early_stopping': [True],
        # Mini-batch size for training with adam/sgd
        'batch_size': [32],
        # Momentum for SGD (controls contribution of previous updates)
        'momentum': [0.8],
        # Proportion of training set used as validation for early stopping
        'validation_fraction': [0.15]
        }
    ),
    "XGBoost": (
            XGBClassifier(eval_metric='logloss'),
            {
                'n_estimators': [50, 100],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'subsample': [0.8, 1.0],
                'colsample_bytree': [0.8, 1.0]
            }
        )
}
    display_model_params_table(model_params)
else:
    print_header('Hyperparameters Tuning Off', color = 'bright_red')
    model_params = None # Default Settings

[94m╔═══════════════════════════════════════╗
║  Hyperparameters Tuning is Activated  ║
╚═══════════════════════════════════════╝[0m


Unnamed: 0,Model,Hyperparameters
0,Support Vector Machine,"C: [1, 5]; kernel: ['linear', 'rbf', 'poly']; gamma: ['auto']"
1,Random Forest,"n_estimators: [10, 50, 100, 1000]; max_depth: [3, 5, 10, 100]"
2,Logistic Regression,"C: [1, 5, 10]; solver: ['liblinear', 'lbfgs']"
3,K-Nearest Neighbors,"n_neighbors: [3, 6, 9, 12, 15]; weights: ['uniform', 'distance']; metric: ['euclidean', 'manhattan']"
4,AdaBoost,"n_estimators: [10, 50, 100]; learning_rate: [0.01, 0.1, 1.0]"
5,Naive Bayes,"var_smoothing: [1e-08, 1e-07, 1e-06, 1e-05]"
6,Gradient Boosting,"n_estimators: [10, 50, 100]; learning_rate: [0.01, 0.1, 1.0]; max_depth: [3, 5, 7]"
7,Extra Trees,"n_estimators: [50, 100]; max_depth: [None, 10, 20]"
8,Quadratic Discriminant Analysis,"reg_param: [0.1, 1]"
9,Neural Network,"hidden_layer_sizes: [(200,), (50, 50)]; activation: ['relu', 'logistic']; solver: ['adam', 'sgd']; alpha: [0.001]; learning_rate: ['adaptive']; learning_rate_init: [0.01]; early_stopping: [True]; ..."


<div style="text-align: center;">
  🔹 <img src="../img/ufc_logo.png" width="50" /> 🔹
</div>

## 4. 🔹 UFC Machine Learning Training

### 4.1 KNN Model Training

#### 🚀 KNN Training 

In [5]:
model_name = 'K-Nearest Neighbors'
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

[95m╔═══════════════════════════════════════════════════╗
║  [K-Nearest Neighbors] UFC GridSearchCV Training  ║
╚═══════════════════════════════════════════════════╝[0m


INFO:root:[K-Nearest Neighbors] 🤖 Training...


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.588 total time=   0.1s
[CV 2/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.575 total time=   0.1s
[CV 3/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.619 total time=   0.1s
[CV 4/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.576 total time=   0.1s
[CV 5/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.595 total time=   0.1s
[CV 1/5] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.588 total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.575 total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.618 total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.577 total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.595 to

INFO:root:[K-Nearest Neighbors] 🔍 Best Score: 0.6355
[K-Nearest Neighbors] 🔍 Best Params: {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'uniform'}


#### 🔍 KNN Metrics

In [6]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [7]:
log_training_result(model_name, model.best_params_, metrics, duration)

✅ Training logged to ../data/results/training_log.csv


### 💾 Save Model 

In [8]:
save_model(model, name=file_model_name[model_name])

✅ Model K-Nearest Neighbors saved to: /home/mlioi/ufc-predictor/models/knn_best.pkl


### 4.2 Support Vector Machine Model 

#### 🚀 Support Vector Machine Training 

In [9]:
model_name = 'Support Vector Machine'
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

[95m╔══════════════════════════════════════════════════════╗
║  [Support Vector Machine] UFC GridSearchCV Training  ║
╚══════════════════════════════════════════════════════╝[0m


INFO:root:[Support Vector Machine] 🤖 Training...


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ....C=1, gamma=auto, kernel=linear;, score=0.660 total time=  10.0s
[CV 2/5] END ....C=1, gamma=auto, kernel=linear;, score=0.650 total time=  10.1s
[CV 3/5] END ....C=1, gamma=auto, kernel=linear;, score=0.651 total time=  10.3s
[CV 4/5] END ....C=1, gamma=auto, kernel=linear;, score=0.667 total time=  10.0s
[CV 5/5] END ....C=1, gamma=auto, kernel=linear;, score=0.648 total time=  10.1s
[CV 1/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.665 total time=   5.4s
[CV 2/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.642 total time=   5.4s
[CV 3/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.658 total time=   5.5s
[CV 4/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.662 total time=   5.4s
[CV 5/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.662 total time=   5.4s
[CV 1/5] END ......C=1, gamma=auto, kernel=poly;, score=0.628 total time=   4.9s
[CV 2/5] END ......C=1, gamma=auto, kernel=poly;,


KeyboardInterrupt



#### 🔍 Support Vector Machine Metrics

In [None]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [None]:
log_training_result(model_name, model.best_params_, metrics, duration)

### 💾 Save Model 

In [None]:
save_model(model, name=file_model_name[model_name])

### 4.3 Logistic Regression Model

#### 🚀 Logistic Regression Training

In [None]:
model_name = 'Logistic Regression'
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

#### 🔍 Logistic Regression Metrics

In [None]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [None]:
log_training_result(model_name, model.best_params_, metrics, duration)

### 💾 Save Model 

In [None]:
save_model(model, name=file_model_name[model_name])

### 4.4 Random Forest Model

#### 🚀 Random Forest Training

In [None]:
model_name = 'Random Forest'
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

#### 🔍 Random Forest Metrics

In [None]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [None]:
log_training_result(model_name, model.best_params_, metrics, duration)

### 💾 Save Model 

In [None]:
save_model(model, name=file_model_name[model_name])

### 4.5 AdaBoost Model

#### 🚀 AdaBoost Training

In [None]:
model_name = 'AdaBoost'
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

#### 🔍 AdaBoost Metrics

In [None]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [None]:
log_training_result(model_name, model.best_params_, metrics, duration)

### 💾 Save Model 

In [None]:
save_model(model, name=file_model_name[model_name])

### 4.6 Naive Bayes Model

#### 🚀 Naive Bayes Training

In [None]:
model_name = 'Naive Bayes'
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

#### 🔍 Naive Bayes Metrics

In [None]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [None]:
log_training_result(model_name, model.best_params_, metrics, duration)

### 💾 Save Model 

In [None]:
save_model(model, name=file_model_name[model_name])

### 4.7 Gradient Boosting Model

#### 🚀 Gradient Boosting Training

In [None]:
model_name = "Gradient Boosting"
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

#### 🔍 Gradient Boosting Metrics

In [None]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [None]:
log_training_result(model_name, model.best_params_, metrics, duration)

### 💾 Save Model 

In [None]:
save_model(model, name=file_model_name[model_name])

### 4.8 Extra Trees Model

#### 🚀 Extra Trees Training

In [None]:
model_name = "Extra Trees"
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

#### 🔍 Extra Trees Metrics

In [None]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [None]:
log_training_result(model_name, model.best_params_, metrics, duration)

### 💾 Save Model 

In [None]:
save_model(model, name=file_model_name[model_name])

### 4.9 Quadratic Discriminant Analysis Model

#### 🚀 Quadratic Discriminant Analysis Training

In [None]:
model_name = "Quadratic Discriminant Analysis"
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

#### 🔍 Quadratic Discriminant Analysis Metrics

In [None]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [None]:
log_training_result(model_name, model.best_params_, metrics, duration)

### 💾 Save Model 

In [None]:
save_model(model, name=file_model_name[model_name])

### 4.10 Neural Network

#### 🚀 Neural Network Training 

In [None]:
model_name = 'Neural Network'
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

#### 🔍 Neural Network Metrics

In [None]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [None]:
log_training_result(model_name, model.best_params_, metrics, duration)

### 💾 Save Model 

In [None]:
save_model(model, name=file_model_name[model_name])

### 4.11 XGBoost Model

#### 🚀 XGBoost Training 

In [5]:
model_name = 'XGBoost'
start = time.time()
model = model_factory(model_name, UFCData, model_params)
end = time.time()
duration = end - start

[95m╔═══════════════════════════════════════╗
║  [XGBoost] UFC GridSearchCV Training  ║
╚═══════════════════════════════════════╝[0m


INFO:root:[XGBoost] 🤖 Training...


Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.580 total time=   0.5s
[CV 2/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.580 total time=   0.1s
[CV 3/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.583 total time=   0.1s
[CV 4/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.580 total time=   0.1s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.580 total time=   0.1s
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0;, score=0.580 total time=   0.1s
[CV 2/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0;, score=0.580 total time=   0.1s
[CV 3/5] END c

INFO:root:[XGBoost] 🔍 Best Score: 0.6074
[XGBoost] 🔍 Best Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}


#### 🔍 XGBoost Metrics

In [9]:
metrics = evaluate_metrics(model, UFCData)

### 📝 Log Training Results

In [10]:
log_training_result(model_name, model.best_params_, metrics, duration)

✅ Training logged to ../data/results/training_log.csv


### 💾 Save Model 

In [11]:
save_model(model, name=file_model_name[model_name])

✅ Model XGBoost saved to: /home/mlioi/ufc-predictor/models/xgb_best.pkl


<div style="text-align: center;">
     <img src="../img/ufc_logo.png" width="800" /> 
</div>