In [6]:
import pandas as pd
import numpy as np

# Import the first CSV file
feature_df = pd.read_csv('ufc_features.csv')

#prepare data for training

# Drop the columns
columns_to_drop = ['p1_fighter', 'p2_fighter'] #method
feature_df = feature_df.drop(columns=columns_to_drop)
cols_to_drop = [col for col in feature_df.columns if col.startswith('method_')]
feature_df.drop(columns=cols_to_drop, inplace=True)


# Clean all column names
def clean_column_name(col):
    return col.lower().replace(' ', '_').replace('.', '').replace('-', '_')

# Apply to all columns
feature_df.columns = [clean_column_name(col) for col in feature_df.columns]

# Identify all categorical columns
categorical_cols = ['p1_stance', 'p2_stance']

# One-hot encode all categorical variables
feature_df = pd.get_dummies(feature_df, columns=categorical_cols)

feature_df = feature_df.sample(frac=1, random_state=42).reset_index(drop=True)

# encode the referee using frequency
ref_counts = feature_df['referee'].value_counts()
feature_df['referee_freq'] = feature_df['referee'].map(ref_counts)
feature_df.drop(columns=['referee'], inplace=True)


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import xgboost as xgb
import pandas as pd

# Target setup
X = feature_df.drop(columns=['winner'])
y = feature_df['winner']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Identify numeric columns for imputation
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Impute missing values with median
num_imputer = SimpleImputer(strategy='median')
X_train[numeric_cols] = num_imputer.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = num_imputer.transform(X_test[numeric_cols])

# XGBoost model training
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)

# Predictions and evaluation
xgb_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, xgb_pred)
report = classification_report(y_test, xgb_pred)

print("XGBoost Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

# Feature importance analysis
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance.to_string())

xgb_model.save_model('xgb_model_good.json') 


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Model Performance:
Accuracy: 0.7286
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.54      0.58       568
           1       0.77      0.83      0.80      1068

    accuracy                           0.73      1636
   macro avg       0.70      0.68      0.69      1636
weighted avg       0.72      0.73      0.72      1636


Feature Importances:
                            Feature  Importance
14                          p2_slpm    0.049571
29                      stracc_diff    0.014472
27                         age_diff    0.014201
19                        p2_td_acc    0.013766
30                        sapm_diff    0.011534
56          p1_age_adjusted_str_def    0.011301
46                        loss_diff    0.011299
63           p2_age_adjusted_td_def    0.010633
28                        slpm_diff    0.010259
40                        p1_losses    0.009995
64          p1_age_adjusted_sub_avg    0.009335
34      