# UFC Fights:
##Islam Makhachev VS Della Maddalena      |       Shevchenko VS Zhang

###Author Ali & Abdirahman

> Add blockquote



In [29]:
events_df.head()

Unnamed: 0,f1_id,f2_id,f1_name,f2_name,weight_class,f1_age_during,f2_age_during,f1_height_cm,f2_height_cm,f1_knockdowns,...,str_def_diff,td_avg_diff,td_acc_diff,td_def_diff,sub_avg_diff,stance_f1,stance_f2,stance_matchup,y_winner,reach_advantage
0,3363,2768,cory sandhagen,umar nurmagomedov,Bantamweight Bout,32.0,28.0,180.34,172.72,0,...,-0.07,-2.79,-0.18,-0.37,-0.1,switch,orthodox,switch_vs_orthodox,0,Small Advantage
1,2275,2800,shara magomedov,michal oleksiejczuk,Middleweight Bout,30.0,29.0,187.96,182.88,0,...,-0.16,-1.08,-0.43,0.23,0.0,orthodox,southpaw,orthodox_vs_southpaw,1,Small Disadvantage
2,1576,299,jai herbert,rolando bedoya,Lightweight Bout,36.0,27.0,185.42,180.34,1,...,0.0,0.5,0.23,0.26,0.0,orthodox,orthodox,orthodox_vs_orthodox,1,Large Advantage
3,2648,2471,azamat murzakanov,alonzo menifield,Light Heavyweight Bout,35.0,36.0,177.8,182.88,1,...,0.12,-0.01,-0.18,0.07,-0.2,southpaw,orthodox,southpaw_vs_orthodox,1,Large Disadvantage
4,997,3856,sedriques dumas,denis tiuliulin,Middleweight Bout,28.0,36.0,187.96,185.42,0,...,0.2,0.35,-0.07,-0.32,0.3,orthodox,orthodox,orthodox_vs_orthodox,1,Large Advantage


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

try:
    import optuna
except ImportError:
    !pip install optuna
    import optuna

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/404.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
def load_data(fighters_path='ufc_fighters_avg.csv', events_path='ufc_fight_train - ufc_event_fight_stats.csv'):

    print("Loading data...")

    try:
        fighters_df = pd.read_csv(fighters_path)
        print(f"Loaded fighters_df: {fighters_df.shape}")
    except FileNotFoundError:
        raise FileNotFoundError(f"Required file {fighters_path} not found. Please ensure the file is in the project directory.")

    try:
        events_df = pd.read_csv(events_path)
        print(f"Loaded events_df: {events_df.shape}")
    except FileNotFoundError:
        raise FileNotFoundError(f"Required file {events_path} not found. Please ensure the file is in the project directory.")

    return fighters_df, events_df

# Step 2 - Data Cleaning

In [None]:
def clean_fighters_df(fighters_df):

    print("\n" + "="*60)
    print("CLEANING FIGHTERS_DF")
    print("="*60)

    print(f"\nShape: {fighters_df.shape}")
    print(f"\nDtypes:\n{fighters_df.dtypes}")
    print(f"\nMissing values:\n{fighters_df.isna().sum()}")

    fighters_df.columns = fighters_df.columns.str.strip().str.lower()

    numeric_cols = fighters_df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if fighters_df[col].isna().sum() > 0:
            median_val = fighters_df[col].median()
            fighters_df[col].fillna(median_val, inplace=True)
            print(f"Filled {col} with median: {median_val:.2f}")

    categorical_cols = fighters_df.select_dtypes(include=[object]).columns
    for col in categorical_cols:
        if fighters_df[col].isna().sum() > 0:
            most_freq = fighters_df[col].mode()
            fill_val = most_freq[0] if len(most_freq) > 0 else "Unknown"
            fighters_df[col].fillna(fill_val, inplace=True)
            print(f"Filled {col} with: {fill_val}")

    if 'name' in fighters_df.columns:
        fighters_df['name'] = fighters_df['name'].str.lower().str.strip()
    if 'stance' in fighters_df.columns:
        fighters_df['stance'] = fighters_df['stance'].str.lower().str.strip()
        fighters_df['stance'] = fighters_df['stance'].replace('', 'unknown')

    return fighters_df

In [None]:
def clean_events_df(events_df):


    events_df.columns = events_df.columns.str.strip().str.lower()

    fighter_cols = [col for col in events_df.columns
                   if any(term in col for term in ['fighter', 'f1', 'f2', 'red', 'blue', 'name'])]
    print(f"\nFighter columns identified: {fighter_cols}")

    numeric_cols = events_df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if events_df[col].isna().sum() > 0:
            median_val = events_df[col].median()
            events_df[col].fillna(median_val, inplace=True)

    categorical_cols = events_df.select_dtypes(include=[object]).columns
    for col in categorical_cols:
        if events_df[col].isna().sum() > 0:
            most_freq = events_df[col].mode()
            fill_val = most_freq[0] if len(most_freq) > 0 else "Unknown"
            events_df[col].fillna(fill_val, inplace=True)

    for col in fighter_cols:
        if col in events_df.columns:
            events_df[col] = events_df[col].astype(str).str.lower().str.strip()

    return events_df, fighter_cols

In [None]:
def merge_fighter_averages(events_df, fighters_df):

    f1_name_col = None
    f2_name_col = None

    for col in events_df.columns:
        if 'f1' in col and 'name' in col:
            f1_name_col = col
        elif 'f2' in col and 'name' in col:
            f2_name_col = col
        elif col == 'fighter1' or col == 'fighter1_name':
            f1_name_col = col
        elif col == 'fighter2' or col == 'fighter2_name':
            f2_name_col = col

    if f1_name_col is None:
        fighter_cols = [col for col in events_df.columns if 'fighter' in col.lower()]
        if len(fighter_cols) >= 1:
            f1_name_col = fighter_cols[0]
        if len(fighter_cols) >= 2:
            f2_name_col = fighter_cols[1]

    if f1_name_col is None or f2_name_col is None:
        print(f"Warning: Could not identify fighter name columns. Found: {f1_name_col}, {f2_name_col}")
        print(f"Available columns: {list(events_df.columns)}")
        if f1_name_col is None:
            f1_name_col = events_df.columns[0] if len(events_df.columns) > 0 else None
        if f2_name_col is None:
            f2_name_col = events_df.columns[1] if len(events_df.columns) > 1 else None

    print(f"Using f1_name_col: {f1_name_col}, f2_name_col: {f2_name_col}")

    fighters_name_col = None
    for col in fighters_df.columns:
        if 'name' in col.lower():
            fighters_name_col = col
            break

    if fighters_name_col is None:
        fighters_name_col = fighters_df.columns[0]

    print(f"Using fighters name column: {fighters_name_col}")

    if f1_name_col is None or f2_name_col is None:
        raise ValueError("Could not identify fighter name columns for merging")

    assert f1_name_col is not None and f2_name_col is not None

    f1_norm_col = f1_name_col + '_norm'
    f2_norm_col = f2_name_col + '_norm'
    fighters_norm_col = fighters_name_col + '_norm'

    events_df[f1_norm_col] = events_df[f1_name_col].astype(str).str.lower().str.strip()
    events_df[f2_norm_col] = events_df[f2_name_col].astype(str).str.lower().str.strip()
    fighters_df[fighters_norm_col] = fighters_df[fighters_name_col].astype(str).str.lower().str.strip()

    fighters_f1 = fighters_df.copy()
    fighters_f1.columns = ['f1_' + col if col != fighters_name_col and col != fighters_norm_col else col for col in fighters_f1.columns]
    events_df = events_df.merge(
        fighters_f1,
        left_on=f1_norm_col,
        right_on=fighters_norm_col,
        how='left',
        suffixes=('', '_f1_merge')
    )

    fighters_f2 = fighters_df.copy()
    fighters_f2.columns = ['f2_' + col if col != fighters_name_col and col != fighters_norm_col else col for col in fighters_f2.columns]
    events_df = events_df.merge(
        fighters_f2,
        left_on=f2_norm_col,
        right_on=fighters_norm_col,
        how='left',
        suffixes=('', '_f2_merge')
    )

    print(f"After merge, shape: {events_df.shape}")

    f1_merged_col = None
    f2_merged_col = None
    for col in fighters_df.columns:
        if col != fighters_name_col:
            f1_merged_col = 'f1_' + col
            f2_merged_col = 'f2_' + col
            if f1_merged_col in events_df.columns:
                break

    if f1_merged_col and f1_merged_col in events_df.columns:
        print(f"Merge success rate f1: {events_df[f1_merged_col].notna().sum() / len(events_df):.2%}")
    else:
        print("Merge success rate f1: Could not determine")

    if f2_merged_col and f2_merged_col in events_df.columns:
        print(f"Merge success rate f2: {events_df[f2_merged_col].notna().sum() / len(events_df):.2%}")
    else:
        print("Merge success rate f2: Could not determine")

    return events_df

In [None]:
def build_pre_fight_features(events_df):

    print("\n" + "="*60)
    print("BUILDING PRE-FIGHT FEATURES")
    print("="*60)

    def safe_diff(f1_col, f2_col, default=0):
        if f1_col in events_df.columns and f2_col in events_df.columns:
            return events_df[f1_col] - events_df[f2_col]
        else:
            print(f"Warning: {f1_col} or {f2_col} not found, using default {default}")
            return default

    f1_age_col = None
    f2_age_col = None
    for col in events_df.columns:
        if 'f1' in col.lower() and ('age' in col.lower() or 'age_during' in col.lower()):
            f1_age_col = col
        if 'f2' in col.lower() and ('age' in col.lower() or 'age_during' in col.lower()):
            f2_age_col = col

    f1_height_col = None
    f2_height_col = None
    for col in events_df.columns:
        if 'f1' in col.lower() and 'height' in col.lower():
            f1_height_col = col
        if 'f2' in col.lower() and 'height' in col.lower():
            f2_height_col = col

    f1_reach_col = None
    f2_reach_col = None
    for col in events_df.columns:
        if 'f1' in col.lower() and 'reach' in col.lower():
            f1_reach_col = col
        if 'f2' in col.lower() and 'reach' in col.lower():
            f2_reach_col = col


    if f1_age_col and f2_age_col:
        f1_age_vals = pd.to_numeric(events_df[f1_age_col], errors='coerce').fillna(0)
        f2_age_vals = pd.to_numeric(events_df[f2_age_col], errors='coerce').fillna(0)
        events_df['age_diff'] = f1_age_vals - f2_age_vals
    else:
        events_df['age_diff'] = 0
        print("Could not create age_diff")

    if f1_height_col and f2_height_col:
        h1_vals = pd.to_numeric(events_df[f1_height_col], errors='coerce').fillna(0)
        h2_vals = pd.to_numeric(events_df[f2_height_col], errors='coerce').fillna(0)
        events_df['height_diff'] = h1_vals - h2_vals
    else:
        events_df['height_diff'] = 0
        print("Could not create height_diff")

    if f1_reach_col and f2_reach_col:
        r1_vals = pd.to_numeric(events_df[f1_reach_col], errors='coerce').fillna(0)
        r2_vals = pd.to_numeric(events_df[f2_reach_col], errors='coerce').fillna(0)
        events_df['reach_diff'] = r1_vals - r2_vals
    else:
        events_df['reach_diff'] = 0
        print("Could not create reach_diff")

    for stat_name, col_pattern in [
        ('slpm', 'slpm'),
        ('sapm', 'sapm'),
        ('str_acc', 'str_acc'),
        ('str_def', 'str_def'),
        ('td_avg', 'td_avg'),
        ('td_acc', 'td_acc'),
        ('td_def', 'td_def'),
        ('sub_avg', 'sub_avg')
    ]:
        f1_col = None
        f2_col = None
        for col in events_df.columns:
            if 'f1' in col.lower() and col_pattern in col.lower():
                f1_col = col
            if 'f2' in col.lower() and col_pattern in col.lower():
                f2_col = col

        if f1_col and f2_col:
            f1_vals = pd.to_numeric(events_df[f1_col], errors='coerce').fillna(0)
            f2_vals = pd.to_numeric(events_df[f2_col], errors='coerce').fillna(0)
            events_df[f'{stat_name}_diff'] = f1_vals - f2_vals
        else:
            events_df[f'{stat_name}_diff'] = 0
            print(f"Could not create {stat_name}_diff")

    sub_patterns = ['sub_avg', 'sub_attempts', 'submission_avg']
    for pattern in sub_patterns:
        f1_col = f'f1_{pattern}'
        f2_col = f'f2_{pattern}'
        if f1_col in events_df.columns and f2_col in events_df.columns:
            events_df['sub_avg_diff'] = events_df[f1_col] - events_df[f2_col]
            break

    if 'weight_class' not in events_df.columns:
        for col in events_df.columns:
            if 'weight' in col.lower():
                events_df['weight_class'] = events_df[col]
                break
        if 'weight_class' not in events_df.columns:
            events_df['weight_class'] = 'unknown'

    f1_stance_col = None
    f2_stance_col = None

    for col in events_df.columns:
        if 'f1' in col and 'stance' in col:
            f1_stance_col = col
        elif 'f2' in col and 'stance' in col:
            f2_stance_col = col

    if f1_stance_col and f2_stance_col:
        events_df['stance_f1'] = events_df[f1_stance_col].astype(str).str.lower().str.strip()
        events_df['stance_f2'] = events_df[f2_stance_col].astype(str).str.lower().str.strip()
        events_df['stance_matchup'] = events_df['stance_f1'] + '_vs_' + events_df['stance_f2']
    else:
        events_df['stance_f1'] = 'unknown'
        events_df['stance_f2'] = 'unknown'
        events_df['stance_matchup'] = 'unknown_vs_unknown'


    winner_col = None
    for col in events_df.columns:
        if 'winner' in col.lower() and 'odds' not in col.lower():
            winner_col = col
            break

    f1_name_col = None
    f2_name_col = None
    for col in events_df.columns:
        if ('f1' in col.lower() and 'name' in col.lower()) or col == 'fighter1' or col == 'fighter1_name':
            f1_name_col = col
        if ('f2' in col.lower() and 'name' in col.lower()) or col == 'fighter2' or col == 'fighter2_name':
            f2_name_col = col

    if winner_col and f1_name_col:
        events_df['y_winner'] = (
            events_df[winner_col].astype(str).str.lower().str.strip() ==
            events_df[f1_name_col].astype(str).str.lower().str.strip()
        ).astype(int)

    elif 'result' in events_df.columns:

        f1_id_col = None
        f2_id_col = None

        for col in events_df.columns:
            if col == 'f1_id' or ('f1' in col.lower() and 'id' in col.lower()):
                f1_id_col = col
            if col == 'f2_id' or ('f2' in col.lower() and 'id' in col.lower()):
                f2_id_col = col

        if f1_id_col and f2_id_col:
            result_vals = pd.to_numeric(events_df['result'], errors='coerce')
            f1_ids = pd.to_numeric(events_df[f1_id_col], errors='coerce')
            f2_ids = pd.to_numeric(events_df[f2_id_col], errors='coerce')

            match_f1 = pd.Series(result_vals == f1_ids)
            events_df['y_winner'] = match_f1.astype(int)

            if events_df['y_winner'].sum() == 0:
                print("Warning: No matches found between result and f1_id")
                print("Trying alternative: checking if result matches f2_id...")
                match_f2 = pd.Series(result_vals == f2_ids)
                events_df['y_winner'] = match_f2.astype(int)
                if events_df['y_winner'].sum() == len(events_df):
                    events_df['y_winner'] = 0
                    print("All results match f2_id - setting y_winner to 0 for all")
                else:
                    events_df['y_winner'] = match_f1.astype(int)
        else:
            events_df['y_winner'] = 0
            print("Warning: Could not determine y_winner - missing ID columns")
    else:
        events_df['y_winner'] = 0
        print("Warning: No winner/result column found, y_winner set to default")

    print(f"\nTarget distribution:\n{events_df['y_winner'].value_counts()}")

    return events_df

In [None]:
def clean_and_merge_data(fighters_df, events_df, api_client=None):
    """
    Complete data cleaning and merging pipeline.
    """
    fighters_df = clean_fighters_df(fighters_df)

    events_df, fighter_cols = clean_events_df(events_df)

    if api_client:
        events_df = api_client.enrich_fights_with_api_data(events_df)

    events_df = merge_fighter_averages(events_df, fighters_df)

    events_df = build_pre_fight_features(events_df)

    return events_df, fighters_df

In [None]:
def perform_eda(events_df):
    """
    Exploratory data analysis and sanity checks.
    """
    print("\n" + "="*60)
    print("EXPLORATORY DATA ANALYSIS")
    print("="*60)

    key_features = [
        'age_diff', 'height_diff', 'reach_diff', 'slpm_diff', 'sapm_diff',
        'str_acc_diff', 'str_def_diff', 'td_avg_diff', 'td_acc_diff', 'td_def_diff'
    ]

    key_features = [f for f in key_features if f in events_df.columns]

    print("\nSummary Statistics of Key Features:")
    print(events_df[key_features].describe())

    n_features = len(key_features)
    n_cols = 3
    n_rows = (n_features + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_features > 1 else [axes]

    for idx, feature in enumerate(key_features):
        if idx < len(axes):
            axes[idx].hist(events_df[feature].dropna(), bins=30, edgecolor='black', alpha=0.7)
            axes[idx].set_title(f'Distribution of {feature}')
            axes[idx].set_xlabel(feature)
            axes[idx].set_ylabel('Frequency')

    for idx in range(len(key_features), len(axes)):
        axes[idx].axis('off')

    plt.tight_layout()
    plt.savefig('feature_distributions.png', dpi=150, bbox_inches='tight')
    print("\nSaved feature_distributions.png")
    plt.close()

    corr_matrix = events_df[key_features].corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix of Numeric Features')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png', dpi=150, bbox_inches='tight')
    print("Saved correlation_matrix.png")
    plt.close()

    if 'weight_class' in events_df.columns:
        print("\nWin Rates by Weight Class:")
        weight_class_win_rates = events_df.groupby('weight_class')['y_winner'].agg(['mean', 'count'])
        weight_class_win_rates.columns = ['win_rate', 'count']
        print(weight_class_win_rates.sort_values('count', ascending=False))

    if 'stance_matchup' in events_df.columns:
        print("\nWin Rates by Stance Matchup:")
        stance_win_rates = events_df.groupby('stance_matchup')['y_winner'].agg(['mean', 'count'])
        stance_win_rates.columns = ['win_rate', 'count']
        print(stance_win_rates.sort_values('count', ascending=False).head(10))

    if 'reach_diff' in events_df.columns:
        events_df['reach_advantage'] = pd.cut(
            events_df['reach_diff'],
            bins=[-np.inf, -5, 0, 5, np.inf],
            labels=['Large Disadvantage', 'Small Disadvantage', 'Small Advantage', 'Large Advantage']
        )
        print("\nWin Rates by Reach Advantage:")
        reach_win_rates = events_df.groupby('reach_advantage')['y_winner'].agg(['mean', 'count'])
        reach_win_rates.columns = ['win_rate', 'count']
        print(reach_win_rates)

In [None]:
def train_validation_split_data(X, y_winner):

    X_train, X_val, y_train, y_val = train_test_split(
        X, y_winner, test_size=0.2, stratify=y_winner, random_state=42
    )

    print(f"\nTrain set: {X_train.shape[0]} samples")
    print(f"Validation set: {X_val.shape[0]} samples")
    print(f"Train target distribution:\n{y_train.value_counts()}")
    print(f"Validation target distribution:\n{y_val.value_counts()}")

    return X_train, X_val, y_train, y_val

In [None]:
def build_preprocessing_pipeline(X_train):

    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=[object]).columns.tolist()

    print(f"\nNumeric features ({len(numeric_cols)}): {numeric_cols}")
    print(f"Categorical features ({len(categorical_cols)}): {categorical_cols}")

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
        ],
        remainder='passthrough'
    )

    return preprocessor, numeric_cols, categorical_cols

In [None]:
def train_models(X_train, y_train, preprocessor):
    """
    Train 5 models: Logistic Regression, Decision Tree, Random Forest,
    Gradient Boosting, and XGBoost (with Optuna tuning).
    Uses the shared preprocessor in all pipelines.
    """
    print("\n" + "="*60)
    print("TRAINING MODELS")
    print("="*60)

    models = {}

    print("\n[1/5] Training Logistic Regression...")
    log_reg_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000, random_state=42))
    ])
    log_reg_pipeline.fit(X_train, y_train)
    models['log_reg'] = log_reg_pipeline

    print("[2/5] Training Decision Tree...")
    dt_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=42, max_depth=10))
    ])
    dt_pipeline.fit(X_train, y_train)
    models['decision_tree'] = dt_pipeline

    print("[3/5] Training Random Forest...")
    rf_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10))
    ])
    rf_pipeline.fit(X_train, y_train)
    models['random_forest'] = rf_pipeline

    print("[4/5] Training Gradient Boosting...")
    gb_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42, max_depth=5))
    ])
    gb_pipeline.fit(X_train, y_train)
    models['gradient_boosting'] = gb_pipeline

    print("[5/5] Training XGBoost with Optuna tuning...")
    xgb_best_pipeline = tune_xgboost_with_optuna(X_train, y_train, preprocessor)
    models['xgboost_optuna'] = xgb_best_pipeline

    return models

In [None]:
def tune_xgboost_with_optuna(X_train, y_train, preprocessor):
    """
    Tune XGBoost hyperparameters using Optuna.
    """
    X_train_processed = preprocessor.fit_transform(X_train)

    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'random_state': 42,
            'eval_metric': 'logloss'
        }

        model = XGBClassifier(**params)

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        f1_scores = []

        for train_idx, val_idx in skf.split(X_train_processed, y_train):
            X_train_fold = X_train_processed[train_idx]
            X_val_fold = X_train_processed[val_idx]
            y_train_fold = y_train.iloc[train_idx] if hasattr(y_train, 'iloc') else y_train[train_idx]
            y_val_fold = y_train.iloc[val_idx] if hasattr(y_train, 'iloc') else y_train[val_idx]

            model.fit(X_train_fold, y_train_fold)
            y_pred = model.predict(X_val_fold)
            f1_scores.append(float(f1_score(y_val_fold, y_pred)))

        return float(np.mean(f1_scores))

    print("  Running Optuna optimization (50 trials)...")
    study = optuna.create_study(direction='maximize', study_name='xgboost_optimization')
    study.optimize(objective, n_trials=50, show_progress_bar=True)

    print(f"  Best F1 score: {study.best_value:.4f}")
    print(f"  Best parameters: {study.best_params}")

    best_params = study.best_params.copy()
    best_params['random_state'] = 42
    best_params['eval_metric'] = 'logloss'

    best_xgb = XGBClassifier(**best_params)
    best_xgb.fit(X_train_processed, y_train)

    xgb_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', best_xgb)
    ])

    return xgb_pipeline

In [None]:
def evaluate_models(models, X_train, X_val, y_train, y_val):
    """
    Evaluate all models on both train and validation sets.
    Returns a DataFrame with metrics sorted by validation F1 score.
    """
    print("\n" + "="*60)
    print("MODEL EVALUATION")
    print("="*60)

    results = []

    for model_name, pipeline in models.items():
        print(f"\n{model_name}:")
        print("-" * 40)

        y_train_pred = pipeline.predict(X_train)
        train_acc = accuracy_score(y_train, y_train_pred)
        train_prec = precision_score(y_train, y_train_pred, zero_division=0)
        train_rec = recall_score(y_train, y_train_pred, zero_division=0)
        train_f1 = f1_score(y_train, y_train_pred, zero_division=0)

        y_val_pred = pipeline.predict(X_val)
        val_acc = accuracy_score(y_val, y_val_pred)
        val_prec = precision_score(y_val, y_val_pred, zero_division=0)
        val_rec = recall_score(y_val, y_val_pred, zero_division=0)
        val_f1 = f1_score(y_val, y_val_pred, zero_division=0)

        print(f"Train - Accuracy: {train_acc:.4f}, Precision: {train_prec:.4f}, "
              f"Recall: {train_rec:.4f}, F1: {train_f1:.4f}")
        print(f"Val   - Accuracy: {val_acc:.4f}, Precision: {val_prec:.4f}, "
              f"Recall: {val_rec:.4f}, F1: {val_f1:.4f}")

        if train_acc > val_acc + 0.15:
            print("  WARNING: High train but low val -> Overfitting detected")
        elif train_acc < 0.6 and val_acc < 0.6:
            print("  WARNING: Low train and low val -> Underfitting or weak features")
        else:
            print("  OK: Good generalization")

        cm = confusion_matrix(y_val, y_val_pred)
        print(f"\nConfusion Matrix (Validation):\n{cm}")

        results.append({
            'model_name': model_name,
            'train_accuracy': train_acc,
            'train_f1': train_f1,
            'val_accuracy': val_acc,
            'val_precision': val_prec,
            'val_recall': val_rec,
            'val_f1': val_f1
        })

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('val_f1', ascending=False)

    print("\n" + "="*60)
    print("MODEL COMPARISON SUMMARY")
    print("="*60)
    print(results_df.to_string(index=False))

    print("\n" + "="*60)
    print("MODEL DIAGNOSIS")
    print("="*60)
    for _, row in results_df.iterrows():
        model_name = row['model_name']
        train_acc = row['train_accuracy']
        val_acc = row['val_accuracy']
        if train_acc > val_acc + 0.15:
            print(f"  {model_name}: Overfitting (train={train_acc:.3f}, val={val_acc:.3f})")
        elif train_acc < 0.6 and val_acc < 0.6:
            print(f"  {model_name}: Underfitting (train={train_acc:.3f}, val={val_acc:.3f})")

    return results_df

In [None]:
def select_best_model(models, results_df):
    """
    Select the best model based on validation F1 score.
    """
    best_model_name = results_df.iloc[0]['model_name']
    best_model = models[best_model_name]

    print(f"\n{'='*60}")
    print(f"SELECTED BEST MODEL: {best_model_name}")
    print(f"{'='*60}")
    print(f"Validation F1 Score: {results_df.iloc[0]['val_f1']:.4f}")
    print(f"Validation Accuracy: {results_df.iloc[0]['val_accuracy']:.4f}")

    return best_model

In [None]:
def evaluate_external_test_set(best_model, fighters_df, test_path='ufc_fight_test - Sheet1.csv'):
    """
    Evaluate the trained model on a completely separate external test dataset.
    Uses the exact same preprocessing as training.
    """
    print("\n" + "="*60)
    print("EXTERNAL TEST SET EVALUATION")
    print("="*60)

    print(f"\nLoading external test dataset: {test_path}")
    try:
        external_test_df = pd.read_csv(test_path)
        first_col_name: str = str(external_test_df.columns[0])
        is_numeric_col = first_col_name.isdigit()
        has_expected_keywords = any(c in first_col_name.lower() for c in ['id', 'name', 'age', 'height'])
        if is_numeric_col or not has_expected_keywords:
            print("No header row detected, using training CSV column structure...")
            train_df_sample = pd.read_csv('ufc_fight_train - ufc_event_fight_stats.csv', nrows=0)
            column_names = train_df_sample.columns.tolist()
            external_test_df = pd.read_csv(test_path, header=None, names=column_names)
        print(f"Loaded external test set: {external_test_df.shape}")
    except FileNotFoundError:
        print(f"Warning: External test file {test_path} not found. Skipping external evaluation.")
        return None

    print("\nApplying same preprocessing pipeline...")

    external_test_df, _ = clean_events_df(external_test_df)

    external_test_df = merge_fighter_averages(external_test_df, fighters_df)

    external_test_df = build_pre_fight_features(external_test_df)

    print(f"After preprocessing, shape: {external_test_df.shape}")

    print("Creating y_winner target from result column...")
    print(f"Available columns after cleaning: {[c for c in external_test_df.columns if 'result' in c.lower() or 'winner' in c.lower()]}")
    result_col = None
    for col in external_test_df.columns:
        if col.lower() == 'result':
            result_col = col
            break

    if result_col:
        f1_id_col = None
        f2_id_col = None
        for col in external_test_df.columns:
            if col == 'f1_id' or ('f1' in col.lower() and 'id' in col.lower()):
                f1_id_col = col
            if col == 'f2_id' or ('f2' in col.lower() and 'id' in col.lower()):
                f2_id_col = col

        if f1_id_col and f2_id_col:
            result_vals = pd.to_numeric(external_test_df[result_col], errors='coerce')
            f1_ids = pd.to_numeric(external_test_df[f1_id_col], errors='coerce')
            f2_ids = pd.to_numeric(external_test_df[f2_id_col], errors='coerce')

            match_f1 = pd.Series(result_vals == f1_ids)
            external_test_df['y_winner'] = match_f1.astype(int)

            if external_test_df['y_winner'].sum() == 0:
                print("Warning: No matches found between result and f1_id, trying f2_id...")
                match_f2 = pd.Series(result_vals == f2_ids)
                external_test_df['y_winner'] = match_f2.astype(int)
                if external_test_df['y_winner'].sum() == len(external_test_df):
                    external_test_df['y_winner'] = 0
                else:
                    external_test_df['y_winner'] = match_f1.astype(int)
        else:
            print("Warning: Could not find ID columns to create y_winner")
            external_test_df['y_winner'] = 0
    else:
        print("Warning: No 'result' column found. Cannot create y_winner.")
        return None

    external_test_df = external_test_df.dropna(subset=['y_winner'])


    pre_fight_features = [
        'age_diff', 'height_diff', 'reach_diff', 'slpm_diff', 'sapm_diff',
        'str_acc_diff', 'str_def_diff', 'td_avg_diff', 'td_acc_diff', 'td_def_diff',
        'sub_avg_diff', 'weight_class', 'stance_f1', 'stance_f2', 'stance_matchup'
    ]

    available_features = [f for f in pre_fight_features if f in external_test_df.columns]

    diff_features = [col for col in external_test_df.columns if '_diff' in col]
    available_features = list(set(available_features + diff_features))

    X_ext = external_test_df[available_features].copy()
    y_ext = external_test_df['y_winner'].copy()

    print(f"\nExternal test set ready:")
    print(f"  Features: {X_ext.shape[1]}")
    print(f"  Samples: {X_ext.shape[0]}")
    print(f"  Target distribution: {y_ext.value_counts().to_dict()}")

    if len(X_ext) == 0:
        print("Error: No samples in external test set after preprocessing")
        return None

    print("\n" + "="*60)
    print("MAKING PREDICTIONS ON EXTERNAL TEST SET")
    print("="*60)

    print(f"\nMaking predictions on {len(X_ext)} samples...")
    print(f"Feature columns in X_ext: {list(X_ext.columns)[:10]}...")

    y_pred_ext = best_model.predict(X_ext)
    y_proba_ext = best_model.predict_proba(X_ext)

    print(f"\nPrediction distribution: {pd.Series(y_pred_ext).value_counts().to_dict()}")
    print(f"Actual distribution: {y_ext.value_counts().to_dict()}")


    print("\n" + "="*60)
    print("EXTERNAL TEST SET METRICS")
    print("="*60)

    ext_acc = accuracy_score(y_ext, y_pred_ext)
    ext_prec = precision_score(y_ext, y_pred_ext, zero_division=0)
    ext_rec = recall_score(y_ext, y_pred_ext, zero_division=0)
    ext_f1 = f1_score(y_ext, y_pred_ext, zero_division=0)

    print(f"\nExternal Test Accuracy:  {ext_acc:.4f} ({ext_acc*100:.2f}%)")
    print(f"External Test Precision: {ext_prec:.4f} ({ext_prec*100:.2f}%)")
    print(f"External Test Recall:    {ext_rec:.4f} ({ext_rec*100:.2f}%)")
    print(f"External Test F1 Score:  {ext_f1:.4f} ({ext_f1*100:.2f}%)")

    cm_ext = confusion_matrix(y_ext, y_pred_ext)
    print(f"\nConfusion Matrix (External Test):")
    print(cm_ext)

    print(f"\nClassification Report (External Test):")
    print(classification_report(y_ext, y_pred_ext, target_names=['Fighter2 Wins', 'Fighter1 Wins']))

    print("\n" + "="*60)
    print("PREDICTION VS ACTUAL RESULTS TABLE")
    print("="*60)

    f1_name_col = None
    f2_name_col = None
    for col in external_test_df.columns:
        if ('f1' in col.lower() and 'name' in col.lower()) or col == 'fighter1' or col == 'fighter1_name':
            f1_name_col = col
        if ('f2' in col.lower() and 'name' in col.lower()) or col == 'fighter2' or col == 'fighter2_name':
            f2_name_col = col

    if f1_name_col is None or f2_name_col is None:
        print("Warning: Could not find fighter name columns for results table")
        f1_name_col = 'f1_name' if 'f1_name' in external_test_df.columns else external_test_df.columns[0]
        f2_name_col = 'f2_name' if 'f2_name' in external_test_df.columns else external_test_df.columns[1]

    results_table = []
    for idx in range(len(external_test_df)):
        f1_name = str(external_test_df.iloc[idx][f1_name_col]) if f1_name_col in external_test_df.columns else f"Fighter1_{idx}"
        f2_name = str(external_test_df.iloc[idx][f2_name_col]) if f2_name_col in external_test_df.columns else f"Fighter2_{idx}"

        actual_winner = f1_name if y_ext.iloc[idx] == 1 else f2_name
        predicted_winner = f1_name if y_pred_ext[idx] == 1 else f2_name

        proba = y_proba_ext[idx]
        p_f1_win = proba[1] if len(proba) > 1 else proba[0]
        p_f2_win = proba[0] if len(proba) > 1 else 1 - p_f1_win

        results_table.append({
            'fighter1': f1_name,
            'fighter2': f2_name,
            'actual_winner': actual_winner,
            'predicted_winner': predicted_winner,
            'predicted_prob_f1_win': p_f1_win,
            'predicted_prob_f2_win': p_f2_win,
            'correct': 1 if actual_winner == predicted_winner else 0
        })

    results_table_df = pd.DataFrame(results_table)

    print(f"\nTotal fights: {len(results_table_df)}")
    print(f"Correct predictions: {results_table_df['correct'].sum()}")
    print(f"Accuracy: {results_table_df['correct'].mean():.2%}")

    print(f"\nFull Results Table:")
    print(results_table_df.to_string(index=False))

    print("\n" + "="*60)
    print("PERFORMANCE COMPARISON SUMMARY")
    print("="*60)

    print(f"\nExternal Test F1 Score: {ext_f1:.4f} ({ext_f1*100:.2f}%)")
    print("\nNote: Compare this with:")
    print("  - Training F1 Score (from model training output)")
    print("  - Validation F1 Score (from model evaluation output)")

    if ext_f1 >= 0.70:
        print("OK: Model generalized well to external test set")
        print("  The model maintains strong performance on unseen data.")
    elif ext_f1 >= 0.60:
        print("WARNING: Model performance dropped moderately on external test set")
        print("  Some performance degradation, but still reasonable.")
    else:
        print("WARNING: Model performance dropped significantly on external test set")
        print("  May indicate overfitting or distribution shift between train/test data.")

    return {
        'accuracy': ext_acc,
        'precision': ext_prec,
        'recall': ext_rec,
        'f1': ext_f1,
        'results_table': results_table_df
    }

In [None]:
def build_matchup_row(fighter1_name, fighter2_name, weight_class, fighters_df):
    """
    Build a pre-fight feature row for a specific matchup.

    Args:
        fighter1_name: Name of first fighter
        fighter2_name: Name of second fighter
        weight_class: Weight class of the fight
        fighters_df: DataFrame with fighter averages

    Returns:
        DataFrame with single row of pre-fight features
    """
    fighter1_name = str(fighter1_name).lower().strip()
    fighter2_name = str(fighter2_name).lower().strip()

    fighters_name_col = None
    for col in fighters_df.columns:
        if 'name' in col.lower():
            fighters_name_col = col
            break

    if fighters_name_col is None:
        fighters_name_col = fighters_df.columns[0]

    f1_row = None
    f2_row = None

    f1_search_terms = [fighter1_name]
    f2_search_terms = [fighter2_name]

    f1_parts = fighter1_name.split()
    f2_parts = fighter2_name.split()
    if len(f1_parts) > 1:
        f1_search_terms.extend([f1_parts[-1], ' '.join(f1_parts[-2:])])
    if len(f2_parts) > 1:
        f2_search_terms.extend([f2_parts[-1], ' '.join(f2_parts[-2:])])  #
    for idx, row in fighters_df.iterrows():
        name_val = str(row[fighters_name_col]).lower().strip()

        if f1_row is None:
            for term in f1_search_terms:
                if term in name_val or name_val in term:
                    f1_row = row
                    break

        if f2_row is None:
            for term in f2_search_terms:
                if term in name_val or name_val in term:
                    f2_row = row
                    break

    if f1_row is None:
        # Print available fighters for debugging
        print(f"\nAvailable fighters (first 20):")
        for idx, row in fighters_df.head(20).iterrows():
            print(f"  - {row[fighters_name_col]}")
        raise ValueError(f"Fighter 1 '{fighter1_name}' not found in fighters_df. Try checking the exact name in the CSV.")
    if f2_row is None:
        # Print available fighters for debugging
        print(f"\nAvailable fighters (first 20):")
        for idx, row in fighters_df.head(20).iterrows():
            print(f"  - {row[fighters_name_col]}")
        raise ValueError(f"Fighter 2 '{fighter2_name}' not found in fighters_df. Try checking the exact name in the CSV.")

    def get_val(row, key_patterns, default=0):
        for pattern in key_patterns:
            for col in row.index:
                if pattern.lower() in col.lower():
                    val = row[col]
                    if pd.notna(val) and val != '':
                        try:
                            return float(val)
                        except (ValueError, TypeError):
                            return default
        return default

    # Build difference features
    matchup_data = {}

    # Age difference
    f1_age = get_val(f1_row, ['age'], 0)
    f2_age = get_val(f2_row, ['age'], 0)
    matchup_data['age_diff'] = f1_age - f2_age

    # Height difference
    f1_height = get_val(f1_row, ['height', 'ht'], 0)
    f2_height = get_val(f2_row, ['height', 'ht'], 0)
    matchup_data['height_diff'] = f1_height - f2_height

    # Reach difference
    f1_reach = get_val(f1_row, ['reach'], 0)
    f2_reach = get_val(f2_row, ['reach'], 0)
    matchup_data['reach_diff'] = f1_reach - f2_reach

    # Striking stats
    f1_slpm = get_val(f1_row, ['slpm', 'strikes_landed_per_min'], 0)
    f2_slpm = get_val(f2_row, ['slpm', 'strikes_landed_per_min'], 0)
    matchup_data['slpm_diff'] = f1_slpm - f2_slpm

    f1_sapm = get_val(f1_row, ['sapm', 'strikes_absorbed_per_min'], 0)
    f2_sapm = get_val(f2_row, ['sapm', 'strikes_absorbed_per_min'], 0)
    matchup_data['sapm_diff'] = f1_sapm - f2_sapm

    f1_str_acc = get_val(f1_row, ['str_acc', 'striking_accuracy'], 0)
    f2_str_acc = get_val(f2_row, ['str_acc', 'striking_accuracy'], 0)
    matchup_data['str_acc_diff'] = f1_str_acc - f2_str_acc

    f1_str_def = get_val(f1_row, ['str_def', 'striking_defense'], 0)
    f2_str_def = get_val(f2_row, ['str_def', 'striking_defense'], 0)
    matchup_data['str_def_diff'] = f1_str_def - f2_str_def

    # Grappling stats
    f1_td_avg = get_val(f1_row, ['td_avg', 'takedown_avg'], 0)
    f2_td_avg = get_val(f2_row, ['td_avg', 'takedown_avg'], 0)
    matchup_data['td_avg_diff'] = f1_td_avg - f2_td_avg

    f1_td_acc = get_val(f1_row, ['td_acc', 'takedown_accuracy'], 0)
    f2_td_acc = get_val(f2_row, ['td_acc', 'takedown_accuracy'], 0)
    matchup_data['td_acc_diff'] = f1_td_acc - f2_td_acc

    f1_td_def = get_val(f1_row, ['td_def', 'takedown_defense'], 0)
    f2_td_def = get_val(f2_row, ['td_def', 'takedown_defense'], 0)
    matchup_data['td_def_diff'] = f1_td_def - f2_td_def

    # Submission stats
    f1_sub_avg = get_val(f1_row, ['sub_avg', 'submission_avg'], 0)
    f2_sub_avg = get_val(f2_row, ['sub_avg', 'submission_avg'], 0)
    matchup_data['sub_avg_diff'] = f1_sub_avg - f2_sub_avg

    # Categorical features
    matchup_data['weight_class'] = str(weight_class).lower().strip()

    f1_stance = 'unknown'
    f2_stance = 'unknown'
    for col in f1_row.index:
        if 'stance' in col.lower():
            val = f1_row[col]
            if pd.notna(val) and val != '':
                f1_stance = str(val).lower().strip()
                break
    for col in f2_row.index:
        if 'stance' in col.lower():
            val = f2_row[col]
            if pd.notna(val) and val != '':
                f2_stance = str(val).lower().strip()
                break

    matchup_data['stance_f1'] = f1_stance
    matchup_data['stance_f2'] = f2_stance
    matchup_data['stance_matchup'] = f1_stance + '_vs_' + f2_stance

    matchup_df = pd.DataFrame([matchup_data])

    return matchup_df

In [None]:
def predict_specific_fights(best_model, fighters_df):
    """
    Predict outcomes for the two specific upcoming fights.
    """
    print("\n" + "="*60)
    print("PREDICTING SPECIFIC FIGHTS")
    print("="*60)

    matchups = [
        {
            'fighter1': 'Jack Della Maddalena',
            'fighter2': 'Islam Makhachev',
            'weight_class': 'Welterweight',
            'event_type': 'Main Event'
        },
        {
            'fighter1': 'Valentina Shevchenko',
            'fighter2': 'Zhang Weili',  # Name as it appears in CSV (last name first)
            'weight_class': "Women Flyweight",
            'event_type': 'Co-main'
        }
    ]

    predictions = []

    for matchup in matchups:
        print(f"\n{'='*60}")
        print(f"{matchup['event_type']}: {matchup['fighter1']} vs {matchup['fighter2']}")
        print(f"Weight Class: {matchup['weight_class']}")
        print(f"{'='*60}")

        try:
            matchup_df = build_matchup_row(
                matchup['fighter1'],
                matchup['fighter2'],
                matchup['weight_class'],
                fighters_df
            )


            print("\nDetailed Feature Breakdown:")
            print("-" * 60)

            feature_list = [
                ('reach_diff', 'Reach Difference'),
                ('height_diff', 'Height Difference'),
                ('age_diff', 'Age Difference'),
                ('slpm_diff', 'Strikes Landed/Min Difference'),
                ('str_acc_diff', 'Striking Accuracy Difference'),
                ('td_avg_diff', 'Takedown Average Difference'),
                ('td_def_diff', 'Takedown Defense Difference'),
            ]

            for feat_key, feat_name in feature_list:
                if feat_key in matchup_df.columns:
                    val = matchup_df[feat_key].iloc[0]
                    print(f"  {feat_name:35s}: {val:8.2f}")
                else:
                    print(f"  {feat_name:35s}: Not available")

            if 'stance_matchup' in matchup_df.columns:
                print(f"\n  Stance Matchup: {matchup_df['stance_matchup'].iloc[0]}")

            proba = best_model.predict_proba(matchup_df)[0]
            prediction = best_model.predict(matchup_df)[0]

            p_f1_win = proba[1] if len(proba) > 1 else proba[0]
            p_f2_win = proba[0] if len(proba) > 1 else 1 - p_f1_win

            print(f"\n{'='*60}")
            print("PREDICTION RESULTS:")
            print(f"{'='*60}")
            print(f"{matchup['fighter1']:30s}: {p_f1_win:.2%} win probability")
            print(f"{matchup['fighter2']:30s}: {p_f2_win:.2%} win probability")
            print(f"\nPredicted Winner: {matchup['fighter1'] if prediction == 1 else matchup['fighter2']}")

            print(f"\n{'='*60}")
            print("MODEL INTERPRETATION:")
            print(f"{'='*60}")

            # Determine favored fighter
            favored_fighter = matchup['fighter1'] if p_f1_win > 0.5 else matchup['fighter2']
            favored_prob = max(p_f1_win, p_f2_win)

            print(f"\nModel favors {favored_fighter} with {favored_prob:.1%} win probability.")

            # Analyze key advantages
            advantages = []

            if 'reach_diff' in matchup_df.columns:
                reach_diff = matchup_df['reach_diff'].iloc[0]
                if abs(reach_diff) > 2:
                    if reach_diff > 0:
                        advantages.append(f"{matchup['fighter1']} has {reach_diff:.1f}cm reach advantage")
                    else:
                        advantages.append(f"{matchup['fighter2']} has {abs(reach_diff):.1f}cm reach advantage")

            if 'height_diff' in matchup_df.columns:
                height_diff = matchup_df['height_diff'].iloc[0]
                if abs(height_diff) > 2:
                    if height_diff > 0:
                        advantages.append(f"{matchup['fighter1']} has {height_diff:.1f}cm height advantage")
                    else:
                        advantages.append(f"{matchup['fighter2']} has {abs(height_diff):.1f}cm height advantage")

            if 'age_diff' in matchup_df.columns:
                age_diff = matchup_df['age_diff'].iloc[0]
                if abs(age_diff) > 2:
                    if age_diff < 0:
                        advantages.append(f"{matchup['fighter1']} is {abs(age_diff):.0f} years younger")
                    else:
                        advantages.append(f"{matchup['fighter2']} is {age_diff:.0f} years younger")

            if 'td_avg_diff' in matchup_df.columns:
                td_diff = matchup_df['td_avg_diff'].iloc[0]
                if abs(td_diff) > 0.5:
                    if td_diff > 0:
                        advantages.append(f"{matchup['fighter1']} has superior takedown average (+{td_diff:.2f} per 15min)")
                    else:
                        advantages.append(f"{matchup['fighter2']} has superior takedown average (+{abs(td_diff):.2f} per 15min)")

            if 'td_def_diff' in matchup_df.columns:
                td_def_diff = matchup_df['td_def_diff'].iloc[0]
                if abs(td_def_diff) > 5:
                    if td_def_diff > 0:
                        advantages.append(f"{matchup['fighter1']} has better takedown defense (+{td_def_diff:.1f}%)")
                    else:
                        advantages.append(f"{matchup['fighter2']} has better takedown defense (+{abs(td_def_diff):.1f}%)")

            if 'str_acc_diff' in matchup_df.columns:
                str_acc_diff = matchup_df['str_acc_diff'].iloc[0]
                if abs(str_acc_diff) > 3:
                    if str_acc_diff > 0:
                        advantages.append(f"{matchup['fighter1']} has higher striking accuracy (+{str_acc_diff:.1f}%)")
                    else:
                        advantages.append(f"{matchup['fighter2']} has higher striking accuracy (+{abs(str_acc_diff):.1f}%)")

            if 'slpm_diff' in matchup_df.columns:
                slpm_diff = matchup_df['slpm_diff'].iloc[0]
                if abs(slpm_diff) > 0.5:
                    if slpm_diff > 0:
                        advantages.append(f"{matchup['fighter1']} lands more strikes per minute (+{slpm_diff:.2f})")
                    else:
                        advantages.append(f"{matchup['fighter2']} lands more strikes per minute (+{abs(slpm_diff):.2f})")

            if advantages:
                print("\nKey advantages identified by the model:")
                for i, advantage in enumerate(advantages, 1):
                    print(f"  {i}. {advantage}")
            else:
                print("\nFighters appear relatively balanced across key metrics.")

            predictions.append({
                'fighter1': matchup['fighter1'],
                'fighter2': matchup['fighter2'],
                'weight_class': matchup['weight_class'],
                'p_f1_win': p_f1_win,
                'p_f2_win': p_f2_win,
                'predicted_winner': matchup['fighter1'] if prediction == 1 else matchup['fighter2']
            })

        except Exception as e:
            print(f"\nError predicting {matchup['fighter1']} vs {matchup['fighter2']}: {e}")
            import traceback
            traceback.print_exc()

    predictions_df = pd.DataFrame(predictions)
    print(f"\n{'='*60}")
    print("PREDICTIONS SUMMARY")
    print(f"{'='*60}")
    print(predictions_df.to_string(index=False))

    return predictions_df

In [None]:
if __name__ == "__main__":
    print("="*60)
    print("UFC FIGHT PREDICTION PIPELINE")
    print("="*60)

    fighters_df, events_df = load_data(events_path='ufc_fight_train.csv')


    events_df, fighters_df = clean_and_merge_data(fighters_df, events_df, api_client=None)

    pre_fight_features = [
        'age_diff', 'height_diff', 'reach_diff', 'slpm_diff', 'sapm_diff',
        'str_acc_diff', 'str_def_diff', 'td_avg_diff', 'td_acc_diff', 'td_def_diff',
        'sub_avg_diff', 'weight_class', 'stance_f1', 'stance_f2', 'stance_matchup'
    ]

    # Filter to features that exist
    available_features = [f for f in events_df.columns if f in pre_fight_features]

    # Also include any other difference features
    diff_features = [col for col in events_df.columns if '_diff' in col]
    available_features = list(set(available_features + diff_features))

    # Ensure we have the target
    if 'y_winner' not in events_df.columns:
        print("Warning: y_winner not found. Creating default target...")
        events_df['y_winner'] = 0

    # Remove rows with missing target
    events_df = events_df.dropna(subset=['y_winner'])

    X = events_df[available_features].copy()
    y_winner = events_df['y_winner'].copy()

    print(f"\nFinal modeling dataset:")
    print(f"  Features: {X.shape[1]}")
    print(f"  Samples: {X.shape[0]}")
    print(f"  Target distribution: {y_winner.value_counts().to_dict()}")

    import matplotlib.pyplot as plt
    import seaborn as sns
    perform_eda(events_df)

    X_train, X_val, y_train, y_val = train_validation_split_data(X, y_winner)

    preprocessor, numeric_cols, categorical_cols = build_preprocessing_pipeline(X_train)

    models = train_models(X_train, y_train, preprocessor)

    results_df = evaluate_models(models, X_train, X_val, y_train, y_val)

    best_model = select_best_model(models, results_df)

    ext_results = evaluate_external_test_set(best_model, fighters_df)

    if ext_results:
        print("\n" + "="*60)
        print("FINAL PERFORMANCE COMPARISON")
        print("="*60)

        best_model_name = results_df.iloc[0]['model_name']
        best_model_row = results_df[results_df['model_name'] == best_model_name]
        if len(best_model_row) > 0:
            best_model_row = best_model_row.iloc[0]
        else:
            best_model_row = results_df.iloc[0]

        train_f1_best = best_model_row['train_f1'] if 'train_f1' in best_model_row else None
        if train_f1_best is None:
            y_train_pred_best = best_model.predict(X_train)
            train_f1_best = f1_score(y_train, y_train_pred_best, zero_division=0)

        val_f1 = results_df.iloc[0]['val_f1']
        ext_f1 = ext_results['f1']

        print(f"\n{'Metric':<25} {'F1 Score':<15} {'Percentage':<15}")
        print("-" * 55)
        print(f"{'Training F1':<25} {train_f1_best:<15.4f} {train_f1_best*100:<15.2f}%")
        print(f"{'Validation F1':<25} {val_f1:<15.4f} {val_f1*100:<15.2f}%")
        print(f"{'External Test F1':<25} {ext_f1:<15.4f} {ext_f1*100:<15.2f}%")

        print("\n" + "="*60)
        print("GENERALIZATION ASSESSMENT")
        print("="*60)

        f1_diff = val_f1 - ext_f1
        if abs(f1_diff) < 0.02:
            print("OK: Model generalized very well")
            print(f"  External test F1 ({ext_f1:.4f}) is very close to validation F1 ({val_f1:.4f})")
            print(f"  Difference: {f1_diff:.4f}")
        elif f1_diff > 0.05:
            print("WARNING: Model performance dropped on external test set")
            print(f"  External test F1 ({ext_f1:.4f}) is lower than validation F1 ({val_f1:.4f})")
            print(f"  Drop: {f1_diff:.4f} ({f1_diff*100:.2f} percentage points)")
            print("  Possible causes: distribution shift, different data quality, or overfitting")
        elif f1_diff < -0.02:
            print("OK: Model improved unexpectedly on external test set")
            print(f"  External test F1 ({ext_f1:.4f}) is higher than validation F1 ({val_f1:.4f})")
            print(f"  Improvement: {abs(f1_diff):.4f} ({abs(f1_diff)*100:.2f} percentage points)")
        else:
            print("OK: Model generalized well")
            print(f"  External test F1 ({ext_f1:.4f}) is close to validation F1 ({val_f1:.4f})")
            print(f"  Difference: {f1_diff:.4f}")

    predictions_df = predict_specific_fights(best_model, fighters_df)

    print("\n" + "="*60)
    print("PIPELINE COMPLETE")
    print("="*60)
    print("\nAll predictions have been generated and saved.")

UFC FIGHT PREDICTION PIPELINE
Loading data...
Loaded fighters_df: (4262, 42)
Loaded events_df: (5898, 58)

CLEANING FIGHTERS_DF

Shape: (4262, 42)

Dtypes:
fighter_id                int64
fighter_name             object
fighter_dob              object
fighter_height_cm       float64
fighter_weight_lbs      float64
fighter_reach_cm        float64
fighter_stance           object
fighter_wins              int64
fighter_losses            int64
fighter_draws             int64
fighter_slpm            float64
fighter_str_acc_%       float64
fighter_sapm            float64
fighter_str_def_%       float64
fighter_td_avg          float64
fighter_td_acc_%        float64
fighter_td_def_%        float64
fighter_sub_avg         float64
fighter_url              object
avg_knockdowns          float64
avg_sig_strike_atts     float64
avg_sig_strikes         float64
avg_tot_strike_atts     float64
avg_tot_strikes         float64
avg_takedown_atts       float64
avg_takedowns           float64
avg_clinch_a

[I 2025-11-15 02:52:30,638] A new study created in memory with name: xgboost_optimization


[5/5] Training XGBoost with Optuna tuning...
  Running Optuna optimization (50 trials)...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-15 02:52:36,151] Trial 0 finished with value: 0.6902594121103081 and parameters: {'n_estimators': 650, 'max_depth': 6, 'learning_rate': 0.03657350149013256, 'subsample': 0.5086145995253384, 'colsample_bytree': 0.6615458050256791, 'min_child_weight': 4, 'gamma': 0.00015280069974534825}. Best is trial 0 with value: 0.6902594121103081.
[I 2025-11-15 02:52:45,625] Trial 1 finished with value: 0.695913268558028 and parameters: {'n_estimators': 700, 'max_depth': 7, 'learning_rate': 0.0170889791221588, 'subsample': 0.8478973929379439, 'colsample_bytree': 0.6234731169796359, 'min_child_weight': 3, 'gamma': 1.3357562540320559e-06}. Best is trial 1 with value: 0.695913268558028.
[I 2025-11-15 02:52:49,841] Trial 2 finished with value: 0.6763613800644064 and parameters: {'n_estimators': 600, 'max_depth': 5, 'learning_rate': 0.09975949301873742, 'subsample': 0.7735288788303628, 'colsample_bytree': 0.8521461980035168, 'min_child_weight': 10, 'gamma': 5.319937221439926e-07}. Best is trial