In [1]:
import pandas as pd
import numpy as np

# Import the first CSV file
methods_df = pd.read_csv('ufc_method_aug.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
#process data for training

# Drop the columns
columns_to_drop = ['p1_fighter', 'p2_fighter', 'p1_Other_wins', 'p2_Other_wins'] #method
methods_df = methods_df.drop(columns=columns_to_drop)
methods_df = methods_df[methods_df['method_grouped'] != 'Other']

# Clean all column names
def clean_column_name(col):
  return col.lower().replace(' ', '_').replace('.', '').replace('-', '_')

# Apply to all columns
methods_df.columns = [clean_column_name(col) for col in methods_df.columns]

# Identify all categorical columns
categorical_cols = ['p1_stance', 'p2_stance']

# One-hot encode all categorical variables
methods_df = pd.get_dummies(methods_df, columns=categorical_cols)

methods_df = methods_df.sample(frac=1, random_state=42).reset_index(drop=True)

# encode the referee using frequency
ref_counts = methods_df['referee'].value_counts()
methods_df['referee_freq'] = methods_df['referee'].map(ref_counts)
methods_df.drop(columns=['referee'], inplace=True)


methods_df.head()

Unnamed: 0,winner,event_date,p1_height,p1_weight,p1_reach,p1_slpm,p1_str_acc,p1_sapm,p1_str_def,p1_td_avg,p1_td_acc,p1_td_def,p1_sub_avg,p2_height,p2_weight,p2_reach,p2_slpm,p2_str_acc,p2_sapm,p2_str_def,p2_td_avg,p2_td_acc,p2_td_def,p2_sub_avg,p1_age_at_event,p2_age_at_event,height_diff,reach_diff,weight_diff,age_diff,slpm_diff,stracc_diff,sapm_diff,strdef_diff,tdavg_diff,tdacc_diff,tddef_diff,subavg_diff,p1_days_since_last_fight,p2_days_since_last_fight,days_since_last_fight_diff,p1_wins,p1_losses,p1_total,p2_wins,p2_losses,p2_total,win_diff,loss_diff,total_diff,p1_win_streak,p2_win_streak,p1_age_adjusted_slpm,p2_age_adjusted_slpm,p1_age_adjusted_str_acc,p2_age_adjusted_str_acc,p1_age_adjusted_sapm,p2_age_adjusted_sapm,p1_age_adjusted_str_def,p2_age_adjusted_str_def,p1_age_adjusted_td_avg,p2_age_adjusted_td_avg,p1_age_adjusted_td_acc,p2_age_adjusted_td_acc,p1_age_adjusted_td_def,p2_age_adjusted_td_def,p1_age_adjusted_sub_avg,p2_age_adjusted_sub_avg,p1_kd_ema,p2_kd_ema,p1_sig_str_pct_ema,p2_sig_str_pct_ema,p1_td_pct_ema,p2_td_pct_ema,p1_sub_att_ema,p2_sub_att_ema,p1_rev_ema,p2_rev_ema,p1_ctrl_ema,p2_ctrl_ema,p1_r1_kd_ema,p2_r1_kd_ema,p1_r1_sig_str_pct_ema,p2_r1_sig_str_pct_ema,p1_r1_td_pct_ema,p2_r1_td_pct_ema,p1_r1_sub_att_ema,p2_r1_sub_att_ema,p1_r1_rev_ema,p2_r1_rev_ema,p1_r1_ctrl_ema,p2_r1_ctrl_ema,p1_sig_str_pct_detailed_ema,p2_sig_str_pct_detailed_ema,p1_r1_sig_str_pct_detailed_ema,p2_r1_sig_str_pct_detailed_ema,p1_sig_str_landed_ema,p2_sig_str_landed_ema,p1_sig_str_attempted_ema,p2_sig_str_attempted_ema,p1_total_str_landed_ema,p2_total_str_landed_ema,p1_total_str_attempted_ema,p2_total_str_attempted_ema,p1_td_landed_ema,p2_td_landed_ema,p1_td_attempted_ema,p2_td_attempted_ema,p1_r1_sig_str_landed_ema,p2_r1_sig_str_landed_ema,p1_r1_sig_str_attempted_ema,p2_r1_sig_str_attempted_ema,p1_r1_total_str_landed_ema,p2_r1_total_str_landed_ema,p1_r1_total_str_attempted_ema,p2_r1_total_str_attempted_ema,p1_r1_td_landed_ema,p2_r1_td_landed_ema,p1_r1_td_attempted_ema,p2_r1_td_attempted_ema,p1_head_landed_ema,p2_head_landed_ema,p1_head_attempted_ema,p2_head_attempted_ema,p1_body_landed_ema,p2_body_landed_ema,p1_body_attempted_ema,p2_body_attempted_ema,p1_leg_landed_ema,p2_leg_landed_ema,p1_leg_attempted_ema,p2_leg_attempted_ema,p1_distance_landed_ema,p2_distance_landed_ema,p1_distance_attempted_ema,p2_distance_attempted_ema,p1_clinch_landed_ema,p2_clinch_landed_ema,p1_clinch_attempted_ema,p2_clinch_attempted_ema,p1_ground_landed_ema,p2_ground_landed_ema,p1_ground_attempted_ema,p2_ground_attempted_ema,p1_r1_head_landed_ema,p2_r1_head_landed_ema,p1_r1_head_attempted_ema,p2_r1_head_attempted_ema,p1_r1_body_landed_ema,p2_r1_body_landed_ema,p1_r1_body_attempted_ema,p2_r1_body_attempted_ema,p1_r1_leg_landed_ema,p2_r1_leg_landed_ema,p1_r1_leg_attempted_ema,p2_r1_leg_attempted_ema,p1_r1_distance_landed_ema,p2_r1_distance_landed_ema,p1_r1_distance_attempted_ema,p2_r1_distance_attempted_ema,p1_r1_clinch_landed_ema,p2_r1_clinch_landed_ema,p1_r1_clinch_attempted_ema,p2_r1_clinch_attempted_ema,p1_r1_ground_landed_ema,p2_r1_ground_landed_ema,p1_r1_ground_attempted_ema,p2_r1_ground_attempted_ema,method_grouped,p1_decision_wins,p2_decision_wins,p1_ko/tko_wins,p2_ko/tko_wins,p1_submission_wins,p2_submission_wins,p1_stance_Open Stance,p1_stance_Orthodox,p1_stance_Sideways,p1_stance_Southpaw,p1_stance_Switch,p2_stance_Open Stance,p2_stance_Orthodox,p2_stance_Sideways,p2_stance_Southpaw,p2_stance_Switch,referee_freq
0,1,2014-03-01,71.0,155.0,71.0,1.28,0.48,1.88,0.53,1.75,0.3,0.69,0.8,72.0,170.0,,3.17,0.46,2.27,0.53,0.0,0.0,0.77,0.0,23.975359,28.125941,-1.0,,-15.0,-4.150582,-1.89,0.02,-0.39,-0.0,1.75,0.3,-0.08,0.8,,,,0,0,0,0,0,0,0,0,0,0,0,0.053388,0.112707,0.020021,0.016355,0.078414,0.080708,0.022106,0.018844,0.072992,0.0,0.012513,0.0,0.02878,0.027377,0.033368,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Decision,0,0,0,0,0,0,False,False,False,True,False,False,False,False,True,False,174.0
1,1,2006-06-24,73.0,205.0,76.0,3.69,0.42,3.8,0.61,3.0,0.34,0.91,0.0,74.0,185.0,75.0,2.11,0.24,4.06,0.53,1.79,0.75,0.66,1.2,29.716632,21.664613,-1.0,1.0,20.0,8.052019,1.58,0.18,-0.26,0.08,1.21,-0.41,0.25,-1.2,,,,0,0,0,0,0,0,0,0,0,0,0,0.124173,0.097394,0.014133,0.011078,0.127875,0.187402,0.020527,0.024464,0.100954,0.082623,0.011441,0.034619,0.030623,0.030464,0.0,0.05539,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,KO/TKO,0,0,0,0,0,0,False,True,False,False,False,False,False,False,True,False,528.0
2,1,2021-09-18,66.0,125.0,67.0,4.04,0.41,3.87,0.53,0.52,0.55,0.68,0.3,67.0,125.0,71.0,2.51,0.45,3.24,0.6,0.72,0.18,0.75,0.7,27.644079,32.136893,-1.0,-4.0,0.0,-4.492813,1.53,-0.04,0.63,-0.07,-0.2,0.37,-0.07,-0.4,105.0,,,2,4,6,0,0,0,2,4,6,0,0,0.146143,0.078103,0.014831,0.014003,0.139994,0.100819,0.019172,0.01867,0.018811,0.022404,0.019896,0.005601,0.024598,0.023338,0.010852,0.021782,0.063492,,0.513016,,0.058824,,0.190476,,0.063492,,52.349206,,0.063492,,0.506984,,0.0,,0.126984,,0.0,,24.269841,,0.513016,,0.506984,,14.746032,,34.349206,,40.904762,,68.714286,,0.015873,,0.777778,,8.936508,,19.920635,,23.285714,,39.920635,,0.0,,0.507937,,7.666667,,24.079365,,5.761905,,8.126984,,1.31746,,2.142857,,13.142857,,32.253968,,0.730159,,0.809524,,0.873016,,1.285714,,3.936508,,12.920635,,4.396825,,6.015873,,0.603175,,0.984127,,7.666667,,18.365079,,0.634921,,0.698413,,0.634921,,0.857143,,Decision,1,0,0,0,1,0,False,True,False,False,False,False,True,False,False,False,864.0
3,0,2014-03-01,72.0,260.0,75.0,2.87,0.5,3.3,0.47,1.97,0.38,0.76,0.1,75.0,265.0,82.0,3.55,0.5,2.79,0.64,0.0,0.0,0.55,0.7,29.357974,35.627652,-3.0,-7.0,-5.0,-6.269678,-0.68,-0.0,0.51,-0.17,1.97,0.38,0.21,-0.6,133.0,161.0,-28.0,3,2,5,6,3,9,-3,-1,-4,0,0,0.097759,0.099642,0.017031,0.014034,0.112406,0.07831,0.016009,0.017964,0.067103,0.0,0.012944,0.0,0.025887,0.015437,0.003406,0.019648,0.258065,0.127202,0.430968,0.621311,0.666667,,0.0,0.015656,0.0,0.0,43.064516,6.154599,0.258065,0.093933,0.417097,0.615577,0.0,,0.0,0.011742,0.0,0.0,11.967742,5.35225,0.430968,0.621311,0.417097,0.615577,12.064516,10.407045,23.064516,19.119374,16.064516,10.894325,27.645161,19.755382,0.258065,0.0,0.580645,0.0,6.16129,8.362035,13.129032,14.450098,8.225806,8.571429,15.451613,14.792564,0.0,0.0,0.129032,0.0,10.870968,7.377691,21.709677,15.493151,0.806452,1.11546,0.967742,1.592955,0.387097,1.913894,0.387097,2.033268,5.935484,8.103718,15.096774,16.677104,2.129032,0.567515,3.483871,0.65362,4.0,1.735812,4.483871,1.78865,5.516129,5.868885,12.451613,11.587084,0.322581,0.994129,0.354839,1.244618,0.322581,1.499022,0.322581,1.618395,3.580645,6.205479,9.870968,12.225049,1.483871,0.508806,2.064516,0.524462,1.096774,1.64775,1.193548,1.700587,KO/TKO,0,1,3,5,0,0,False,False,False,True,False,False,False,False,False,True,960.0
4,0,2014-04-16,69.0,155.0,70.0,2.48,0.51,2.97,0.55,2.36,0.33,0.7,0.5,70.0,170.0,71.0,4.78,0.42,3.59,0.65,1.05,0.54,0.78,0.0,25.141684,27.731691,-1.0,-1.0,-15.0,-2.590007,-2.3,0.09,-0.62,-0.1,1.31,-0.21,-0.08,0.5,,,,0,0,0,0,0,0,0,0,0,0,0,0.098641,0.172366,0.020285,0.015145,0.118131,0.129455,0.021876,0.023439,0.093868,0.037863,0.013126,0.019472,0.027842,0.028127,0.019887,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Decision,0,0,0,0,0,0,False,False,False,True,False,False,True,False,False,False,528.0


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, balanced_accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
from collections import Counter

# Load and prepare data
methods_df = methods_df[methods_df['method_grouped'] != 'Other']

# Create targets with proper null handling
methods_df['p1_method_target'] = np.where(methods_df['winner'] == 1, methods_df['method_grouped'], np.nan)
methods_df['p2_method_target'] = np.where(methods_df['winner'] == 0, methods_df['method_grouped'], np.nan)

# Feature engineering
excluded_cols = ['winner', 'method_grouped', 'p1_method_target', 'p2_method_target']
feature_cols = [col for col in methods_df.columns if col not in excluded_cols]

def train_method_model(target_column):
    # Filter and encode data
    df = methods_df.dropna(subset=[target_column]).copy()
    X = df[feature_cols]
    y = df[target_column]
    
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    submission_label = le.transform(['Submission'])[0]
    
    # Split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
    )
    
    # Get class counts and define strategies
    train_class_counts = Counter(y_train)
    smote_strategy = {
        cls: min(count * 2, 4000) #2000
        for cls, count in train_class_counts.items() 
        if cls != submission_label
    }
    adasyn_strategy = {
        submission_label: min(train_class_counts[submission_label] * 3, 5000) #5000
    }

    # Pipeline with combined sampling
    numeric_cols = X_train.select_dtypes(include=np.number).columns
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(f_classif, k=50)),
        ('smote', SMOTE(
            sampling_strategy=smote_strategy,
            k_neighbors=5,
            random_state=42
        )),
        ('adasyn', ADASYN(
            sampling_strategy=adasyn_strategy,
            n_neighbors=3,
            random_state=42
        ))
    ])
    
    # Process training data
    X_train_processed, y_train_processed = pipeline.fit_resample(
        X_train[numeric_cols], y_train
    )

    # Get selected features
    selector = pipeline.named_steps['selector']
    selected_features = X_train[numeric_cols].columns[selector.get_support()]
    
    # Save features to file
    pd.Series(list(selected_features)).to_json(f"{target_column}_features.json")
    
    # Process test data
    X_test_processed = pipeline[:-2].transform(X_test[numeric_cols])
    
    # Get selected features
    selected_features = X_train[numeric_cols].columns[
        pipeline.named_steps['selector'].get_support()
    ]

    # Class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train_processed), y=y_train_processed)
    weight_dict = dict(zip(np.unique(y_train_processed), class_weights))

    # Model configurations
    models = {
        'Logistic Regression': LogisticRegression(
            max_iter=1000,
            class_weight='balanced',
            solver='saga'
        ),
        'Random Forest': RandomForestClassifier(
            n_estimators=300,
            class_weight=weight_dict,
            min_samples_leaf=15,
            max_depth=8,
            random_state=42
        ),
        'XGBoost': xgb.XGBClassifier(
            objective='multi:softmax',
            num_class=len(np.unique(y_train_processed)),
            eval_metric='mlogloss',
            use_label_encoder=False,
            random_state=42
        ),
        'LightGBM': lgb.LGBMClassifier(
            class_weight=weight_dict,
            objective='multiclass',
            num_leaves=31,
            min_child_samples=20,
            random_state=42,
            verbose=-1
        )
    }

    # Train and evaluate
    results = {}
    for name, model in models.items():
        if name == 'LightGBM':
            # Preserve feature names
            train_df = pd.DataFrame(X_train_processed, columns=selected_features)
            test_df = pd.DataFrame(X_test_processed, columns=selected_features)
            model.fit(
                train_df, y_train_processed,
                eval_set=(test_df, y_test)
            )
        elif name == 'XGBoost':
            model.fit(X_train_processed, y_train_processed)
            # Save the XGBoost model to a JSON file
            model.save_model(f"{target_column}_xgboost_model.json") 
        else:
            model.fit(X_train_processed, y_train_processed)
        
        pred = model.predict(X_test_processed)
        results[name] = {
            'accuracy': accuracy_score(y_test, pred),
            'balanced_acc': balanced_accuracy_score(y_test, pred),
            'f1_macro': f1_score(y_test, pred, average='macro'),
            'report': classification_report(y_test, pred, target_names=le.classes_, zero_division=0)
        }
    
    print(f"\n{target_column} - Selected Features:")
    print(list(selected_features))
    
    return results, selected_features

# Train models
print("Training p1_method_target model...")
p1_results, p1_features = train_method_model('p1_method_target')

print("\nTraining p2_method_target model...")
p2_results, p2_features = train_method_model('p2_method_target')

# Save features to files
p1_features.to_series().to_json("p1_method_features.json")
p2_features.to_series().to_json("p2_method_features.json")

# Evaluation function
def print_results(results, model_name):
    print(f"\n{model_name} Results:")
    for name, res in results.items():
        print(f"\n{name}:")
        print(f"  Accuracy: {res['accuracy']:.4f}")
        print(f"  Balanced Accuracy: {res['balanced_acc']:.4f}")
        print(f"  Macro F1: {res['f1_macro']:.4f}")
        print("  Classification Report:\n", res['report'])

print_results(p1_results, "Fighter 1 Method Prediction")
print_results(p2_results, "Fighter 2 Method Prediction")


Training p1_method_target model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



p1_method_target - Selected Features:
['p1_height', 'p1_weight', 'p1_reach', 'p1_slpm', 'p1_str_def', 'p1_td_avg', 'p1_td_def', 'p1_sub_avg', 'p2_height', 'p2_weight', 'p2_reach', 'p2_sapm', 'p2_str_def', 'p2_td_def', 'slpm_diff', 'tdavg_diff', 'subavg_diff', 'p1_age_adjusted_slpm', 'p2_age_adjusted_str_def', 'p1_age_adjusted_td_avg', 'p1_age_adjusted_td_def', 'p1_age_adjusted_sub_avg', 'p1_kd_ema', 'p1_sub_att_ema', 'p1_ctrl_ema', 'p1_r1_sub_att_ema', 'p1_sig_str_landed_ema', 'p2_sig_str_landed_ema', 'p1_sig_str_attempted_ema', 'p2_sig_str_attempted_ema', 'p1_total_str_landed_ema', 'p1_total_str_attempted_ema', 'p2_total_str_attempted_ema', 'p1_td_attempted_ema', 'p1_r1_sig_str_attempted_ema', 'p1_head_attempted_ema', 'p2_head_attempted_ema', 'p1_body_landed_ema', 'p1_body_attempted_ema', 'p1_leg_landed_ema', 'p1_leg_attempted_ema', 'p1_distance_landed_ema', 'p2_distance_landed_ema', 'p1_distance_attempted_ema', 'p2_distance_attempted_ema', 'p1_r1_distance_landed_ema', 'p1_r1_distanc

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



p2_method_target - Selected Features:
['p1_height', 'p1_weight', 'p1_reach', 'p1_sapm', 'p1_str_def', 'p1_td_def', 'p2_height', 'p2_weight', 'p2_reach', 'p2_slpm', 'p2_str_def', 'p2_td_avg', 'p2_td_def', 'p2_sub_avg', 'slpm_diff', 'tdavg_diff', 'subavg_diff', 'p2_age_adjusted_slpm', 'p1_age_adjusted_str_def', 'p2_age_adjusted_td_avg', 'p2_age_adjusted_td_def', 'p2_age_adjusted_sub_avg', 'p2_kd_ema', 'p2_sub_att_ema', 'p2_ctrl_ema', 'p2_r1_sub_att_ema', 'p2_sig_str_landed_ema', 'p1_sig_str_attempted_ema', 'p2_sig_str_attempted_ema', 'p2_total_str_landed_ema', 'p1_total_str_attempted_ema', 'p2_total_str_attempted_ema', 'p2_td_attempted_ema', 'p2_r1_sig_str_attempted_ema', 'p2_r1_td_attempted_ema', 'p1_head_attempted_ema', 'p2_head_attempted_ema', 'p2_body_landed_ema', 'p2_body_attempted_ema', 'p2_leg_landed_ema', 'p2_leg_attempted_ema', 'p2_distance_landed_ema', 'p1_distance_attempted_ema', 'p2_distance_attempted_ema', 'p2_r1_body_attempted_ema', 'p2_r1_distance_landed_ema', 'p2_r1_dist

