In [4]:
import pandas as pd
import numpy as np

# Import the first CSV file
methods_df = pd.read_csv('ufc_method_aug.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
#process data for training

# Drop the columns
columns_to_drop = ['p1_fighter', 'p2_fighter', 'p1_Other_wins', 'p2_Other_wins'] #method
methods_df = methods_df.drop(columns=columns_to_drop)
methods_df = methods_df[methods_df['method_grouped'] != 'Other']

# Clean all column names
def clean_column_name(col):
  return col.lower().replace(' ', '_').replace('.', '').replace('-', '_')

# Apply to all columns
methods_df.columns = [clean_column_name(col) for col in methods_df.columns]

# Identify all categorical columns
categorical_cols = ['p1_stance', 'p2_stance']

# One-hot encode all categorical variables
methods_df = pd.get_dummies(methods_df, columns=categorical_cols)

methods_df = methods_df.sample(frac=1, random_state=42).reset_index(drop=True)

# encode the referee using frequency
ref_counts = methods_df['referee'].value_counts()
methods_df['referee_freq'] = methods_df['referee'].map(ref_counts)
methods_df.drop(columns=['referee'], inplace=True)


methods_df.head()

Unnamed: 0,winner,event_date,p1_height,p1_weight,p1_reach,p1_slpm,p1_str_acc,p1_sapm,p1_str_def,p1_td_avg,p1_td_acc,p1_td_def,p1_sub_avg,p2_height,p2_weight,p2_reach,p2_slpm,p2_str_acc,p2_sapm,p2_str_def,p2_td_avg,p2_td_acc,p2_td_def,p2_sub_avg,p1_age_at_event,p2_age_at_event,height_diff,reach_diff,weight_diff,age_diff,slpm_diff,stracc_diff,sapm_diff,strdef_diff,tdavg_diff,tdacc_diff,tddef_diff,subavg_diff,p1_days_since_last_fight,p2_days_since_last_fight,days_since_last_fight_diff,p1_wins,p1_losses,p1_total,p2_wins,p2_losses,p2_total,win_diff,loss_diff,total_diff,p1_win_streak,p2_win_streak,p1_age_adjusted_slpm,p2_age_adjusted_slpm,p1_age_adjusted_str_acc,p2_age_adjusted_str_acc,p1_age_adjusted_sapm,p2_age_adjusted_sapm,p1_age_adjusted_str_def,p2_age_adjusted_str_def,p1_age_adjusted_td_avg,p2_age_adjusted_td_avg,p1_age_adjusted_td_acc,p2_age_adjusted_td_acc,p1_age_adjusted_td_def,p2_age_adjusted_td_def,p1_age_adjusted_sub_avg,p2_age_adjusted_sub_avg,p1_kd_ema,p2_kd_ema,p1_sig_str_pct_ema,p2_sig_str_pct_ema,p1_td_pct_ema,p2_td_pct_ema,p1_sub_att_ema,p2_sub_att_ema,p1_rev_ema,p2_rev_ema,p1_ctrl_ema,p2_ctrl_ema,p1_r1_kd_ema,p2_r1_kd_ema,p1_r1_sig_str_pct_ema,p2_r1_sig_str_pct_ema,p1_r1_td_pct_ema,p2_r1_td_pct_ema,p1_r1_sub_att_ema,p2_r1_sub_att_ema,p1_r1_rev_ema,p2_r1_rev_ema,p1_r1_ctrl_ema,p2_r1_ctrl_ema,p1_sig_str_pct_detailed_ema,p2_sig_str_pct_detailed_ema,p1_r1_sig_str_pct_detailed_ema,p2_r1_sig_str_pct_detailed_ema,p1_sig_str_landed_ema,p2_sig_str_landed_ema,p1_sig_str_attempted_ema,p2_sig_str_attempted_ema,p1_total_str_landed_ema,p2_total_str_landed_ema,p1_total_str_attempted_ema,p2_total_str_attempted_ema,p1_td_landed_ema,p2_td_landed_ema,p1_td_attempted_ema,p2_td_attempted_ema,p1_r1_sig_str_landed_ema,p2_r1_sig_str_landed_ema,p1_r1_sig_str_attempted_ema,p2_r1_sig_str_attempted_ema,p1_r1_total_str_landed_ema,p2_r1_total_str_landed_ema,p1_r1_total_str_attempted_ema,p2_r1_total_str_attempted_ema,p1_r1_td_landed_ema,p2_r1_td_landed_ema,p1_r1_td_attempted_ema,p2_r1_td_attempted_ema,p1_head_landed_ema,p2_head_landed_ema,p1_head_attempted_ema,p2_head_attempted_ema,p1_body_landed_ema,p2_body_landed_ema,p1_body_attempted_ema,p2_body_attempted_ema,p1_leg_landed_ema,p2_leg_landed_ema,p1_leg_attempted_ema,p2_leg_attempted_ema,p1_distance_landed_ema,p2_distance_landed_ema,p1_distance_attempted_ema,p2_distance_attempted_ema,p1_clinch_landed_ema,p2_clinch_landed_ema,p1_clinch_attempted_ema,p2_clinch_attempted_ema,p1_ground_landed_ema,p2_ground_landed_ema,p1_ground_attempted_ema,p2_ground_attempted_ema,p1_r1_head_landed_ema,p2_r1_head_landed_ema,p1_r1_head_attempted_ema,p2_r1_head_attempted_ema,p1_r1_body_landed_ema,p2_r1_body_landed_ema,p1_r1_body_attempted_ema,p2_r1_body_attempted_ema,p1_r1_leg_landed_ema,p2_r1_leg_landed_ema,p1_r1_leg_attempted_ema,p2_r1_leg_attempted_ema,p1_r1_distance_landed_ema,p2_r1_distance_landed_ema,p1_r1_distance_attempted_ema,p2_r1_distance_attempted_ema,p1_r1_clinch_landed_ema,p2_r1_clinch_landed_ema,p1_r1_clinch_attempted_ema,p2_r1_clinch_attempted_ema,p1_r1_ground_landed_ema,p2_r1_ground_landed_ema,p1_r1_ground_attempted_ema,p2_r1_ground_attempted_ema,method_grouped,p1_decision_wins,p2_decision_wins,p1_ko/tko_wins,p2_ko/tko_wins,p1_submission_wins,p2_submission_wins,p1_stance_Open Stance,p1_stance_Orthodox,p1_stance_Sideways,p1_stance_Southpaw,p1_stance_Switch,p2_stance_Open Stance,p2_stance_Orthodox,p2_stance_Sideways,p2_stance_Southpaw,p2_stance_Switch,referee_freq
0,0,2021-04-10,73.0,170.0,72.0,6.69,0.53,5.44,0.52,0.72,1.0,0.7,0.0,71.0,170.0,75.0,4.96,0.48,3.66,0.58,1.75,0.58,0.66,0.2,32.071184,27.227926,2.0,-3.0,-0.0,4.843258,1.73,0.05,1.78,-0.06,-1.03,0.42,0.04,-0.2,140.0,182.0,-42.0,1,0,1,1,1,2,0,-1,-1,1,0,0.208598,0.182166,0.016526,0.017629,0.169623,0.134421,0.016214,0.021302,0.02245,0.064272,0.031181,0.021302,0.021826,0.02424,0.0,0.007345,0.0,0.0,0.52,0.373333,1.0,0.0,0.0,0.0,0.0,0.0,196.0,5.0,0.0,0.0,0.54,0.4,1.0,,0.0,0.0,0.0,0.0,77.0,3.666667,0.52,0.373333,0.54,0.4,125.0,47.0,237.0,118.666667,160.0,49.333333,275.0,123.0,1.0,0.0,1.0,0.333333,56.0,24.0,103.0,59.666667,65.0,26.333333,113.0,64.0,1.0,0.0,1.0,0.0,100.0,34.0,202.0,97.333333,24.0,7.333333,34.0,15.0,1.0,5.666667,1.0,6.333333,82.0,46.0,180.0,116.0,25.0,1.0,33.0,2.666667,18.0,0.0,24.0,0.0,42.0,16.0,85.0,49.0,13.0,4.333333,17.0,6.333333,1.0,3.666667,1.0,4.333333,42.0,23.666667,86.0,58.666667,12.0,0.333333,13.0,1.0,2.0,0.0,4.0,0.0,Submission,0,1,1,0,0,0,False,False,False,False,True,False,True,False,False,False,814.0
1,0,2009-07-11,76.0,205.0,79.0,2.76,0.38,3.01,0.52,1.32,0.4,0.6,1.0,73.0,205.0,75.0,1.88,0.52,2.62,0.4,3.89,0.4,0.5,0.8,32.268309,44.555784,3.0,4.0,-0.0,-12.287474,0.88,-0.14,0.39,0.12,-2.57,-0.0,0.1,0.2,161.0,175.0,-14.0,5,4,9,6,4,10,-1,0,-1,0,0,0.085533,0.042194,0.011776,0.011671,0.09328,0.058803,0.016115,0.008978,0.040907,0.087306,0.012396,0.008978,0.018594,0.011222,0.03099,0.017955,0.015656,0.0,0.465519,0.469003,0.01763,0.578103,0.348337,0.226784,0.003914,0.0,139.414873,185.180841,0.0,0.0,0.421957,0.47088,0.017185,0.57552,0.266145,0.226784,0.0,0.0,47.843444,51.049853,0.465519,0.469003,0.421957,0.47088,30.056751,26.73998,65.455969,57.538612,62.266145,60.820137,100.534247,94.504399,0.043053,4.544477,1.146771,7.493646,7.661448,15.353861,18.340509,31.513196,15.203523,31.416422,26.16047,48.959922,0.019569,2.542522,0.544031,4.490714,16.964775,23.341153,47.393346,53.639296,6.553816,2.147605,8.751468,2.397849,6.53816,1.251222,9.311155,1.501466,8.037182,8.723363,35.927593,31.192571,15.32681,4.30694,19.731898,4.699902,6.692759,13.709677,9.796477,21.646139,1.726027,14.457478,10.007828,30.616813,2.692759,0.14565,3.978474,0.14565,3.242661,0.750733,4.354207,0.750733,2.058708,3.843597,11.401174,12.924731,5.054795,2.304985,6.078278,2.447703,0.547945,9.205279,0.861057,16.140762,Decision,2,0,1,3,2,3,False,True,False,False,False,False,True,False,False,False,2298.0
2,1,2022-10-29,76.0,260.0,78.0,5.76,0.49,3.47,0.55,0.49,0.5,0.66,0.2,76.0,265.0,80.0,4.96,0.44,5.67,0.45,0.18,0.5,0.2,0.0,31.071869,30.464066,0.0,-2.0,-5.0,0.607803,0.8,0.05,-2.2,0.1,0.31,0.0,0.46,0.2,,112.0,,0,0,0,1,5,6,-1,-5,-6,0,0,0.185377,0.162815,0.01577,0.014443,0.111677,0.186121,0.017701,0.014772,0.01577,0.005909,0.016092,0.016413,0.021241,0.006565,0.006437,0.0,,0.0,,0.412381,,,,0.0,,0.0,,35.301587,,0.0,,0.365397,,,,0.0,,0.0,,18.285714,,0.412381,,0.365397,,72.936508,,174.793651,,80.269841,,183.301587,,0.0,,0.0,,23.428571,,67.857143,,26.666667,,71.47619,,0.0,,0.0,,48.126984,,143.952381,,13.952381,,18.746032,,10.857143,,12.095238,,67.730159,,167.968254,,3.428571,,4.539683,,1.777778,,2.285714,,13.349206,,55.52381,,4.714286,,6.873016,,5.365079,,5.460317,,20.539683,,64.206349,,1.111111,,1.365079,,1.777778,,2.285714,Decision,0,1,0,0,0,0,False,True,False,False,False,False,True,False,False,False,862.0
3,0,2017-04-15,72.0,263.0,73.0,2.18,0.36,4.79,0.46,1.14,0.27,0.65,0.1,79.0,250.0,80.0,4.97,0.57,2.99,0.54,0.61,0.66,0.72,0.2,40.818617,28.473648,-7.0,-7.0,13.0,12.344969,-2.79,-0.21,1.8,-0.08,0.53,-0.39,-0.07,-0.1,203.0,147.0,56.0,9,9,18,1,0,1,8,9,17,1,1,0.053407,0.174547,0.00882,0.020019,0.117348,0.105009,0.011269,0.018965,0.027928,0.021423,0.006615,0.023179,0.015924,0.025287,0.00245,0.007024,0.548598,0.0,0.417889,0.53,0.205246,0.5,0.000122,0.0,0.0,0.0,222.371187,225.0,0.017345,0.0,0.347418,0.57,0.217617,0.5,0.0,0.0,0.0,0.0,104.988842,55.0,0.417889,0.53,0.347418,0.57,29.311277,45.0,64.233876,84.0,47.866069,91.0,84.978466,141.0,2.015759,1.0,3.971474,2.0,10.894657,19.0,24.126652,33.0,19.106148,34.0,33.463362,52.0,0.625067,1.0,1.705687,2.0,21.176205,35.0,54.849014,71.0,3.35864,9.0,4.338777,12.0,4.776431,1.0,5.046086,1.0,18.511,31.0,50.286611,68.0,8.100754,14.0,10.934543,16.0,2.699523,0.0,3.012722,0.0,6.219468,18.0,19.330766,31.0,2.151162,1.0,2.256234,2.0,2.524027,0.0,2.539652,0.0,7.007797,10.0,19.259759,24.0,3.81314,9.0,4.542536,9.0,0.073719,0.0,0.324357,0.0,Decision,1,1,8,0,0,0,False,True,False,False,False,False,True,False,False,False,956.0
4,0,2015-01-31,74.0,185.0,,1.17,0.28,1.83,0.67,0.0,0.0,1.0,0.0,74.0,185.0,78.0,1.93,0.38,2.63,0.5,2.0,0.68,0.81,0.9,30.836413,32.202601,0.0,,0.0,-1.366188,-0.76,-0.1,-0.8,0.17,-2.0,-0.68,0.19,-0.9,245.0,210.0,35.0,0,1,1,3,2,5,-3,-1,-4,0,0,0.037942,0.059933,0.00908,0.0118,0.059345,0.08167,0.021728,0.015527,0.0,0.062107,0.0,0.021116,0.032429,0.025153,0.0,0.027948,0.0,0.0,0.75,0.420968,,0.62,0.0,0.483871,0.0,0.0,0.0,154.580645,0.0,0.0,0.75,0.574839,,0.467742,0.0,0.129032,0.0,0.0,0.0,40.258065,0.75,0.420968,0.75,0.574839,3.0,29.483871,4.0,68.806452,4.0,41.387097,5.0,84.064516,0.0,2.193548,0.0,3.516129,3.0,5.096774,4.0,10.16129,4.0,10.290323,5.0,17.935484,0.0,0.741935,0.0,1.290323,1.0,18.064516,2.0,54.516129,2.0,8.129032,2.0,10.741935,0.0,3.290323,0.0,3.548387,0.0,16.709677,0.0,52.193548,3.0,7.290323,4.0,7.903226,0.0,5.483871,0.0,8.709677,1.0,2.516129,2.0,6.419355,2.0,2.387097,2.0,3.290323,0.0,0.193548,0.0,0.451613,0.0,2.580645,0.0,5.741935,3.0,1.419355,4.0,2.032258,0.0,1.096774,0.0,2.387097,Decision,0,2,0,0,0,1,False,True,False,False,False,False,True,False,False,False,1066.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, balanced_accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
from collections import Counter

# Load and prepare data
methods_df = methods_df[methods_df['method_grouped'] != 'Other']

# Create targets with proper null handling
methods_df['p1_method_target'] = np.where(methods_df['winner'] == 1, methods_df['method_grouped'], np.nan)
methods_df['p2_method_target'] = np.where(methods_df['winner'] == 0, methods_df['method_grouped'], np.nan)

# Feature engineering
excluded_cols = ['winner', 'method_grouped', 'p1_method_target', 'p2_method_target']
feature_cols = [col for col in methods_df.columns if col not in excluded_cols]

def train_method_model(target_column):
    # Filter and encode data
    df = methods_df.dropna(subset=[target_column]).copy()
    X = df[feature_cols]
    y = df[target_column]
    
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    submission_label = le.transform(['Submission'])[0]
    
    # Split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
    )
    
    # Get class counts and define strategies
    train_class_counts = Counter(y_train)
    smote_strategy = {
        cls: min(count * 2, 4000) #2000
        for cls, count in train_class_counts.items() 
        if cls != submission_label
    }
    adasyn_strategy = {
        submission_label: min(train_class_counts[submission_label] * 3, 5000) #5000
    }

    # Pipeline with combined sampling
    numeric_cols = X_train.select_dtypes(include=np.number).columns
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(f_classif, k=50)),
        ('smote', SMOTE(
            sampling_strategy=smote_strategy,
            k_neighbors=5,
            random_state=42
        )),
        ('adasyn', ADASYN(
            sampling_strategy=adasyn_strategy,
            n_neighbors=3,
            random_state=42
        ))
    ])
    
    # Process training data
    X_train_processed, y_train_processed = pipeline.fit_resample(
        X_train[numeric_cols], y_train
    )

    # Get selected features
    selector = pipeline.named_steps['selector']
    selected_features = X_train[numeric_cols].columns[selector.get_support()]
    
    # Save features to file
    pd.Series(list(selected_features)).to_json(f"{target_column}_features.json")
    
    # Process test data
    X_test_processed = pipeline[:-2].transform(X_test[numeric_cols])
    
    # Get selected features
    selected_features = X_train[numeric_cols].columns[
        pipeline.named_steps['selector'].get_support()
    ]

    # Class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train_processed), y=y_train_processed)
    weight_dict = dict(zip(np.unique(y_train_processed), class_weights))

    # Model configurations
    models = {
        'Logistic Regression': LogisticRegression(
            max_iter=1000,
            class_weight='balanced',
            solver='saga'
        ),
        'Random Forest': RandomForestClassifier(
            n_estimators=300,
            class_weight=weight_dict,
            min_samples_leaf=15,
            max_depth=8,
            random_state=42
        ),
        'XGBoost': xgb.XGBClassifier(
            objective='multi:softmax',
            num_class=len(np.unique(y_train_processed)),
            eval_metric='mlogloss',
            use_label_encoder=False,
            random_state=42
        ),
        'LightGBM': lgb.LGBMClassifier(
            class_weight=weight_dict,
            objective='multiclass',
            num_leaves=31,
            min_child_samples=20,
            random_state=42,
            verbose=-1
        )
    }

    # Train and evaluate
    results = {}
    for name, model in models.items():
        if name == 'LightGBM':
            # Preserve feature names
            train_df = pd.DataFrame(X_train_processed, columns=selected_features)
            test_df = pd.DataFrame(X_test_processed, columns=selected_features)
            model.fit(
                train_df, y_train_processed,
                eval_set=(test_df, y_test)
            )
        elif name == 'XGBoost':
            model.fit(X_train_processed, y_train_processed)
            # Save the XGBoost model to a JSON file
            model.save_model(f"{target_column}_xgboost_model.json") 
        else:
            model.fit(X_train_processed, y_train_processed)
        
        pred = model.predict(X_test_processed)
        results[name] = {
            'accuracy': accuracy_score(y_test, pred),
            'balanced_acc': balanced_accuracy_score(y_test, pred),
            'f1_macro': f1_score(y_test, pred, average='macro'),
            'report': classification_report(y_test, pred, target_names=le.classes_, zero_division=0)
        }
    
    print(f"\n{target_column} - Selected Features:")
    print(list(selected_features))
    
    return results, selected_features

# Train models
print("Training p1_method_target model...")
p1_results, p1_features = train_method_model('p1_method_target')

print("\nTraining p2_method_target model...")
p2_results, p2_features = train_method_model('p2_method_target')

# Save features to files
p1_features.to_series().to_json("p1_method_features.json")
p2_features.to_series().to_json("p2_method_features.json")

# Evaluation function
def print_results(results, model_name):
    print(f"\n{model_name} Results:")
    for name, res in results.items():
        print(f"\n{name}:")
        print(f"  Accuracy: {res['accuracy']:.4f}")
        print(f"  Balanced Accuracy: {res['balanced_acc']:.4f}")
        print(f"  Macro F1: {res['f1_macro']:.4f}")
        print("  Classification Report:\n", res['report'])

print_results(p1_results, "Fighter 1 Method Prediction")
print_results(p2_results, "Fighter 2 Method Prediction")


Training p1_method_target model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



p1_method_target - Selected Features:
['p1_height', 'p1_weight', 'p1_reach', 'p1_slpm', 'p1_str_def', 'p1_td_avg', 'p1_td_def', 'p1_sub_avg', 'p2_height', 'p2_weight', 'p2_reach', 'p2_sapm', 'p2_str_def', 'p2_td_def', 'tdavg_diff', 'subavg_diff', 'p1_age_adjusted_slpm', 'p2_age_adjusted_str_def', 'p1_age_adjusted_td_avg', 'p1_age_adjusted_td_def', 'p1_age_adjusted_sub_avg', 'p1_kd_ema', 'p1_sub_att_ema', 'p1_r1_sub_att_ema', 'p1_sig_str_landed_ema', 'p2_sig_str_landed_ema', 'p1_sig_str_attempted_ema', 'p2_sig_str_attempted_ema', 'p2_total_str_landed_ema', 'p1_total_str_attempted_ema', 'p2_total_str_attempted_ema', 'p1_td_attempted_ema', 'p1_r1_sig_str_attempted_ema', 'p2_head_landed_ema', 'p1_head_attempted_ema', 'p2_head_attempted_ema', 'p1_body_landed_ema', 'p1_body_attempted_ema', 'p2_body_attempted_ema', 'p1_leg_landed_ema', 'p1_leg_attempted_ema', 'p1_distance_landed_ema', 'p2_distance_landed_ema', 'p1_distance_attempted_ema', 'p2_distance_attempted_ema', 'p1_r1_distance_landed_e

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



p2_method_target - Selected Features:
['p1_height', 'p1_weight', 'p1_reach', 'p1_sapm', 'p1_str_def', 'p1_td_def', 'p2_height', 'p2_weight', 'p2_reach', 'p2_slpm', 'p2_str_def', 'p2_td_avg', 'p2_td_def', 'p2_sub_avg', 'slpm_diff', 'tdavg_diff', 'subavg_diff', 'p2_age_adjusted_slpm', 'p1_age_adjusted_str_def', 'p2_age_adjusted_td_avg', 'p2_age_adjusted_td_def', 'p2_age_adjusted_sub_avg', 'p2_kd_ema', 'p2_sub_att_ema', 'p2_ctrl_ema', 'p2_r1_sub_att_ema', 'p2_sig_str_landed_ema', 'p1_sig_str_attempted_ema', 'p2_sig_str_attempted_ema', 'p2_total_str_landed_ema', 'p1_total_str_attempted_ema', 'p2_total_str_attempted_ema', 'p2_td_attempted_ema', 'p2_r1_sig_str_attempted_ema', 'p2_head_landed_ema', 'p1_head_attempted_ema', 'p2_head_attempted_ema', 'p2_body_landed_ema', 'p2_body_attempted_ema', 'p2_leg_landed_ema', 'p2_leg_attempted_ema', 'p1_distance_landed_ema', 'p2_distance_landed_ema', 'p1_distance_attempted_ema', 'p2_distance_attempted_ema', 'p2_r1_distance_landed_ema', 'p2_r1_distance_a

