In [4]:
import pandas as pd
import numpy as np

# Import the first CSV file
methods_df = pd.read_csv('ufc_methods.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
#process data for training

# Drop the columns
columns_to_drop = ['p1_fighter', 'p2_fighter', 'p1_Other_wins', 'p2_Other_wins'] #method
methods_df = methods_df.drop(columns=columns_to_drop)
methods_df = methods_df[methods_df['method_grouped'] != 'Other']

# Clean all column names
def clean_column_name(col):
  return col.lower().replace(' ', '_').replace('.', '').replace('-', '_')

# Apply to all columns
methods_df.columns = [clean_column_name(col) for col in methods_df.columns]

# Identify all categorical columns
categorical_cols = ['p1_stance', 'p2_stance']

# One-hot encode all categorical variables
methods_df = pd.get_dummies(methods_df, columns=categorical_cols)

methods_df = methods_df.sample(frac=1, random_state=42).reset_index(drop=True)

# encode the referee using frequency
ref_counts = methods_df['referee'].value_counts()
methods_df['referee_freq'] = methods_df['referee'].map(ref_counts)
methods_df.drop(columns=['referee'], inplace=True)


methods_df.head()

Unnamed: 0,winner,p1_height,p1_weight,p1_reach,p1_slpm,p1_str_acc,p1_sapm,p1_str_def,p1_td_avg,p1_td_acc,p1_td_def,p1_sub_avg,p2_height,p2_weight,p2_reach,p2_slpm,p2_str_acc,p2_sapm,p2_str_def,p2_td_avg,p2_td_acc,p2_td_def,p2_sub_avg,p1_age_at_event,p2_age_at_event,height_diff,reach_diff,weight_diff,age_diff,slpm_diff,stracc_diff,sapm_diff,strdef_diff,tdavg_diff,tdacc_diff,tddef_diff,subavg_diff,p1_days_since_last_fight,p2_days_since_last_fight,days_since_last_fight_diff,p1_wins,p1_losses,p1_total,p2_wins,p2_losses,p2_total,win_diff,loss_diff,total_diff,p1_win_streak,p2_win_streak,p1_age_adjusted_slpm,p2_age_adjusted_slpm,p1_age_adjusted_str_acc,p2_age_adjusted_str_acc,p1_age_adjusted_sapm,p2_age_adjusted_sapm,p1_age_adjusted_str_def,p2_age_adjusted_str_def,p1_age_adjusted_td_avg,p2_age_adjusted_td_avg,p1_age_adjusted_td_acc,p2_age_adjusted_td_acc,p1_age_adjusted_td_def,p2_age_adjusted_td_def,p1_age_adjusted_sub_avg,p2_age_adjusted_sub_avg,p1_kd_ema,p2_kd_ema,p1_sig_str_pct_ema,p2_sig_str_pct_ema,p1_td_pct_ema,p2_td_pct_ema,p1_sub_att_ema,p2_sub_att_ema,p1_rev_ema,p2_rev_ema,p1_ctrl_ema,p2_ctrl_ema,p1_r1_kd_ema,p2_r1_kd_ema,p1_r1_sig_str_pct_ema,p2_r1_sig_str_pct_ema,p1_r1_td_pct_ema,p2_r1_td_pct_ema,p1_r1_sub_att_ema,p2_r1_sub_att_ema,p1_r1_rev_ema,p2_r1_rev_ema,p1_r1_ctrl_ema,p2_r1_ctrl_ema,p1_sig_str_pct_detailed_ema,p2_sig_str_pct_detailed_ema,p1_r1_sig_str_pct_detailed_ema,p2_r1_sig_str_pct_detailed_ema,p1_sig_str_landed_ema,p2_sig_str_landed_ema,p1_sig_str_attempted_ema,p2_sig_str_attempted_ema,p1_total_str_landed_ema,p2_total_str_landed_ema,p1_total_str_attempted_ema,p2_total_str_attempted_ema,p1_td_landed_ema,p2_td_landed_ema,p1_td_attempted_ema,p2_td_attempted_ema,p1_r1_sig_str_landed_ema,p2_r1_sig_str_landed_ema,p1_r1_sig_str_attempted_ema,p2_r1_sig_str_attempted_ema,p1_r1_total_str_landed_ema,p2_r1_total_str_landed_ema,p1_r1_total_str_attempted_ema,p2_r1_total_str_attempted_ema,p1_r1_td_landed_ema,p2_r1_td_landed_ema,p1_r1_td_attempted_ema,p2_r1_td_attempted_ema,p1_head_landed_ema,p2_head_landed_ema,p1_head_attempted_ema,p2_head_attempted_ema,p1_body_landed_ema,p2_body_landed_ema,p1_body_attempted_ema,p2_body_attempted_ema,p1_leg_landed_ema,p2_leg_landed_ema,p1_leg_attempted_ema,p2_leg_attempted_ema,p1_distance_landed_ema,p2_distance_landed_ema,p1_distance_attempted_ema,p2_distance_attempted_ema,p1_clinch_landed_ema,p2_clinch_landed_ema,p1_clinch_attempted_ema,p2_clinch_attempted_ema,p1_ground_landed_ema,p2_ground_landed_ema,p1_ground_attempted_ema,p2_ground_attempted_ema,p1_r1_head_landed_ema,p2_r1_head_landed_ema,p1_r1_head_attempted_ema,p2_r1_head_attempted_ema,p1_r1_body_landed_ema,p2_r1_body_landed_ema,p1_r1_body_attempted_ema,p2_r1_body_attempted_ema,p1_r1_leg_landed_ema,p2_r1_leg_landed_ema,p1_r1_leg_attempted_ema,p2_r1_leg_attempted_ema,p1_r1_distance_landed_ema,p2_r1_distance_landed_ema,p1_r1_distance_attempted_ema,p2_r1_distance_attempted_ema,p1_r1_clinch_landed_ema,p2_r1_clinch_landed_ema,p1_r1_clinch_attempted_ema,p2_r1_clinch_attempted_ema,p1_r1_ground_landed_ema,p2_r1_ground_landed_ema,p1_r1_ground_attempted_ema,p2_r1_ground_attempted_ema,method_grouped,p1_decision_wins,p2_decision_wins,p1_ko/tko_wins,p2_ko/tko_wins,p1_submission_wins,p2_submission_wins,p1_stance_Open Stance,p1_stance_Orthodox,p1_stance_Sideways,p1_stance_Southpaw,p1_stance_Switch,p2_stance_Open Stance,p2_stance_Orthodox,p2_stance_Sideways,p2_stance_Southpaw,p2_stance_Switch,referee_freq
0,1,70.0,145.0,74.0,2.41,0.4,1.85,0.63,2.22,0.33,0.6,1.3,69.0,155.0,68.0,1.57,0.5,1.29,0.51,1.24,0.5,0.25,1.2,31.112936,24.974675,1.0,6.0,-10.0,6.138261,0.84,-0.1,0.56,0.12,0.98,-0.17,0.35,0.1,93.0,,,4,2,6,0,0,0,4,2,6,1,0,0.07746,0.062864,0.012856,0.02002,0.059461,0.051652,0.020249,0.020421,0.071353,0.04965,0.010607,0.02002,0.019285,0.01001,0.041783,0.048049,0.0,,0.544762,,0.620645,,1.777778,,0.253968,,260.380952,,0.0,,0.516508,,0.426667,,0.444444,,0.0,,98.539683,,0.544762,,0.516508,,27.650794,,51.238095,,76.777778,,103.349206,,0.857143,,1.587302,,7.269841,,15.460317,,22.301587,,30.936508,,0.253968,,0.698413,,17.349206,,39.253968,,4.365079,,5.412698,,5.936508,,6.571429,,11.301587,,23.793651,,2.873016,,3.793651,,13.47619,,23.650794,,4.650794,,11.984127,,1.507937,,2.301587,,1.111111,,1.174603,,2.222222,,6.68254,,1.730159,,2.650794,,3.31746,,6.126984,,KO/TKO,0,0,1,0,3,0,False,False,False,True,False,False,False,False,True,False,264.0
1,1,67.0,145.0,73.0,2.32,0.38,3.23,0.47,0.37,0.07,0.35,1.5,72.0,145.0,,1.13,0.35,2.36,0.48,2.67,0.53,0.12,0.7,25.552361,34.294319,-5.0,,0.0,-8.741958,1.19,0.03,0.87,-0.01,-2.3,-0.46,0.23,0.8,210.0,210.0,0.0,0,1,1,1,0,1,-1,1,0,0,1,0.090794,0.03295,0.014871,0.010206,0.126407,0.068816,0.018394,0.013996,0.01448,0.077855,0.002739,0.015454,0.013697,0.003499,0.058703,0.020412,0.0,0.0,0.38,0.38,0.0,0.33,1.0,0.0,0.0,0.0,65.0,197.0,0.0,0.0,0.47,0.39,0.0,1.0,1.0,0.0,0.0,0.0,30.0,197.0,0.38,0.38,0.47,0.39,32.0,30.0,83.0,78.0,74.0,67.0,129.0,118.0,0.0,1.0,4.0,3.0,10.0,9.0,21.0,23.0,27.0,39.0,38.0,55.0,0.0,1.0,1.0,1.0,16.0,17.0,61.0,60.0,6.0,10.0,9.0,14.0,10.0,3.0,13.0,4.0,24.0,24.0,71.0,65.0,1.0,1.0,1.0,3.0,7.0,5.0,11.0,10.0,7.0,6.0,17.0,19.0,3.0,2.0,4.0,3.0,0.0,1.0,0.0,1.0,3.0,4.0,13.0,13.0,1.0,0.0,1.0,0.0,6.0,5.0,7.0,10.0,Decision,0,1,0,0,0,0,False,True,False,False,False,False,True,False,False,False,389.0
2,0,75.0,205.0,76.0,3.78,0.41,3.29,0.46,0.0,0.0,0.0,0.0,71.0,205.0,70.0,2.08,0.55,4.21,0.4,5.08,0.36,0.0,1.4,38.513347,30.255989,4.0,6.0,0.0,8.257358,1.7,-0.14,-0.92,0.06,-5.08,-0.36,0.0,-1.4,118.0,175.0,-57.0,1,1,2,1,0,1,0,1,1,1,1,0.098148,0.068747,0.010646,0.018178,0.085425,0.139146,0.011944,0.013221,0.0,0.167901,0.0,0.011898,0.0,0.0,0.0,0.046272,1.0,0.0,0.42,0.0,0.0,1.0,0.0,1.0,0.0,0.0,277.333333,29.0,0.0,0.0,0.366667,0.0,,1.0,0.0,1.0,0.0,0.0,116.0,29.0,0.42,0.0,0.366667,0.0,44.0,0.0,104.666667,4.0,124.666667,0.0,191.666667,4.0,0.0,1.0,0.666667,1.0,17.666667,0.0,47.666667,4.0,51.0,0.0,83.333333,4.0,0.0,1.0,0.0,1.0,30.0,0.0,87.333333,4.0,11.333333,0.0,12.666667,0.0,2.666667,0.0,4.666667,0.0,22.333333,0.0,70.666667,4.0,13.666667,0.0,20.0,0.0,8.0,0.0,14.0,0.0,13.0,0.0,41.666667,4.0,3.333333,0.0,4.0,0.0,1.333333,0.0,2.0,0.0,10.333333,0.0,34.666667,4.0,4.666667,0.0,7.0,0.0,2.666667,0.0,6.0,0.0,Submission,0,0,1,0,0,1,False,True,False,False,False,False,True,False,False,False,14.0
3,0,74.0,265.0,75.0,2.08,0.46,1.52,0.59,1.83,0.41,0.66,0.1,74.0,240.0,80.0,3.29,0.48,3.8,0.44,1.82,0.46,0.37,1.9,28.301164,37.423682,0.0,-5.0,25.0,-9.122519,-1.21,-0.02,-2.28,0.15,0.01,-0.05,0.29,-1.8,147.0,147.0,0.0,3,0,3,1,0,1,2,0,2,3,1,0.073495,0.087912,0.016254,0.012826,0.053708,0.10154,0.020847,0.011757,0.064662,0.048632,0.014487,0.012292,0.023321,0.009887,0.003533,0.05077,0.0,0.0,0.655714,0.45,0.385714,0.25,0.0,1.0,0.0,0.0,654.571429,42.0,0.0,0.0,0.585714,0.45,0.285714,0.25,0.0,1.0,0.0,0.0,186.571429,42.0,0.655714,0.45,0.585714,0.45,39.0,10.0,62.714286,22.0,130.428571,13.0,165.714286,26.0,1.571429,1.0,4.0,4.0,9.142857,10.0,17.285714,22.0,38.571429,13.0,50.428571,26.0,0.285714,1.0,2.142857,4.0,17.285714,6.0,38.428571,18.0,10.714286,4.0,12.571429,4.0,11.0,0.0,11.714286,0.0,7.0,7.0,25.142857,18.0,13.714286,3.0,15.857143,4.0,18.285714,0.0,21.714286,0.0,2.571429,6.0,9.857143,18.0,2.857143,4.0,3.571429,4.0,3.714286,0.0,3.857143,0.0,1.714286,7.0,7.571429,18.0,3.428571,3.0,3.428571,4.0,4.0,0.0,6.285714,0.0,KO/TKO,3,0,0,0,0,1,False,True,False,False,False,False,True,False,False,False,117.0
4,0,73.0,253.0,75.0,3.7,0.57,2.9,0.51,1.4,0.45,0.59,0.5,71.0,205.0,70.0,2.08,0.55,4.21,0.4,5.08,0.36,0.0,1.4,31.403149,29.776865,2.0,5.0,48.0,1.626283,1.62,0.02,-1.31,0.11,-3.68,0.09,0.59,-0.9,210.0,,,3,1,4,0,0,0,3,1,4,1,0,0.117823,0.069853,0.018151,0.018471,0.092347,0.141385,0.01624,0.013433,0.044582,0.170602,0.01433,0.01209,0.018788,0.0,0.015922,0.047016,0.733333,,0.666667,,,,1.066667,,0.0,,140.2,,0.733333,,0.666667,,,,1.066667,,0.0,,140.2,,0.666667,,0.666667,,19.666667,,28.0,,51.0,,63.2,,0.0,,0.0,,19.666667,,28.0,,51.0,,63.2,,0.0,,0.0,,15.0,,22.733333,,1.333333,,1.933333,,3.333333,,3.333333,,6.2,,11.066667,,1.333333,,1.4,,12.133333,,15.533333,,15.0,,22.733333,,1.333333,,1.933333,,3.333333,,3.333333,,6.2,,11.066667,,1.333333,,1.4,,12.133333,,15.533333,,Submission,0,0,2,0,1,0,False,True,False,False,False,False,True,False,False,False,20.0


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, balanced_accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
from collections import Counter

# Load and prepare data
methods_df = methods_df[methods_df['method_grouped'] != 'Other']

# Create targets with proper null handling
methods_df['p1_method_target'] = np.where(methods_df['winner'] == 1, methods_df['method_grouped'], np.nan)
methods_df['p2_method_target'] = np.where(methods_df['winner'] == 0, methods_df['method_grouped'], np.nan)

# Feature engineering
excluded_cols = ['winner', 'method_grouped', 'p1_method_target', 'p2_method_target']
feature_cols = [col for col in methods_df.columns if col not in excluded_cols]

def train_method_model(target_column):
    # Filter and encode data
    df = methods_df.dropna(subset=[target_column]).copy()
    X = df[feature_cols]
    y = df[target_column]
    
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    submission_label = le.transform(['Submission'])[0]
    
    # Split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
    )
    
    # Get class counts and define strategies
    train_class_counts = Counter(y_train)
    smote_strategy = {
        cls: min(count * 2, 2000) 
        for cls, count in train_class_counts.items() 
        if cls != submission_label
    }
    adasyn_strategy = {
        submission_label: min(train_class_counts[submission_label] * 3, 2500)
    }

    # Pipeline with combined sampling
    numeric_cols = X_train.select_dtypes(include=np.number).columns
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(f_classif, k=50)),
        ('smote', SMOTE(
            sampling_strategy=smote_strategy,
            k_neighbors=5,
            random_state=42
        )),
        ('adasyn', ADASYN(
            sampling_strategy=adasyn_strategy,
            n_neighbors=3,
            random_state=42
        ))
    ])
    
    # Process training data
    X_train_processed, y_train_processed = pipeline.fit_resample(
        X_train[numeric_cols], y_train
    )

    # Get selected features
    selector = pipeline.named_steps['selector']
    selected_features = X_train[numeric_cols].columns[selector.get_support()]
    
    # Save features to file
    pd.Series(list(selected_features)).to_json(f"{target_column}_features.json")
    
    # Process test data
    X_test_processed = pipeline[:-2].transform(X_test[numeric_cols])
    
    # Get selected features
    selected_features = X_train[numeric_cols].columns[
        pipeline.named_steps['selector'].get_support()
    ]

    # Class weights
    class_weights = compute_class_weight('balanced', 
                                       classes=np.unique(y_train_processed),
                                       y=y_train_processed)
    weight_dict = dict(zip(np.unique(y_train_processed), class_weights))

    # Model configurations
    models = {
        'Logistic Regression': LogisticRegression(
            max_iter=1000,
            class_weight='balanced',
            solver='saga'
        ),
        'Random Forest': RandomForestClassifier(
            n_estimators=300,
            class_weight=weight_dict,
            min_samples_leaf=15,
            max_depth=8,
            random_state=42
        ),
        'XGBoost': xgb.XGBClassifier(
            objective='multi:softmax',
            num_class=len(np.unique(y_train_processed)),
            eval_metric='mlogloss',
            use_label_encoder=False,
            random_state=42
        ),
        'LightGBM': lgb.LGBMClassifier(
            class_weight=weight_dict,
            objective='multiclass',
            num_leaves=31,
            min_child_samples=20,
            random_state=42,
            verbose=-1
        )
    }

    # Train and evaluate
    results = {}
    for name, model in models.items():
        if name == 'LightGBM':
            # Preserve feature names
            train_df = pd.DataFrame(X_train_processed, columns=selected_features)
            test_df = pd.DataFrame(X_test_processed, columns=selected_features)
            model.fit(
                train_df, y_train_processed,
                eval_set=(test_df, y_test)
            )
        elif name == 'XGBoost':
            model.fit(X_train_processed, y_train_processed)
            # Save the XGBoost model to a JSON file
            model.save_model(f"{target_column}_xgboost_model.json") 
        else:
            model.fit(X_train_processed, y_train_processed)
        
        pred = model.predict(X_test_processed)
        results[name] = {
            'accuracy': accuracy_score(y_test, pred),
            'balanced_acc': balanced_accuracy_score(y_test, pred),
            'f1_macro': f1_score(y_test, pred, average='macro'),
            'report': classification_report(y_test, pred, target_names=le.classes_, zero_division=0)
        }
    
    print(f"\n{target_column} - Selected Features:")
    print(list(selected_features))
    
    return results, selected_features

# Train models
print("Training p1_method_target model...")
p1_results, p1_features = train_method_model('p1_method_target')

print("\nTraining p2_method_target model...")
p2_results, p2_features = train_method_model('p2_method_target')

# Save features to files
p1_features.to_series().to_json("p1_method_features.json")
p2_features.to_series().to_json("p2_method_features.json")

# Evaluation function
def print_results(results, model_name):
    print(f"\n{model_name} Results:")
    for name, res in results.items():
        print(f"\n{name}:")
        print(f"  Accuracy: {res['accuracy']:.4f}")
        print(f"  Balanced Accuracy: {res['balanced_acc']:.4f}")
        print(f"  Macro F1: {res['f1_macro']:.4f}")
        print("  Classification Report:\n", res['report'])

print_results(p1_results, "Fighter 1 Method Prediction")
print_results(p2_results, "Fighter 2 Method Prediction")


Training p1_method_target model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



p1_method_target - Selected Features:
['p1_height', 'p1_weight', 'p1_reach', 'p1_slpm', 'p1_str_def', 'p1_td_avg', 'p1_td_def', 'p1_sub_avg', 'p2_height', 'p2_weight', 'p2_reach', 'p2_sapm', 'p2_str_def', 'p2_td_def', 'slpm_diff', 'subavg_diff', 'p1_age_adjusted_slpm', 'p2_age_adjusted_str_def', 'p1_age_adjusted_td_avg', 'p1_age_adjusted_td_def', 'p1_age_adjusted_sub_avg', 'p1_kd_ema', 'p1_sub_att_ema', 'p1_r1_sub_att_ema', 'p1_sig_str_landed_ema', 'p2_sig_str_landed_ema', 'p1_sig_str_attempted_ema', 'p2_sig_str_attempted_ema', 'p1_total_str_attempted_ema', 'p2_total_str_attempted_ema', 'p1_td_attempted_ema', 'p1_r1_sig_str_attempted_ema', 'p1_r1_td_attempted_ema', 'p2_head_landed_ema', 'p1_head_attempted_ema', 'p2_head_attempted_ema', 'p1_body_landed_ema', 'p1_body_attempted_ema', 'p1_leg_landed_ema', 'p1_leg_attempted_ema', 'p1_distance_landed_ema', 'p2_distance_landed_ema', 'p1_distance_attempted_ema', 'p2_distance_attempted_ema', 'p1_r1_body_attempted_ema', 'p1_r1_distance_landed_

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



p2_method_target - Selected Features:
['p1_height', 'p1_weight', 'p1_reach', 'p1_str_acc', 'p1_str_def', 'p2_height', 'p2_weight', 'p2_reach', 'p2_slpm', 'p2_sapm', 'p2_str_def', 'p2_td_avg', 'p2_td_def', 'p2_sub_avg', 'slpm_diff', 'tdavg_diff', 'subavg_diff', 'p2_age_adjusted_slpm', 'p2_age_adjusted_sapm', 'p1_age_adjusted_str_def', 'p2_age_adjusted_str_def', 'p2_age_adjusted_td_avg', 'p2_age_adjusted_td_def', 'p2_age_adjusted_sub_avg', 'p2_kd_ema', 'p2_sub_att_ema', 'p2_ctrl_ema', 'p2_r1_sub_att_ema', 'p2_sig_str_landed_ema', 'p1_sig_str_attempted_ema', 'p2_sig_str_attempted_ema', 'p2_total_str_landed_ema', 'p1_total_str_attempted_ema', 'p2_total_str_attempted_ema', 'p2_td_landed_ema', 'p2_td_attempted_ema', 'p2_r1_sig_str_attempted_ema', 'p2_r1_total_str_attempted_ema', 'p2_r1_td_landed_ema', 'p2_r1_td_attempted_ema', 'p2_head_landed_ema', 'p1_head_attempted_ema', 'p2_head_attempted_ema', 'p2_distance_landed_ema', 'p1_distance_attempted_ema', 'p2_distance_attempted_ema', 'p2_r1_dis

