In [1]:
!ls ../feat_selection/ | grep json

feat_selection_intersection_at_95.json
feat_selection_intersection_at_96.json
feat_selection_intersection_at_97.json
feat_selection_intersection_at_98.json
feat_selection_intersection_at_99.json
feat_selection_union_at_80.json
feat_selection_union_at_85.json
feat_selection_union_at_90.json
feat_selection_union_at_95.json
select_optuna_catb.json
select_optuna_combined_v23.json
select_optuna_combined_v24.json
select_optuna_combined_v34.json
select_optuna_lgbm_v1.json
select_optuna_lgbm_v2.json
select_optuna_lgbm_v3.json


In [2]:
import json
import numpy as np
import pandas as pd
from pathlib import Path

import sys
sys.path.append("../src")
from preproc import process_train_data

In [3]:
# define some paths
path_raw = Path("../data/raw")
path_processed = Path("../data/processed")
path_results = Path("../data/results")

# load data
df_train = pd.read_csv(path_raw / "train.csv")

df_train, numerical_cols, categorical_cols, encoder, scaler = process_train_data(
    df_train,
    scale=False,
    include_position_features=False,
    include_text_features=False,
)

# Print the results
print("Numerical Columns:", len(numerical_cols))
print("Categorical Columns:", len(categorical_cols))

number of all nan cols:  18
number of constant cols:  198
Numerical Columns: 588
Categorical Columns: 10


***
### feature selection overlap

In [4]:
# Load the 4 feature selection JSON files
with open('../feat_selection/select_optuna_lgbm_v1.json', 'r') as f:
    fsv1 = json.load(f)
    
with open('../feat_selection/select_optuna_lgbm_v2.json', 'r') as f:
    fsv2 = json.load(f)
    
with open('../feat_selection/select_optuna_catb.json', 'r') as f:
    fsv3     = json.load(f)
    
with open('../feat_selection/select_optuna_lgbm_v3.json', 'r') as f:
    fsv4 = json.load(f)

In [5]:
fsv1_numerical, fsv1_categorical = fsv1["numerical"], fsv1["categorical"]
fsv2_numerical, fsv2_categorical = fsv2["numerical"], fsv2["categorical"]
fsv3_numerical, fsv3_categorical = fsv3["numerical"], fsv3["categorical"]
fsv4_numerical, fsv4_categorical = fsv4["numerical"], fsv4["categorical"]

# Create a list of all feature selection versions
fs_versions = [
    ('fsv1', fsv1_numerical, fsv1_categorical),
    ('fsv2', fsv2_numerical, fsv2_categorical), 
    ('fsv3', fsv3_numerical, fsv3_categorical),
    ('fsv4', fsv4_numerical, fsv4_categorical)
]

# Iterate through each version
for i, (name, num, cat) in enumerate(fs_versions):
    print('-'*100)
    print(f"{name}:")
    print(f"Number of numerical features: {len(num)}")
    print(f"Number of categorical features: {len(cat)}")
    print("\nCategorical features:")
    print(cat)
    
    # Compare with other versions
    for j, (other_name, other_num, other_cat) in enumerate(fs_versions):
        if i != j:  # Don't compare with self
            # Calculate overlap counts and percentages
            num_overlap_set = set(num) & set(other_num)
            cat_overlap_set = set(cat) & set(other_cat)
            num_overlap_pct = len(num_overlap_set) / len(num) * 100
            cat_overlap_pct = len(cat_overlap_set) / len(cat) * 100
            
            print(f"\nMatch with {other_name}:")
            print(f"Numerical features overlap: {len(num_overlap_set)} features ({num_overlap_pct:.1f}%)")
            print(f"Categorical features overlap: {len(cat_overlap_set)} features ({cat_overlap_pct:.1f}%)")


----------------------------------------------------------------------------------------------------
fsv1:
Number of numerical features: 281
Number of categorical features: 6

Categorical features:
['agent1', 'agent2', 'agent1_playout', 'agent1_score_bounds', 'agent2_exploration_const', 'agent2_score_bounds']

Match with fsv2:
Numerical features overlap: 144 features (51.2%)
Categorical features overlap: 3 features (50.0%)

Match with fsv3:
Numerical features overlap: 142 features (50.5%)
Categorical features overlap: 4 features (66.7%)

Match with fsv4:
Numerical features overlap: 145 features (51.6%)
Categorical features overlap: 4 features (66.7%)
----------------------------------------------------------------------------------------------------
fsv2:
Number of numerical features: 296
Number of categorical features: 5

Categorical features:
['agent1', 'agent2', 'agent1_selection', 'agent1_playout', 'agent2_selection']

Match with fsv1:
Numerical features overlap: 144 features (48.6

In [6]:
# Create intersection of fsv2 and fsv3
fsv23_numerical = list(set(fsv2_numerical) & set(fsv3_numerical))
fsv23_categorical = list(set(fsv2_categorical) | set(fsv3_categorical))

print("FSV2 & FSV3 Intersection:")
print(f"Number of numerical features: {len(fsv23_numerical)}")
print(f"Number of categorical features: {len(fsv23_categorical)}")
print("\nCategorical features:")
print(fsv23_categorical)

# Save FSV2 & FSV3 intersection to JSON
output = {
    "numerical": fsv23_numerical,
    "categorical": fsv23_categorical
}

with open("../feat_selection/select_optuna_combined_v23.json", "w") as f:
    json.dump(output, f)

FSV2 & FSV3 Intersection:
Number of numerical features: 141
Number of categorical features: 9

Categorical features:
['agent1_playout', 'agent2', 'agent2_exploration_const', 'agent2_playout', 'agent1_selection', 'agent1_exploration_const', 'agent2_score_bounds', 'agent2_selection', 'agent1']


In [7]:
# Create intersection of fsv2 and fsv4
fsv24_numerical = list(set(fsv2_numerical) & set(fsv4_numerical))
fsv24_categorical = list(set(fsv2_categorical) | set(fsv4_categorical))

print("FSV2 & FSV4 Intersection:")
print(f"Number of numerical features: {len(fsv24_numerical)}")
print(f"Number of categorical features: {len(fsv24_categorical)}")
print("\nCategorical features:")
print(fsv24_categorical)

# Save FSV2 & FSV4 intersection to JSON
output = {
    "numerical": fsv24_numerical,
    "categorical": fsv24_categorical
}

with open("../feat_selection/select_optuna_combined_v24.json", "w") as f:
    json.dump(output, f)

FSV2 & FSV4 Intersection:
Number of numerical features: 147
Number of categorical features: 8

Categorical features:
['agent1_playout', 'agent2', 'agent2_exploration_const', 'agent2_playout', 'agent1_selection', 'agent2_score_bounds', 'agent2_selection', 'agent1']


In [8]:
# Create intersection of fsv3 and fsv4
fsv34_numerical = list(set(fsv3_numerical) & set(fsv4_numerical))
fsv34_categorical = list(set(fsv3_categorical) | set(fsv4_categorical))

print("FSV3 & FSV4 Intersection:")
print(f"Number of numerical features: {len(fsv34_numerical)}")
print(f"Number of categorical features: {len(fsv34_categorical)}")
print("\nCategorical features:")
print(fsv34_categorical)

# Save FSV3 & FSV4 intersection to JSON
output = {
    "numerical": fsv34_numerical,
    "categorical": fsv34_categorical
}

with open("../feat_selection/select_optuna_combined_v34.json", "w") as f:
    json.dump(output, f)

FSV3 & FSV4 Intersection:
Number of numerical features: 148
Number of categorical features: 9

Categorical features:
['agent2', 'agent1_playout', 'agent2_exploration_const', 'agent2_playout', 'agent1_selection', 'agent1_exploration_const', 'agent2_score_bounds', 'agent2_selection', 'agent1']


In [9]:
# Get intersection of numerical and categorical features for fsv2, fsv3, fsv4
numerical_intersection = set(fsv2_numerical) & set(fsv3_numerical) & set(fsv4_numerical)
categorical_intersection = set(fsv2_categorical) & set(fsv3_categorical) & set(fsv4_categorical)

print("Intersection between fsv2, fsv3 and fsv4:")
print(f"Number of numerical features: {len(numerical_intersection)}")
print(f"Number of categorical features: {len(categorical_intersection)}")

Intersection between fsv2, fsv3 and fsv4:
Number of numerical features: 76
Number of categorical features: 2


***
### importance selection

In [10]:
!ls ../data/results | grep importance

feat_importance_catboost_cv1.csv
feat_importance_lightgbm_cv1.csv
feat_importance_xgboost_cv1.csv


In [11]:
imp_lgb = pd.read_csv("../data/results/feat_importance_lightgbm_cv1.csv")
imp_cat = pd.read_csv("../data/results/feat_importance_catboost_cv1.csv")
imp_xgb = pd.read_csv("../data/results/feat_importance_xgboost_cv1.csv")

imp_lgb["importance_relative_cumsum"] = imp_lgb["importance_relative"].cumsum()
imp_cat["importance_relative_cumsum"] = imp_cat["importance_relative"].cumsum()
imp_xgb["importance_relative_cumsum"] = imp_xgb["importance_relative"].cumsum()

len(imp_lgb)

598

In [12]:
def get_important_features(cut_off=0.9, join_type='intersection'):
    """
    Get important features based on importance scores from multiple models.
    
    Args:
        cut_off (float): Importance threshold between 0 and 1
        join_type (str): Either 'intersection' or 'union' to determine how to combine features
        
    Returns:
        tuple: Lists of numerical and categorical feature names
    """
    # Get features for each model under cutoff threshold
    imp_lgb_cut = imp_lgb.query(f"importance_relative_cumsum < {cut_off}").copy()
    imp_cat_cut = imp_cat.query(f"importance_relative_cumsum < {cut_off}").copy()
    imp_xgb_cut = imp_xgb.query(f"importance_relative_cumsum < {cut_off}").copy()

    print(f"Number of features after {cut_off*100}% importance cutoff:")
    print(f"LightGBM: {len(imp_lgb_cut)} features")
    print(f"CatBoost: {len(imp_cat_cut)} features") 
    print(f"XGBoost: {len(imp_xgb_cut)} features")

    # Get sets of features from each model
    lgb_features = set(imp_lgb_cut['feature'])
    cat_features = set(imp_cat_cut['feature'])
    xgb_features = set(imp_xgb_cut['feature'])

    if join_type == 'intersection':
        selected_features = lgb_features & cat_features & xgb_features
        print(f"\nFeatures common to all models: {len(selected_features)} features")
    elif join_type == 'union':
        selected_features = lgb_features | cat_features | xgb_features
        print(f"Features in any model: {len(selected_features)} features")
    else:
        raise ValueError("join_type must be either 'intersection' or 'union'")

    # Split into numerical and categorical features
    numerical_features = []
    categorical_features = []
    
    for feat in selected_features:
        if feat in categorical_cols:
            categorical_features.append(feat)
        else:
            numerical_features.append(feat)
            
    return numerical_features, categorical_features

In [13]:
num_cols, cat_cols = get_important_features(
    cut_off=0.95,
    join_type='intersection'
)
output = dict(
    numerical_features=num_cols,
    categorical_features=cat_cols
)
with open("../feat_selection/feat_selection_intersection_at_95.json", "w") as f:
    json.dump(output, f)

Number of features after 95.0% importance cutoff:
LightGBM: 271 features
CatBoost: 181 features
XGBoost: 379 features

Features common to all models: 141 features


In [14]:
num_cols, cat_cols = get_important_features(
    cut_off=0.96,
    join_type='intersection'
)
output = dict(
    numerical_features=num_cols,
    categorical_features=cat_cols
)
with open("../feat_selection/feat_selection_intersection_at_96.json", "w") as f:
    json.dump(output, f)

Number of features after 96.0% importance cutoff:
LightGBM: 293 features
CatBoost: 202 features
XGBoost: 399 features

Features common to all models: 157 features


In [15]:
num_cols, cat_cols = get_important_features(
    cut_off=0.97,
    join_type='intersection'
)
output = dict(
    numerical_features=num_cols,
    categorical_features=cat_cols
)
with open("../feat_selection/feat_selection_intersection_at_97.json", "w") as f:
    json.dump(output, f)

Number of features after 97.0% importance cutoff:
LightGBM: 319 features
CatBoost: 232 features
XGBoost: 421 features

Features common to all models: 183 features


In [16]:
num_cols, cat_cols = get_important_features(
    cut_off=0.98,
    join_type='intersection'
)
output = dict(
    numerical_features=num_cols,
    categorical_features=cat_cols
)
with open("../feat_selection/feat_selection_intersection_at_98.json", "w") as f:
    json.dump(output, f)

Number of features after 98.0% importance cutoff:
LightGBM: 353 features
CatBoost: 273 features
XGBoost: 448 features

Features common to all models: 227 features


In [17]:
num_cols, cat_cols = get_important_features(
    cut_off=0.99,
    join_type='intersection'
)
output = dict(
    numerical_features=num_cols,
    categorical_features=cat_cols
)
with open("../feat_selection/feat_selection_intersection_at_99.json", "w") as f:
    json.dump(output, f)

Number of features after 99.0% importance cutoff:
LightGBM: 408 features
CatBoost: 337 features
XGBoost: 481 features

Features common to all models: 296 features


In [18]:
num_cols, cat_cols = get_important_features(
    cut_off=0.80,
    join_type='union'
)
output = dict(
    numerical_features=num_cols,
    categorical_features=cat_cols
)
with open("../feat_selection/feat_selection_union_at_80.json", "w") as f:
    json.dump(output, f)

Number of features after 80.0% importance cutoff:
LightGBM: 98 features
CatBoost: 66 features
XGBoost: 204 features
Features in any model: 223 features


In [19]:
num_cols, cat_cols = get_important_features(
    cut_off=0.85,
    join_type='union'
)
output = dict(
    numerical_features=num_cols,
    categorical_features=cat_cols
)
with open("../feat_selection/feat_selection_union_at_85.json", "w") as f:
    json.dump(output, f)

Number of features after 85.0% importance cutoff:
LightGBM: 138 features
CatBoost: 85 features
XGBoost: 248 features
Features in any model: 272 features


In [20]:
num_cols, cat_cols = get_important_features(
    cut_off=0.90,
    join_type='union'
)
output = dict(
    numerical_features=num_cols,
    categorical_features=cat_cols
)
with open("../feat_selection/feat_selection_union_at_90.json", "w") as f:
    json.dump(output, f)

Number of features after 90.0% importance cutoff:
LightGBM: 192 features
CatBoost: 118 features
XGBoost: 304 features
Features in any model: 336 features


In [21]:
num_cols, cat_cols = get_important_features(
    cut_off=0.95,
    join_type='union'
)
output = dict(
    numerical_features=num_cols,
    categorical_features=cat_cols
)
with open("../feat_selection/feat_selection_union_at_95.json", "w") as f:
    json.dump(output, f)

Number of features after 95.0% importance cutoff:
LightGBM: 271 features
CatBoost: 181 features
XGBoost: 379 features
Features in any model: 406 features


***