In [None]:
# import dataset
import pandas as pd
import numpy as np

data = pd.read_csv('2022__20240527_29067.csv', engine='python',encoding='CP949')
df = data.copy()

In [None]:
df.info(show_counts=True, verbose=True)

In [None]:
columns = [str(i) for i in range(1, 612)]
df.columns = columns
df.info(show_counts=True, verbose=True)

In [None]:
drop_cols = ['1','3','4','5','6','58','59', '196','317','318','319','560','593','120','121','122','123','124','168','176','556']

In [None]:
num_features = ['5','6','13','15','32','33','61','65','67','75','76','78','79','83','97','153','201','214','215',
                '217','218','219','220','226','283','314','523','534','535','570','572','573','574','575','583']

ordinal_features = df.columns.difference(num_features)

In [None]:
# Drop rows which is missing in columns named "119"
df = df.dropna(subset=['119'])
df = df.drop(drop_cols, axis=1)
df.info(show_counts=True, verbose=True)

In [None]:
#Drop columns which have more than 70% of missing values
df = df.dropna(thresh=len(df)*0.3, axis=1)
df.info(show_counts=True, verbose=True)

In [None]:
df['119'].value_counts()

In [None]:
reclassification_map = {
    1.0: 0, 2.0: 0,
    3.0: 1,
    4.0: 2, 5.0: 2
}

# Apply the mapping
df['119'] = df['119'].replace(reclassification_map)

In [None]:
df['119'].value_counts()

In [None]:
# New num_features and ordinal_features list after preprocessing
new_num_features = [col for col in df.columns if col not in ordinal_features]
new_ordinal_features = df.columns.difference(new_num_features).tolist()

In [None]:
#Label Encoding for ordinal features
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in new_ordinal_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
#Scaling for numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[new_num_features] = scaler.fit_transform(df[new_num_features])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df.drop(labels=['119'], axis=1),df['119'],test_size=0.2,random_state=123, stratify=df['119'])

BorutoShap

In [None]:
from BorutaShap import BorutaShap
from catboost import CatBoostClassifier

model = CatBoostClassifier(loss_function='MultiClass', auto_class_weights= 'Balanced', random_state=123, verbose=False)
Feature_Selector = BorutaShap(model=model,importance_measure='shap', classification=True)
Feature_Selector.fit(X=X_train, y=Y_train, n_trials=100, random_state=123,stratify=Y_train)
selected_cb_columns = Feature_Selector.Subset().columns
Feature_Selector.plot(which_features='Accepted',X_size=11)

In [None]:
selected_cb_columns

In [None]:
from BorutaShap import BorutaShap
from xgboost import XGBClassifier

model = XGBClassifier(class_weights= 'balanced', random_state=123)
Feature_Selector = BorutaShap(model=model,importance_measure='shap', classification=True)
Feature_Selector.fit(X=X_train, y=Y_train, n_trials=100, random_state=123,stratify=Y_train)
selected_xgb_columns = Feature_Selector.Subset().columns
Feature_Selector.plot(which_features='Accepted',X_size=11)

In [None]:
selected_xgb_columns

In [None]:
from BorutaShap import BorutaShap
from lightgbm import LGBMClassifier

model = LGBMClassifier(class_weights= 'balanced', random_state=123, verbose=-1)
Feature_Selector = BorutaShap(model=model,importance_measure='shap', classification=True)
Feature_Selector.fit(X=X_train, y=Y_train, n_trials=100, random_state=123,stratify=Y_train)
selected_lgb_columns = Feature_Selector.Subset().columns
Feature_Selector.plot(which_features='Accepted',X_size=11)

In [None]:
selected_lgb_columns

In [None]:
from BorutaShap import BorutaShap
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight= 'balanced', random_state=123)
Feature_Selector = BorutaShap(model=model,importance_measure='shap', classification=True)
Feature_Selector.fit(X=X_train, y=Y_train, n_trials=100, random_state=123,stratify=Y_train)
selected_rf_columns = Feature_Selector.Subset().columns
Feature_Selector.plot(which_features='Accepted',X_size=11)

In [None]:
selected_rf_columns

In [None]:
from BorutaShap import BorutaShap
from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier(class_weight= 'balanced', random_state=123)
Feature_Selector = BorutaShap(model=model,importance_measure='shap', classification=True)
Feature_Selector.fit(X=X_train, y=Y_train, n_trials=100, random_state=123,stratify=Y_train)
selected_hgb_columns = Feature_Selector.Subset().columns
Feature_Selector.plot(which_features='Accepted',X_size=11)

In [None]:
selected_hgb_columns

In [None]:
from BorutaShap import BorutaShap
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier(class_weight= 'balanced', random_state=123)
Feature_Selector = BorutaShap(model=model,importance_measure='shap', classification=True)
Feature_Selector.fit(X=X_train, y=Y_train, n_trials=100, random_state=123,stratify=Y_train)
selected_et_columns = Feature_Selector.Subset().columns
Feature_Selector.plot(which_features='Accepted',X_size=11)

In [None]:
selected_et_columns

In [None]:
selected_cb_columns = ['531', '171', '167', '170', '558', '90', '175']
selected_xgb_columns=['166', '175', '575', '558', '574']
selected_lgb_columns = ['574', '531', '171', '166', '575', '170', '558', '175']
selected_rf_columns = ['574', '531', '171', '166', '173', '167', '575', '172', '170', '558','175', '169']
selected_hgb_columns = ['574', '175', '170', '558']
selected_et_columns = ['125', '12', '72', '172', '170', '558', '531', '116', '463', '539','90', '118', '169', '171', '166', '167', '452', '576', '553', '175']

In [None]:
from sklearn.model_selection import StratifiedKFold

folds = 5
skf = StratifiedKFold(n_splits=folds)


In [None]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

models_dict = {
    'catboost': CatBoostClassifier(random_state=123, verbose=False),
    'xgboost': XGBClassifier(random_state=123),
    'lightgbm': LGBMClassifier(random_state=123, verbose=-1),
    'random_forest': RandomForestClassifier(random_state=123),
    'extra_trees': ExtraTreesClassifier(random_state=123),
    'hist_gradient_boosting': HistGradientBoostingClassifier(random_state=123)
}
# 5 fold crossvalidate models_dict, evaluation metric: weighted average F1, weighted average precision, weighted average recall, balanced accuracy
for model_name, model in models_dict.items():
    print(f"Model: {model_name}")
    scores = cross_validate(model, X_train, Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    print("F1:", scores['test_f1_weighted'].mean())
    print("Precision:", scores['test_precision_weighted'].mean())
    print("Recall:", scores['test_recall_weighted'].mean())
    print("Accuracy:", scores['test_accuracy'].mean())
    print()

In [None]:
#Using selected_cb_columns
for model_name, model in models_dict.items():
    print(f"Model: {model_name}")
    scores = cross_validate(model, X_train[selected_cb_columns], Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    print("F1:", scores['test_f1_weighted'].mean())
    print("Precision:", scores['test_precision_weighted'].mean())
    print("Recall:", scores['test_recall_weighted'].mean())
    print("Accuracy:", scores['test_accuracy'].mean())
    print()

In [None]:
#Using selected_xgb_columns
for model_name, model in models_dict.items():
    print(f"Model: {model_name}")
    scores = cross_validate(model, X_train[selected_xgb_columns], Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    print("F1:", scores['test_f1_weighted'].mean())
    print("Precision:", scores['test_precision_weighted'].mean())
    print("Recall:", scores['test_recall_weighted'].mean())
    print("Accuracy:", scores['test_accuracy'].mean())
    print()

In [None]:
#Using selected_lgb_columns
for model_name, model in models_dict.items():
    print(f"Model: {model_name}")
    scores = cross_validate(model, X_train[selected_lgb_columns], Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    print("F1:", scores['test_f1_weighted'].mean())
    print("Precision:", scores['test_precision_weighted'].mean())
    print("Recall:", scores['test_recall_weighted'].mean())
    print("Accuracy:", scores['test_accuracy'].mean())
    print()

In [None]:
#Using selected_rf_columns
for model_name, model in models_dict.items():
    print(f"Model: {model_name}")
    scores = cross_validate(model, X_train[selected_rf_columns], Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    print("F1:", scores['test_f1_weighted'].mean())
    print("Precision:", scores['test_precision_weighted'].mean())
    print("Recall:", scores['test_recall_weighted'].mean())
    print("Accuracy:", scores['test_accuracy'].mean())
    print()

In [None]:
#Using selected_hgb_columns
for model_name, model in models_dict.items():
    print(f"Model: {model_name}")
    scores = cross_validate(model, X_train[selected_hgb_columns], Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    print("F1:", scores['test_f1_weighted'].mean())
    print("Precision:", scores['test_precision_weighted'].mean())
    print("Recall:", scores['test_recall_weighted'].mean())
    print("Accuracy:", scores['test_accuracy'].mean())
    print()

In [None]:
#Using selected_et_columns
for model_name, model in models_dict.items():
    print(f"Model: {model_name}")
    scores = cross_validate(model, X_train[selected_et_columns], Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    print("F1:", scores['test_f1_weighted'].mean())
    print("Precision:", scores['test_precision_weighted'].mean())
    print("Recall:", scores['test_recall_weighted'].mean())
    print("Accuracy:", scores['test_accuracy'].mean())
    print()

Solving imbalance

In [None]:
X_train_final = X_train[selected_rf_columns].copy()
X_test_final = X_test[selected_rf_columns].copy()

X_train_final.shape, X_test_final.shape

In [None]:
#Applying SMOTE-Tomek
# from imblearn.over_sampling import SMOTE,ADASYN,BorderlineSMOTE
# from imblearn.under_sampling import EditedNearestNeighbours,TomekLinks
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline

smote_tomek = SMOTETomek(random_state=123)

models_dict_smote = {
    'catboost': CatBoostClassifier(random_state=123, verbose=False),
    'xgboost': XGBClassifier(random_state=123),
    'lightgbm': LGBMClassifier( random_state=123, verbose=-1),
    'random_forest': RandomForestClassifier(random_state=123),
    'extra_trees': ExtraTreesClassifier(random_state=123),
    'hist_gradient_boosting': HistGradientBoostingClassifier( random_state=123)
}

for model_name, model in models_dict_smote.items():
    print(f"Model: {model_name}")
    pipeline = Pipeline([('smote', smote_tomek), ('model', model)])
    scores = cross_validate(pipeline, X_train_final, Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    print("F1:", scores['test_f1_weighted'].mean())
    print("Precision:", scores['test_precision_weighted'].mean())
    print("Recall:", scores['test_recall_weighted'].mean())
    print("Accuracy:", scores['test_accuracy'].mean())
    print()

Hyperparameters Fine-tuning

In [None]:
import optuna
from optuna.samplers import TPESampler
from imblearn.pipeline import Pipeline
# CatBoost Hyperparameter Tuning
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 20, 100),
        'random_strength': trial.suggest_float('random_strength', 0.1, 1.0),
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'verbose': False,
        'random_state': 123
    }
    model = CatBoostClassifier(**params)
    pipeline = Pipeline([('rebalance', smote_tomek), ('model', model)])
    scores = cross_validate(pipeline, X_train_final, Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    return scores['test_f1_weighted'].mean()

sampler = TPESampler(seed=123)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_params

In [None]:
# LightGBM Hyperparameter Tuning
def objective(trial):
    params = {
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 0.7),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', -1, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 200),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 0.1),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'num_leaves': trial.suggest_int('num_leaves', 30, 300),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e-2),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e-2),
        'random_state': 123
    }
    model = LGBMClassifier(**params)
    pipeline = Pipeline([('rebalance', smote_tomek), ('model', model)])
    scores = cross_validate(pipeline, X_train_final, Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    return scores['test_f1_weighted'].mean()

sampler = TPESampler(seed=123)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_params

In [None]:
# XGBoost Hyperparameter Tuning
def objective(trial):
    params = {
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'device': 'gpu',  # This needs to match your hardware capability
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'random_state': 123,  # Fixed for reproducibility
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-6, 1e-4),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-6, 1e-4),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.2, 0.8),
        'random_state': 123
    }
    model = XGBClassifier(**params)
    pipeline = Pipeline([('rebalance', smote_tomek), ('model', model)])
    scores = cross_validate(pipeline, X_train_final, Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    return scores['test_f1_weighted'].mean()

sampler = TPESampler(seed=123)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_params

In [None]:
# HGBoost Hyperparameter Tuning
def objective(trial):
    params = {
        'max_iter': trial.suggest_int('max_iter', 50, 200),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'l2_regularization': trial.suggest_loguniform('l2_regularization', 1e-5, 1e-1),
        'random_state': 123
    }
    model = HistGradientBoostingClassifier(**params)
    pipeline = Pipeline([('smote', smote_tomek), ('model', model)])
    scores = cross_validate(pipeline, X_train_final, Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    return scores['test_f1_weighted'].mean()

sampler = TPESampler(seed=123)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_params

In [None]:
# ET Hyperparameter Tuning
def objective(trial):
    params = {
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample']),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
        'min_impurity_decrease': trial.suggest_loguniform('min_impurity_decrease', 1e-7, 1e-3),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'random_state': 123
    }
    model = ExtraTreesClassifier(**params)
    pipeline = Pipeline([('smote', smote_tomek), ('model', model)])
    scores = cross_validate(pipeline, X_train_final, Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    return scores['test_f1_weighted'].mean()

sampler = TPESampler(seed=123)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_params

In [None]:
# RF Hyperparameter Tuning
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 10, 100),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'random_state': 123
    }
    model = ExtraTreesClassifier(**params)
    pipeline = Pipeline([('smote', smote_tomek), ('model', model)])
    scores = cross_validate(pipeline, X_train_final, Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    return scores['test_f1_weighted'].mean()

sampler = TPESampler(seed=123)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_params

Evaluating optimized models

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

final_models_dict = {
    'catboost': CatBoostClassifier(n_estimators=252, depth=7, l2_leaf_reg=8, random_strength=0.77, border_count=32, eta=0.2285, random_state=123, verbose=False),
    'xgboost': XGBClassifier(colsample_bytree=0.9981, learning_rate=0.08, max_depth=3, min_child_weight=2, n_estimators=142, reg_alpha=6.97e-06, reg_lambda=1.79e-05, scale_pos_weight=17.9, subsample=0.32, random_state=123),
    'lightgbm': LGBMClassifier(bagging_fraction=0.77, bagging_freq=4, feature_fraction=0.469, learning_rate=0.04, max_depth=-1, min_child_samples=98, min_child_weight=0.001, min_split_gain=0.27, n_estimators=227, num_leaves=232, reg_alpha=0.01, reg_lambda=0.01, random_state=123, verbose=-1),
    'random_forest': RandomForestClassifier(random_state=123),
    'extra_trees': ExtraTreesClassifier(bootstrap=True, class_weight='balanced_subsample', criterion='entropy', max_depth=11, max_features=0.85, min_impurity_decrease=2.14e-07, min_samples_leaf=3, min_samples_split=9, n_estimators=227, random_state=123),
    'hist_gradient_boosting': HistGradientBoostingClassifier(max_iter=150,max_leaf_nodes=31,max_depth=22,min_samples_leaf=20,learning_rate=0.052,l2_regularization=0.0005,random_state=123)
}
# 5 fold crossvalidate models_dict, evaluation metric: weighted average F1, weighted average precision, weighted average recall, balanced accuracy
for model_name, model in final_models_dict.items():
    print(f"Model: {model_name}")
    pipeline = Pipeline([('smote', smote_tomek), ('model', model)])
    scores = cross_validate(pipeline, X_train_final, Y_train, cv=skf,
                            scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'])
    print("F1:", scores['test_f1_weighted'].mean())
    print("Precision:", scores['test_precision_weighted'].mean())
    print("Recall:", scores['test_recall_weighted'].mean())
    print("Accuracy:", scores['test_accuracy'].mean())
    print()

In [None]:
#Fine-tuning the Voting Classifier
import optuna
from optuna.samplers import TPESampler
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score

estimators = []
final_et = ExtraTreesClassifier(bootstrap=True, class_weight='balanced_subsample', criterion='entropy', max_depth=11, max_features=0.85, min_impurity_decrease=2.14e-07, min_samples_leaf=3, min_samples_split=9, n_estimators=227, random_state=123)
final_catboost=CatBoostClassifier(n_estimators=252, depth=7, l2_leaf_reg=8, random_strength=0.77, border_count=32, eta=0.2285, random_state=123, verbose=False)
final_xgboost=XGBClassifier(colsample_bytree=0.9981, learning_rate=0.08, max_depth=3, min_child_weight=2, n_estimators=142, reg_alpha=6.97e-06, reg_lambda=1.79e-05, subsample=0.32, random_state=123)

estimators.append(('et', final_et))
estimators.append(('catboost', final_catboost))
estimators.append(('xgb', final_xgboost))

# Define StratifiedKFold cross-validator
skf = StratifiedKFold(n_splits=5)

# Define SMOTETomek for handling class imbalance
smote_tomek = SMOTETomek(random_state=123)

# Define the objective function for Optuna
def objective(trial):
    weight_et = trial.suggest_float("weight_et", 0.1, 1)
    weight_catboost = trial.suggest_float("weight_catboost", 0.1, 1)
    weight_xgb = trial.suggest_float("weight_xgb", 0.1, 1)

    model = VotingClassifier(
        estimators=estimators,
        voting='soft',
        weights=[weight_et, weight_catboost, weight_xgb]
    )

    pipeline = Pipeline([('smote', smote_tomek), ('model', model)])
    
    # Define scoring metrics
    scoring = {
        'f1_weighted': make_scorer(f1_score, average='weighted'),
        'precision_weighted': make_scorer(precision_score, average='weighted'),
        'recall_weighted': make_scorer(recall_score, average='weighted'),
        'accuracy': make_scorer(accuracy_score)
    }
    
    scores = cross_validate(pipeline, X_train_final, Y_train, cv=skf, scoring=scoring)
    
    return scores['test_f1_weighted'].mean()

# Setup Optuna study
sampler = TPESampler(seed=123)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=1)

# Output the best parameters
best_params = study.best_params
best_params

In [None]:
X=data[selected_rf_columns].copy()
Y=data['119'].copy()

from sklearn.model_selection import train_test_split
x_train_final, x_test_final, y_train_final, y_test_final = train_test_split(X,Y,test_size=0.2,random_state=123, stratify=Y)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_final = scaler.fit_transform(x_train_final)
x_test_final = scaler.transform(x_test_final)

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold

estimators = []
final_et = ExtraTreesClassifier(bootstrap=True, class_weight='balanced_subsample', criterion='entropy', max_depth=11, max_features=0.85, min_impurity_decrease=2.14e-07, min_samples_leaf=3, min_samples_split=9, n_estimators=227, random_state=123)
final_catboost=CatBoostClassifier(n_estimators=252, depth=7, l2_leaf_reg=8, random_strength=0.77, border_count=32, eta=0.2285, random_state=123, verbose=False)
final_xgboost=XGBClassifier(colsample_bytree=0.9981, learning_rate=0.08, max_depth=3, min_child_weight=2, n_estimators=142, reg_alpha=6.97e-06, reg_lambda=1.79e-05, subsample=0.32, random_state=123)

estimators.append(('et', final_et))
estimators.append(('catboost', final_catboost))
estimators.append(('xgb', final_xgboost))

# Define StratifiedKFold cross-validator
skf = StratifiedKFold(n_splits=5)

# Define SMOTETomek for handling class imbalance
smote_tomek = SMOTETomek(random_state=123)

hardvote = VotingClassifier(estimators=estimators, voting='soft', weights=[0.5886,0.6677,0.4478])
pipeline = Pipeline([('smote', smote_tomek), ('model', hardvote)])

scoring = {
        'f1_weighted',
        'precision_weighted',
        'recall_weighted',
        'accuracy'
    }
    
scores = cross_validate(pipeline, x_train_final, y_train_final, cv=skf, scoring=scoring)

print("F1:", scores['test_f1_weighted'].mean())
print("Precision:", scores['test_precision_weighted'].mean())
print("Recall:", scores['test_recall_weighted'].mean())
print("Accuracy:", scores['test_accuracy'].mean())

In [None]:
#Refit the model
pipeline.fit(x_train_final, y_train_final)

#Predict the test data
y_pred = pipeline.predict(x_test_final)

from sklearn.metrics import classification_report
print(classification_report(y_test_final, y_pred,digits=4))

In [None]:
#plot confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test_final, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d", cmap='Greens', cbar=False, xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.title('Weighted Soft Voting Classifier Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

LIME Explainations

In [None]:
def run_pred(x):
  x=scaler.transform(x)
  return pipeline.predict_proba(x)

In [None]:
import lime
import lime.lime_tabular

explainer = lime.lime_tabular.LimeTabularExplainer(x_train_final.values, feature_names=selected_rf_columns, class_names=['Dissatisfied', 'Normal', 'Satisfied'], discretize_continuous=True, random_state=123)

In [None]:
exp = explainer.explain_instance(x_test_final.values[60], run_pred,num_features = 12, top_labels=3,num_samples=15000)
exp.show_in_notebook(show_table = True, show_all = False,  show_predicted_value = True,labels=(0,))