In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split
from src.data_loader import *
from sklearn.preprocessing import label_binarize, PowerTransformer
from PIL import Image
from IPython.display import Image, display
from src.pipeline import *
import joblib
from src.convert_graphml_to_pyg_data_multithread import *
from tqdm import tqdm

import networkx as nx
%aimport src.data_loader
%aimport src.visualization
%aimport src.training
%aimport src.evaluation
%aimport src.pipeline
%aimport src.convert_graphml_to_pyg_data_multithread

In [3]:
features = [
    "faces", "edges", "vertices", "quantity",
    "height", "width", "depth", "volume", "area",
    "bbox_height", "bbox_width", "bbox_depth", "bbox_volume",
    "bbox_area",
]
data = pd.read_csv("./data/synced_dataset_final.csv")

In [4]:
X = data[features]
X_train, X_test, y_train_index, y_test_index = train_test_split(
    X, range(len(X)),
    test_size=0.2,
    random_state=100,
    stratify=data["is_cnc"])
y_multi_train = data["multiclass_labels"].iloc[y_train_index]
y_multi_test = data["multiclass_labels"].iloc[y_test_index]

y_binary_train = data["is_cnc"].iloc[y_train_index]
y_binary_test = data["is_cnc"].iloc[y_test_index]

params = {
    "n_estimators": randint(100, 300),
    "max_depth": randint(3, 10),
    "learning_rate": uniform(0.01, 0.3),
}

In [6]:
best_search_binary = randomizedsearchcv_xgboost(X_train, y_binary_train,
                                                params)

In [7]:
y_binary_pred = best_search_binary.best_estimator_.predict(X_test)
y_binary_prob = best_search_binary.best_estimator_.predict_proba(X_test)

In [8]:
print("Binary Model Evaluation")
metrics_binary = evaluate_classification(
    y_true=y_binary_test,
    y_pred=y_binary_pred,
    y_prob=y_binary_prob,
)
pprint(metrics_binary)

Binary Model Evaluation
{'accuracy': 0.8711414790996784,
 'average_precision': 0.9604947229991485,
 'f1_macro': 0.8609816827884358,
 'f1_weighted': 0.8718514262103362,
 'roc_auc': 0.9350805013844213}


In [9]:
y_binary_pred_train = best_search_binary.best_estimator_.predict(X_train)
y_binary_prob_train = best_search_binary.best_estimator_.predict_proba(X_train)

metrics_binary_train = evaluate_classification(
    y_true=y_binary_train,
    y_pred=y_binary_pred_train,
    y_prob=y_binary_prob_train,
)
pprint(metrics_binary_train)

{'accuracy': 0.9465814542385144,
 'average_precision': 0.9942039999295788,
 'f1_macro': 0.9427195107495587,
 'f1_weighted': 0.9470214527536648,
 'roc_auc': 0.9892845813353901}


In [8]:
import optuna

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 10),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),
    }
    sample_weight = compute_sample_weight(
        class_weight="balanced", y=y_binary_train
    )
    model = XGBClassifier(**params, use_label_encoder=False,
                          eval_metric='logloss')
    model.fit(
        X_train, y_binary_train,
        sample_weight=sample_weight,
    )

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)

    metrics = evaluate_classification(
        y_true=y_binary_test,
        y_pred=y_pred,
        y_prob=y_prob,
    )
    return metrics['roc_auc']  # or any other metric you want to optimize

In [9]:
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.CmaEsSampler(seed=100))
study.optimize(objective, n_trials=2000, timeout=600, n_jobs=-1)

[I 2025-08-17 15:31:43,273] A new study created in memory with name: no-name-3800ceca-3228-48b3-95f2-bad0703c3788
[I 2025-08-17 15:31:44,270] Trial 0 finished with value: 0.8060988323958828 and parameters: {'n_estimators': 101, 'max_depth': 3, 'learning_rate': 0.11339261828987716, 'min_child_weight': 4, 'subsample': 0.9384451355399128, 'colsample_bytree': 0.8729683090830276, 'reg_alpha': 2.913829738793342, 'reg_lambda': 6.002469401758164}. Best is trial 0 with value: 0.8060988323958828.
[I 2025-08-17 15:31:44,445] Trial 1 finished with value: 0.8590546332803406 and parameters: {'n_estimators': 79, 'max_depth': 5, 'learning_rate': 0.2526138457963263, 'min_child_weight': 5, 'subsample': 0.8268462974354694, 'colsample_bytree': 0.9678998304239301, 'reg_alpha': 5.560773553890458, 'reg_lambda': 3.6819055455792995}. Best is trial 1 with value: 0.8590546332803406.
[I 2025-08-17 15:31:44,630] Trial 3 finished with value: 0.876454216305264 and parameters: {'n_estimators': 54, 'max_depth': 9, 'le

In [10]:
study.best_params

{'n_estimators': 263,
 'max_depth': 10,
 'learning_rate': 0.24515821198545437,
 'min_child_weight': 1,
 'subsample': 0.980170170005214,
 'colsample_bytree': 0.892489256328304,
 'reg_alpha': 0.7674498549340549,
 'reg_lambda': 3.0585710555472234}

In [11]:
study.best_value

0.9393747021148229

In [19]:
fold_results = {}

for fold in data.binary_fold.unique():
    print(f"Running fold {fold}")

    # Samples not in fold will be training data
    X_train = data[data.binary_fold != fold][features]
    y_train = data[data.binary_fold != fold]["is_cnc"]

    # Samples in fold will be test data
    X_test = data[data.binary_fold == fold][features]
    y_test = data[data.binary_fold == fold]["is_cnc"]
    def objective(trial):
        try:
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                "reg_alpha": trial.suggest_float("reg_alpha", 0, 10),
                "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),
            }

            sample_weight = compute_sample_weight(
                class_weight="balanced", y=y_train
            )

            model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss')
            model.fit(X_train, y_train, sample_weight=sample_weight)

            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)

            metrics = evaluate_classification(
                y_true=y_test,
                y_pred=y_pred,
                y_prob=y_prob,
            )
            return metrics['roc_auc']
        # Return low score if an error occurs
        except Exception as e:
            print(f"Error during trial number: {trial.number}: {e}")
            return 0.0

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.CmaEsSampler(seed=100))
    study.optimize(objective, n_trials=2000, n_jobs=-1)

    fold_results[f"fold_{fold}"] = {
            'best_params': study.best_params,
            'best_score': study.best_value,
            'study': study
        }

[I 2025-08-17 17:53:11,394] A new study created in memory with name: no-name-98df772b-39f4-4ff1-80c8-21dabc8ac17c


Running fold 5.0


[I 2025-08-17 17:53:13,397] Trial 2 finished with value: 0.8287633918211388 and parameters: {'n_estimators': 123, 'max_depth': 5, 'learning_rate': 0.05119136278161413, 'min_child_weight': 3, 'subsample': 0.8886085697106936, 'colsample_bytree': 0.732067667688567, 'reg_alpha': 3.5914031711815255, 'reg_lambda': 9.558554553018887}. Best is trial 2 with value: 0.8287633918211388.
[I 2025-08-17 17:53:13,728] Trial 9 finished with value: 0.9040026854999138 and parameters: {'n_estimators': 83, 'max_depth': 9, 'learning_rate': 0.18564204410739085, 'min_child_weight': 9, 'subsample': 0.9080023501469553, 'colsample_bytree': 0.9593494452125929, 'reg_alpha': 9.323442848223149, 'reg_lambda': 2.8074943527100227}. Best is trial 9 with value: 0.9040026854999138.
[I 2025-08-17 17:53:13,740] Trial 4 finished with value: 0.8767879124073954 and parameters: {'n_estimators': 102, 'max_depth': 7, 'learning_rate': 0.10586722259947284, 'min_child_weight': 1, 'subsample': 0.9141555748086332, 'colsample_bytree': 

Running fold 8.0


[I 2025-08-17 18:10:30,447] Trial 1 finished with value: 0.7641485390902144 and parameters: {'n_estimators': 55, 'max_depth': 3, 'learning_rate': 0.05033696449286893, 'min_child_weight': 8, 'subsample': 0.8876979770809832, 'colsample_bytree': 0.8592069512276834, 'reg_alpha': 5.452807159060113, 'reg_lambda': 0.18425471705175278}. Best is trial 1 with value: 0.7641485390902144.
[I 2025-08-17 18:10:30,499] Trial 4 finished with value: 0.7558169458085823 and parameters: {'n_estimators': 61, 'max_depth': 3, 'learning_rate': 0.035345140255274035, 'min_child_weight': 10, 'subsample': 0.7066803367316082, 'colsample_bytree': 0.817586872266057, 'reg_alpha': 3.9347129291655736, 'reg_lambda': 5.076539977990055}. Best is trial 1 with value: 0.7641485390902144.
[I 2025-08-17 18:10:30,802] Trial 13 finished with value: 0.775021755349107 and parameters: {'n_estimators': 79, 'max_depth': 4, 'learning_rate': 0.024355690038437468, 'min_child_weight': 1, 'subsample': 0.6214063166724066, 'colsample_bytree'

Running fold 1.0


[I 2025-08-17 18:26:52,949] Trial 15 finished with value: 0.829750397761253 and parameters: {'n_estimators': 78, 'max_depth': 6, 'learning_rate': 0.04318873537818854, 'min_child_weight': 9, 'subsample': 0.7329972536837649, 'colsample_bytree': 0.9723369228129758, 'reg_alpha': 8.510584920638273, 'reg_lambda': 5.132548957806441}. Best is trial 15 with value: 0.829750397761253.
[I 2025-08-17 18:26:53,169] Trial 7 finished with value: 0.8716180702090925 and parameters: {'n_estimators': 111, 'max_depth': 5, 'learning_rate': 0.21507933194503698, 'min_child_weight': 3, 'subsample': 0.9744528716540036, 'colsample_bytree': 0.8480136299392371, 'reg_alpha': 0.09166798215625938, 'reg_lambda': 3.349279347418639}. Best is trial 7 with value: 0.8716180702090925.
[I 2025-08-17 18:26:53,281] Trial 11 finished with value: 0.8175676194130059 and parameters: {'n_estimators': 184, 'max_depth': 3, 'learning_rate': 0.0936187016308341, 'min_child_weight': 10, 'subsample': 0.840142820939902, 'colsample_bytree':

Running fold 9.0


[I 2025-08-17 18:44:16,049] Trial 5 finished with value: 0.7877687533366667 and parameters: {'n_estimators': 64, 'max_depth': 4, 'learning_rate': 0.07960849243898346, 'min_child_weight': 3, 'subsample': 0.8957022421291816, 'colsample_bytree': 0.7446889160523195, 'reg_alpha': 8.212344598535534, 'reg_lambda': 1.9812518622668218}. Best is trial 5 with value: 0.7877687533366667.
[I 2025-08-17 18:44:16,455] Trial 8 finished with value: 0.8486247571922452 and parameters: {'n_estimators': 59, 'max_depth': 7, 'learning_rate': 0.13620562193624386, 'min_child_weight': 4, 'subsample': 0.7902557439362036, 'colsample_bytree': 0.6757892427128795, 'reg_alpha': 8.969268979019196, 'reg_lambda': 4.825436699654869}. Best is trial 8 with value: 0.8486247571922452.
[I 2025-08-17 18:44:16,775] Trial 1 finished with value: 0.8682088603808851 and parameters: {'n_estimators': 79, 'max_depth': 7, 'learning_rate': 0.12197580493638621, 'min_child_weight': 1, 'subsample': 0.9511392548623576, 'colsample_bytree': 0.

Running fold 2.0


[I 2025-08-17 19:02:00,671] Trial 7 finished with value: 0.7973037429051805 and parameters: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.21253034665320472, 'min_child_weight': 3, 'subsample': 0.7773407355798484, 'colsample_bytree': 0.6976850869981404, 'reg_alpha': 2.9840569594422606, 'reg_lambda': 6.335363385579168}. Best is trial 7 with value: 0.7973037429051805.
[I 2025-08-17 19:02:01,174] Trial 2 finished with value: 0.825805790952483 and parameters: {'n_estimators': 119, 'max_depth': 3, 'learning_rate': 0.23925304970705108, 'min_child_weight': 5, 'subsample': 0.8720468352642555, 'colsample_bytree': 0.6219428867868461, 'reg_alpha': 6.271739313368428, 'reg_lambda': 1.1388974446644429}. Best is trial 2 with value: 0.825805790952483.
[I 2025-08-17 19:02:01,556] Trial 13 finished with value: 0.8893366696381219 and parameters: {'n_estimators': 80, 'max_depth': 7, 'learning_rate': 0.24476017791741098, 'min_child_weight': 4, 'subsample': 0.8246235816780014, 'colsample_bytree': 0

Running fold 6.0


[I 2025-08-17 19:20:12,014] Trial 11 finished with value: 0.8028534621163891 and parameters: {'n_estimators': 70, 'max_depth': 3, 'learning_rate': 0.235812209083307, 'min_child_weight': 3, 'subsample': 0.7061364054620148, 'colsample_bytree': 0.8975149405984868, 'reg_alpha': 6.793317194132844, 'reg_lambda': 7.993742282939672}. Best is trial 11 with value: 0.8028534621163891.
[I 2025-08-17 19:20:12,662] Trial 3 finished with value: 0.8060787921884038 and parameters: {'n_estimators': 122, 'max_depth': 4, 'learning_rate': 0.06917659392474061, 'min_child_weight': 3, 'subsample': 0.7604824617611197, 'colsample_bytree': 0.7921970546344121, 'reg_alpha': 5.174932146628205, 'reg_lambda': 5.951599621162504}. Best is trial 3 with value: 0.8060787921884038.
[I 2025-08-17 19:20:12,776] Trial 15 finished with value: 0.8189979375893159 and parameters: {'n_estimators': 128, 'max_depth': 4, 'learning_rate': 0.12594925711138466, 'min_child_weight': 10, 'subsample': 0.6289874455789648, 'colsample_bytree':

Running fold 3.0


[I 2025-08-17 19:38:04,650] Trial 15 finished with value: 0.8132103564617869 and parameters: {'n_estimators': 50, 'max_depth': 4, 'learning_rate': 0.19988105225122796, 'min_child_weight': 5, 'subsample': 0.8969384485438525, 'colsample_bytree': 0.6160941488097361, 'reg_alpha': 9.221548013141046, 'reg_lambda': 0.9153058689977078}. Best is trial 15 with value: 0.8132103564617869.
[I 2025-08-17 19:38:04,979] Trial 1 finished with value: 0.8285574525225398 and parameters: {'n_estimators': 82, 'max_depth': 4, 'learning_rate': 0.20926063461448693, 'min_child_weight': 6, 'subsample': 0.8518302273084827, 'colsample_bytree': 0.9721256883985164, 'reg_alpha': 8.102751931340357, 'reg_lambda': 7.816179047989981}. Best is trial 1 with value: 0.8285574525225398.
[I 2025-08-17 19:38:05,001] Trial 11 finished with value: 0.836869929249275 and parameters: {'n_estimators': 77, 'max_depth': 4, 'learning_rate': 0.2848114788076276, 'min_child_weight': 2, 'subsample': 0.716876138899655, 'colsample_bytree': 0.

Running fold 0.0


[I 2025-08-17 19:55:13,411] Trial 8 finished with value: 0.8271344263774951 and parameters: {'n_estimators': 176, 'max_depth': 3, 'learning_rate': 0.18532715670430083, 'min_child_weight': 2, 'subsample': 0.6314569137304321, 'colsample_bytree': 0.8735009604403037, 'reg_alpha': 2.302929722778514, 'reg_lambda': 9.391579951613798}. Best is trial 8 with value: 0.8271344263774951.
[I 2025-08-17 19:55:13,542] Trial 11 finished with value: 0.8568531724986178 and parameters: {'n_estimators': 153, 'max_depth': 4, 'learning_rate': 0.2829622200618917, 'min_child_weight': 8, 'subsample': 0.6183960158437348, 'colsample_bytree': 0.8866801203304886, 'reg_alpha': 7.736271927163965, 'reg_lambda': 1.9850472544635411}. Best is trial 11 with value: 0.8568531724986178.
[I 2025-08-17 19:55:13,862] Trial 5 finished with value: 0.8645207682152086 and parameters: {'n_estimators': 113, 'max_depth': 7, 'learning_rate': 0.08270298003007126, 'min_child_weight': 8, 'subsample': 0.8506048057876658, 'colsample_bytree'

Running fold 7.0


[I 2025-08-17 20:12:22,195] Trial 5 finished with value: 0.8111926816855652 and parameters: {'n_estimators': 96, 'max_depth': 3, 'learning_rate': 0.27054764040093016, 'min_child_weight': 3, 'subsample': 0.6901915879227365, 'colsample_bytree': 0.960926993207345, 'reg_alpha': 0.9297001064496235, 'reg_lambda': 0.5800083484551322}. Best is trial 5 with value: 0.8111926816855652.
[I 2025-08-17 20:12:22,314] Trial 3 finished with value: 0.8378195080968318 and parameters: {'n_estimators': 71, 'max_depth': 5, 'learning_rate': 0.21277328746081864, 'min_child_weight': 7, 'subsample': 0.936971162205232, 'colsample_bytree': 0.8501195445666259, 'reg_alpha': 3.0156231743787068, 'reg_lambda': 9.887492089703002}. Best is trial 3 with value: 0.8378195080968318.
[I 2025-08-17 20:12:22,417] Trial 14 finished with value: 0.8566526987728539 and parameters: {'n_estimators': 56, 'max_depth': 7, 'learning_rate': 0.12822937060585599, 'min_child_weight': 6, 'subsample': 0.8345794355600157, 'colsample_bytree': 0

Running fold 4.0


[I 2025-08-17 20:29:46,351] Trial 8 finished with value: 0.7944744180862169 and parameters: {'n_estimators': 54, 'max_depth': 5, 'learning_rate': 0.050299733124122135, 'min_child_weight': 6, 'subsample': 0.8568425138710836, 'colsample_bytree': 0.6140936177813762, 'reg_alpha': 0.9419988658052303, 'reg_lambda': 9.000559025045924}. Best is trial 8 with value: 0.7944744180862169.
[I 2025-08-17 20:29:46,575] Trial 15 finished with value: 0.7738279734459972 and parameters: {'n_estimators': 69, 'max_depth': 5, 'learning_rate': 0.01539050869883881, 'min_child_weight': 10, 'subsample': 0.8177094872560521, 'colsample_bytree': 0.9219982763331406, 'reg_alpha': 9.362105594294448, 'reg_lambda': 8.07246054309725}. Best is trial 8 with value: 0.7944744180862169.
[I 2025-08-17 20:29:47,077] Trial 13 finished with value: 0.8894853156607493 and parameters: {'n_estimators': 69, 'max_depth': 8, 'learning_rate': 0.15518504861024224, 'min_child_weight': 1, 'subsample': 0.695892090838657, 'colsample_bytree': 

In [20]:
with open(r"./data/xgboost_binary_fold_results.pkl", "wb") as f:
    joblib.dump(fold_results, f)
fold_results

{'fold_5.0': {'best_params': {'n_estimators': 275,
   'max_depth': 10,
   'learning_rate': 0.258369482337815,
   'min_child_weight': 1,
   'subsample': 0.9622152335407611,
   'colsample_bytree': 0.7632647309349613,
   'reg_alpha': 0.6589152897804043,
   'reg_lambda': 2.817375553286472},
  'best_score': 0.9435175761817683,
  'study': <optuna.study.study.Study at 0x1e22208afd0>},
 'fold_8.0': {'best_params': {'n_estimators': 253,
   'max_depth': 10,
   'learning_rate': 0.2824805487379577,
   'min_child_weight': 1,
   'subsample': 0.9881189932354381,
   'colsample_bytree': 0.9275765347718531,
   'reg_alpha': 0.3348110778687682,
   'reg_lambda': 1.0581349364901365},
  'best_score': 0.940324259104797,
  'study': <optuna.study.study.Study at 0x1e22369fd50>},
 'fold_1.0': {'best_params': {'n_estimators': 274,
   'max_depth': 10,
   'learning_rate': 0.21529986627479275,
   'min_child_weight': 1,
   'subsample': 0.9126117624876511,
   'colsample_bytree': 0.9329792451878993,
   'reg_alpha': 0.36

In [23]:
auroc_scores = [item["best_score"] for item in fold_results.values()]
average_mean = np.mean(auroc_scores)
average_std = np.std(auroc_scores)
print(f"Average AUROC: {average_mean:.4f} ± {average_std:.4f}")

Average AUROC: 0.9425 ± 0.0019
