In [None]:
import pandas as pd
%load_ext autoreload
%autoreload 1

In [None]:
from src.pipeline import *
from scipy.stats import sem
# from src.convert_graphml_to_pyg_data_multithread import *

optuna.logging.set_verbosity(optuna.logging.WARNING)

%aimport src.pipeline
# %aimport src.convert_graphml_to_pyg_data_multithread

In [None]:
features = [
    "faces", "edges", "vertices", "quantity",
    "height", "width", "depth", "volume", "area",
    "bbox_height", "bbox_width", "bbox_depth", "bbox_volume",
    "bbox_area",
]
data = pd.read_csv("./data/synced_dataset_final.csv")

In [None]:
X = data[features]
X_train, X_test, y_train_index, y_test_index = train_test_split(
    X, range(len(X)),
    test_size=0.2,
    random_state=100,
    stratify=data["is_cnc"])
y_multi_train = data["multiclass_labels"].iloc[y_train_index]
y_multi_test = data["multiclass_labels"].iloc[y_test_index]

y_binary_train = data["is_cnc"].iloc[y_train_index]
y_binary_test = data["is_cnc"].iloc[y_test_index]

params = {
    "n_estimators": randint(100, 300),
    "max_depth": randint(3, 10),
    "learning_rate": uniform(0.005, 0.3),
}
best_search_binary = randomizedsearchcv_xgboost(
    X_train, y_binary_train,
    params, n_iter=50,
)

In [None]:
y_binary_pred = best_search_binary.best_estimator_.predict(X_test)
y_binary_prob = best_search_binary.best_estimator_.predict_proba(X_test)
y_binary_pred_train = best_search_binary.best_estimator_.predict(X_train)
y_binary_prob_train = best_search_binary.best_estimator_.predict_proba(X_train)
metrics_binary_validation = evaluate_classification(
    y_true=y_binary_test,
    y_pred=y_binary_pred,
    y_prob=y_binary_prob,
)
metrics_binary_train = evaluate_classification(
    y_true=y_binary_train,
    y_pred=y_binary_pred_train,
    y_prob=y_binary_prob_train,
)
pd.DataFrame([metrics_binary_train, metrics_binary_validation],
             index=["train", "validation"]).T

In [None]:
feature_importance = best_search_binary.best_estimator_.feature_importances_

# If you have feature names
feature_names = X_train.columns  # assuming X_train is a DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print(importance_df)

In [None]:
fold_results = xgboost_optuna(
    data,
    n_trials=100,
    features=features
)

In [None]:
fold_results["fold_0"]["best_params"]

In [None]:
best_params = {'n_estimators': 229,
               'max_depth': 10,
               'learning_rate': 0.23010627398694375,
               'min_child_weight': 5,
               'subsample': 0.8515134589080573,
               'colsample_bytree': 0.7674160634634193,
               'reg_alpha': 3.8657297966420128,
               'reg_lambda': 0.6141160901516102}
fold_all_results = xgboost_optuna_run_with_best_params(
    data=data,
    best_params=best_params,
    features=features
)

In [None]:
pd.DataFrame(fold_all_results).T.to_csv(r"xgboost_optuna_results.csv")

In [None]:
results_df = pd.DataFrame(fold_all_results).T
results_df

In [None]:
xg_auroc = results_df["roc_auc"].tolist()
xg_auroc

In [None]:
sem(results_df["roc_auc"]), np.mean(results_df["roc_auc"])

In [None]:
import json

with open(
        r"C:\Users\phanm\OneDrive\Documents\Study\PythonProjects\manufacturing-tech-gnn\optuna_gine_all_fold_results.json",
        "rb") as f:
    all_fold_results = json.load(f)

In [None]:
gnn_auroc = [item["val_tracker"]["auroc"] for item in
             all_fold_results.values()]
sem(gnn_auroc), np.mean(gnn_auroc)

In [None]:
gnn_auroc

In [None]:
from scipy.stats import ttest_rel

t_stat, p_value = ttest_rel(gnn_auroc, xg_auroc)

print("t-statistic:", t_stat)
print("p-value:", p_value)