# models for opcode frequency

## Import Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import time
import json
import pandas as pd
from pathlib import Path
import joblib

import optuna
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier


## Const

In [3]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data/processed')
MODEL_PATH = os.path.join(PATH, 'models')

In [4]:
with open(os.path.join(DATA_PATH, 'feature-opcode-freq_list.json')) as f:
    feature_list = json.load(f)

with open(os.path.join(DATA_PATH, 'labels-opcode-freq.json')) as f:
    labels = json.load(f)

In [5]:
train_df = pd.read_csv(os.path.join(DATA_PATH, 'train-opcode-freq.csv'))
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test-opcode-freq.csv'))

X_train = train_df[feature_list]
y_train = train_df[labels]

X_test = test_df[feature_list]
y_test = test_df[labels]

In [6]:
X_train.head()

Unnamed: 0,PUSH1,MSTORE,CALLDATASIZE,LT,PUSH2,JUMPI,CALLDATALOAD,PUSH29,SWAP1,DIV,...,UNKNOWN_0xc6,UNKNOWN_0xe1,INVALID_0x70,PUSH30,DUP16,UNKNOWN_0x2b,UNKNOWN_0xd8,INVALID_0x7a,UNKNOWN_0xf9,INVALID_0x7f
0,1,0,2,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,16,2,2,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,40,5,5,2,32,10,2,0,7,0,...,0,0,0,0,1,1,0,0,0,0
3,53,7,5,2,43,9,2,0,8,0,...,0,0,0,0,0,0,0,0,0,0
4,178,32,7,7,134,41,6,0,66,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
y_train.head()

Unnamed: 0,mint,leak,limit
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


## Traditional Machine Learning Models

In [8]:
models = {
    "Logistic Regression": MultiOutputClassifier(LogisticRegression(max_iter=1000)),
    "Random Forest": MultiOutputClassifier(RandomForestClassifier()),
    "Gradient Boosting": MultiOutputClassifier(GradientBoostingClassifier()),
    "AdaBoost": MultiOutputClassifier(AdaBoostClassifier()),
    "SVM (Linear)": MultiOutputClassifier(SVC(kernel="linear")),
    "KNN": MultiOutputClassifier(KNeighborsClassifier()),
    "Naive Bayes": MultiOutputClassifier(GaussianNB()),
    "MLP Classifier": MultiOutputClassifier(MLPClassifier(max_iter=300)),
    "XGBoost": MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    "LightGBM": MultiOutputClassifier(LGBMClassifier()),
    "DecisionTree": MultiOutputClassifier(DecisionTreeClassifier())
}

In [9]:
results = []

for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()

    y_pred = model.predict(X_test)

    results.append({
        "Classifier": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="macro"),  # change to 'macro' if multi-class
        "Recall": recall_score(y_test, y_pred, average="macro"),
        "F1-Score": f1_score(y_test, y_pred, average="macro"),
        "Training Time": round(end - start, 3)
    })

[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1889
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1889
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of 

### Result

In [10]:
df = pd.DataFrame(results)
df.sort_values(by="F1-Score", ascending=False, inplace=True)
df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-Score,Training Time
9,LightGBM,0.428571,0.888889,0.388889,0.484127,0.084
5,KNN,0.285714,0.571429,0.407407,0.475,0.007
8,XGBoost,0.428571,0.805556,0.388889,0.471306,0.509
0,Logistic Regression,0.285714,0.527778,0.351852,0.422222,0.497
10,DecisionTree,0.214286,0.654762,0.351852,0.39881,0.016
3,AdaBoost,0.285714,0.477778,0.333333,0.387302,0.296
2,Gradient Boosting,0.285714,0.694444,0.277778,0.360195,0.762
1,Random Forest,0.428571,0.833333,0.240741,0.349784,0.336
4,SVM (Linear),0.285714,0.416667,0.277778,0.318627,0.103
7,MLP Classifier,0.214286,0.305556,0.185185,0.206349,0.863


### Tuning

In [11]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 500),
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "num_leaves": trial.suggest_int("num_leaves", 3, 255),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1
    }

    model = MultiOutputClassifier(LGBMClassifier(**params))

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average="macro", zero_division=0)

In [12]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Tuned LGBMClassifier (MultiOutput):")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="macro", zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average="macro", zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred, average="macro", zero_division=0))

[I 2025-07-12 11:31:58,712] A new study created in memory with name: no-name-6933e261-4a85-43b0-af01-07de9e812e9d
[I 2025-07-12 11:31:59,062] Trial 0 finished with value: 0.26666666666666666 and parameters: {'n_estimators': 104, 'learning_rate': 0.0032999242371406423, 'max_depth': 5, 'num_leaves': 115, 'min_child_samples': 23, 'subsample': 0.748377228674205, 'colsample_bytree': 0.6099281713050817, 'reg_alpha': 8.793892192615997e-08, 'reg_lambda': 1.225559812056827e-07}. Best is trial 0 with value: 0.26666666666666666.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002340 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1878
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001827 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1878
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] 

[I 2025-07-12 11:31:59,318] Trial 1 finished with value: 0.3841269841269841 and parameters: {'n_estimators': 350, 'learning_rate': 0.06591592108633305, 'max_depth': 15, 'num_leaves': 162, 'min_child_samples': 44, 'subsample': 0.7298690872300824, 'colsample_bytree': 0.9999032555231511, 'reg_alpha': 1.5503254174626225e-06, 'reg_lambda': 0.00012478193830898964}. Best is trial 1 with value: 0.3841269841269841.
[I 2025-07-12 11:31:59,443] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 400, 'learning_rate': 0.00020328415881537752, 'max_depth': 13, 'num_leaves': 242, 'min_child_samples': 42, 'subsample': 0.9095849995249684, 'colsample_bytree': 0.9949599918305516, 'reg_alpha': 0.00014425496082959074, 'reg_lambda': 2.577363498196816e-06}. Best is trial 1 with value: 0.3841269841269841.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000275 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 82
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 82
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] 

[I 2025-07-12 11:31:59,879] Trial 3 finished with value: 0.0 and parameters: {'n_estimators': 370, 'learning_rate': 1.3007962261002906e-06, 'max_depth': 10, 'num_leaves': 247, 'min_child_samples': 16, 'subsample': 0.5772875271537765, 'colsample_bytree': 0.7755882982617321, 'reg_alpha': 0.00012146566667649076, 'reg_lambda': 1.4931863313128469e-05}. Best is trial 1 with value: 0.3841269841269841.


[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001402 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1918
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 102
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233577 -> initscore=-1.188224
[LightGBM] [Info] Start training from score -1.188224
[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info

[I 2025-07-12 11:31:59,955] Trial 4 finished with value: 0.0 and parameters: {'n_estimators': 227, 'learning_rate': 5.7721850581607865e-05, 'max_depth': 6, 'num_leaves': 82, 'min_child_samples': 60, 'subsample': 0.7407732833190854, 'colsample_bytree': 0.8328595818047566, 'reg_alpha': 0.315811820158821, 'reg_lambda': 4.969981276599551e-06}. Best is trial 1 with value: 0.3841269841269841.
[I 2025-07-12 11:32:00,004] Trial 5 finished with value: 0.4386724386724386 and parameters: {'n_estimators': 101, 'learning_rate': 0.02597238796550384, 'max_depth': 8, 'num_leaves': 17, 'min_child_samples': 55, 'subsample': 0.7363181491615947, 'colsample_bytree': 0.6867159216244006, 'reg_alpha': 1.818936511049541e-06, 'reg_lambda': 1.921661894130783e-05}. Best is trial 5 with value: 0.4386724386724386.
[I 2025-07-12 11:32:00,060] Trial 6 finished with value: 0.0 and parameters: {'n_estimators': 49, 'learning_rate': 0.004282854010780806, 'max_depth': 10, 'num_leaves': 148, 'min_child_samples': 66, 'subsa

[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000331 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233577 -> initscore=-1.188224
[LightGBM] [Info] Start training from score -1.188224
[LightGBM] [Info] Number of

[I 2025-07-12 11:32:00,212] Trial 7 finished with value: 0.0 and parameters: {'n_estimators': 451, 'learning_rate': 0.00017150097470483377, 'max_depth': 13, 'num_leaves': 153, 'min_child_samples': 45, 'subsample': 0.525249975716763, 'colsample_bytree': 0.8222472170658124, 'reg_alpha': 7.471615877196962, 'reg_lambda': 7.306186621196338e-06}. Best is trial 5 with value: 0.4386724386724386.
[I 2025-07-12 11:32:00,263] Trial 8 finished with value: 0.0 and parameters: {'n_estimators': 20, 'learning_rate': 0.00026249375689118854, 'max_depth': 9, 'num_leaves': 193, 'min_child_samples': 5, 'subsample': 0.5445295415827363, 'colsample_bytree': 0.7152468847774957, 'reg_alpha': 8.510970507085673e-05, 'reg_lambda': 1.363075029244112}. Best is trial 5 with value: 0.4386724386724386.


[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 82
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000262 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 82
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233577 -> initscore=-1.188224
[LightGBM] [Info] Start training from score -1.188224
[LightGBM] [Info] Number of

[I 2025-07-12 11:32:00,570] Trial 9 finished with value: 0.4386724386724386 and parameters: {'n_estimators': 422, 'learning_rate': 0.004017291135608417, 'max_depth': 13, 'num_leaves': 21, 'min_child_samples': 30, 'subsample': 0.6072439895278492, 'colsample_bytree': 0.8755836540594015, 'reg_alpha': 1.0162092633789091e-05, 'reg_lambda': 1.6731684463113405}. Best is trial 5 with value: 0.4386724386724386.
[I 2025-07-12 11:32:00,643] Trial 10 finished with value: 0.0 and parameters: {'n_estimators': 185, 'learning_rate': 0.06947543130581627, 'max_depth': 3, 'num_leaves': 16, 'min_child_samples': 97, 'subsample': 0.9809711002325441, 'colsample_bytree': 0.5570969984056607, 'reg_alpha': 1.1359157096367405e-08, 'reg_lambda': 0.011123335072521012}. Best is trial 5 with value: 0.4386724386724386.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233577 -> initscore=-1.188224
[LightGBM] [Info] Start training from score -1.188224
[LightGBM] [Info] Number of positive: 3

[I 2025-07-12 11:32:00,733] Trial 11 finished with value: 0.0 and parameters: {'n_estimators': 287, 'learning_rate': 0.0039368667257014085, 'max_depth': 8, 'num_leaves': 11, 'min_child_samples': 82, 'subsample': 0.650071540504983, 'colsample_bytree': 0.6704156394795772, 'reg_alpha': 1.0645079729924686e-06, 'reg_lambda': 8.972273616656025}. Best is trial 5 with value: 0.4386724386724386.
[I 2025-07-12 11:32:00,871] Trial 12 finished with value: 0.4386724386724386 and parameters: {'n_estimators': 146, 'learning_rate': 0.014109305892622295, 'max_depth': 13, 'num_leaves': 61, 'min_child_samples': 30, 'subsample': 0.6573513501854421, 'colsample_bytree': 0.896077948774064, 'reg_alpha': 3.3782815402181435e-06, 'reg_lambda': 0.01634104247197628}. Best is trial 5 with value: 0.4386724386724386.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1864
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1864
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of 

[I 2025-07-12 11:32:01,019] Trial 13 finished with value: 0.3463203463203463 and parameters: {'n_estimators': 467, 'learning_rate': 0.018326778440294247, 'max_depth': 7, 'num_leaves': 49, 'min_child_samples': 66, 'subsample': 0.6388296656049723, 'colsample_bytree': 0.6717055226259643, 'reg_alpha': 0.004180532552632754, 'reg_lambda': 0.004154656973277594}. Best is trial 5 with value: 0.4386724386724386.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000278 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1856
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1856
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] 

[I 2025-07-12 11:32:01,199] Trial 14 finished with value: 0.0 and parameters: {'n_estimators': 292, 'learning_rate': 0.0010424891617123783, 'max_depth': 11, 'num_leaves': 100, 'min_child_samples': 33, 'subsample': 0.831597758792354, 'colsample_bytree': 0.5224532710928018, 'reg_alpha': 2.0297222803843614e-05, 'reg_lambda': 1.164982323825514e-08}. Best is trial 5 with value: 0.4386724386724386.
[I 2025-07-12 11:32:01,366] Trial 15 finished with value: 0.0 and parameters: {'n_estimators': 91, 'learning_rate': 3.782214161446785e-05, 'max_depth': 15, 'num_leaves': 33, 'min_child_samples': 55, 'subsample': 0.8110530119564932, 'colsample_bytree': 0.7559819817992816, 'reg_alpha': 0.002271495799131081, 'reg_lambda': 0.14826054996589513}. Best is trial 5 with value: 0.4386724386724386.
[I 2025-07-12 11:32:01,460] Trial 16 finished with value: 0.0 and parameters: {'n_estimators': 497, 'learning_rate': 0.012953839968457458, 'max_depth': 12, 'num_leaves': 6, 'min_child_samples': 82, 'subsample': 0.

[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1781
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 74
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1781
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 74
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of 

[I 2025-07-12 11:32:01,917] Trial 17 finished with value: 0.0 and parameters: {'n_estimators': 226, 'learning_rate': 0.0011333281543882752, 'max_depth': 4, 'num_leaves': 70, 'min_child_samples': 10, 'subsample': 0.6925586094620185, 'colsample_bytree': 0.6217453594000026, 'reg_alpha': 9.468755281199573e-06, 'reg_lambda': 0.00028095335358382364}. Best is trial 5 with value: 0.4386724386724386.




[I 2025-07-12 11:32:02,067] Trial 18 finished with value: 0.0 and parameters: {'n_estimators': 313, 'learning_rate': 6.524157160054076e-06, 'max_depth': 7, 'num_leaves': 48, 'min_child_samples': 32, 'subsample': 0.7955021770881501, 'colsample_bytree': 0.9515985324737535, 'reg_alpha': 2.375610667796241e-08, 'reg_lambda': 0.7852271918241903}. Best is trial 5 with value: 0.4386724386724386.
[I 2025-07-12 11:32:02,157] Trial 19 finished with value: 0.0 and parameters: {'n_estimators': 404, 'learning_rate': 0.0010941357252504259, 'max_depth': 8, 'num_leaves': 91, 'min_child_samples': 73, 'subsample': 0.69494729459208, 'colsample_bytree': 0.7957247781913191, 'reg_alpha': 3.6039379301508254e-07, 'reg_lambda': 0.001692868832635917}. Best is trial 5 with value: 0.4386724386724386.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1856
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000284 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1856
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of 

[I 2025-07-12 11:32:02,294] Trial 20 finished with value: 0.48412698412698413 and parameters: {'n_estimators': 150, 'learning_rate': 0.03161045697730621, 'max_depth': 11, 'num_leaves': 32, 'min_child_samples': 20, 'subsample': 0.5959292074740044, 'colsample_bytree': 0.7093210817839888, 'reg_alpha': 0.0008389840721495638, 'reg_lambda': 4.762882448431826e-05}. Best is trial 20 with value: 0.48412698412698413.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001464 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1889
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000359 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1889
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of 

[I 2025-07-12 11:32:02,628] Trial 21 finished with value: 0.48412698412698413 and parameters: {'n_estimators': 155, 'learning_rate': 0.03441130906134932, 'max_depth': 11, 'num_leaves': 33, 'min_child_samples': 20, 'subsample': 0.5817334706208886, 'colsample_bytree': 0.7155693399145102, 'reg_alpha': 0.04424457283766692, 'reg_lambda': 4.5202239283561795e-05}. Best is trial 20 with value: 0.48412698412698413.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1889
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1889
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] 

[I 2025-07-12 11:32:02,778] Trial 22 finished with value: 0.48412698412698413 and parameters: {'n_estimators': 155, 'learning_rate': 0.03131172203155254, 'max_depth': 11, 'num_leaves': 33, 'min_child_samples': 19, 'subsample': 0.5163927415752301, 'colsample_bytree': 0.6842619756436005, 'reg_alpha': 0.06731835121241198, 'reg_lambda': 4.205065959654405e-05}. Best is trial 20 with value: 0.48412698412698413.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1902
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 97
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000362 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1902
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 97
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] 

[I 2025-07-12 11:32:02,914] Trial 23 finished with value: 0.5225885225885226 and parameters: {'n_estimators': 161, 'learning_rate': 0.055515616551427935, 'max_depth': 11, 'num_leaves': 41, 'min_child_samples': 23, 'subsample': 0.5126142252592383, 'colsample_bytree': 0.7260476593472516, 'reg_alpha': 0.07204564551235534, 'reg_lambda': 7.296145062812182e-05}. Best is trial 23 with value: 0.5225885225885226.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000606 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406


[I 2025-07-12 11:32:03,228] Trial 24 finished with value: 0.5793650793650793 and parameters: {'n_estimators': 181, 'learning_rate': 0.06611816875700782, 'max_depth': 11, 'num_leaves': 44, 'min_child_samples': 3, 'subsample': 0.5624680948195944, 'colsample_bytree': 0.7301864281809478, 'reg_alpha': 0.10997450452659199, 'reg_lambda': 4.0617803221958314e-07}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000622 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233577 -> initscore=-1.188224
[LightGBM] [Info] Start training from score -1.188224
[LightGBM] [Info] Number 

[I 2025-07-12 11:32:03,475] Trial 25 finished with value: 0.42857142857142855 and parameters: {'n_estimators': 204, 'learning_rate': 0.0929716978513467, 'max_depth': 10, 'num_leaves': 65, 'min_child_samples': 6, 'subsample': 0.555618524534667, 'colsample_bytree': 0.6244720841503089, 'reg_alpha': 1.3094357055488899, 'reg_lambda': 3.9852592824047303e-07}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Total Bins 2014
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2014
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total B

[I 2025-07-12 11:32:03,788] Trial 26 finished with value: 0.48412698412698413 and parameters: {'n_estimators': 250, 'learning_rate': 0.007983069453077288, 'max_depth': 12, 'num_leaves': 118, 'min_child_samples': 13, 'subsample': 0.508040907286742, 'colsample_bytree': 0.7354363681865353, 'reg_alpha': 0.022035473484929607, 'reg_lambda': 5.732872996659405e-08}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1878
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1878
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of 

[I 2025-07-12 11:32:03,974] Trial 27 finished with value: 0.5225885225885226 and parameters: {'n_estimators': 129, 'learning_rate': 0.09196960309491482, 'max_depth': 14, 'num_leaves': 49, 'min_child_samples': 23, 'subsample': 0.5006695902761314, 'colsample_bytree': 0.797790771856623, 'reg_alpha': 0.3193633245047678, 'reg_lambda': 7.418683434867378e-07}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000622 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000628 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517


[I 2025-07-12 11:32:04,190] Trial 28 finished with value: 0.5079365079365079 and parameters: {'n_estimators': 85, 'learning_rate': 0.08417796696039095, 'max_depth': 14, 'num_leaves': 78, 'min_child_samples': 3, 'subsample': 0.5518194962169984, 'colsample_bytree': 0.8068019945580027, 'reg_alpha': 0.3389908567376643, 'reg_lambda': 9.3344796389766e-07}. Best is trial 24 with value: 0.5793650793650793.
[I 2025-07-12 11:32:04,296] Trial 29 finished with value: 0.16666666666666666 and parameters: {'n_estimators': 127, 'learning_rate': 0.008324307679016613, 'max_depth': 14, 'num_leaves': 55, 'min_child_samples': 25, 'subsample': 0.5072499230247075, 'colsample_bytree': 0.7619233057847223, 'reg_alpha': 9.932842861124495, 'reg_lambda': 1.6340834308365454e-07}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233577 -> initscore=-1.188224
[LightGBM] [Info] Start training from score -1.188224
[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1868
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 89
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number o

[I 2025-07-12 11:32:04,574] Trial 30 finished with value: 0.48412698412698413 and parameters: {'n_estimators': 197, 'learning_rate': 0.04798809912316574, 'max_depth': 12, 'num_leaves': 108, 'min_child_samples': 26, 'subsample': 0.5538029545160461, 'colsample_bytree': 0.5750928892986106, 'reg_alpha': 0.42932789732908094, 'reg_lambda': 5.333249110615037e-08}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1864
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1864
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] 

[I 2025-07-12 11:32:04,702] Trial 31 finished with value: 0.5793650793650793 and parameters: {'n_estimators': 61, 'learning_rate': 0.09277912896761566, 'max_depth': 14, 'num_leaves': 79, 'min_child_samples': 4, 'subsample': 0.5467594334318817, 'colsample_bytree': 0.8171870297520896, 'reg_alpha': 0.20146135062453105, 'reg_lambda': 7.178454382370265e-07}. Best is trial 24 with value: 0.5793650793650793.
[I 2025-07-12 11:32:04,808] Trial 32 finished with value: 0.5079365079365079 and parameters: {'n_estimators': 49, 'learning_rate': 0.09880289602501152, 'max_depth': 14, 'num_leaves': 45, 'min_child_samples': 11, 'subsample': 0.6211477231579943, 'colsample_bytree': 0.8528413651866452, 'reg_alpha': 0.01772846667413602, 'reg_lambda': 1.4019027158798916e-08}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2099
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 177
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000554 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2099
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 177
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233577 -> initscore=-1.188224
[LightGBM] [Info] Start training from score -1.188224
[LightGBM] [Info] Number 

[I 2025-07-12 11:32:04,930] Trial 33 finished with value: 0.37806637806637805 and parameters: {'n_estimators': 73, 'learning_rate': 0.03969596201582151, 'max_depth': 15, 'num_leaves': 72, 'min_child_samples': 11, 'subsample': 0.5021684098251796, 'colsample_bytree': 0.7731275189064415, 'reg_alpha': 2.0223060520431257, 'reg_lambda': 3.5157169733197464e-07}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1951
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1951
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info

[I 2025-07-12 11:32:05,185] Trial 34 finished with value: 0.32857142857142857 and parameters: {'n_estimators': 125, 'learning_rate': 0.011863372902871958, 'max_depth': 14, 'num_leaves': 121, 'min_child_samples': 38, 'subsample': 0.5714198988839558, 'colsample_bytree': 0.8003200090258911, 'reg_alpha': 0.09144310272262848, 'reg_lambda': 2.6255917918686982e-06}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1918
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 102
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000395 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1918
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 102
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info

[I 2025-07-12 11:32:05,307] Trial 35 finished with value: 0.0 and parameters: {'n_estimators': 19, 'learning_rate': 0.0018794362300085302, 'max_depth': 12, 'num_leaves': 95, 'min_child_samples': 16, 'subsample': 0.5356774968854087, 'colsample_bytree': 0.737502131631936, 'reg_alpha': 1.704172183363061, 'reg_lambda': 5.495372444303906e-07}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001149 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1918
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 102
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233577 -> initscore=-1.188224
[LightGBM] [Info] Start training from score -1.188224
[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000477 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1843
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 83
[LightGBM] [Info] [binary:BoostF

[I 2025-07-12 11:32:05,494] Trial 36 finished with value: 0.47130647130647124 and parameters: {'n_estimators': 177, 'learning_rate': 0.0553358672591853, 'max_depth': 9, 'num_leaves': 132, 'min_child_samples': 40, 'subsample': 0.6881643968321962, 'colsample_bytree': 0.6476115416654286, 'reg_alpha': 0.010165443896473395, 'reg_lambda': 1.2913031994270237e-07}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000383 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1821
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1821
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 80
[LightGBM] [Info] [binary:BoostFro

[I 2025-07-12 11:32:05,723] Trial 37 finished with value: 0.4386724386724386 and parameters: {'n_estimators': 117, 'learning_rate': 0.021011875977160078, 'max_depth': 15, 'num_leaves': 189, 'min_child_samples': 48, 'subsample': 0.5748535665951495, 'colsample_bytree': 0.8320141240915486, 'reg_alpha': 0.14188515821881625, 'reg_lambda': 6.210938220497454e-06}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info

[I 2025-07-12 11:32:06,243] Trial 38 finished with value: 0.0 and parameters: {'n_estimators': 228, 'learning_rate': 1.3298911390379226e-06, 'max_depth': 10, 'num_leaves': 88, 'min_child_samples': 3, 'subsample': 0.5295275361452662, 'colsample_bytree': 0.9526484664683161, 'reg_alpha': 0.0006575468958690703, 'reg_lambda': 3.8703119753124887e-08}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002050 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1983
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 125
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000440 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1983
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 125
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info

[I 2025-07-12 11:32:06,508] Trial 39 finished with value: 0.3841269841269841 and parameters: {'n_estimators': 72, 'learning_rate': 0.00855758593398567, 'max_depth': 13, 'num_leaves': 227, 'min_child_samples': 8, 'subsample': 0.7692729815886961, 'colsample_bytree': 0.7804840417672669, 'reg_alpha': 0.19724555024133492, 'reg_lambda': 2.0314091453876986e-06}. Best is trial 24 with value: 0.5793650793650793.
[I 2025-07-12 11:32:06,606] Trial 40 finished with value: 0.0 and parameters: {'n_estimators': 48, 'learning_rate': 3.553204505957047e-05, 'max_depth': 9, 'num_leaves': 42, 'min_child_samples': 15, 'subsample': 0.9223570294696986, 'colsample_bytree': 0.8542941884379854, 'reg_alpha': 0.9415069771106691, 'reg_lambda': 1.3036280437132262e-05}. Best is trial 24 with value: 0.5793650793650793.
[I 2025-07-12 11:32:06,741] Trial 41 finished with value: 0.5079365079365079 and parameters: {'n_estimators': 91, 'learning_rate': 0.0971043697394403, 'max_depth': 14, 'num_leaves': 81, 'min_child_samp

[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1932
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 106
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1932
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 106
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number o

[I 2025-07-12 11:32:06,882] Trial 42 finished with value: 0.4386724386724386 and parameters: {'n_estimators': 72, 'learning_rate': 0.05446676071564176, 'max_depth': 14, 'num_leaves': 71, 'min_child_samples': 8, 'subsample': 0.5694390401004045, 'colsample_bytree': 0.7881989736590127, 'reg_alpha': 3.4060921384742353, 'reg_lambda': 8.516853424259674e-07}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1983
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 125
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000673 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1983
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 125
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number o

[I 2025-07-12 11:32:07,128] Trial 43 finished with value: 0.48412698412698413 and parameters: {'n_estimators': 174, 'learning_rate': 0.02333269375387936, 'max_depth': 13, 'num_leaves': 79, 'min_child_samples': 24, 'subsample': 0.5254779902357264, 'colsample_bytree': 0.7357235428547656, 'reg_alpha': 0.03582210891452675, 'reg_lambda': 3.162916075645392e-06}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406


[I 2025-07-12 11:32:07,304] Trial 44 finished with value: 0.5793650793650793 and parameters: {'n_estimators': 104, 'learning_rate': 0.059296915964562345, 'max_depth': 15, 'num_leaves': 22, 'min_child_samples': 3, 'subsample': 0.5927759428616858, 'colsample_bytree': 0.8189487005360685, 'reg_alpha': 0.34783106660280527, 'reg_lambda': 2.4315915395396065e-07}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000606 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000438 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2161
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233577 -> initscore=-1.188224
[LightGBM] [Inf

[I 2025-07-12 11:32:07,406] Trial 45 finished with value: 0.4386724386724386 and parameters: {'n_estimators': 114, 'learning_rate': 0.051195995329301774, 'max_depth': 15, 'num_leaves': 24, 'min_child_samples': 36, 'subsample': 0.6207235641802464, 'colsample_bytree': 0.8444300052935388, 'reg_alpha': 4.449319537298403, 'reg_lambda': 2.0824050958150181e-07}. Best is trial 24 with value: 0.5793650793650793.
[I 2025-07-12 11:32:07,531] Trial 46 finished with value: 0.48412698412698413 and parameters: {'n_estimators': 130, 'learning_rate': 0.021466515332207843, 'max_depth': 15, 'num_leaves': 6, 'min_child_samples': 18, 'subsample': 0.5904322551812387, 'colsample_bytree': 0.7032994146087561, 'reg_alpha': 0.009294376599246996, 'reg_lambda': 3.8044712399430286e-08}. Best is trial 24 with value: 0.5793650793650793.
[I 2025-07-12 11:32:07,633] Trial 47 finished with value: 0.0 and parameters: {'n_estimators': 33, 'learning_rate': 0.0053615499655660075, 'max_depth': 10, 'num_leaves': 55, 'min_chil

[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1913
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000369 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1913
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] Start training from score -0.585517
[LightGBM] [Info] Number o

[I 2025-07-12 11:32:07,889] Trial 48 finished with value: 0.0 and parameters: {'n_estimators': 212, 'learning_rate': 0.000545958092139341, 'max_depth': 13, 'num_leaves': 20, 'min_child_samples': 28, 'subsample': 0.6684670664428018, 'colsample_bytree': 0.7569702151168766, 'reg_alpha': 0.13979419787419872, 'reg_lambda': 8.274293602350496e-06}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 39, number of negative: 98
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001631 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1864
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.284672 -> initscore=-0.921406
[LightGBM] [Info] Start training from score -0.921406
[LightGBM] [Info] Number of positive: 49, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000293 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1864
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357664 -> initscore=-0.585517
[LightGBM] [Info] 

[I 2025-07-12 11:32:08,108] Trial 49 finished with value: 0.0 and parameters: {'n_estimators': 169, 'learning_rate': 0.00012951608828948337, 'max_depth': 13, 'num_leaves': 41, 'min_child_samples': 7, 'subsample': 0.6375624202443606, 'colsample_bytree': 0.9180533037815783, 'reg_alpha': 0.001916903645218976, 'reg_lambda': 2.5638701327825622e-05}. Best is trial 24 with value: 0.5793650793650793.


[LightGBM] [Info] Number of positive: 32, number of negative: 105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1992
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 128
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233577 -> initscore=-1.188224
[LightGBM] [Info] Start training from score -1.188224
Tuned LGBMClassifier (MultiOutput):
Accuracy: 0.21428571428571427
Precision: 0.6547619047619048
Recall: 0.35185185185185186
F1 Score: 0.3988095238095238


In [13]:
joblib.dump(model, os.path.join(MODEL_PATH, f'best_lgbm-moc_model_on_crpwarner_opcode_freq.pkl'))

['/Users/napatcholthaipanich/Dev/master/dissertation/workspace/ml/models/best_lgbm-moc_model_on_crpwarner_opcode_freq.pkl']

### K-Fold (K=3)

In [14]:
NUM_FOLDS = 3
results = []
best_model = MultiOutputClassifier(LGBMClassifier(**study.best_params, random_state=42))
best_f1 = 0
best_fold = 0

In [15]:
for fold in range(NUM_FOLDS):
    print(f"=========== Fold-{fold} ===========")
    train_path = os.path.join(DATA_PATH, f'train_fold_{fold}-opcode-freq.csv')
    val_path = os.path.join(DATA_PATH, f'val_fold_{fold}-opcode-freq.csv')

    train_df = pd.read_csv(train_path)
    val_df   = pd.read_csv(val_path)

    X_train = train_df[feature_list]
    y_train = train_df[labels]

    X_val = val_df[feature_list]
    y_val = val_df[labels]

    # Train model
    model = MultiOutputClassifier(LGBMClassifier(**study.best_params, random_state=42))
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_val)
    report = classification_report(y_val, y_pred, target_names=labels, output_dict=True)
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="macro", zero_division=0)

    results.append({'fold': fold, 'accuracy': acc, 'report': report})
    print(f"Accuracy: {acc}")
    print("Precision:", precision_score(y_val, y_pred, average="macro", zero_division=0))
    print("Recall:", recall_score(y_val, y_pred, average="macro", zero_division=0))
    print("F1 Score:", f1)

    if best_f1 < f1:
        best_model = model
        best_fold = fold
## Step 6: Average Performance Summary
print("\n===== Overall Summary =====")
avg_acc = sum([r['accuracy'] for r in results]) / NUM_FOLDS
print(f"Average Accuracy: {avg_acc:.4f}")

# Save model
joblib.dump(best_model, os.path.join(MODEL_PATH, f'best_lgbm-ovr_model_on_crpwarner_opcode_freq_from_fold{best_fold}.pkl'))

[LightGBM] [Info] Number of positive: 13, number of negative: 33
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000505 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1104
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.282609 -> initscore=-0.931558
[LightGBM] [Info] Start training from score -0.931558
[LightGBM] [Info] Number of positive: 6, number of negative: 40
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1104
[LightGBM] [Info] Number of data points in the train set: 46, number of used features: 182
[LightGBM] [Info] [binary:BoostFrom

['/Users/napatcholthaipanich/Dev/master/dissertation/workspace/ml/models/best_lgbm-ovr_model_on_crpwarner_opcode_freq_from_fold2.pkl']