# Baseline Model using Graph Stat for binary

## Set up

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import f1_score


In [3]:
PATH = Path.cwd().parents[2]
DATA_PATH = os.path.join(PATH, 'data/processed/graphs_stat')

## List Traditional ML model to compare

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

models = {
  "LogisticRegression()": LogisticRegression(),
  "DecisionTreeClassifier()": DecisionTreeClassifier(),
  "RandomForestClassifier()": RandomForestClassifier(),
  "AdaBoostClassifier()": AdaBoostClassifier(),
  "ExtraTreesClassifier()": ExtraTreesClassifier(),
  "XGBClassifier()": XGBClassifier(),
  "LGBMClassifier()": LGBMClassifier(),
  "SVC()": SVC(),
  "GaussianNB()": GaussianNB(),
  "KNeighborsClassifier()": KNeighborsClassifier(),
  "SGDClassifier()": SGDClassifier(),
  "MLPClassifier()": MLPClassifier(),
}

## Load Dataset

In [5]:
df = pd.read_csv(os.path.join(DATA_PATH, 'dataset.csv')).set_index('address')

with open(os.path.join(DATA_PATH, 'features.json'), "r") as f:
    features = json.load(f)

with open(os.path.join(DATA_PATH, 'labels.json'), "r") as f:
    labels = json.load(f)

In [6]:
df.head()

Unnamed: 0_level_0,mint,leak,limit,num_nodes,num_edges,avg_degree,density,connected_components,avg_clustering
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0x93023f1d3525e273f291b6f76d2f5027a39bf302,1,0,1,234,113,0.965812,0.002073,142,0.005698
0x2753dce37a7edb052a77832039bcc9aa49ad8b25,0,0,1,339,178,1.050147,0.001553,192,0.003933
0x94b7d24552933f50a5a5705c446528806dcea381,0,0,0,9,4,0.888889,0.055556,5,0.0
0xe0b9d4146aad6936cbfcbe4dae47e34aab96b093,0,0,0,477,145,0.607966,0.000639,347,0.0
0x10f6f2b97f3ab29583d9d38babf2994df7220c21,1,0,1,649,170,0.523883,0.000404,499,0.0


In [7]:
features[:5]

['num_nodes', 'num_edges', 'avg_degree', 'density', 'connected_components']

In [8]:
labels[:5]

['mint', 'leak', 'limit']

## Prepare Features and Labels

In [9]:
X = df[features]
y_mint = df[['mint']]
y_leak = df[['leak']]
y_limit = df[['limit']]

## Train/Test Split

In [10]:
X_mint_train, X_mint_test, y_mint_train, y_mint_test = train_test_split(
    X, y_mint, test_size=0.2, random_state=42
)

X_leak_train, X_leak_test, y_leak_train, y_leak_test = train_test_split(
    X, y_leak, test_size=0.2, random_state=42
)

X_limit_train, X_limit_test, y_limit_train, y_limit_test = train_test_split(
    X, y_limit, test_size=0.2, random_state=42
)

## Run all models and collect reports

In [11]:
def get_report_all_ml(X_train, y_train, X_test, y_test):
    report_list = []

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Generate classification report (as dict)
        report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

        # Average scores across all labels (macro average)
        avg_scores = report_dict["macro avg"]

        report_list.append({
            "Model": name,
            "Precision": avg_scores["precision"],
            "Recall": avg_scores["recall"],
            "F1-score": avg_scores["f1-score"]
        })

    df_report = pd.DataFrame(report_list)
    df_report = df_report.sort_values("F1-score", ascending=False).reset_index(drop=True)

    return df_report

In [12]:
def display_cm(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [0, 1])
    return cm_display

## Show report

In [13]:
data = {
  'mint': {
    "X_train": X_mint_train,
    "X_test": X_mint_test,
    "y_train": y_mint_train,
    "y_test": y_mint_test
  },
  'leak': {
    "X_train": X_leak_train,
    "X_test": X_leak_test,
    "y_train": y_leak_train,
    "y_test": y_leak_test
  },
  'limit': {
    "X_train": X_limit_train,
    "X_test": X_limit_test,
    "y_train": y_limit_train,
    "y_test": y_limit_test
  }
}

reports = []

for d in data.values():
    X_train = d['X_train']
    X_test = d['X_test']
    y_train = d['y_train']
    y_test = d['y_test']
    df_report = get_report_all_ml(X_train, y_train, X_test, y_test)
    reports.append(df_report)

[LightGBM] [Info] Number of positive: 14, number of negative: 41
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000640 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 109
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.254545 -> initscore=-1.074515
[LightGBM] [Info] Start training from score -1.074515
[LightGBM] [Info] Number of positive: 6, number of negative: 49
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 109
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.109091 -> initscore=-2.100061
[LightGBM] [Info] Start t

In [14]:
reports[0]

Unnamed: 0,Model,Precision,Recall,F1-score
0,KNeighborsClassifier(),0.9,0.833333,0.844444
1,AdaBoostClassifier(),0.785714,0.791667,0.784615
2,RandomForestClassifier(),0.785714,0.791667,0.784615
3,DecisionTreeClassifier(),0.708333,0.708333,0.708333
4,XGBClassifier(),0.708333,0.708333,0.708333
5,ExtraTreesClassifier(),0.708333,0.708333,0.708333
6,LGBMClassifier(),0.833333,0.666667,0.65
7,MLPClassifier(),0.633333,0.625,0.625668
8,GaussianNB(),0.772727,0.6875,0.625668
9,SVC(),0.285714,0.5,0.363636


In [15]:
reports[1]

Unnamed: 0,Model,Precision,Recall,F1-score
0,DecisionTreeClassifier(),0.666667,0.621212,0.634783
1,LGBMClassifier(),0.666667,0.621212,0.634783
2,SGDClassifier(),0.525,0.530303,0.52381
3,LogisticRegression(),0.392857,0.5,0.44
4,MLPClassifier(),0.392857,0.5,0.44
5,RandomForestClassifier(),0.392857,0.5,0.44
6,SVC(),0.392857,0.5,0.44
7,ExtraTreesClassifier(),0.392857,0.5,0.44
8,KNeighborsClassifier(),0.392857,0.5,0.44
9,AdaBoostClassifier(),0.384615,0.454545,0.416667


In [16]:
reports[2]

Unnamed: 0,Model,Precision,Recall,F1-score
0,DecisionTreeClassifier(),0.777778,0.777778,0.714286
1,RandomForestClassifier(),0.777778,0.777778,0.714286
2,AdaBoostClassifier(),0.777778,0.777778,0.714286
3,ExtraTreesClassifier(),0.777778,0.777778,0.714286
4,XGBClassifier(),0.666667,0.677778,0.641026
5,KNeighborsClassifier(),0.625,0.633333,0.625668
6,LogisticRegression(),0.466667,0.466667,0.428571
7,MLPClassifier(),0.692308,0.555556,0.377778
8,GaussianNB(),0.291667,0.388889,0.333333
9,LGBMClassifier(),0.30303,0.355556,0.270833


## MLP

### Build model

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import MeanSquaredError

2025-07-15 15:06:40.520755: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
def Model(input_dim, output_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(512, kernel_regularizer=regularizers.l1_l2(1e-6)),
        BatchNormalization(),
        LeakyReLU(0.01),
        Dropout(0.4),

        Dense(256, kernel_regularizer=regularizers.l1_l2(1e-6)),
        BatchNormalization(),
        LeakyReLU(0.01),
        Dropout(0.3),

        Dense(128, kernel_regularizer=regularizers.l1_l2(1e-6)),
        BatchNormalization(),
        LeakyReLU(0.01),
        Dropout(0.2),

        Dense(output_dim, activation='sigmoid')  # sigmoid for multi-label
    ])

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=1e-6),
        metrics=['accuracy']
    )
    return model

In [19]:
model = Model(input_dim=X.shape[1], output_dim=1)

### Train

In [20]:
model.fit(X_mint_train, y_mint_train, validation_split=0.2, epochs=100, batch_size=32,
                    callbacks=[
                                EarlyStopping(monitor='val_loss',
                                             patience=5,
                                             restore_best_weights=True),
                                ReduceLROnPlateau(
                                  monitor='val_loss',
                                  factor=0.5,
                                  patience=5,
                                  verbose=1)
                               ]
                    )

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 302ms/step - accuracy: 0.2964 - loss: 1.1274 - val_accuracy: 0.1818 - val_loss: 4.7056 - learning_rate: 1.0000e-06
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.2850 - loss: 1.1744 - val_accuracy: 0.1818 - val_loss: 3.5179 - learning_rate: 1.0000e-06
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.3002 - loss: 0.9865 - val_accuracy: 0.1818 - val_loss: 2.9802 - learning_rate: 1.0000e-06
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.3778 - loss: 1.0347 - val_accuracy: 0.1818 - val_loss: 2.4116 - learning_rate: 1.0000e-06
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.3826 - loss: 1.0152 - val_accuracy: 0.1818 - val_loss: 2.2059 - learning_rate: 1.0000e-06
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x1326cd010>

### Predict

In [21]:
y_mint_test_prob = model.predict(X_mint_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step


In [22]:
def tune_thresholds(y_true, y_pred_prob, metric='f1'):
    y_true = np.asarray(y_true)          # Fix: convert to NumPy
    y_pred_prob = np.asarray(y_pred_prob)

    best_thresholds = []
    best_scores = []

    for i in range(y_true.shape[1]):
        label_true = y_true[:, i]
        label_probs = y_pred_prob[:, i]  # Fix here too

        thresholds = np.linspace(0.0, 1.0, 101)
        scores = []

        for t in thresholds:
            label_pred = (label_probs >= t).astype(int)
            if metric == 'f1':
                score = f1_score(label_true, label_pred, zero_division=0)
            scores.append(score)

        best_t = thresholds[np.argmax(scores)]
        best_score = np.max(scores)

        best_thresholds.append(best_t)
        best_scores.append(best_score)

        print(f"Label {i}: Best threshold = {best_t:.2f}, Best {metric} = {best_score:.4f}")

    return best_thresholds, best_scores


In [23]:
best_thresholds, _ = tune_thresholds(y_mint_test, y_mint_test_prob)

Label 0: Best threshold = 0.46, Best f1 = 0.6316


In [24]:
y_mint_pred = (model.predict(X_mint_test) >= best_thresholds).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step


### Report

In [25]:
print(classification_report(y_mint_test, y_mint_pred))

              precision    recall  f1-score   support

           0       1.00      0.12      0.22         8
           1       0.46      1.00      0.63         6

    accuracy                           0.50        14
   macro avg       0.73      0.56      0.43        14
weighted avg       0.77      0.50      0.40        14



## Autoencoder + MPL

In [26]:
def Autoencoder(input_dim=256):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(128),
        LeakyReLU(0.01),

        Dense(32),
        BatchNormalization(),
        LeakyReLU(0.01),

        Dense(128),
        BatchNormalization(),
        LeakyReLU(0.01),

        Dense(input_dim, activation='sigmoid')  # sigmoid for multi-label
    ])

    model.compile(optimizer=Adam(1e-6), loss=MeanSquaredError())
    return model


In [27]:
autoencoder = Autoencoder(input_dim=X.shape[1])
autoencoder.fit(X_mint_train, X_mint_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 207ms/step - loss: 72812.5312 - val_loss: 81898.8750
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 57756.7070 - val_loss: 81909.9141
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 72941.1016 - val_loss: 81909.2656
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - loss: 71992.7109 - val_loss: 81909.2031
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 71206.6250 - val_loss: 81909.5469
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 58771.4102 - val_loss: 81911.1406
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 61025.4883 - val_loss: 81912.9766
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 60093.8828 - val_loss: 81913.9609

<keras.src.callbacks.history.History at 0x1331d8da0>

In [28]:
X_train_encoded = autoencoder.predict(X_mint_train)
X_test_encoded = autoencoder.predict(X_mint_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


In [29]:
model = Model(input_dim=X_train_encoded.shape[1], output_dim=1)

In [30]:
model.fit(X_train_encoded, y_mint_train, validation_split=0.2, epochs=100, batch_size=32,
                    callbacks=[
                                EarlyStopping(monitor='val_loss',
                                             patience=5,
                                             restore_best_weights=True),
                                ReduceLROnPlateau(
                                  monitor='val_loss',
                                  factor=0.5,
                                  patience=5,
                                  verbose=1)
                               ]
                    )

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 302ms/step - accuracy: 0.5152 - loss: 0.7281 - val_accuracy: 0.3636 - val_loss: 0.7025 - learning_rate: 1.0000e-06
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.5152 - loss: 0.8184 - val_accuracy: 0.5455 - val_loss: 0.7005 - learning_rate: 1.0000e-06
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.5767 - loss: 0.7315 - val_accuracy: 0.7273 - val_loss: 0.6967 - learning_rate: 1.0000e-06
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.5000 - loss: 0.7900 - val_accuracy: 0.8182 - val_loss: 0.6921 - learning_rate: 1.0000e-06
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.7197 - loss: 0.5530 - val_accuracy: 0.8182 - val_loss: 0.6895 - learning_rate: 1.0000e-06
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x133556750>

In [31]:
y_pred_prob = model.predict(X_test_encoded)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step


In [32]:
best_thresholds, _ = tune_thresholds(y_mint_test, y_pred_prob)

Label 0: Best threshold = 0.00, Best f1 = 0.6000


In [33]:
y_pred = (model.predict(X_test) >= best_thresholds).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step


### Report

In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.64      1.00      0.78         9

    accuracy                           0.64        14
   macro avg       0.32      0.50      0.39        14
weighted avg       0.41      0.64      0.50        14

