# Time series for multi-label

## Set up

In [1]:
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, LSTM, Dense, Masking
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

from utils.threshold import tune_thresholds
from utils.comparing import evaluate_multilabel_classification

PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data/labeled')
TXN_PATH = os.path.join(DATA_PATH, 'txn')

2025-07-27 10:09:38.397431: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
seq_len = 500
feature_dim = 5

In [3]:
df = pd.read_csv(os.path.join(DATA_PATH, 'groundtruth.csv')).set_index('Address')
labels_name = [col for col in df.columns]
X, y = [], []

for path in tqdm(list(Path(TXN_PATH).glob('*.json'))):
    addr = path.stem
    with open(path) as f:
        data = json.load(f)
    txns = sorted(data.get("transaction", []), key=lambda x: int(x.get("timeStamp", 0)))
    seq = [[
        int(tx.get("gas", 0)),
        int(tx.get("gasPrice", 0)),
        int(tx.get("value", 0)),
        int(tx.get("isError", 0)),
        int(tx.get("txreceipt_status", 0))
    ] for tx in txns]

    if len(seq) < seq_len:
        seq += [[0]*feature_dim] * (seq_len - len(seq))
    else:
        seq = seq[:seq_len]

    seq = MinMaxScaler().fit_transform(seq)
    X.append(seq)
    y.append(df.loc[addr].tolist())

100%|██████████| 69/69 [00:00<00:00, 81.74it/s]


In [4]:
X = np.array(X)
y = np.array(y)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
len(y_train), len(y_test)

(55, 14)

## GRU

In [7]:
# === Model ===
model = Sequential([
    Masking(mask_value=0.0, input_shape=(seq_len, feature_dim)),
    GRU(64),
    Dense(3, activation="sigmoid")
])

  super().__init__(**kwargs)


In [8]:
model.compile(loss=BinaryCrossentropy(), optimizer=Adam(1e-3), metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, verbose=1)

Epoch 1/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 193ms/step - accuracy: 0.6413 - loss: 0.6767
Epoch 2/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step - accuracy: 0.5407 - loss: 0.6664
Epoch 3/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step - accuracy: 0.5720 - loss: 0.6552
Epoch 4/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step - accuracy: 0.4402 - loss: 0.6515
Epoch 5/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step - accuracy: 0.4714 - loss: 0.6411
Epoch 6/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step - accuracy: 0.4350 - loss: 0.6325
Epoch 7/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step - accuracy: 0.4021 - loss: 0.6233
Epoch 8/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step - accuracy: 0.4437 - loss: 0.6079
Epoch 9/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x141634140>

In [9]:
# === Predict ===
prob = model.predict(X_test)
thresholds, _ = tune_thresholds(y_test, prob)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step
Label 0: Best threshold = 0.35, Best f1 = 0.4615
Label 1: Best threshold = 0.02, Best f1 = 0.5000
Label 2: Best threshold = 0.41, Best f1 = 0.6667


### Result

In [10]:
evaluate_multilabel_classification(y_test, prob, labels_name, thresholds)

({'micro_precision': 0.39285714285714285,
  'micro_recall': 0.9166666666666666,
  'micro_f1': 0.55,
  'macro_precision': 0.38888888888888884,
  'macro_recall': 0.9166666666666666,
  'macro_f1': 0.5427350427350427,
  'weighted_precision': 0.40277777777777773,
  'weighted_recall': 0.9166666666666666,
  'weighted_f1': 0.5566239316239315,
  'subset_accuracy': 0.21428571428571427},
               precision    recall  f1-score  support
 Mint           0.333333  0.750000  0.461538      4.0
 Leak           0.333333  1.000000  0.500000      3.0
 Limit          0.500000  1.000000  0.666667      5.0
 micro avg      0.392857  0.916667  0.550000     12.0
 macro avg      0.388889  0.916667  0.542735     12.0
 weighted avg   0.402778  0.916667  0.556624     12.0
 samples avg    0.333333  0.571429  0.400000     12.0)

## LSTM

In [11]:
# === Model ===
model = Sequential([
    Masking(mask_value=0.0, input_shape=(seq_len, feature_dim)),
    LSTM(64),
    Dense(3, activation="sigmoid")
])

  super().__init__(**kwargs)


In [12]:
model.compile(loss=BinaryCrossentropy(), optimizer=Adam(1e-3), metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, verbose=1)

Epoch 1/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 332ms/step - accuracy: 0.4437 - loss: 0.6945
Epoch 2/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 319ms/step - accuracy: 0.2426 - loss: 0.6831
Epoch 3/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 366ms/step - accuracy: 0.2409 - loss: 0.6727
Epoch 4/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 352ms/step - accuracy: 0.2409 - loss: 0.6628
Epoch 5/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 391ms/step - accuracy: 0.2305 - loss: 0.6482
Epoch 6/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 394ms/step - accuracy: 0.2305 - loss: 0.6323
Epoch 7/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 361ms/step - accuracy: 0.2617 - loss: 0.6193
Epoch 8/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 481ms/step - accuracy: 0.2409 - loss: 0.6152
Epoch 9/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x14165fad0>

In [13]:
# === Predict ===
prob = model.predict(X_test)
thresholds, _ = tune_thresholds(y_test, prob)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 469ms/step
Label 0: Best threshold = 0.38, Best f1 = 0.4615
Label 1: Best threshold = 0.09, Best f1 = 0.5000
Label 2: Best threshold = 0.45, Best f1 = 0.6154


### Result

In [14]:
evaluate_multilabel_classification(y_test, prob, labels_name, thresholds)

({'micro_precision': 0.38461538461538464,
  'micro_recall': 0.8333333333333334,
  'micro_f1': 0.5263157894736842,
  'macro_precision': 0.38888888888888884,
  'macro_recall': 0.85,
  'macro_f1': 0.5256410256410257,
  'weighted_precision': 0.40277777777777773,
  'weighted_recall': 0.8333333333333334,
  'weighted_f1': 0.5352564102564102,
  'subset_accuracy': 0.14285714285714285},
               precision    recall  f1-score  support
 Mint           0.333333  0.750000  0.461538      4.0
 Leak           0.333333  1.000000  0.500000      3.0
 Limit          0.500000  0.800000  0.615385      5.0
 micro avg      0.384615  0.833333  0.526316     12.0
 macro avg      0.388889  0.850000  0.525641     12.0
 weighted avg   0.402778  0.833333  0.535256     12.0
 samples avg    0.333333  0.535714  0.376190     12.0)