# All Models Fusion

## Set up

###  Install required packages.

In [1]:
import os
import torch
os.environ['TORCH'] = torch.__version__
os.environ['PYTHONWARNINGS'] = "ignore"
print(torch.__version__)
!pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

2.6.0+cu124
Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.6.0%2Bcu124/torch_scatter-2.1.2%2Bpt26cu124-cp311-cp311-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt26cu124
Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.6.0%2Bcu124/torch_sparse-0.6.18%2Bpt26cu124-cp311-cp311-linux_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt26cu124
Collecting git+https://github.com/pyg-team/pytorch_geometric.git
  Cloning https://github.com/pyg-team/pyto

### Unzip Data and Utils

In [2]:
!unzip "utils.zip" -d '.'

Archive:  utils.zip
   creating: ./utils/
   creating: ./utils/__pycache__/
  inflating: ./__MACOSX/utils/.___pycache__  
  inflating: ./utils/threshold.py    
  inflating: ./utils/comparing.py    
  inflating: ./utils/__pycache__/comparing.cpython-312.pyc  
  inflating: ./__MACOSX/utils/__pycache__/._comparing.cpython-312.pyc  
  inflating: ./utils/__pycache__/threshold.cpython-312.pyc  
  inflating: ./__MACOSX/utils/__pycache__/._threshold.cpython-312.pyc  


In [3]:
!unzip "data.zip" -d '.'

[1;30;43mเอาต์พุตของการสตรีมมีการตัดเหลือเพียง 5000 บรรทัดสุดท้าย[0m
  inflating: ./__MACOSX/data/external/rphunter/Dataset/Incidents-Source-Code/2024-5-2-NovaMind/._@openzeppelin  
  inflating: ./data/external/rphunter/Dataset/Incidents-Source-Code/2021-11-25-Sloth Inu/0x85bcaee8befa419becb90e8b405d131ee9e31877.sol  
  inflating: ./__MACOSX/data/external/rphunter/Dataset/Incidents-Source-Code/2021-11-25-Sloth Inu/._0x85bcaee8befa419becb90e8b405d131ee9e31877.sol  
  inflating: ./data/external/rphunter/Dataset/Incidents-Source-Code/2021-8-26-Snakedefi/0xf882413338a88F3d41f0df4c31d8CA5d33e3e3d1.sol  
  inflating: ./__MACOSX/data/external/rphunter/Dataset/Incidents-Source-Code/2021-8-26-Snakedefi/._0xf882413338a88F3d41f0df4c31d8CA5d33e3e3d1.sol  
  inflating: ./data/external/rphunter/Dataset/Incidents-Source-Code/2022-9-22-SiriusToken/0x9bc699780A326C059C660C6ee2EF1D8583D01aEa.sol  
  inflating: ./__MACOSX/data/external/rphunter/Dataset/Incidents-Source-Code/2022-9-22-SiriusToken/._0x9b

### Import and Var

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
import pandas as pd
from pathlib import Path
import json
import pickle
import numpy as np
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from torch_geometric.utils import from_networkx

PATH = './data/labeled'

## Load Data

### Split Train

In [None]:
y = pd.read_csv(os.path.join(PATH, 'groundtruth.csv'), index_col=0)
y.index = y.index.str.lower()
addresses = y.index.tolist()
labels_name = [col for col in y.columns]
train_idx, test_idx = train_test_split(addresses, test_size=0.2, random_state=42)

# Optional: Convert to sets for faster lookup
train_idx_set, test_idx_set = set(train_idx), set(test_idx)
y_train = y.loc[train_idx]
y_test = y.loc[test_idx]

In [None]:
# Load features (ensure index is lowercase)
def load_feature(file):
    df = pd.read_csv(os.path.join(PATH, file), index_col=0)
    df.index = df.index.str.lower()
    return df

In [None]:
transaction_feature_df = load_feature('transaction_feature.csv')
bytecode_feature_df = load_feature('bytecode_feature.csv')
sourcecode_feature_df = load_feature('tf_idf.csv')
txn_graph_features_df = load_feature('txn_graph_features.csv')
cfg_graph_features_df = load_feature('cfg_graph_features.csv')

In [None]:
def load_graph(graphs, graph_detail, ground, labels_name):
    dataset = []

    for i, (address, graph_data) in tqdm(enumerate(graphs.items())):
        # Check if the address exists in the ground truth DataFrame
        if address in ground.index:
            feature = graph_detail.loc[address].values.reshape(1, -1) # Use txn_graph features for txn_dataset
            data = from_networkx(graph_data)
            data.x = torch.tensor(feature, dtype=torch.float32).repeat(data.num_nodes, 1) # Repeat features for each node
            data.y = torch.tensor(ground.loc[address][labels_name].values, dtype=torch.float32).unsqueeze(0) # Add a batch dimension
            if i < 10:
                print(data)
            dataset.append(data)
    return dataset

In [None]:
txn_graph = pickle.load(open(os.path.join(PATH, 'txn.pkl'), 'rb'))
txn_graph_dataset = load_graph(txn_graph, txn_graph_features_df, y, labels_name)
cfg_graph = pickle.load(open(os.path.join(PATH, 'cfg.pkl'), 'rb'))
cfg_graph_dataset = load_graph(cfg_graph, cfg_graph_features_df, y, labels_name)

In [None]:
# for time series
seq_len = 500
feature_dim = 5
X_ts, y_ts = [], []

for path in tqdm(list(Path(os.path.join(PATH, 'txn')).glob('*.json'))):
    addr = path.stem
    data = json.load(open(path))
    txns = sorted(data.get("transaction", []), key=lambda x: int(x.get("timeStamp", 0)))
    seq = [[
        int(tx.get("gas", 0)),
        int(tx.get("gasPrice", 0)),
        int(tx.get("value", 0)),
        int(tx.get("isError", 0)),
        int(tx.get("txreceipt_status", 0))
    ] for tx in txns]

    if len(seq) < seq_len:
        seq += [[0]*feature_dim] * (seq_len - len(seq))
    else:
        seq = seq[:seq_len]

    seq = StandardScaler().fit_transform(seq)
    X_ts.append(seq)
    y_ts.append(y.loc[addr].tolist())

In [None]:
# For txn_static_feature_model
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# bytecode_static_feature_model
from sklearn.neural_network import MLPClassifier
# {'estimator__activation': 'tanh', 'estimator__alpha': 1e-05, 'estimator__early_stopping': True, 'estimator__hidden_layer_sizes': (200,), 'estimator__learning_rate_init': 0.001, 'estimator__solver': 'adam'}

In [None]:
# sourcecode_statice_feature_model
from sklearn.linear_model import LogisticRegression
# max_iter=500, class_weight='balanced', random_state=4

In [None]:
# time series for multi-label
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Masking
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
# GRU
# === Model ===
model = Sequential([
    Masking(mask_value=0.0, input_shape=(seq_len, feature_dim)),
    GRU(64),
    Dense(3, activation="sigmoid")
])

In [None]:
# GCN
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class GCN(nn.Module):
    def __init__(self, in_channels, hidden, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden*2)
        self.conv2 = GCNConv(hidden*2, hidden)
        self.lin = nn.Linear(hidden, out_channels)
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        return self.lin(x)

In [None]:
from utils.threshold import tune_thresholds
from utils.comparing import evaluate_multilabel_classification