# Split Dataset -- Opcode Frequency

## Import Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import os
import json
from pathlib import Path
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import IterativeStratification
from imblearn.combine import SMOTETomek

## CRPWarner

In [2]:
source = 'crpwarner'
dataset = 'dataset-modified.csv' # dataset-modified.csv | sample_dataset-modified.csv
feature = 'groundtruth-feature-opcode-frequency.csv' # groundtruth-feature-opcode-frequency.csv | large-feature-opcode-frequency.csv

In [3]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data')
IN_PATH = os.path.join(DATA_PATH, 'interim')
CRP_PATH = os.path.join(IN_PATH, source)
PROCESS_PATH = os.path.join(DATA_PATH, 'processed')

### Load Labeled Dataset

In [4]:
df = pd.read_csv(os.path.join(CRP_PATH, dataset))
df.head()

Unnamed: 0,address,mint,leak,limit
0,0x93023f1d3525e273f291b6f76d2f5027a39bf302,1,0,1
1,0x2753dce37a7edb052a77832039bcc9aa49ad8b25,0,0,1
2,0x94b7d24552933f50a5a5705c446528806dcea381,0,0,0
3,0xe0b9d4146aad6936cbfcbe4dae47e34aab96b093,0,0,0
4,0x10f6f2b97f3ab29583d9d38babf2994df7220c21,1,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   address  69 non-null     object
 1   mint     69 non-null     int64 
 2   leak     69 non-null     int64 
 3   limit    69 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 2.3+ KB


In [6]:
feature_df = pd.read_csv(os.path.join(CRP_PATH, feature))
feature_df.head()

Unnamed: 0,address,PUSH1,MSTORE,CALLDATASIZE,LT,PUSH2,JUMPI,CALLDATALOAD,PUSH29,SWAP1,...,UNKNOWN_0xc6,UNKNOWN_0xe1,INVALID_0x70,PUSH30,DUP16,UNKNOWN_0x2b,UNKNOWN_0xd8,INVALID_0x7a,UNKNOWN_0xf9,INVALID_0x7f
0,0x8275ebf521dc217aa79c88132017a5bcef001dd9,386,112,10,7,126,53,17,1,209,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0x292e89d5d5bdab3af2f5838c194c1983f0140b43,413,115,10,8,188,63,7,0,89,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0x2753dce37a7edb052a77832039bcc9aa49ad8b25,660,176,23,13,378,101,7,0,157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0xe1a0ce8b94c6a5e4791401086763d7bd0a6c18f5,264,77,8,13,116,36,15,0,103,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0x9db8a10c7fe60d84397860b3af2e686d4f90c2b7,801,248,20,29,312,100,35,0,310,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
dataset_df = pd.merge(df, feature_df, on="address", how="inner")

In [8]:
dataset_df.head()

Unnamed: 0,address,mint,leak,limit,PUSH1,MSTORE,CALLDATASIZE,LT,PUSH2,JUMPI,...,UNKNOWN_0xc6,UNKNOWN_0xe1,INVALID_0x70,PUSH30,DUP16,UNKNOWN_0x2b,UNKNOWN_0xd8,INVALID_0x7a,UNKNOWN_0xf9,INVALID_0x7f
0,0x93023f1d3525e273f291b6f76d2f5027a39bf302,1,0,1,405,119,14,9,255,67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0x2753dce37a7edb052a77832039bcc9aa49ad8b25,0,0,1,660,176,23,13,378,101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0x94b7d24552933f50a5a5705c446528806dcea381,0,0,0,16,2,2,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0xe0b9d4146aad6936cbfcbe4dae47e34aab96b093,0,0,0,489,107,9,10,469,96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0x10f6f2b97f3ab29583d9d38babf2994df7220c21,1,0,1,725,150,18,18,694,115,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Columns: 269 entries, address to INVALID_0x7f
dtypes: float64(199), int64(69), object(1)
memory usage: 145.1+ KB


In [10]:
labels = ["mint", "leak", "limit"]
X = dataset_df.drop(columns=["address", "mint", "leak", "limit"])  # feature matrix
y = dataset_df[labels]  # multi-label targets
feature_list = list(X.columns)  # ordered list of opcode features


In [11]:
X.to_csv(os.path.join(PROCESS_PATH, 'X_features-opcode-freq_only.csv'))
y.to_csv(os.path.join(PROCESS_PATH, 'Y_labels_only-opcode-freq.csv'))

with open(os.path.join(PROCESS_PATH, 'feature-opcode-freq_list.json'), "w") as f:
    json.dump(feature_list, f, indent=2)

with open(os.path.join(PROCESS_PATH, 'labels-opcode-freq.json'), "w") as f:
   json.dump(labels, f, indent=2)

### Train/Test Split

#### Split into train/test

In [12]:
# Split 80% train / 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

#### Apply SMOTETomek to each label and collect (X, y_label)

In [13]:
Xy_resampled = []

for label in y_train.columns:
    smt = SMOTETomek(random_state=42)
    X_res, y_res = smt.fit_resample(X_train, y_train[label])

    df_X = pd.DataFrame(X_res)
    df_y = pd.DataFrame({label: y_res})
    df_combined = pd.concat([df_X, df_y], axis=1)

    Xy_resampled.append(df_combined)

#### Merge all (X + y_label) together

In [14]:
df_full = pd.concat(Xy_resampled, ignore_index=True)

#### Group by features and aggregate all label columns (preserve 1 if any resampled label has it)

In [15]:
feature_cols = df_full.columns[:-len(y_train.columns)]  # assume features are the first N columns
label_cols = y_train.columns

In [16]:
df_final = df_full.groupby(list(feature_cols), as_index=False)[label_cols.tolist()].max()

#### Separate back X and y

In [17]:
X_train = df_final[feature_cols].fillna(0).astype(int)
y_train = df_final[label_cols].fillna(0).astype(int)

In [18]:
# Merge for saving (optional — for keeping features + labels together)
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Save
train_df.to_csv(os.path.join(PROCESS_PATH, 'train-opcode-freq.csv'), index=False)
test_df.to_csv(os.path.join(PROCESS_PATH, 'test-opcode-freq.csv'), index=False)

print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 137
Test size: 14


In [19]:
train_df.head()

Unnamed: 0,PUSH1,MSTORE,CALLDATASIZE,LT,PUSH2,JUMPI,CALLDATALOAD,PUSH29,SWAP1,DIV,...,PUSH30,DUP16,UNKNOWN_0x2b,UNKNOWN_0xd8,INVALID_0x7a,UNKNOWN_0xf9,INVALID_0x7f,mint,leak,limit
0,1,0,2,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,16,2,2,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,40,5,5,2,32,10,2,0,7,0,...,0,1,1,0,0,0,0,0,0,0
3,53,7,5,2,43,9,2,0,8,0,...,0,0,0,0,0,0,0,0,0,0
4,178,32,7,7,134,41,6,0,66,0,...,0,0,0,0,0,0,0,0,0,0


### K-Fold Split

In [20]:
X_y = pd.concat([X, y], axis=1)

stratifier = IterativeStratification(n_splits=3, order=1)

for fold, (train_idx, val_idx) in enumerate(stratifier.split(X, y)):
    train_fold = X_y.iloc[train_idx]
    val_fold = X_y.iloc[val_idx]
    train_fold.to_csv(os.path.join(PROCESS_PATH, f'train_fold_{fold}-opcode-freq.csv'), index=False)
    val_fold.to_csv(os.path.join(PROCESS_PATH, f'val_fold_{fold}-opcode-freq.csv'), index=False)

print("✅ Saved 3-fold CV sets (train_fold_*-opcode-freq.csv, val_fold_*-opcode-freq.csv)")

✅ Saved 3-fold CV sets (train_fold_*-opcode-freq.csv, val_fold_*-opcode-freq.csv)
