# Split Dataset -- Opcode N Gram 

## Import Libraries

In [1]:
import pandas as pd
import os
import json
from pathlib import Path
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import IterativeStratification

## CRPWarner

In [2]:
source = 'crpwarner'
dataset = 'dataset-modified.csv' # dataset-modified.csv | sample_dataset-modified.csv
feature = 'groundtruth-feature-opcode-n-gram.csv' # groundtruth-feature-opcode-n-gramuency.csv | large-feature-opcode-n-gramuency.csv

In [3]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data')
IN_PATH = os.path.join(DATA_PATH, 'interim')
CRP_PATH = os.path.join(IN_PATH, source)
PROCESS_PATH = os.path.join(DATA_PATH, 'processed')

### Load Labeled Dataset

In [4]:
df = pd.read_csv(os.path.join(CRP_PATH, dataset))
df.head()

Unnamed: 0,address,mint,leak,limit
0,0x93023f1d3525e273f291b6f76d2f5027a39bf302,1,0,1
1,0x2753dce37a7edb052a77832039bcc9aa49ad8b25,0,0,1
2,0x94b7d24552933f50a5a5705c446528806dcea381,0,0,0
3,0xe0b9d4146aad6936cbfcbe4dae47e34aab96b093,0,0,0
4,0x10f6f2b97f3ab29583d9d38babf2994df7220c21,1,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   address  69 non-null     object
 1   mint     69 non-null     int64 
 2   leak     69 non-null     int64 
 3   limit    69 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 2.3+ KB


In [6]:
feature_df = pd.read_csv(os.path.join(CRP_PATH, feature))
feature_df.head()

Unnamed: 0,address,add,add add,add add dup2,add add gt,add add mload,add add mstore,add add swap1,add add swap2,add add swap3,...,xor gasprice push1,xor invalid_0x72,xor mul,xor mul extcodehash,xor push32,xor push32 push9,xor sload,xor sload smod,xor unknown_0xe3,xor unknown_0xe3 push5
0,0x8275ebf521dc217aa79c88132017a5bcef001dd9,153,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0x292e89d5d5bdab3af2f5838c194c1983f0140b43,98,2,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0x2753dce37a7edb052a77832039bcc9aa49ad8b25,153,3,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0xe1a0ce8b94c6a5e4791401086763d7bd0a6c18f5,78,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0x9db8a10c7fe60d84397860b3af2e686d4f90c2b7,206,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
dataset_df = pd.merge(df, feature_df, on="address", how="inner")

In [8]:
dataset_df.head()

Unnamed: 0,address,mint,leak,limit,add,add add,add add dup2,add add gt,add add mload,add add mstore,...,xor gasprice push1,xor invalid_0x72,xor mul,xor mul extcodehash,xor push32,xor push32 push9,xor sload,xor sload smod,xor unknown_0xe3,xor unknown_0xe3 push5
0,0x93023f1d3525e273f291b6f76d2f5027a39bf302,1,0,1,101,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0x2753dce37a7edb052a77832039bcc9aa49ad8b25,0,0,1,153,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0x94b7d24552933f50a5a5705c446528806dcea381,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0xe0b9d4146aad6936cbfcbe4dae47e34aab96b093,0,0,0,148,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0x10f6f2b97f3ab29583d9d38babf2994df7220c21,1,0,1,234,4,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Columns: 13751 entries, address to xor unknown_0xe3 push5
dtypes: int64(13750), object(1)
memory usage: 7.2+ MB


In [10]:
labels = ["mint", "leak", "limit"]
X = dataset_df.drop(columns=["address", "mint", "leak", "limit"])  # feature matrix
y = dataset_df[labels]  # multi-label targets
feature_list = list(X.columns)  # ordered list of opcode features


In [11]:
X.to_csv(os.path.join(PROCESS_PATH, 'X_features-opcode-n-gram_only.csv'))
y.to_csv(os.path.join(PROCESS_PATH, 'Y_labels_only-opcode-n-gram.csv'))

with open(os.path.join(PROCESS_PATH, 'feature-opcode-n-gram_list.json'), "w") as f:
    json.dump(feature_list, f, indent=2)

with open(os.path.join(PROCESS_PATH, 'labels-opcode-n-gram.json'), "w") as f:
   json.dump(labels, f, indent=2)

### Train/Test Split (no stratify)

In [12]:
# Split 80% train / 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

# Merge for saving (optional — for keeping features + labels together)
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Save
train_df.to_csv(os.path.join(PROCESS_PATH, 'train-opcode-n-gram.csv'), index=False)
test_df.to_csv(os.path.join(PROCESS_PATH, 'test-opcode-n-gram.csv'), index=False)

print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 55
Test size: 14


### K-Fold Split

In [13]:
X_y = pd.concat([X, y], axis=1)

stratifier = IterativeStratification(n_splits=3, order=1)

for fold, (train_idx, val_idx) in enumerate(stratifier.split(X, y)):
    train_fold = X_y.iloc[train_idx]
    val_fold = X_y.iloc[val_idx]
    train_fold.to_csv(os.path.join(PROCESS_PATH, f'train_fold_{fold}-opcode-n-gram.csv'), index=False)
    val_fold.to_csv(os.path.join(PROCESS_PATH, f'val_fold_{fold}-opcode-n-gram.csv'), index=False)

print("✅ Saved 3-fold CV sets (train_fold_*-opcode-n-gram.csv, val_fold_*-opcode-n-gram.csv)")

✅ Saved 3-fold CV sets (train_fold_*-opcode-n-gram.csv, val_fold_*-opcode-n-gram.csv)
