# Split Dataset -- CRPWarner

## Import Libraries

In [8]:
import pandas as pd
import os
from pathlib import Path
from sklearn.model_selection import train_test_split, KFold

In [9]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data')
IN_PATH = os.path.join(DATA_PATH, 'interim')
PROCESS_PATH = os.path.join(DATA_PATH, 'processed')
NAME = 'crpwarner'

## Load Labeled Dataset

In [10]:
df = pd.read_csv(os.path.join(IN_PATH, f'{NAME}/dataset-modified.csv'))
df.head()

Unnamed: 0,address,mint,leak,limit
0,0x93023f1d3525e273f291b6f76d2f5027a39bf302,1,0,1
1,0x2753dce37a7edb052a77832039bcc9aa49ad8b25,0,0,1
2,0x94b7d24552933f50a5a5705c446528806dcea381,0,0,0
3,0xe0b9d4146aad6936cbfcbe4dae47e34aab96b093,0,0,0
4,0x10f6f2b97f3ab29583d9d38babf2994df7220c21,1,0,1


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   address  69 non-null     object
 1   mint     69 non-null     int64 
 2   leak     69 non-null     int64 
 3   limit    69 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 2.3+ KB


## Train/Test Split (no stratify)

In [12]:
# Split 80% train / 20% test
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# Save to processed folder
train_df.to_csv(os.path.join(PROCESS_PATH, 'train.csv'), index=False)
test_df.to_csv(os.path.join(PROCESS_PATH, 'test.csv'), index=False)

# Print shapes
print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 55
Test size: 14


## K-Fold Split

In [13]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    train_fold = df.iloc[train_idx]
    val_fold = df.iloc[val_idx]

    train_fold.to_csv(os.path.join(PROCESS_PATH, f'train_fold_{fold}.csv'), index=False)
    val_fold.to_csv(os.path.join(PROCESS_PATH, f'val_fold_{fold}.csv'), index=False)

print("✅ Saved 3-fold CV sets (train_fold_*.csv, val_fold_*.csv)")

✅ Saved 3-fold CV sets (train_fold_*.csv, val_fold_*.csv)
