## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

pd.set_option('display.max_columns', None)

## 1. Load Data

In [2]:
df = pd.read_csv('../data/procressed/elliptic_bitcoin_dataset_v0.csv')

# Rename 
df = df.rename(columns={'feat_0': 'time_step'})

# Cleanup
if 'class' in df.columns:
    df = df.drop(columns=['class'])

print(f"Data Shape: {df.shape}")
print(f"Time steps present: {sorted(df['time_step'].unique())}")

Data Shape: (46564, 168)
Time steps present: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49)]


## 2. Train/Test Split แบบ time-base


In [3]:
train_df = df[df['time_step'] <= 34]
test_df = df[df['time_step'] > 34]

print(f"Train set (Time 1-34): {train_df.shape}")
print(f"Test set (Time 35-49): {test_df.shape}")

drop_cols = ['label', 'txId', 'time_step']

X_train = train_df.drop(columns=drop_cols)
y_train = train_df['label']

X_test = test_df.drop(columns=drop_cols)
y_test = test_df['label']

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

Train set (Time 1-34): (29894, 168)
Test set (Time 35-49): (16670, 168)
X_train shape: (29894, 165), y_train shape: (29894,)
X_test shape: (16670, 165), y_test shape: (16670,)


## 3. Apply SMOTE 
- จำลองโจรเพิ่มมากขึ้นเพิ่อไม่ให้ Model ลำเอียง

In [4]:
# Check imbalance before SMOTE
print("Before SMOTE (Train):", y_train.value_counts().to_dict())

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("After SMOTE (Train):", y_train_res.value_counts().to_dict())

Before SMOTE (Train): {0: 26432, 1: 3462}
After SMOTE (Train): {0: 26432, 1: 26432}


## 4. Train Models

In [5]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
}

trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_res, y_train_res)
    trained_models[name] = model
    print(f"{name} trained")

Training Logistic Regression...
Logistic Regression trained
Training Random Forest...
Random Forest trained
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost trained


## 5. Save Models

In [6]:
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

for name, model in trained_models.items():
    filename = f"{name.replace(' ', '_').lower()}_smote_v1.pkl"
    path = os.path.join(models_dir, filename)
    with open(path, 'wb') as file:
        pickle.dump(model, file)
    print(f"Saved {name} to {path}")
    
print("All models saved successfully.")

Saved Logistic Regression to ../models\logistic_regression_smote_v1.pkl
Saved Random Forest to ../models\random_forest_smote_v1.pkl
Saved XGBoost to ../models\xgboost_smote_v1.pkl
All models saved successfully.
