In [1]:
import sys
sys.path.append('..')

In [2]:
raw_data_dir = '../data/raw/'
processed_data_dir = '../data/processed'
model_dir = '../models/'

In [3]:
import os
import pandas as pd

file_path = os.path.join(raw_data_dir, 'creditcard.csv')
data = pd.read_csv(file_path)

In [4]:
X = data.drop(columns=['Class', 'Time'])
y = data['Class']

In [5]:
from sklearn.model_selection import train_test_split

# Initial split (train_val + test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y,
    random_state=42
)

# Split train_val into train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.15,
    stratify=y_train_val,
    random_state=42
)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [7]:
import joblib

joblib.dump(scaler, os.path.join(model_dir, 'scaler.pkl'))

['../models/scaler.pkl']

In [8]:
datasets = {
    # Features
    'X_train_scaled.csv': X_train_scaled,
    'X_val_scaled.csv': X_val_scaled,
    'X_test_scaled.csv': X_test_scaled,
    # Targets
    'y_train.csv': y_train,
    'y_val.csv': y_val,
    'y_test.csv': y_test
}

for filename, data in datasets.items():
    pd.DataFrame(data).to_csv(
        os.path.join(processed_data_dir, filename), 
        index=False
    )


In [9]:
print(f"Files in {processed_data_dir}:")
print(os.listdir(processed_data_dir))
print('\nDataset shapes:')
print(f"Train: {X_train_scaled.shape} samples")
print(f"Val:   {X_val_scaled.shape} samples")
print(f"Test:  {X_test_scaled.shape} samples")

Files in ../data/processed:
['y_test.csv', 'X_train_scaled.csv', 'X_val_scaled.csv', 'X_test_scaled.csv', 'y_val.csv', '.gitkeep', 'y_train.csv']

Dataset shapes:
Train: (193668, 29) samples
Val:   (34177, 29) samples
Test:  (56962, 29) samples
