In [1]:
import sys
sys.path.append("..")

In [2]:
from src.download_data import download_kaggle_dataset

dataset_name = "mlg-ulb/creditcardfraud"
temp_dir = download_kaggle_dataset(dataset_name, "temp_download")

import os
print(f"Files in {temp_dir}: {os.listdir(temp_dir)}")

Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
Dataset downloaded to directory: temp_download
Files in temp_download: ['y_test.csv', 'X_test_selected', 'LightGBM.pkl', 'X_val_selected.csv', 'raw', 'X_val_selected', 'XGBoost.pkl', 'X_train_scaled.csv', 'X_test_selected.csv', 'X_train_enriched.csv', 'Logistic_Regression_predictions.csv', 'encoded_features_train.csv', 'LightGBM_predictions.csv', 'best_autoencoder.pth', 'y_train_res.csv', 'Random_Forest_predictions.csv', 'Random_Forest.pkl', 'X_train_selected', 'creditcard.csv', 'X_val_scaled.csv', 'X_train_selected.csv', 'X_test_enriched.csv', 'X_val_enriched.csv', 'selected_features.csv', 'X_train_res.csv', 'comparison_report.csv', 'model_performance.csv', 'encoded_features_val.csv', 'scaler.pkl', 'XGBoost_predictions.csv', 'X_test_scaled.csv', 'processed', 'Logistic_Regression.pkl', 'y_val.csv', 'encoded_features_test.csv', 'y_train.csv']


In [3]:
import pandas as pd

file_path = os.path.join(temp_dir, "creditcard.csv")
data = pd.read_csv(file_path)

In [4]:
X = data.drop("Class", axis=1)
y = data["Class"]

In [5]:
from sklearn.model_selection import train_test_split

# Initial split (train_val + test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y,
    random_state=42
)

# Split train_val into train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.15,
    stratify=y_train_val,
    random_state=42
)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [7]:
import joblib

joblib.dump(scaler, os.path.join(temp_dir, "scaler.pkl"))

['temp_download/scaler.pkl']

In [8]:
datasets = {
    # Features
    "X_train_scaled.csv": X_train_scaled,
    "X_val_scaled.csv": X_val_scaled,
    "X_test_scaled.csv": X_test_scaled,
    # Targets
    "y_train.csv": y_train,
    "y_val.csv": y_val,
    "y_test.csv": y_test
}

for filename, data in datasets.items():
    pd.DataFrame(data).to_csv(
        os.path.join(temp_dir, filename), 
        index=False
    )


In [9]:
print(f"Files in {temp_dir}:")
print(os.listdir(temp_dir))
print("\nDataset shapes:")
print(f"Train: {X_train_scaled.shape} samples")
print(f"Val:   {X_val_scaled.shape} samples")
print(f"Test:  {X_test_scaled.shape} samples")

Files in temp_download:
['y_test.csv', 'X_test_selected', 'LightGBM.pkl', 'X_val_selected.csv', 'raw', 'X_val_selected', 'XGBoost.pkl', 'X_train_scaled.csv', 'X_test_selected.csv', 'X_train_enriched.csv', 'Logistic_Regression_predictions.csv', 'encoded_features_train.csv', 'LightGBM_predictions.csv', 'best_autoencoder.pth', 'y_train_res.csv', 'Random_Forest_predictions.csv', 'Random_Forest.pkl', 'X_train_selected', 'creditcard.csv', 'X_val_scaled.csv', 'X_train_selected.csv', 'X_test_enriched.csv', 'X_val_enriched.csv', 'selected_features.csv', 'X_train_res.csv', 'comparison_report.csv', 'model_performance.csv', 'encoded_features_val.csv', 'scaler.pkl', 'XGBoost_predictions.csv', 'X_test_scaled.csv', 'processed', 'Logistic_Regression.pkl', 'y_val.csv', 'encoded_features_test.csv', 'y_train.csv']

Dataset shapes:
Train: (193668, 30) samples
Val:   (34177, 30) samples
Test:  (56962, 30) samples
