In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('../data/raw/creditcard.csv')

In [3]:
X = df.drop('Class', axis=1)
y = df['Class']

In [4]:
print(f"Original class distribution:\n{y.value_counts()}")

Original class distribution:
Class
0    284315
1       492
Name: count, dtype: int64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [6]:
print(f"\nTrain set shape: {X_train.shape}, Test set shape: {X_test.shape}")
print(f"Train fraud rate: {y_train.value_counts(normalize=True)[1]:.4f}")
print(f"Test fraud rate: {y_test.value_counts(normalize=True)[1]:.4f}")


Train set shape: (199364, 30), Test set shape: (85443, 30)
Train fraud rate: 0.0017
Test fraud rate: 0.0017


In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [9]:
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

In [10]:
print(f"\nAfter SMOTE:")
print(f"Train fraud rate: {pd.Series(y_train_balanced).value_counts(normalize=True)[1]:.4f}")
print(f"Class distribution: {pd.Series(y_train_balanced).value_counts()}")


After SMOTE:
Train fraud rate: 0.5000
Class distribution: Class
0    199020
1    199020
Name: count, dtype: int64


In [11]:
X_train_balanced.to_csv('../data/processed/X_train.csv', index=False)
X_test_scaled.to_csv('../data/processed/X-test.csv', index=False)
y_train_balanced.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

In [12]:
import joblib
joblib.dump(scaler, '../models/scaler.pkl')

['../models/scaler.pkl']