In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Load data
df = pd.read_csv('../data/processed/cleaned_fraud_data.csv')

# Features and target
X = df.drop(['class', 'device_id'], axis=1)  # drop device_id upfront
y = df['class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Define transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Create an imblearn pipeline with SMOTE and preprocessing
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42))
])

# Fit and resample on training data only
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

print("Original training set class distribution:")
print(y_train.value_counts())

print("\nResampled training set class distribution:")
print(pd.Series(y_train_resampled).value_counts())

# Now transform test data (only preprocessing, no SMOTE)
X_test_transformed = preprocessor.transform(X_test)

# X_train_resampled and X_test_transformed are ready for modeling



Original training set class distribution:
class
0    109568
1     11321
Name: count, dtype: int64

Resampled training set class distribution:
class
0    109568
1    109568
Name: count, dtype: int64
