In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import os

In [10]:
from sklearn.preprocessing import OrdinalEncoder


In [3]:
# Load the processed datasets
fraud_data_path = "../data/processed_fraud_data.csv"
creditcard_data_path = "../data/creditcard.csv"

fraud_data = pd.read_csv(fraud_data_path)
creditcard_data = pd.read_csv(creditcard_data_path)

In [4]:
# Fraud_Data.csv (E-commerce transactions)
X_fraud = fraud_data.drop(columns=["class"])  # Features
y_fraud = fraud_data["class"]  # Target

In [5]:
# Creditcard.csv (Bank transactions)
X_creditcard = creditcard_data.drop(columns=["Class"])  # Features
y_creditcard = creditcard_data["Class"]  # Target

In [6]:
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(
    X_creditcard, y_creditcard, test_size=0.2, random_state=42, stratify=y_creditcard
)

In [16]:
# 📌 Step 1: Data Preparation for Model Training

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
import os

# Load the processed datasets
fraud_data_path = "../data/processed_fraud_data.csv"
creditcard_data_path = "../data/creditcard.csv"

fraud_data = pd.read_csv(fraud_data_path)
creditcard_data = pd.read_csv(creditcard_data_path)

# 📌 Separate Features (X) and Target Variable (y)

# Fraud_Data.csv (E-commerce transactions)
X_fraud = fraud_data.drop(columns=["class"])  # Features
y_fraud = fraud_data["class"]  # Target

# Creditcard.csv (Bank transactions)
X_creditcard = creditcard_data.drop(columns=["Class"])  # Features
y_creditcard = creditcard_data["Class"]  # Target

# 📌 Convert Datetime Columns into Unix Timestamp
datetime_columns = ["signup_time", "purchase_time"]
for col in datetime_columns:
    if col in X_fraud.columns:
        X_fraud[col] = pd.to_datetime(X_fraud[col]).astype(int) // 10**9

# 📌 Perform Train-Test Split (80% Training, 20% Testing)
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(
    X_creditcard, y_creditcard, test_size=0.2, random_state=42, stratify=y_creditcard
)

# 📌 Encode Categorical Variables Using OrdinalEncoder
categorical_columns = ["browser", "source", "country"]

ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

# Apply encoding only to categorical columns in Fraud Data
X_fraud_train[categorical_columns] = ordinal_encoder.fit_transform(X_fraud_train[categorical_columns])
X_fraud_test[categorical_columns] = ordinal_encoder.transform(X_fraud_test[categorical_columns])

# 📌 Normalize Numerical Features Using MinMaxScaler
scaler = MinMaxScaler()

# Select only numerical columns
numeric_columns = X_fraud_train.select_dtypes(include=["number"]).columns

X_fraud_train_scaled = pd.DataFrame(scaler.fit_transform(X_fraud_train[numeric_columns]), columns=numeric_columns)
X_fraud_test_scaled = pd.DataFrame(scaler.transform(X_fraud_test[numeric_columns]), columns=numeric_columns)

# Ensure categorical features are added back after scaling
for col in categorical_columns:
    X_fraud_train_scaled[col] = X_fraud_train[col].values
    X_fraud_test_scaled[col] = X_fraud_test[col].values

# 📌 Save Processed Train-Test Datasets
processed_data_path = "../data/processed/"

if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path)

X_fraud_train_scaled.to_csv(f"{processed_data_path}X_fraud_train.csv", index=False)
X_fraud_test_scaled.to_csv(f"{processed_data_path}X_fraud_test.csv", index=False)
y_fraud_train.to_csv(f"{processed_data_path}y_fraud_train.csv", index=False)
y_fraud_test.to_csv(f"{processed_data_path}y_fraud_test.csv", index=False)

X_credit_train.to_csv(f"{processed_data_path}X_credit_train.csv", index=False)
X_credit_test.to_csv(f"{processed_data_path}X_credit_test.csv", index=False)
y_credit_train.to_csv(f"{processed_data_path}y_credit_train.csv", index=False)
y_credit_test.to_csv(f"{processed_data_path}y_credit_test.csv", index=False)

print("✅ Data Preparation Complete! Processed train-test datasets saved in 'data/processed/'.")


✅ Data Preparation Complete! Processed train-test datasets saved in 'data/processed/'.


In [None]:
# 📌 Save Processed Train-Test Datasets
processed_data_path = "../data/processed/"

if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path)

X_fraud_train_scaled.to_csv(f"{processed_data_path}X_fraud_train.csv", index=False)
X_fraud_test_scaled.to_csv(f"{processed_data_path}X_fraud_test.csv", index=False)
y_fraud_train.to_csv(f"{processed_data_path}y_fraud_train.csv", index=False)
y_fraud_test.to_csv(f"{processed_data_path}y_fraud_test.csv", index=False)

X_credit_train_scaled.to_csv(f"{processed_data_path}X_credit_train.csv", index=False)
X_credit_test_scaled.to_csv(f"{processed_data_path}X_credit_test.csv", index=False)
y_credit_train.to_csv(f"{processed_data_path}y_credit_train.csv", index=False)
y_credit_test.to_csv(f"{processed_data_path}y_credit_test.csv", index=False)

print("✅ Data Preparation Complete! Processed train-test datasets saved in 'data/processed/'.")