In [None]:
#  Data Preprocessing EDA

In [1]:

#  Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

In [2]:


#  Load Datasets
fraud_data_path = "../data/Fraud_Data.csv"
ip_data_path = "../data/IpAddress_to_Country.csv"
creditcard_data_path = "../data/creditcard.csv"

fraud_data = pd.read_csv(fraud_data_path)
ip_data = pd.read_csv(ip_data_path)
creditcard_data = pd.read_csv(creditcard_data_path)

#  Step 1: Handling Missing Values
print("Missing Values Before Processing:")
print(fraud_data.isnull().sum(), "\n")

# Handling missing values:
for col in fraud_data.select_dtypes(include=["float64", "int64"]).columns:
    fraud_data[col].fillna(fraud_data[col].median(), inplace=True)  # Use median for numerical

for col in fraud_data.select_dtypes(include=["object"]).columns:
    fraud_data[col].fillna(fraud_data[col].mode()[0], inplace=True)  # Use mode for categorical



Missing Values Before Processing:
user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64 



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fraud_data[col].fillna(fraud_data[col].median(), inplace=True)  # Use median for numerical
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fraud_data[col].fillna(fraud_data[col].median(), inplace=True)  # Use median for numerical
The behavior will change in pandas 3.0. This inpla

In [None]:
# Step 2: Data Cleaning
# Convert timestamps to datetime format

In [3]:
fraud_data["signup_time"] = pd.to_datetime(fraud_data["signup_time"])
fraud_data["purchase_time"] = pd.to_datetime(fraud_data["purchase_time"])

# Remove duplicates
fraud_data.drop_duplicates(inplace=True)

#  Step 3: Merge IP Address Data

In [4]:

ip_data["lower_bound_ip_address"] = ip_data["lower_bound_ip_address"].astype(float)
ip_data["upper_bound_ip_address"] = ip_data["upper_bound_ip_address"].astype(float)

# Merging by matching IP range
fraud_data["country"] = fraud_data["ip_address"].apply(
    lambda x: ip_data.loc[
        (x >= ip_data["lower_bound_ip_address"]) & (x <= ip_data["upper_bound_ip_address"]), "country"
    ].values[0] if len(ip_data.loc[
        (x >= ip_data["lower_bound_ip_address"]) & (x <= ip_data["upper_bound_ip_address"]), "country"
    ].values) > 0 else "Unknown"
)


#  Step 4: Feature Engineering

In [5]:
fraud_data["transaction_delay"] = (fraud_data["purchase_time"] - fraud_data["signup_time"]).dt.total_seconds()
fraud_data["hour_of_day"] = fraud_data["purchase_time"].dt.hour
fraud_data["day_of_week"] = fraud_data["purchase_time"].dt.dayofweek



#  Step 5: Normalization & Encoding

In [6]:

scaler = MinMaxScaler()
fraud_data["purchase_value"] = scaler.fit_transform(fraud_data[["purchase_value"]])
fraud_data["transaction_delay"] = scaler.fit_transform(fraud_data[["transaction_delay"]])

encoder = LabelEncoder()
fraud_data["browser"] = encoder.fit_transform(fraud_data["browser"])
fraud_data["source"] = encoder.fit_transform(fraud_data["source"])
fraud_data["country"] = encoder.fit_transform(fraud_data["country"])

# 📌 Step 6: Save Processed Data
processed_fraud_path = "../data/processed_fraud_data.csv"
fraud_data.to_csv(processed_fraud_path, index=False)

print(f"s Preprocessing completed! Processed file saved: {processed_fraud_path}")


✅ Preprocessing completed! Processed file saved: ../data/processed_fraud_data.csv
