# EDA - Fraud Data (E-commerce)

This notebook performs Exploratory Data Analysis and Feature Engineering on the E-commerce Fraud dataset.

In [None]:
import pandas as pd
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join('../src')))

from data_loader import load_data, basic_cleaning
from preprocessing import assign_countries, feature_engineering_dates, feature_engineering_velocity, ip_to_country
from visualization import set_style, plot_class_distribution, plot_numerical_distributions, plot_categorical_breakdown

set_style()

## 1. Data Loading

In [None]:
# Load Datasets
fraud_df = load_data('../data/raw/Fraud_Data.csv')
ip_country_df = load_data('../data/raw/IpAddress_to_Country.csv')

## 2. Basic Cleaning
- Duplicate removal
- Date conversion

In [None]:
fraud_df = basic_cleaning(fraud_df, date_columns=['signup_time', 'purchase_time'])
fraud_df.info()

## 3. Feature Engineering & Merging
- Merge with IP Country data
- Extract date features
- Calculate velocity

In [None]:
# Map IPs to Countries
fraud_df = assign_countries(fraud_df, ip_country_df)

# Date Features
fraud_df = feature_engineering_dates(fraud_df, ['signup_time', 'purchase_time'])

# Velocity Features
fraud_df = feature_engineering_velocity(fraud_df, user_col='user_id')

fraud_df.head()

## 4. Exploratory Data Analysis

In [None]:
# Class Imbalance
plot_class_distribution(fraud_df['class'], title="Fraud Class Distribution (E-commerce)")

In [None]:
# Relationship: Country vs Fraud
top_countries = fraud_df['country'].value_counts().head(10).index
subset_df = fraud_df[fraud_df['country'].isin(top_countries)]
plot_categorical_breakdown(subset_df, ['country'], target='class')

In [None]:
# Numerical Distributions
plot_numerical_distributions(fraud_df, ['purchase_value', 'age', 'time_diff_minutes'], hue='class')