In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")


# Load Data

In [None]:
df = pd.read_csv("../data/raw/data.csv")
df.head()


In [None]:
df.shape


In [None]:
df.info()


# COLUMN OVERVIEW

In [None]:
df.columns


# Column Groups

In [None]:
identifier_cols = [
    "TransactionId", "BatchId", "AccountId",
    "SubscriptionId", "CustomerId"
]

categorical_cols = [
    "CurrencyCode", "ProviderId", "ProductId",
    "ProductCategory", "ChannelId", "PricingStrategy"
]

numerical_cols = ["Amount", "Value", "CountryCode"]

target_like_col = ["FraudResult"]


# Summary Statistics

In [None]:
df[numerical_cols].describe().T



# Fraud Distribution

In [None]:
df["FraudResult"].value_counts(normalize=True) * 100


In [None]:
sns.countplot(x="FraudResult", data=df)
plt.title("Fraud Result Distribution")
plt.show()


# Missing Values

df.isnull().sum().sort_values(ascending=False)


# Visualization

In [None]:
plt.figure(figsize=(10,4))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()


# Numerical Feature Distributions

# Amount

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df["Amount"], bins=50, kde=True)
plt.title("Transaction Amount Distribution")
plt.show()



# Value

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df["Value"], bins=50, kde=True)
plt.title("Transaction Value Distribution")
plt.show()


# CountryCode

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df["CountryCode"], bins=20)
plt.title("Country Code Distribution")
plt.show()


# OUTLIER DETECTION (BOXPLOTS)

In [None]:
for col in ["Amount", "Value"]:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])
    plt.title(f"Outliers in {col}")
    plt.show()


# Categorical Feature Distributions

# CurrencyCode

In [None]:
df["CurrencyCode"].value_counts().plot(kind="bar")
plt.title("Currency Code Distribution")
plt.show()



# ProductCategory (Top 10)

In [None]:
df["ProductCategory"].value_counts().head(10).plot(kind="bar")
plt.title("Top 10 Product Categories")
plt.show()


# ChannelId

In [None]:
df["ChannelId"].value_counts().plot(kind="bar")
plt.title("Channel Usage Distribution")
plt.show()


# PricingStrategy

In [None]:
df["PricingStrategy"].value_counts().plot(kind="bar")
plt.title("Pricing Strategy Distribution")
plt.show()


# TIME-BASED FEATURE ANALYSIS

In [None]:
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])


# Hour of Transaction

In [None]:
df["transaction_hour"] = df["TransactionStartTime"].dt.hour

sns.countplot(x="transaction_hour", data=df)
plt.title("Transactions by Hour of Day")
plt.show()


# Day of Week

In [None]:
df["transaction_dayofweek"] = df["TransactionStartTime"].dt.day_name()

plt.figure(figsize=(10,4))
sns.countplot(x="transaction_dayofweek", data=df,
              order=[
                  "Monday", "Tuesday", "Wednesday",
                  "Thursday", "Friday", "Saturday", "Sunday"
              ])
plt.xticks(rotation=45)
plt.title("Transactions by Day of Week")
plt.show()


# Correlation Analysis

In [None]:
corr = df[["Amount", "Value", "CountryCode", "FraudResult"]].corr()

plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()



# FRAUD VS TRANSACTION BEHAVIOR

# Fraud vs Amount

In [None]:
sns.boxplot(x="FraudResult", y="Amount", data=df)
plt.title("Fraud Result vs Transaction Amount")
plt.show()



# Fraud vs Channel

In [None]:
sns.countplot(x="ChannelId", hue="FraudResult", data=df)
plt.title("Fraud by Channel")
plt.show()



## Key EDA Insights

1. Transaction amounts are highly right-skewed, with a small number of very large transactions indicating potential outliers.
2. Most customers transact through a limited number of channels, with web and mobile platforms dominating usage.
3. Several categorical features such as ProductCategory and ProviderId are highly imbalanced.
4. Fraudulent transactions represent a small fraction of the dataset, indicating class imbalance.
5. Time-based features show that transactions cluster around specific hours of the day, suggesting behavioral patterns.
