# Exploratory Data Analysis - Credit Risk Model

This notebook performs comprehensive exploratory data analysis on the transaction fraud dataset to understand patterns, detect anomalies, and derive insights for credit risk modeling.

## 1. Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries loaded successfully!")

In [None]:
# Load data
df = pd.read_csv('../data/raw/data.csv')
print(f"Data loaded successfully!")
print(f"Dataset shape: {df.shape}")

## 2. Data Overview

In [None]:
# Display basic information
print("=" * 70)
print("DATASET OVERVIEW")
print("=" * 70)
print(f"\nNumber of rows: {df.shape[0]:,}")
print(f"Number of columns: {df.shape[1]}")
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\nFirst few rows:")
df.head()

In [None]:
# Data types and non-null counts
print("\nColumn Information:")
print("=" * 70)
df.info()

In [None]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates} ({duplicates/len(df)*100:.2f}%)")

## 3. Summary Statistics

In [None]:
# Numeric features summary
print("\nNumerical Features Summary:")
print("=" * 100)
df.describe().T

In [None]:
# Categorical features summary
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical Features ({len(categorical_cols)}): {categorical_cols}")
print("\nUnique value counts:")
for col in categorical_cols:
    print(f"{col}: {df[col].nunique()} unique values")

In [None]:
# Fraud distribution
print("\nFraud Distribution:")
print("=" * 70)
fraud_counts = df['FraudResult'].value_counts()
fraud_pct = df['FraudResult'].value_counts(normalize=True) * 100

fraud_summary = pd.DataFrame({
    'Count': fraud_counts,
    'Percentage': fraud_pct
})
fraud_summary.index = ['Legitimate', 'Fraud']
print(fraud_summary)
print(f"\nFraud rate: {fraud_pct[1]:.4f}%")

## 4. Missing Values Analysis

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Percentage': missing_pct
}).sort_values('Missing_Count', ascending=False)

missing_df = missing_df[missing_df['Missing_Count'] > 0]

if len(missing_df) > 0:
    print("\nMissing Values:")
    print("=" * 70)
    print(missing_df)
else:
    print("\n✓ No missing values found in the dataset!")

## 5. Distribution Analysis

In [None]:
# Amount and Value distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Amount distribution
axes[0, 0].hist(df['Amount'], bins=50, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Amount Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Amount')
axes[0, 0].set_ylabel('Frequency')

# Value distribution
axes[0, 1].hist(df['Value'], bins=50, color='lightcoral', edgecolor='black')
axes[0, 1].set_title('Value Distribution', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Value')
axes[0, 1].set_ylabel('Frequency')

# Amount (log scale)
axes[1, 0].hist(np.log1p(df['Amount']), bins=50, color='lightgreen', edgecolor='black')
axes[1, 0].set_title('Amount Distribution (Log Scale)', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Log(Amount + 1)')
axes[1, 0].set_ylabel('Frequency')

# Value (log scale)
axes[1, 1].hist(np.log1p(df['Value']), bins=50, color='plum', edgecolor='black')
axes[1, 1].set_title('Value Distribution (Log Scale)', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Log(Value + 1)')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Fraud distribution by ProductCategory
fraud_by_category = df.groupby('ProductCategory')['FraudResult'].agg(['sum', 'count', 'mean'])
fraud_by_category.columns = ['Fraud_Count', 'Total_Transactions', 'Fraud_Rate']
fraud_by_category = fraud_by_category.sort_values('Fraud_Rate', ascending=False)

print("\nFraud Rate by Product Category:")
print("=" * 70)
print(fraud_by_category)

# Visualize
fig = px.bar(fraud_by_category.reset_index(), 
             x='ProductCategory', 
             y='Fraud_Rate',
             title='Fraud Rate by Product Category',
             labels={'Fraud_Rate': 'Fraud Rate', 'ProductCategory': 'Product Category'},
             color='Fraud_Rate',
             color_continuous_scale='Reds')
fig.show()

In [None]:
# PricingStrategy distribution
pricing_dist = df['PricingStrategy'].value_counts().sort_index()
fig = px.pie(values=pricing_dist.values, 
             names=pricing_dist.index,
             title='Pricing Strategy Distribution',
             hole=0.4)
fig.show()

## 6. Correlation Analysis

In [None]:
# Numeric correlation matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with FraudResult
fraud_corr = correlation_matrix['FraudResult'].sort_values(ascending=False)
print("\nCorrelation with Fraud:")
print("=" * 70)
print(fraud_corr)

## 7. Outlier Detection

In [None]:
# Box plots for Amount and Value
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Amount boxplot
bp1 = axes[0].boxplot([df[df['FraudResult']==0]['Amount'].dropna(),
                        df[df['FraudResult']==1]['Amount'].dropna()],
                       labels=['Legitimate', 'Fraud'],
                       patch_artist=True)
axes[0].set_title('Amount Distribution by Fraud Status', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Amount')
bp1['boxes'][0].set_facecolor('lightgreen')
bp1['boxes'][1].set_facecolor('lightcoral')

# Value boxplot
bp2 = axes[1].boxplot([df[df['FraudResult']==0]['Value'].dropna(),
                        df[df['FraudResult']==1]['Value'].dropna()],
                       labels=['Legitimate', 'Fraud'],
                       patch_artist=True)
axes[1].set_title('Value Distribution by Fraud Status', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Value')
bp2['boxes'][0].set_facecolor('lightgreen')
bp2['boxes'][1].set_facecolor('lightcoral')

plt.tight_layout()
plt.show()

In [None]:
# IQR method for outlier detection
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]    return len(outliers), lower_bound, upper_bound

print("\nOutlier Detection (IQR Method):")
print("=" * 70)
for col in ['Amount', 'Value']:
    count, lower, upper = detect_outliers_iqr(df, col)
    pct = (count / len(df)) * 100
    print(f"{col}:")
    print(f"  Outliers: {count:,} ({pct:.2f}%)")
    print(f"  Bounds: [{lower:.2f}, {upper:.2f}]")
    print()

## 8. Time Series Analysis (if applicable)

In [None]:
# Parse datetime
if 'TransactionStartTime' in df.columns:
    df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
    df['Hour'] = df['TransactionStartTime'].dt.hour
    df['DayOfWeek'] = df['TransactionStartTime'].dt.dayofweek
    df['Date'] = df['TransactionStartTime'].dt.date
    
    # Transactions by hour
    hourly_dist = df.groupby('Hour').size()
    
    plt.figure(figsize=(12, 5))
    plt.bar(hourly_dist.index, hourly_dist.values, color='steelblue', edgecolor='black')
    plt.title('Transaction Volume by Hour of Day', fontsize=14, fontweight='bold')
    plt.xlabel('Hour')
    plt.ylabel('Number of Transactions')
    plt.xticks(range(24))
    plt.grid(axis='y', alpha=0.3)
    plt.show()
    
    # Fraud rate by hour
    hourly_fraud = df.groupby('Hour')['FraudResult'].mean()
    
    plt.figure(figsize=(12, 5))
    plt.plot(hourly_fraud.index, hourly_fraud.values, marker='o', 
             color='crimson', linewidth=2, markersize=6)
    plt.title('Fraud Rate by Hour of Day', fontsize=14, fontweight='bold')
    plt.xlabel('Hour')
    plt.ylabel('Fraud Rate')
    plt.xticks(range(24))
    plt.grid(True, alpha=0.3)
    plt.show()

## 9. Channel and Provider Analysis

In [None]:
# Top channels by volume
channel_volume = df['ChannelId'].value_counts().head(10)

fig = px.bar(x=channel_volume.index, y=channel_volume.values,
             title='Top 10 Channels by Transaction Volume',
             labels={'x': 'Channel ID', 'y': 'Transaction Count'},
             color=channel_volume.values,
             color_continuous_scale='Blues')
fig.show()

In [None]:
# Fraud rate by channel
channel_fraud = df.groupby('ChannelId')['FraudResult'].agg(['sum', 'count', 'mean'])
channel_fraud.columns = ['Fraud_Count', 'Total', 'Fraud_Rate']
channel_fraud = channel_fraud[channel_fraud['Total'] >= 100].sort_values('Fraud_Rate', ascending=False).head(10)

print("\nTop Channels by Fraud Rate (min 100 transactions):")
print("=" * 70)
print(channel_fraud)

## 10. Key Insights and Summary

In [None]:
print("\n" + "="*80)
print(" " * 25 + "KEY INSIGHTS")
print("="*80)

insights = [
    f"1. Dataset contains {len(df):,} transactions with {df.shape[1]} features",
    f"2. Overall fraud rate is {df['FraudResult'].mean()*100:.4f}%, indicating highly imbalanced classes",
    f"3. Amount and Value distributions are right-skewed with significant outliers",
    f"4. Product categories show varying fraud rates, with some categories riskier than others",
    f"5. Strong correlation ({correlation_matrix.loc['Amount', 'Value']:.3f}) between Amount and Value suggests consistent pricing"
]

for insight in insights:
    print(f"\n{insight}")

print("\n" + "="*80)
print("\nRECOMMENDATIONS FOR MODELING:")
print("=" * 80)
recommendations = [
    "• Use SMOTE or other resampling techniques to address class imbalance",
    "• Consider log transformation for Amount and Value features",
    "• Include interaction features between Channel, Provider, and Product Category",
    "• Implement time-based features (hour, day of week) for temporal patterns",
    "• Use ensemble methods (XGBoost, Random Forest) for better performance on imbalanced data"
]

for rec in recommendations:
    print(f"\n{rec}")

print("\n" + "="*80)