# Fraud Detection - Exploratory Data Analysis
## Interactive Analysis with Plotly

**Objective:** Analyze transaction data to understand fraud patterns and prepare for machine learning modeling.

**Dataset:** PesaPal Mobile Money Transactions

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

print('✅ Libraries loaded successfully')

✅ Libraries loaded successfully


In [2]:
# Load data
train = pd.read_csv('training.csv')
test = pd.read_csv('test.csv')

print(f"Training data: {train.shape}")
print(f"Test data: {test.shape}")

# Convert timestamp to datetime
train['TransactionStartTime'] = pd.to_datetime(train['TransactionStartTime'])
test['TransactionStartTime'] = pd.to_datetime(test['TransactionStartTime'])

# Feature engineering - Time-based features
train['Hour'] = train['TransactionStartTime'].dt.hour
train['Day'] = train['TransactionStartTime'].dt.day
train['Month'] = train['TransactionStartTime'].dt.month
train['Weekday'] = train['TransactionStartTime'].dt.weekday
train['Date'] = train['TransactionStartTime'].dt.date

print('\n✅ Data loaded and time features created')

Training data: (95662, 16)
Test data: (45019, 15)

✅ Data loaded and time features created


## 1. Dataset Overview

In [3]:
# Basic statistics
print("=" * 60)
print("DATASET SUMMARY")
print("=" * 60)

fraud_count = train['FraudResult'].sum()
total_count = len(train)
fraud_rate = (fraud_count / total_count) * 100

print(f"Total Transactions: {total_count:,}")
print(f"Fraudulent Transactions: {fraud_count:,}")
print(f"Legitimate Transactions: {total_count - fraud_count:,}")
print(f"Fraud Rate: {fraud_rate:.3f}%")
print(f"Class Imbalance Ratio: 1:{int(total_count/fraud_count)}")

DATASET SUMMARY
Total Transactions: 95,662
Fraudulent Transactions: 193
Legitimate Transactions: 95,469
Fraud Rate: 0.202%
Class Imbalance Ratio: 1:495


## 2. Fraud Distribution - Interactive Visualization

In [4]:
# Interactive Pie Chart - Fraud vs Legitimate
fraud_counts = train['FraudResult'].value_counts()

fig = go.Figure(data=[go.Pie(
    labels=['Legitimate', 'Fraud'],
    values=fraud_counts.values,
    hole=0.4,
    marker=dict(colors=['#2E86C1', '#E74C3C']),
    textinfo='label+percent',
    hovertemplate="<b>%{label}</b><br>Count: %{value:,}<br>Percentage: %{percent}<extra></extra>"
)])

fig.update_layout(
    title='Overall Fraud Distribution',
    height=500,
    showlegend=True
)

fig.show()

## 3. Temporal Fraud Patterns

In [5]:
# Hourly fraud analysis
hourly_fraud = train.groupby('Hour')['FraudResult'].agg(['sum', 'count'])
hourly_fraud['rate'] = (hourly_fraud['sum'] / hourly_fraud['count']) * 100

# Daily fraud analysis  
daily_fraud = train.groupby('Date')['FraudResult'].agg(['sum', 'count'])
daily_fraud['rate'] = (daily_fraud['sum'] / daily_fraud['count']) * 100

# Create subplots
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('Fraud Rate by Hour of Day', 'Daily Fraud Rate Trend'),
    vertical_spacing=0.15
)

# Hourly pattern
fig.add_trace(
    go.Bar(
        x=hourly_fraud.index,
        y=hourly_fraud['rate'],
        name='Hourly Fraud Rate',
        marker_color='#FF6B6B',
        hovertemplate="<b>Hour: %{x}</b><br>Fraud Rate: %{y:.2f}%<br>Frauds: %{customdata[0]}<br>Total: %{customdata[1]}<extra></extra>",
        customdata=np.column_stack((hourly_fraud['sum'], hourly_fraud['count']))
    ),
    row=1, col=1
)

# Daily trend
fig.add_trace(
    go.Scatter(
        x=daily_fraud.index,
        y=daily_fraud['rate'],
        name='Daily Fraud Rate',
        line=dict(color='#4ECDC4', width=2),
        mode='lines+markers',
        hovertemplate="<b>Date: %{x}</b><br>Fraud Rate: %{y:.2f}%<br>Frauds: %{customdata[0]}<br>Total: %{customdata[1]}<extra></extra>",
        customdata=np.column_stack((daily_fraud['sum'], daily_fraud['count']))
    ),
    row=2, col=1
)

# Update axes
fig.update_xaxes(title_text="Hour", row=1, col=1)
fig.update_xaxes(title_text="Date", row=2, col=1)
fig.update_yaxes(title_text="Fraud Rate (%)", row=1, col=1)
fig.update_yaxes(title_text="Fraud Rate (%)", row=2, col=1)

fig.update_layout(height=700, showlegend=False, title_text="Temporal Fraud Analysis")
fig.show()

# Print key statistics
print("\n📊 KEY FRAUD STATISTICS:")
print(f"Peak fraud hour: {hourly_fraud['rate'].idxmax()}:00 ({hourly_fraud['rate'].max():.2f}%)")
print(f"Lowest fraud hour: {hourly_fraud['rate'].idxmin()}:00 ({hourly_fraud['rate'].min():.2f}%)")
print(f"Average daily fraud rate: {daily_fraud['rate'].mean():.2f}%")
print(f"Highest daily fraud rate: {daily_fraud['rate'].max():.2f}%")


📊 KEY FRAUD STATISTICS:
Peak fraud hour: 21:00 (1.01%)
Lowest fraud hour: 1:00 (0.00%)
Average daily fraud rate: 0.21%
Highest daily fraud rate: 1.70%


## 4. Transaction Amount Analysis

In [6]:
# Compare amount distributions for fraud vs legitimate
fig = go.Figure()

# Remove outliers for better visualization (using IQR method)
Q1 = train['Amount'].quantile(0.25)
Q3 = train['Amount'].quantile(0.75)
IQR = Q3 - Q1
train_clean = train[(train['Amount'] >= Q1 - 1.5*IQR) & (train['Amount'] <= Q3 + 1.5*IQR)]

fig.add_trace(go.Box(
    x=train_clean[train_clean['FraudResult']==0]['Amount'],
    name='Legitimate',
    marker_color='#2E86C1'
))

fig.add_trace(go.Box(
    x=train_clean[train_clean['FraudResult']==1]['Amount'],
    name='Fraud',
    marker_color='#E74C3C'
))

fig.update_layout(
    title='Transaction Amount Distribution (Outliers Removed)',
    xaxis_title='Amount',
    height=400
)

fig.show()

## 5. Feature Correlation Heatmap

In [7]:
# Select numerical features for correlation
numerical_features = ['Amount', 'Value', 'PricingStrategy', 'Hour', 'Day', 'Month', 'Weekday', 'FraudResult']
correlation_matrix = train[numerical_features].corr()

# Interactive heatmap
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu',
    zmid=0,
    text=correlation_matrix.values,
    texttemplate='%{text:.2f}',
    textfont={"size": 10},
    hovertemplate='%{y} vs %{x}<br>Correlation: %{z:.3f}<extra></extra>'
))

fig.update_layout(
    title='Feature Correlation Matrix',
    height=600,
    width=700
)

fig.show()

# Show correlations with FraudResult
fraud_corr = correlation_matrix['FraudResult'].sort_values(ascending=False)
print("\n📊 CORRELATIONS WITH FRAUD:")
print(fraud_corr)


📊 CORRELATIONS WITH FRAUD:
FraudResult        1.000000
Value              0.566739
Amount             0.557370
Hour               0.008295
Weekday           -0.006913
Day               -0.008636
Month             -0.008887
PricingStrategy   -0.033821
Name: FraudResult, dtype: float64


## 6. Categorical Feature Analysis

In [8]:
# Fraud rate by Product Category
product_fraud = train.groupby('ProductCategory')['FraudResult'].agg(['sum', 'count'])
product_fraud['rate'] = (product_fraud['sum'] / product_fraud['count']) * 100
product_fraud = product_fraud.sort_values('rate', ascending=False)

fig = go.Figure(data=[
    go.Bar(
        x=product_fraud.index,
        y=product_fraud['rate'],
        marker_color='#95A5A6',
        hovertemplate="<b>%{x}</b><br>Fraud Rate: %{y:.2f}%<br>Frauds: %{customdata[0]}<br>Total: %{customdata[1]}<extra></extra>",
        customdata=np.column_stack((product_fraud['sum'], product_fraud['count']))
    )
])

fig.update_layout(
    title='Fraud Rate by Product Category',
    xaxis_title='Product Category',
    yaxis_title='Fraud Rate (%)',
    height=400
)

fig.show()

## 7. Summary Statistics

In [9]:
# Create summary dataframe
summary_data = {
    'Metric': [
        'Total Transactions',
        'Fraudulent Transactions', 
        'Fraud Rate (%)',
        'Peak Fraud Hour',
        'Peak Fraud Rate (%)',
        'Avg Daily Fraud Rate (%)'
    ],
    'Value': [
        f"{total_count:,}",
        f"{fraud_count:,}",
        f"{fraud_rate:.3f}",
        f"{hourly_fraud['rate'].idxmax()}:00",
        f"{hourly_fraud['rate'].max():.2f}",
        f"{daily_fraud['rate'].mean():.2f}"
    ]
}

summary_df = pd.DataFrame(summary_data)

# Display as interactive table
fig = go.Figure(data=[go.Table(
    header=dict(
        values=['<b>Metric</b>', '<b>Value</b>'],
        fill_color='#2C3E50',
        font=dict(color='white', size=14),
        align='left'
    ),
    cells=dict(
        values=[summary_df['Metric'], summary_df['Value']],
        fill_color='#ECF0F1',
        font=dict(size=12),
        align='left',
        height=30
    )
)])

fig.update_layout(
    title='Key Fraud Detection Statistics',
    height=350
)

fig.show()

## Conclusions

### Key Findings:

1. **Severe Class Imbalance**: Fraud represents only ~0.2% of transactions, requiring specialized modeling techniques

2. **Temporal Patterns**: 
   - Clear hourly variations in fraud rates
   - Peak fraud activity during specific hours
   - Daily fluctuations suggest time-based features are important

3. **Transaction Characteristics**:
   - Fraud transactions show different amount distributions
   - Certain product categories show higher fraud rates

4. **Feature Correlations**:
   - Multiple features show correlations with fraud
   - Engineered time-based features add predictive value

### Next Steps:
- Apply SMOTE or other resampling techniques for class imbalance
- Use PR-AUC as primary evaluation metric
- Test ensemble methods (LightGBM, XGBoost, Random Forest)
- Engineer additional features based on EDA insights