# Credit Card Fraud Detection - Exploratory Data Analysis

This notebook performs comprehensive EDA on the credit card fraud dataset.

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from src.data_utils import load_data, basic_eda, get_feature_info
from src.config import SEED
from src.utils import set_seed

# Set style and seed
plt.style.use('default')
sns.set_palette("husl")
set_seed(SEED)

# Display settings
pd.set_option('display.max_columns', None)
np.random.seed(SEED)

## 1. Data Loading and Basic Information

In [None]:
# Load data
try:
    df = load_data()
    print(f"Data loaded successfully: {df.shape}")
except FileNotFoundError:
    print("Dataset not found. Please download creditcard.csv from Kaggle and place it in data/creditcard.csv")
    print("Download link: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud")

In [None]:
# Basic dataset information
if 'df' in locals():
    print("Dataset Info:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Data types: {df.dtypes.value_counts()}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Display first few rows
    display(df.head())

## 2. Class Distribution Analysis

In [None]:
if 'df' in locals():
    # Basic EDA
    eda_results = basic_eda(df)
    
    print("Class Distribution:")
    for class_label, count in eda_results['class_counts'].items():
        percentage = count / eda_results['total_samples'] * 100
        label = 'Normal' if class_label == 0 else 'Fraud'
        print(f"  {label}: {count:,} ({percentage:.2f}%)")
    
    print(f"\nImbalance Ratio: {eda_results['class_counts'][0] / eda_results['class_counts'][1]:.1f}:1")

In [None]:
if 'df' in locals():
    # Visualize class distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Count plot
    class_counts = df['Class'].value_counts()
    ax1.bar(['Normal', 'Fraud'], class_counts.values, color=['lightblue', 'lightcoral'])
    ax1.set_ylabel('Count')
    ax1.set_title('Class Distribution (Count)')
    ax1.set_yscale('log')
    
    # Add count labels
    for i, v in enumerate(class_counts.values):
        ax1.text(i, v, f'{v:,}', ha='center', va='bottom')
    
    # Pie chart
    labels = ['Normal (99.83%)', 'Fraud (0.17%)']
    sizes = class_counts.values
    colors = ['lightblue', 'lightcoral']
    ax2.pie(sizes, labels=labels, colors=colors, autopct='%1.2f%%', startangle=90)
    ax2.set_title('Class Distribution (Percentage)')
    
    plt.tight_layout()
    plt.show()

## 3. Feature Analysis

In [None]:
if 'df' in locals():
    # Missing values
    missing_values = df.isnull().sum()
    print(f"Missing values: {missing_values.sum()}")
    
    # Descriptive statistics
    print("\nDescriptive Statistics:")
    display(df.describe())

In [None]:
if 'df' in locals():
    # Time analysis
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Time distribution
    ax1.hist(df['Time'], bins=50, alpha=0.7, edgecolor='black')
    ax1.set_xlabel('Time (seconds)')
    ax1.set_ylabel('Frequency')
    ax1.set_title('Distribution of Transaction Times')
    ax1.grid(True, alpha=0.3)
    
    # Time by class
    for class_val in [0, 1]:
        subset = df[df['Class'] == class_val]['Time']
        label = 'Normal' if class_val == 0 else 'Fraud'
        ax2.hist(subset, bins=50, alpha=0.7, label=label, density=True)
    
    ax2.set_xlabel('Time (seconds)')
    ax2.set_ylabel('Density')
    ax2.set_title('Time Distribution by Class')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
if 'df' in locals():
    # Amount analysis
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    # Amount distribution (log scale)
    ax1.hist(df['Amount'], bins=50, alpha=0.7, edgecolor='black')
    ax1.set_xlabel('Amount')
    ax1.set_ylabel('Frequency')
    ax1.set_title('Distribution of Transaction Amounts')
    ax1.set_yscale('log')
    ax1.grid(True, alpha=0.3)
    
    # Amount by class (log scale)
    for class_val in [0, 1]:
        subset = df[df['Class'] == class_val]['Amount']
        label = 'Normal' if class_val == 0 else 'Fraud'
        ax2.hist(subset, bins=50, alpha=0.7, label=label, density=True)
    
    ax2.set_xlabel('Amount')
    ax2.set_ylabel('Density')
    ax2.set_title('Amount Distribution by Class')
    ax2.legend()
    ax2.set_yscale('log')
    ax2.grid(True, alpha=0.3)
    
    # Box plot
    df_sample = df.sample(n=10000, random_state=SEED)  # Sample for better visualization
    sns.boxplot(data=df_sample, x='Class', y='Amount', ax=ax3)
    ax3.set_title('Amount Distribution by Class (Boxplot)')
    ax3.set_yscale('log')
    
    # Statistics
    amount_stats = df.groupby('Class')['Amount'].describe()
    amount_stats.plot(kind='bar', ax=ax4)
    ax4.set_title('Amount Statistics by Class')
    ax4.set_xlabel('Statistic')
    ax4.set_ylabel('Amount')
    ax4.legend(['Normal', 'Fraud'])
    ax4.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print("Amount Statistics by Class:")
    display(amount_stats)

## 4. PCA Features Analysis

In [None]:
if 'df' in locals():
    # Analyze V features
    v_features = [col for col in df.columns if col.startswith('V')]
    
    # Correlation with target
    correlations = df[v_features + ['Class']].corr()['Class'].drop('Class').sort_values(key=abs, ascending=False)
    
    # Plot top correlated features
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Correlation plot
    top_features = correlations.head(10)
    colors = ['red' if x < 0 else 'blue' for x in top_features.values]
    ax1.barh(range(len(top_features)), top_features.values, color=colors, alpha=0.7)
    ax1.set_yticks(range(len(top_features)))
    ax1.set_yticklabels(top_features.index)
    ax1.set_xlabel('Correlation with Class')
    ax1.set_title('Top 10 Features Correlated with Fraud')
    ax1.grid(True, alpha=0.3)
    
    # Feature distributions for top correlated feature
    top_feature = abs(correlations).idxmax()
    for class_val in [0, 1]:
        subset = df[df['Class'] == class_val][top_feature]
        label = 'Normal' if class_val == 0 else 'Fraud'
        ax2.hist(subset, bins=50, alpha=0.7, label=label, density=True)
    
    ax2.set_xlabel(top_feature)
    ax2.set_ylabel('Density')
    ax2.set_title(f'Distribution of {top_feature} by Class')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"Most correlated feature with fraud: {top_feature} (r = {correlations[top_feature]:.3f})")

## 5. Correlation Analysis

In [None]:
if 'df' in locals():
    # Correlation matrix for selected features
    features_to_plot = ['Time', 'Amount'] + [f'V{i}' for i in range(1, 11)] + ['Class']
    corr_matrix = df[features_to_plot].corr()
    
    # Plot correlation matrix
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": .8}, fmt='.2f')
    plt.title('Correlation Matrix (Selected Features)')
    plt.tight_layout()
    plt.show()

## 6. Dimensionality Reduction Visualization

In [None]:
if 'df' in locals():
    # Sample data for visualization (t-SNE is computationally expensive)
    sample_size = 5000
    df_sample = df.sample(n=sample_size, random_state=SEED)
    
    # Prepare features (exclude Class)
    X_sample = df_sample.drop('Class', axis=1)
    y_sample = df_sample['Class']
    
    print(f"Performing dimensionality reduction on {sample_size} samples...")
    
    # PCA
    pca = PCA(n_components=2, random_state=SEED)
    X_pca = pca.fit_transform(X_sample)
    
    # t-SNE (this may take a while)
    print("Computing t-SNE... This may take a few minutes.")
    tsne = TSNE(n_components=2, random_state=SEED, perplexity=30, n_iter=1000)
    X_tsne = tsne.fit_transform(X_sample)
    
    # Plot results
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # PCA plot
    scatter1 = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=y_sample, cmap='coolwarm', alpha=0.6, s=1)
    ax1.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    ax1.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
    ax1.set_title('PCA Visualization')
    plt.colorbar(scatter1, ax=ax1, label='Class')
    
    # t-SNE plot
    scatter2 = ax2.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_sample, cmap='coolwarm', alpha=0.6, s=1)
    ax2.set_xlabel('t-SNE 1')
    ax2.set_ylabel('t-SNE 2')
    ax2.set_title('t-SNE Visualization')
    plt.colorbar(scatter2, ax=ax2, label='Class')
    
    plt.tight_layout()
    plt.show()
    
    print(f"PCA explained variance: {pca.explained_variance_ratio_.sum():.1%}")

## 7. Time-based Analysis

In [None]:
if 'df' in locals():
    # Convert time to hours
    df_time = df.copy()
    df_time['Hour'] = (df_time['Time'] / 3600) % 24
    
    # Fraud rate by hour
    hourly_fraud = df_time.groupby(df_time['Hour'].astype(int)).agg({
        'Class': ['count', 'sum', 'mean']
    }).round(4)
    
    hourly_fraud.columns = ['Total_Transactions', 'Fraud_Count', 'Fraud_Rate']
    
    # Plot time-based patterns
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
    
    # Transactions by hour
    ax1.bar(hourly_fraud.index, hourly_fraud['Total_Transactions'], alpha=0.7)
    ax1.set_xlabel('Hour of Day')
    ax1.set_ylabel('Number of Transactions')
    ax1.set_title('Transaction Volume by Hour')
    ax1.grid(True, alpha=0.3)
    
    # Fraud rate by hour
    ax2.bar(hourly_fraud.index, hourly_fraud['Fraud_Rate'], color='red', alpha=0.7)
    ax2.set_xlabel('Hour of Day')
    ax2.set_ylabel('Fraud Rate')
    ax2.set_title('Fraud Rate by Hour')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("Hourly Statistics:")
    display(hourly_fraud.head(10))

## 8. Summary and Key Insights

In [None]:
if 'df' in locals():
    print("=" * 60)
    print("EXPLORATORY DATA ANALYSIS SUMMARY")
    print("=" * 60)
    
    print(f"\n📊 Dataset Overview:")
    print(f"   • Total transactions: {len(df):,}")
    print(f"   • Features: {len(df.columns) - 1}")
    print(f"   • Missing values: {df.isnull().sum().sum()}")
    
    print(f"\n🎯 Class Distribution:")
    class_counts = df['Class'].value_counts()
    print(f"   • Normal transactions: {class_counts[0]:,} ({class_counts[0]/len(df):.2%})")
    print(f"   • Fraud transactions: {class_counts[1]:,} ({class_counts[1]/len(df):.2%})")
    print(f"   • Imbalance ratio: {class_counts[0]/class_counts[1]:.0f}:1")
    
    print(f"\n💰 Transaction Amounts:")
    print(f"   • Normal transactions median: ${df[df['Class']==0]['Amount'].median():.2f}")
    print(f"   • Fraud transactions median: ${df[df['Class']==1]['Amount'].median():.2f}")
    print(f"   • Max amount: ${df['Amount'].max():.2f}")
    
    print(f"\n🔍 Key Correlations:")
    v_features = [col for col in df.columns if col.startswith('V')]
    correlations = df[v_features + ['Class']].corr()['Class'].drop('Class')
    top_positive = correlations.nlargest(3)
    top_negative = correlations.nsmallest(3)
    
    print(f"   • Strongest positive correlations:")
    for feature, corr in top_positive.items():
        print(f"     - {feature}: {corr:.3f}")
    
    print(f"   • Strongest negative correlations:")
    for feature, corr in top_negative.items():
        print(f"     - {feature}: {corr:.3f}")
    
    print(f"\n📈 Modeling Considerations:")
    print(f"   • Highly imbalanced dataset - consider SMOTE/class weights")
    print(f"   • PCA features suggest complex patterns")
    print(f"   • Time and amount show different distributions by class")
    print(f"   • No missing values - clean dataset")
    print(f"   • Focus on recall (catching fraud) over precision")
    
    print("\n" + "=" * 60)
else:
    print("Dataset not loaded. Please ensure creditcard.csv is in the data/ directory.")