# Feature Engineering
## Transaction Fraud Detection System

This notebook demonstrates the feature engineering process:
1. Balance inconsistency features (key fraud indicators)
2. Amount-based features
3. Transaction behavior flags
4. Categorical encoding
5. Feature importance analysis

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data_loader import load_raw_data, clean_data
from features import (
    engineer_features, 
    get_feature_columns,
    create_balance_features,
    create_amount_features,
    create_transaction_features
)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Load and Clean Data

In [None]:
# Load data
df = load_raw_data(nrows=500000)  # Adjust based on your system

print(f"\nRaw data shape: {df.shape}")

In [None]:
# Clean data
df_clean = clean_data(df)

print(f"\nCleaned data shape: {df_clean.shape}")
print(f"Rows removed: {len(df) - len(df_clean):,}")

## 2. Balance Inconsistency Features

These are **critical fraud indicators**. Fraudulent transactions often have balance inconsistencies.

In [None]:
# Create balance features
df_balance = create_balance_features(df_clean)

# Show new balance features
balance_features = [
    'errorBalanceOrig', 'errorBalanceDest',
    'absErrorBalanceOrig', 'absErrorBalanceDest',
    'totalBalanceError', 'hasBalanceErrorOrig', 'hasBalanceErrorDest',
    'origBalanceZero', 'destBalanceZero'
]

print("Balance Features Created:")
print("="*70)
for feat in balance_features:
    print(f"  ✓ {feat}")

print("\nSample of balance features:")
df_balance[balance_features + ['isFraud']].head(10)

In [None]:
# Analyze balance error by fraud status
print("\nBalance Error Statistics by Fraud Status:")
print("="*70)
print("\nTotal Balance Error:")
print(df_balance.groupby('isFraud')['totalBalanceError'].describe())

print("\nPercentage with Balance Errors:")
print(f"Legitimate: {df_balance[df_balance['isFraud']==0]['hasBalanceErrorOrig'].mean()*100:.2f}%")
print(f"Fraud:      {df_balance[df_balance['isFraud']==1]['hasBalanceErrorOrig'].mean()*100:.2f}%")

In [None]:
# Visualize balance error distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Total balance error by fraud status
df_sample = df_balance[df_balance['totalBalanceError'] > 0].sample(n=min(5000, len(df_balance)), random_state=42)
sns.boxplot(data=df_sample, x='isFraud', y='totalBalanceError', ax=axes[0])
axes[0].set_title('Total Balance Error by Fraud Status', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Is Fraud')
axes[0].set_ylabel('Total Balance Error')
axes[0].set_xticklabels(['Legitimate', 'Fraud'])
axes[0].set_yscale('log')

# Percentage with balance errors
error_pct = df_balance.groupby('isFraud')['hasBalanceErrorOrig'].mean() * 100
error_pct.plot(kind='bar', ax=axes[1], color=['#2ecc71', '#e74c3c'])
axes[1].set_title('Percentage with Balance Errors', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Is Fraud')
axes[1].set_ylabel('Percentage (%)')
axes[1].set_xticklabels(['Legitimate', 'Fraud'], rotation=0)

# Add value labels
for i, v in enumerate(error_pct):
    axes[1].text(i, v + max(error_pct)*0.02, f'{v:.2f}%', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 3. Amount-Based Features

Features that capture transaction amount patterns.

In [None]:
# Create amount features
df_amount = create_amount_features(df_balance)

# Show new amount features
amount_features = [
    'log_amount', 'amount_to_orig_balance_ratio', 'amount_to_dest_balance_ratio',
    'drains_origin_account', 'is_large_transaction', 'is_round_amount'
]

print("Amount Features Created:")
print("="*70)
for feat in amount_features:
    print(f"  ✓ {feat}")

print("\nSample of amount features:")
df_amount[['amount'] + amount_features + ['isFraud']].head(10)

In [None]:
# Analyze amount features by fraud status
print("\nAmount Feature Statistics by Fraud Status:")
print("="*70)

for feat in ['drains_origin_account', 'is_large_transaction', 'is_round_amount']:
    fraud_rate = df_amount[df_amount[feat]==1]['isFraud'].mean() * 100
    overall_rate = df_amount['isFraud'].mean() * 100
    print(f"\n{feat}:")
    print(f"  Fraud rate when True:  {fraud_rate:.4f}%")
    print(f"  Overall fraud rate:    {overall_rate:.4f}%")
    print(f"  Lift:                  {fraud_rate/overall_rate:.2f}x")

In [None]:
# Visualize amount features
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Log amount distribution
df_amount[df_amount['isFraud']==0]['log_amount'].hist(bins=50, alpha=0.5, label='Legitimate', ax=axes[0,0], color='green')
df_amount[df_amount['isFraud']==1]['log_amount'].hist(bins=50, alpha=0.5, label='Fraud', ax=axes[0,0], color='red')
axes[0,0].set_title('Log Amount Distribution', fontsize=12, fontweight='bold')
axes[0,0].set_xlabel('Log(Amount + 1)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].legend()

# Amount to origin balance ratio
df_sample = df_amount[df_amount['amount_to_orig_balance_ratio'] < 10].sample(n=min(5000, len(df_amount)), random_state=42)
sns.boxplot(data=df_sample, x='isFraud', y='amount_to_orig_balance_ratio', ax=axes[0,1])
axes[0,1].set_title('Amount to Origin Balance Ratio', fontsize=12, fontweight='bold')
axes[0,1].set_xticklabels(['Legitimate', 'Fraud'])

# Drains origin account
drain_fraud = df_amount.groupby('drains_origin_account')['isFraud'].mean() * 100
drain_fraud.plot(kind='bar', ax=axes[1,0], color=['#2ecc71', '#e74c3c'])
axes[1,0].set_title('Fraud Rate: Drains Origin Account', fontsize=12, fontweight='bold')
axes[1,0].set_xlabel('Drains Origin Account')
axes[1,0].set_ylabel('Fraud Rate (%)')
axes[1,0].set_xticklabels(['No', 'Yes'], rotation=0)

# Large transaction
large_fraud = df_amount.groupby('is_large_transaction')['isFraud'].mean() * 100
large_fraud.plot(kind='bar', ax=axes[1,1], color=['#2ecc71', '#e74c3c'])
axes[1,1].set_title('Fraud Rate: Large Transaction', fontsize=12, fontweight='bold')
axes[1,1].set_xlabel('Is Large Transaction')
axes[1,1].set_ylabel('Fraud Rate (%)')
axes[1,1].set_xticklabels(['No', 'Yes'], rotation=0)

plt.tight_layout()
plt.show()

## 4. Transaction Type Features

Encoding transaction types and time-based features.

In [None]:
# Create transaction features
df_trans = create_transaction_features(df_amount)

# Show new transaction features
trans_features = [
    'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER',
    'can_be_fraud', 'hour', 'day', 'is_night', 'is_morning', 'is_afternoon', 'is_evening'
]

print("Transaction Features Created:")
print("="*70)
for feat in trans_features:
    print(f"  ✓ {feat}")

print("\nSample of transaction features:")
df_trans[['type'] + trans_features[:6] + ['isFraud']].head(10)

In [None]:
# Analyze fraud by time of day
time_periods = ['is_night', 'is_morning', 'is_afternoon', 'is_evening']
time_labels = ['Night (0-6)', 'Morning (6-12)', 'Afternoon (12-18)', 'Evening (18-24)']

print("\nFraud Rate by Time of Day:")
print("="*70)

for period, label in zip(time_periods, time_labels):
    fraud_rate = df_trans[df_trans[period]==1]['isFraud'].mean() * 100
    count = df_trans[df_trans[period]==1].shape[0]
    print(f"{label:20s}: {fraud_rate:.4f}% ({count:,} transactions)")

In [None]:
# Visualize transaction type features
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Fraud rate by transaction type (one-hot encoded)
type_cols = ['type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']
fraud_by_type = []
for col in type_cols:
    fraud_rate = df_trans[df_trans[col]==1]['isFraud'].mean() * 100
    fraud_by_type.append(fraud_rate)

type_names = ['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER']
axes[0].bar(type_names, fraud_by_type, color=['#2ecc71', '#e74c3c', '#2ecc71', '#2ecc71', '#e74c3c'])
axes[0].set_title('Fraud Rate by Transaction Type', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Transaction Type')
axes[0].set_ylabel('Fraud Rate (%)')
axes[0].tick_params(axis='x', rotation=45)

# Add value labels
for i, v in enumerate(fraud_by_type):
    axes[0].text(i, v + max(fraud_by_type)*0.02, f'{v:.3f}%', ha='center', fontweight='bold')

# Fraud rate by time of day
time_fraud_rates = []
for period in time_periods:
    fraud_rate = df_trans[df_trans[period]==1]['isFraud'].mean() * 100
    time_fraud_rates.append(fraud_rate)

axes[1].bar(time_labels, time_fraud_rates, color='#3498db')
axes[1].set_title('Fraud Rate by Time of Day', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Time Period')
axes[1].set_ylabel('Fraud Rate (%)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Complete Feature Engineering Pipeline

In [None]:
# Apply complete feature engineering pipeline
df_featured, encoders = engineer_features(df_clean)

print(f"\nOriginal columns: {len(df_clean.columns)}")
print(f"After feature engineering: {len(df_featured.columns)}")
print(f"New features created: {len(df_featured.columns) - len(df_clean.columns)}")

In [None]:
# Get feature columns for modeling
feature_cols = get_feature_columns(df_featured)

print(f"\nFeature columns for modeling ({len(feature_cols)}):")
print("="*70)
for i, col in enumerate(feature_cols, 1):
    print(f"{i:2d}. {col}")

## 6. Feature Correlation Analysis

In [None]:
# Calculate correlation with fraud
X = df_featured[feature_cols]
y = df_featured['isFraud']

correlations = X.corrwith(y).sort_values(ascending=False)

print("\nTop 20 Features Correlated with Fraud:")
print("="*70)
print(correlations.head(20))

print("\nBottom 10 Features (Negative Correlation):")
print("="*70)
print(correlations.tail(10))

In [None]:
# Visualize top feature correlations
fig, ax = plt.subplots(figsize=(12, 8))

top_features = correlations.abs().sort_values(ascending=False).head(20)
top_correlations = correlations[top_features.index]

colors = ['#e74c3c' if x > 0 else '#3498db' for x in top_correlations]
top_correlations.plot(kind='barh', ax=ax, color=colors)
ax.set_title('Top 20 Features by Correlation with Fraud', fontsize=14, fontweight='bold')
ax.set_xlabel('Correlation Coefficient', fontsize=12)
ax.set_ylabel('Feature', fontsize=12)
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Save Processed Data

In [None]:
# Save featured data for model training
output_path = '../data/processed/featured_data.csv'
df_featured.to_csv(output_path, index=False)
print(f"\nFeatured data saved to: {output_path}")
print(f"Shape: {df_featured.shape}")
print(f"Size: {df_featured.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 8. Key Findings

### Most Important Features
1. **Balance inconsistency features** - Strong correlation with fraud
2. **Transaction type** - Only TRANSFER and CASH_OUT can be fraud
3. **Amount ratios** - Unusual amount-to-balance ratios indicate fraud
4. **Account draining** - Transactions that empty accounts are suspicious

### Feature Engineering Success
- Created **{} new features** from raw data
- Balance error features show **high correlation** with fraud
- Features capture both **statistical** and **behavioral** patterns

### Next Steps
1. Time-based train/test split
2. Handle class imbalance
3. Train baseline and advanced models
4. Evaluate with Precision@K and PR-AUC

In [None]:
print("\n" + "="*70)
print("FEATURE ENGINEERING COMPLETE!")
print("="*70)
print(f"\nTotal features created: {len(feature_cols)}")
print("\nNext notebook: 03_model_training.ipynb")