In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [4]:
# ==================== LOAD DATA ====================
print("="*80)
print("FEATURE ENGINEERING FOR FRAUD DETECTION")
print("="*80)

# Load explored data

fraud_data = pd.read_csv('../data/processed/fraud_data_explored.csv')
creditcard = pd.read_csv('../data/processed/creditcard_explored.csv')
ip_data = pd.read_csv('../data/raw/ipAddress_to_Country.csv')

print(f"\n✓ Fraud data loaded: {fraud_data.shape}")
print(f"✓ Credit card data loaded: {creditcard.shape}")
print(f"✓ IP data loaded: {ip_data.shape}")

FEATURE ENGINEERING FOR FRAUD DETECTION

✓ Fraud data loaded: (151112, 14)
✓ Credit card data loaded: (284807, 32)
✓ IP data loaded: (138846, 3)


In [6]:
# ==================== FRAUD DATA PREPROCESSING ====================
print("\n" + "="*80)
print("PART 1: E-COMMERCE FRAUD DATA - FEATURE ENGINEERING")
print("="*80)

# Convert datetime
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])




PART 1: E-COMMERCE FRAUD DATA - FEATURE ENGINEERING


In [7]:
# ==================== 1. GEOLOCATION FEATURES ====================
print("\n1. GEOLOCATION INTEGRATION")
print("-" * 80)

def ip_to_integer(ip_str):
    """Convert IP address to integer"""
    try:
        parts = str(ip_str).split('.')
        return int(parts[0]) * 256**3 + int(parts[1]) * 256**2 + \
               int(parts[2]) * 256 + int(parts[3])
    except:
        return None

# Convert IP addresses
print("Converting IP addresses to integers...")
fraud_data['ip_integer'] = fraud_data['ip_address'].apply(ip_to_integer)

# CRITICAL FIX: Convert to numeric dtype for merge_asof
fraud_data['ip_integer'] = pd.to_numeric(fraud_data['ip_integer'], errors='coerce')

print(f"✓ Converted {fraud_data['ip_integer'].notna().sum():,} IP addresses")
print(f"  Data type: {fraud_data['ip_integer'].dtype}")

# Prepare IP data and ensure numeric types
print("\nPreparing IP lookup table...")

# Handle different possible column names
if 'lower_bound_ip_address' in ip_data.columns and 'upper_bound_ip_address' in ip_data.columns:
    print("  Found columns: lower_bound_ip_address, upper_bound_ip_address")
    
    # Check if they're strings (IP format) or already integers
    if ip_data['lower_bound_ip_address'].dtype == 'object':
        print("  Converting IP addresses from string format...")
        ip_data['lower_bound'] = ip_data['lower_bound_ip_address'].apply(ip_to_integer)
        ip_data['upper_bound'] = ip_data['upper_bound_ip_address'].apply(ip_to_integer)
    else:
        print("  Using existing numeric values...")
        ip_data['lower_bound'] = ip_data['lower_bound_ip_address']
        ip_data['upper_bound'] = ip_data['upper_bound_ip_address']
        
elif 'lower_bound' in ip_data.columns and 'upper_bound' in ip_data.columns:
    print("  Found columns: lower_bound, upper_bound")
    # They might already be correct
else:
    print("  ERROR: Could not find IP range columns!")
    print(f"  Available columns: {ip_data.columns.tolist()}")
    raise ValueError("IP range columns not found in ip_data")

# Convert to numeric (handles any remaining string issues)
ip_data['lower_bound'] = pd.to_numeric(ip_data['lower_bound'], errors='coerce')
ip_data['upper_bound'] = pd.to_numeric(ip_data['upper_bound'], errors='coerce')

print(f"✓ IP lookup table prepared")
print(f"  lower_bound dtype: {ip_data['lower_bound'].dtype}")
print(f"  upper_bound dtype: {ip_data['upper_bound'].dtype}")

# Remove invalid entries
initial_ip_count = len(ip_data)
ip_data = ip_data.dropna(subset=['lower_bound', 'upper_bound'])
if initial_ip_count > len(ip_data):
    print(f"  Removed {initial_ip_count - len(ip_data)} invalid IP ranges")

# Remove invalid fraud data IPs
initial_fraud_count = len(fraud_data)
fraud_data_valid = fraud_data.dropna(subset=['ip_integer']).copy()
if initial_fraud_count > len(fraud_data_valid):
    print(f"  Removed {initial_fraud_count - len(fraud_data_valid)} invalid IPs from fraud data")

print(f"✓ Valid IP integers: {len(fraud_data_valid):,}")

# Verify dtypes before merge
print("\nVerifying data types for merge:")
print(f"  Left key (ip_integer): {fraud_data_valid['ip_integer'].dtype}")
print(f"  Right key (lower_bound): {ip_data['lower_bound'].dtype}")

# Sort for merge
print("\nSorting datasets for merge...")
ip_data_sorted = ip_data.sort_values('lower_bound').copy()
fraud_data_sorted = fraud_data_valid.sort_values('ip_integer').copy()

# Merge using range lookup
print("\nPerforming range-based merge (this may take a moment)...")
try:
    merged = pd.merge_asof(
        fraud_data_sorted,
        ip_data_sorted[['lower_bound', 'upper_bound', 'country']],
        left_on='ip_integer',
        right_on='lower_bound',
        direction='backward'
    )
    print("✓ Merge successful!")
except Exception as e:
    print(f"✗ Merge failed with error: {e}")
    print("\nTroubleshooting info:")
    print(f"  Sample ip_integer values: {fraud_data_sorted['ip_integer'].head(3).tolist()}")
    print(f"  Sample lower_bound values: {ip_data_sorted['lower_bound'].head(3).tolist()}")
    raise

# Validate: keep only rows where IP is within range
print("\nValidating merge results...")
print(f"  Rows before validation: {len(merged):,}")
merged_valid = merged[merged['ip_integer'] <= merged['upper_bound']].copy()
print(f"  Rows after validation: {len(merged_valid):,}")

# Add back rows that couldn't be matched
unmatched_indices = fraud_data[~fraud_data.index.isin(merged_valid.index)].index
if len(unmatched_indices) > 0:
    unmatched = fraud_data.loc[unmatched_indices].copy()
    unmatched['country'] = 'Unknown'
    unmatched['lower_bound'] = np.nan
    unmatched['upper_bound'] = np.nan
    
    # Ensure all columns exist in unmatched
    for col in merged_valid.columns:
        if col not in unmatched.columns:
            unmatched[col] = np.nan
    
    # Combine
    merged_final = pd.concat([merged_valid, unmatched], ignore_index=True)
    print(f"  ⚠ Added {len(unmatched):,} unmatched transactions as 'Unknown'")
else:
    merged_final = merged_valid

# Fill any remaining nulls in country
merged_final['country'].fillna('Unknown', inplace=True)

# Statistics
matched_count = (merged_final['country'] != 'Unknown').sum()
unknown_count = (merged_final['country'] == 'Unknown').sum()

print(f"\n✓ Final statistics:")
print(f"  Total transactions: {len(merged_final):,}")
print(f"  Matched to countries: {matched_count:,} ({matched_count/len(merged_final)*100:.2f}%)")
print(f"  Unknown countries: {unknown_count:,} ({unknown_count/len(merged_final)*100:.2f}%)")

# Country-based features
print("\nCreating country-based features...")

# Country fraud rate
country_fraud_rate = merged_final.groupby('country')['class'].mean()
merged_final['country_fraud_rate'] = merged_final['country'].map(country_fraud_rate)

# Country transaction count
country_tx_count = merged_final.groupby('country')['class'].count()
merged_final['country_tx_count'] = merged_final['country'].map(country_tx_count)

# High-risk countries (fraud rate > average)
avg_fraud_rate = merged_final['class'].mean()
high_risk_countries = country_fraud_rate[country_fraud_rate > avg_fraud_rate].index
merged_final['is_high_risk_country'] = merged_final['country'].isin(high_risk_countries).astype(int)

print(f"✓ Country fraud rate feature created")
print(f"✓ Identified {len(high_risk_countries)} high-risk countries")
print(f"  Average fraud rate: {avg_fraud_rate:.4f}")

# Display top risky countries
print("\nTop 10 countries by fraud rate:")
top_risky = country_fraud_rate.sort_values(ascending=False).head(10)
for country, rate in top_risky.items():
    if country != 'Unknown':
        count = country_tx_count[country]
        print(f"  {country}: {rate:.4f} ({count:,} transactions)")

fraud_data = merged_final


1. GEOLOCATION INTEGRATION
--------------------------------------------------------------------------------
Converting IP addresses to integers...
✓ Converted 0 IP addresses
  Data type: float64

Preparing IP lookup table...
  Found columns: lower_bound_ip_address, upper_bound_ip_address
  Using existing numeric values...
✓ IP lookup table prepared
  lower_bound dtype: float64
  upper_bound dtype: int64
  Removed 151112 invalid IPs from fraud data
✓ Valid IP integers: 0

Verifying data types for merge:
  Left key (ip_integer): float64
  Right key (lower_bound): float64

Sorting datasets for merge...

Performing range-based merge (this may take a moment)...
✓ Merge successful!

Validating merge results...
  Rows before validation: 0
  Rows after validation: 0
  ⚠ Added 151,112 unmatched transactions as 'Unknown'

✓ Final statistics:
  Total transactions: 151,112
  Matched to countries: 0 (0.00%)
  Unknown countries: 151,112 (100.00%)

Creating country-based features...
✓ Country fraud 

In [8]:
# ==================== 2. TIME-BASED FEATURES ====================
print("\n2. TIME-BASED FEATURES")
print("-" * 80)

# Basic time features
fraud_data['hour_of_day'] = fraud_data['purchase_time'].dt.hour
fraud_data['day_of_week'] = fraud_data['purchase_time'].dt.dayofweek
fraud_data['day_of_month'] = fraud_data['purchase_time'].dt.day
fraud_data['month'] = fraud_data['purchase_time'].dt.month
fraud_data['year'] = fraud_data['purchase_time'].dt.year

print("✓ Created: hour_of_day, day_of_week, day_of_month, month")

# Time periods
fraud_data['is_night'] = fraud_data['hour_of_day'].between(0, 6).astype(int)
fraud_data['is_morning'] = fraud_data['hour_of_day'].between(6, 12).astype(int)
fraud_data['is_afternoon'] = fraud_data['hour_of_day'].between(12, 18).astype(int)
fraud_data['is_evening'] = fraud_data['hour_of_day'].between(18, 24).astype(int)
fraud_data['is_weekend'] = fraud_data['day_of_week'].isin([5, 6]).astype(int)

print("✓ Created: time period indicators (night, morning, afternoon, evening, weekend)")

# Cyclical encoding for hour and day
fraud_data['hour_sin'] = np.sin(2 * np.pi * fraud_data['hour_of_day'] / 24)
fraud_data['hour_cos'] = np.cos(2 * np.pi * fraud_data['hour_of_day'] / 24)
fraud_data['day_sin'] = np.sin(2 * np.pi * fraud_data['day_of_week'] / 7)
fraud_data['day_cos'] = np.cos(2 * np.pi * fraud_data['day_of_week'] / 7)

print("✓ Created: cyclical time encodings (hour_sin, hour_cos, day_sin, day_cos)")

# Time since signup
fraud_data['time_since_signup_hours'] = (fraud_data['purchase_time'] - 
                                         fraud_data['signup_time']).dt.total_seconds() / 3600
fraud_data['time_since_signup_days'] = fraud_data['time_since_signup_hours'] / 24

print("✓ Created: time_since_signup (hours and days)")

# Account age categories
fraud_data['account_age_category'] = pd.cut(
    fraud_data['time_since_signup_days'],
    bins=[-1, 1, 7, 30, 90, 180, 365, np.inf],
    labels=['<1day', '1-7days', '1-4weeks', '1-3months', '3-6months', '6-12months', '>1year']
)

# Quick purchase flag (purchased within 1 hour of signup)
fraud_data['is_quick_purchase'] = (fraud_data['time_since_signup_hours'] < 1).astype(int)
fraud_data['is_very_new_account'] = (fraud_data['time_since_signup_days'] < 1).astype(int)

print("✓ Created: account age categories and quick purchase flags")


2. TIME-BASED FEATURES
--------------------------------------------------------------------------------
✓ Created: hour_of_day, day_of_week, day_of_month, month
✓ Created: time period indicators (night, morning, afternoon, evening, weekend)
✓ Created: cyclical time encodings (hour_sin, hour_cos, day_sin, day_cos)
✓ Created: time_since_signup (hours and days)
✓ Created: account age categories and quick purchase flags


In [9]:
# ==================== 3. TRANSACTION FREQUENCY & VELOCITY ====================
print("\n3. TRANSACTION FREQUENCY & VELOCITY FEATURES")
print("-" * 80)

# Sort by user and time
fraud_data = fraud_data.sort_values(['user_id', 'purchase_time'])

# Transaction count per user
fraud_data['user_transaction_count'] = fraud_data.groupby('user_id').cumcount() + 1
print("✓ Created: user_transaction_count")

# Time between transactions
fraud_data['time_since_last_tx'] = fraud_data.groupby('user_id')['purchase_time'].diff().dt.total_seconds() / 3600
fraud_data['time_since_last_tx'].fillna(999, inplace=True)  # First transaction

# Rapid transaction flag
fraud_data['is_rapid_tx'] = (fraud_data['time_since_last_tx'] < 1).astype(int)

print("✓ Created: time_since_last_tx, is_rapid_tx")

# Purchase value patterns per user
fraud_data['user_avg_purchase'] = fraud_data.groupby('user_id')['purchase_value'].transform('mean')
fraud_data['user_std_purchase'] = fraud_data.groupby('user_id')['purchase_value'].transform('std')
fraud_data['user_max_purchase'] = fraud_data.groupby('user_id')['purchase_value'].transform('max')
fraud_data['user_min_purchase'] = fraud_data.groupby('user_id')['purchase_value'].transform('min')

# Purchase value ratio
fraud_data['purchase_value_ratio'] = fraud_data['purchase_value'] / (fraud_data['user_avg_purchase'] + 0.01)
fraud_data['is_unusual_amount'] = (abs(fraud_data['purchase_value_ratio']) > 2).astype(int)

print("✓ Created: user purchase statistics and anomaly flags")


3. TRANSACTION FREQUENCY & VELOCITY FEATURES
--------------------------------------------------------------------------------
✓ Created: user_transaction_count
✓ Created: time_since_last_tx, is_rapid_tx
✓ Created: user purchase statistics and anomaly flags


In [10]:
# ==================== 4. DEVICE & BROWSER FEATURES ====================
print("\n4. DEVICE & BROWSER DIVERSITY FEATURES")
print("-" * 80)

# Multiple devices per user
fraud_data['user_device_count'] = fraud_data.groupby('user_id')['device_id'].transform('nunique')
fraud_data['user_browser_count'] = fraud_data.groupby('user_id')['browser'].transform('nunique')
fraud_data['is_multiple_devices'] = (fraud_data['user_device_count'] > 1).astype(int)
fraud_data['is_multiple_browsers'] = (fraud_data['user_browser_count'] > 1).astype(int)

print("✓ Created: device/browser diversity features")

# Device and browser fraud rates
device_fraud_rate = fraud_data.groupby('device_id')['class'].mean()
browser_fraud_rate = fraud_data.groupby('browser')['class'].mean()
source_fraud_rate = fraud_data.groupby('source')['class'].mean()

fraud_data['device_fraud_rate'] = fraud_data['device_id'].map(device_fraud_rate)
fraud_data['browser_fraud_rate'] = fraud_data['browser'].map(browser_fraud_rate)
fraud_data['source_fraud_rate'] = fraud_data['source'].map(source_fraud_rate)

print("✓ Created: device/browser/source fraud rate features")

# ==================== 5. USER BEHAVIOR FEATURES ====================
print("\n5. USER BEHAVIOR FEATURES")
print("-" * 80)

# Multiple countries per user
fraud_data['user_country_count'] = fraud_data.groupby('user_id')['country'].transform('nunique')
fraud_data['is_multi_country'] = (fraud_data['user_country_count'] > 1).astype(int)

# IP changes
fraud_data['user_ip_count'] = fraud_data.groupby('user_id')['ip_address'].transform('nunique')
fraud_data['is_multi_ip'] = (fraud_data['user_ip_count'] > 1).astype(int)

print("✓ Created: geographic diversity features")

# ==================== 6. AGE-BASED FEATURES ====================
print("\n6. AGE-BASED FEATURES")
print("-" * 80)

# Age groups
fraud_data['age_group'] = pd.cut(
    fraud_data['age'],
    bins=[0, 25, 35, 45, 55, 65, 100],
    labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+']
)

# Age-based risk
age_fraud_rate = fraud_data.groupby('age_group')['class'].mean()
fraud_data['age_group_fraud_rate'] = fraud_data['age_group'].map(age_fraud_rate)

print("✓ Created: age groups and age-based risk scores")



4. DEVICE & BROWSER DIVERSITY FEATURES
--------------------------------------------------------------------------------
✓ Created: device/browser diversity features
✓ Created: device/browser/source fraud rate features

5. USER BEHAVIOR FEATURES
--------------------------------------------------------------------------------
✓ Created: geographic diversity features

6. AGE-BASED FEATURES
--------------------------------------------------------------------------------
✓ Created: age groups and age-based risk scores


In [11]:
# ==================== 7. SUMMARY OF FRAUD FEATURES ====================
print("\n" + "="*80)
print("FRAUD DATA - FEATURE ENGINEERING SUMMARY")
print("="*80)

new_features = [
    # Geolocation
    'country', 'country_fraud_rate', 'country_tx_count', 'is_high_risk_country',
    # Time
    'hour_of_day', 'day_of_week', 'is_night', 'is_morning', 'is_afternoon', 
    'is_evening', 'is_weekend', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos',
    'time_since_signup_hours', 'time_since_signup_days', 'is_quick_purchase',
    # Transaction patterns
    'user_transaction_count', 'time_since_last_tx', 'is_rapid_tx',
    'user_avg_purchase', 'purchase_value_ratio', 'is_unusual_amount',
    # Device/Browser
    'user_device_count', 'user_browser_count', 'is_multiple_devices',
    'device_fraud_rate', 'browser_fraud_rate', 'source_fraud_rate',
    # User behavior
    'user_country_count', 'is_multi_country', 'user_ip_count', 'is_multi_ip',
    # Age
    'age_group', 'age_group_fraud_rate'
]

print(f"\nTotal new features created: {len(new_features)}")
print(f"Dataset shape: {fraud_data.shape}")
print(f"\nFeature categories:")
print(f"  • Geolocation: 4 features")
print(f"  • Time-based: 15 features")
print(f"  • Transaction patterns: 6 features")
print(f"  • Device/Browser: 6 features")
print(f"  • User behavior: 4 features")
print(f"  • Demographics: 2 features")

# ==================== CREDITCARD DATA FEATURE ENGINEERING ====================
print("\n" + "="*80)
print("PART 2: CREDIT CARD DATA - FEATURE ENGINEERING")
print("="*80)

# The credit card data is already PCA-transformed, so we'll create minimal new features

# ==================== 1. TIME FEATURES ====================
print("\n1. TIME-BASED FEATURES")
print("-" * 80)

# Convert time to hours
creditcard['Time_hours'] = creditcard['Time'] / 3600
creditcard['Time_days'] = creditcard['Time'] / (3600 * 24)

# Time periods (assuming 2-day period)
creditcard['hour_of_day'] = (creditcard['Time_hours'] % 24).astype(int)
creditcard['day_num'] = (creditcard['Time_days']).astype(int)

print("✓ Created: Time_hours, Time_days, hour_of_day, day_num")

# Time categories
creditcard['is_night'] = creditcard['hour_of_day'].between(0, 6).astype(int)
creditcard['is_morning'] = creditcard['hour_of_day'].between(6, 12).astype(int)
creditcard['is_afternoon'] = creditcard['hour_of_day'].between(12, 18).astype(int)
creditcard['is_evening'] = creditcard['hour_of_day'].between(18, 24).astype(int)

print("✓ Created: time period indicators")

# Cyclical encoding
creditcard['hour_sin'] = np.sin(2 * np.pi * creditcard['hour_of_day'] / 24)
creditcard['hour_cos'] = np.cos(2 * np.pi * creditcard['hour_of_day'] / 24)

print("✓ Created: cyclical time encodings")

# ==================== 2. AMOUNT FEATURES ====================
print("\n2. AMOUNT-BASED FEATURES")
print("-" * 80)

# Amount categories
creditcard['amount_category'] = pd.cut(
    creditcard['Amount'],
    bins=[-0.01, 10, 50, 100, 500, 1000, 10000, np.inf],
    labels=['<10', '10-50', '50-100', '100-500', '500-1K', '1K-10K', '>10K']
)

# Log amount (for high-value transactions)
creditcard['log_amount'] = np.log1p(creditcard['Amount'])

# Amount percentiles
creditcard['amount_percentile'] = creditcard['Amount'].rank(pct=True)

# High-value flag
amount_95 = creditcard['Amount'].quantile(0.95)
creditcard['is_high_value'] = (creditcard['Amount'] > amount_95).astype(int)

print("✓ Created: amount categories, log_amount, percentiles")

# ==================== 3. V-FEATURE INTERACTIONS ====================
print("\n3. V-FEATURE INTERACTIONS")
print("-" * 80)

# Key features identified from correlation analysis
key_features = ['V4', 'V11', 'V12', 'V14', 'V17']

# Create interaction terms for highly correlated features
creditcard['V4_V11'] = creditcard['V4'] * creditcard['V11']
creditcard['V4_V12'] = creditcard['V4'] * creditcard['V12']
creditcard['V12_V14'] = creditcard['V12'] * creditcard['V14']

print("✓ Created: V-feature interaction terms")


FRAUD DATA - FEATURE ENGINEERING SUMMARY

Total new features created: 36
Dataset shape: (151112, 60)

Feature categories:
  • Geolocation: 4 features
  • Time-based: 15 features
  • Transaction patterns: 6 features
  • Device/Browser: 6 features
  • User behavior: 4 features
  • Demographics: 2 features

PART 2: CREDIT CARD DATA - FEATURE ENGINEERING

1. TIME-BASED FEATURES
--------------------------------------------------------------------------------
✓ Created: Time_hours, Time_days, hour_of_day, day_num
✓ Created: time period indicators
✓ Created: cyclical time encodings

2. AMOUNT-BASED FEATURES
--------------------------------------------------------------------------------
✓ Created: amount categories, log_amount, percentiles

3. V-FEATURE INTERACTIONS
--------------------------------------------------------------------------------
✓ Created: V-feature interaction terms


In [12]:
# ==================== 4. AGGREGATE FEATURES ====================
print("\n4. AGGREGATE V-FEATURES")
print("-" * 80)

# Get all V columns
v_cols = [col for col in creditcard.columns if col.startswith('V') and len(col) <= 3]

# Statistical aggregations
creditcard['V_mean'] = creditcard[v_cols].mean(axis=1)
creditcard['V_std'] = creditcard[v_cols].std(axis=1)
creditcard['V_max'] = creditcard[v_cols].max(axis=1)
creditcard['V_min'] = creditcard[v_cols].min(axis=1)
creditcard['V_range'] = creditcard['V_max'] - creditcard['V_min']

print("✓ Created: V-feature statistical aggregations")

# ==================== 5. SUMMARY OF CREDIT CARD FEATURES ====================
print("\n" + "="*80)
print("CREDIT CARD DATA - FEATURE ENGINEERING SUMMARY")
print("="*80)

new_cc_features = [
    'Time_hours', 'Time_days', 'hour_of_day', 'is_night', 'is_morning',
    'hour_sin', 'hour_cos', 'log_amount', 'amount_percentile', 'is_high_value',
    'V4_V11', 'V4_V12', 'V12_V14', 'V_mean', 'V_std', 'V_range'
]

print(f"\nTotal new features created: {len(new_cc_features)}")
print(f"Dataset shape: {creditcard.shape}")
print(f"\nFeature categories:")
print(f"  • Time-based: 7 features")
print(f"  • Amount-based: 3 features")
print(f"  • V-interactions: 3 features")
print(f"  • V-aggregations: 3 features")



4. AGGREGATE V-FEATURES
--------------------------------------------------------------------------------
✓ Created: V-feature statistical aggregations

CREDIT CARD DATA - FEATURE ENGINEERING SUMMARY

Total new features created: 16
Dataset shape: (284807, 54)

Feature categories:
  • Time-based: 7 features
  • Amount-based: 3 features
  • V-interactions: 3 features
  • V-aggregations: 3 features


In [13]:
# ==================== SAVE ENGINEERED DATA ====================
print("\n" + "="*80)
print("SAVING ENGINEERED DATASETS")
print("="*80)

# Save fraud data
fraud_data.to_csv('../data/processed/fraud_data_engineered.csv', index=False)
print(f"✓ Saved: fraud_data_engineered.csv ({fraud_data.shape})")

# Save credit card data
creditcard.to_csv('../data/processed/creditcard_engineered.csv', index=False)
print(f"✓ Saved: creditcard_engineered.csv ({creditcard.shape})")


SAVING ENGINEERED DATASETS
✓ Saved: fraud_data_engineered.csv ((151112, 60))
✓ Saved: creditcard_engineered.csv ((284807, 54))


In [14]:
# ==================== FEATURE IMPORTANCE PREVIEW ====================
print("\n" + "="*80)
print("FEATURE IMPORTANCE PREVIEW")
print("="*80)

print("\nFraud Data - Top Features by Correlation with Target:")
fraud_numerical = fraud_data.select_dtypes(include=[np.number])
if 'class' in fraud_numerical.columns:
    fraud_corr = fraud_numerical.corr()['class'].abs().sort_values(ascending=False)
    print(fraud_corr.head(15))

print("\nCredit Card Data - Top Features by Correlation with Target:")
cc_numerical = creditcard.select_dtypes(include=[np.number])
if 'Class' in cc_numerical.columns:
    cc_corr = cc_numerical.corr()['Class'].abs().sort_values(ascending=False)
    print(cc_corr.head(15))

print("\n" + "="*80)
print("FEATURE ENGINEERING COMPLETE!")
print("="*80)
print("\n✓ Ready for data transformation and modeling")
print("✓ Next step: Data preprocessing (scaling, encoding, train-test split)")


FEATURE IMPORTANCE PREVIEW

Fraud Data - Top Features by Correlation with Target:
class                      1.000000
device_fraud_rate          0.932580
is_quick_purchase          0.714120
is_very_new_account        0.663749
month                      0.310112
time_since_signup_hours    0.257888
time_since_signup_days     0.257888
day_of_month               0.160319
day_sin                    0.026106
source_fraud_rate          0.020728
day_of_week                0.018939
browser_fraud_rate         0.017171
is_weekend                 0.014139
hour_cos                   0.010131
day_cos                    0.008301
Name: class, dtype: float64

Credit Card Data - Top Features by Correlation with Target:
Class      1.000000
V12_V14    0.582719
V4_V12     0.537021
V4_V11     0.445713
V17        0.326481
V_mean     0.316330
V14        0.302544
V12        0.260593
V_std      0.250839
V_min      0.220023
V10        0.216883
V_range    0.202200
V16        0.196539
V3         0.192961
V7      