In [1]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipaddress
from datetime import datetime, timedelta
import os

# Set display and plotting options
pd.set_option('display.max_columns', 100)
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [2]:
# Load cleaned datasets
fraud_clean = pd.read_csv('../../data/processed/Fraud_Data_clean.csv')
ip_map_clean = pd.read_csv('../../data/processed/IpAddress_to_Country_clean.csv')
credit_clean = pd.read_csv('../../data/processed/creditcard_clean.csv')

print('Fraud_Data clean shape:', fraud_clean.shape)
print('IpAddress_to_Country clean shape:', ip_map_clean.shape)
print('creditcard clean shape:', credit_clean.shape)

# Convert datetime columns back
fraud_clean['signup_time'] = pd.to_datetime(fraud_clean['signup_time'])
fraud_clean['purchase_time'] = pd.to_datetime(fraud_clean['purchase_time'])

Fraud_Data clean shape: (151112, 11)
IpAddress_to_Country clean shape: (138846, 3)
creditcard clean shape: (283726, 31)


In [3]:
def ip_to_int(ip_str):
    """Convert IP address string to integer"""
    try:
        return int(ipaddress.IPv4Address(ip_str))
    except:
        return None

# Convert IP addresses in fraud data
fraud_clean['ip_int'] = fraud_clean['ip_address'].astype(str).apply(ip_to_int)

# Convert IP addresses in IP mapping
ip_map_clean['lower_bound_int'] = ip_map_clean['lower_bound_ip_address'].apply(ip_to_int)
ip_map_clean['upper_bound_int'] = ip_map_clean['upper_bound_ip_address'].apply(ip_to_int)

print('IP conversion completed')

IP conversion completed


In [None]:
import pandas as pd
import numpy as np

# Load processed data
fraud_clean = pd.read_csv('../../data/processed/Fraud_Data_clean.csv')
ip_map_clean = pd.read_csv('../../data/processed/IpAddress_to_Country_clean.csv')

# Convert to numeric (they're already numeric, but ensure proper dtype)
fraud_clean['ip_int'] = pd.to_numeric(fraud_clean['ip_address'], errors='coerce')
ip_map_clean['lower_bound_int'] = pd.to_numeric(ip_map_clean['lower_bound_ip_address'], errors='coerce')
ip_map_clean['upper_bound_int'] = pd.to_numeric(ip_map_clean['upper_bound_ip_address'], errors='coerce')

# Get the valid IP range from mapping data
min_valid_ip = ip_map_clean['lower_bound_int'].min()
max_valid_ip = ip_map_clean['upper_bound_int'].max()

print(f'Valid IP range: {min_valid_ip} to {max_valid_ip}')

# Filter fraud data to only include IPs within the valid range
fraud_filtered = fraud_clean[
    (fraud_clean['ip_int'] >= min_valid_ip) & 
    (fraud_clean['ip_int'] <= max_valid_ip)
].copy()

print(f'Original fraud data: {fraud_clean.shape[0]} records')
print(f'Filtered fraud data: {fraud_filtered.shape[0]} records')
print(f'Records outside IP range: {fraud_clean.shape[0] - fraud_filtered.shape[0]}')

# Remove any NaN values
fraud_filtered = fraud_filtered.dropna(subset=['ip_int'])
ip_map_clean = ip_map_clean.dropna(subset=['lower_bound_int', 'upper_bound_int'])

print('IP conversion and filtering completed')

Valid IP range: 16777216.0 to 3758096383
Original fraud data: 151112 records
Filtered fraud data: 131095 records
Records outside IP range: 20017
IP conversion and filtering completed


In [None]:
# Sort both datasets by IP integer for merge_asof
fraud_sorted = fraud_filtered.sort_values('ip_int')
ip_sorted = ip_map_clean.sort_values('lower_bound_int')

# Ensure both columns are numeric
print('Data types before merge:')
print('fraud_sorted ip_int dtype:', fraud_sorted['ip_int'].dtype)
print('ip_sorted lower_bound_int dtype:', ip_sorted['lower_bound_int'].dtype)

# Merge using merge_asof (for range-based matching)
fraud_with_country = pd.merge_asof(
    fraud_sorted,
    ip_sorted[['lower_bound_int', 'upper_bound_int', 'country']],
    left_on='ip_int',
    right_on='lower_bound_int',
    direction='backward'
)

# Filter to ensure IP is within the range
fraud_with_country = fraud_with_country[
    (fraud_with_country['ip_int'] >= fraud_with_country['lower_bound_int']) &
    (fraud_with_country['ip_int'] <= fraud_with_country['upper_bound_int'])
]

print('Merged dataset shape:', fraud_with_country.shape)
print('Countries found:', fraud_with_country['country'].nunique())
print('Sample merged data:')
display(fraud_with_country[['user_id', 'ip_address', 'country', 'class']].head())

Data types before merge:
fraud_sorted ip_int dtype: float64
ip_sorted lower_bound_int dtype: float64
Merged dataset shape: (129146, 15)
Countries found: 181
Sample merged data:


Unnamed: 0,user_id,ip_address,country,class
0,247547,16778860.0,Australia,0
1,220737,16842050.0,Thailand,0
2,390400,16843660.0,China,0
3,69592,16938730.0,China,0
4,174987,16971980.0,Thailand,0


In [14]:
# Cell 5: Feature Engineering
import pandas as pd
from datetime import datetime

# Use the merged dataset
fraud_with_features = fraud_with_country.copy()

# 1. Time-based features
fraud_with_features['signup_time'] = pd.to_datetime(fraud_with_features['signup_time'])
fraud_with_features['purchase_time'] = pd.to_datetime(fraud_with_features['purchase_time'])

# Time since signup (in hours)
fraud_with_features['time_since_signup'] = (
    fraud_with_features['purchase_time'] - fraud_with_features['signup_time']
).dt.total_seconds() / 3600

# Hour of day
fraud_with_features['hour_of_day'] = fraud_with_features['purchase_time'].dt.hour

# Day of week
fraud_with_features['day_of_week'] = fraud_with_features['purchase_time'].dt.dayofweek

# 2. Transaction velocity features
# Transactions per user (group by user_id)
user_transaction_counts = fraud_with_features.groupby('user_id').size().reset_index(name='user_transaction_count')
fraud_with_features = fraud_with_features.merge(user_transaction_counts, on='user_id', how='left')

# 3. Device and IP usage patterns
device_counts = fraud_with_features.groupby('device_id').size().reset_index(name='device_usage_count')
fraud_with_features = fraud_with_features.merge(device_counts, on='device_id', how='left')

ip_counts = fraud_with_features.groupby('ip_address').size().reset_index(name='ip_usage_count')
fraud_with_features = fraud_with_features.merge(ip_counts, on='ip_address', how='left')

# 4. Country-based features
country_counts = fraud_with_features.groupby('country').size().reset_index(name='country_transaction_count')
fraud_with_features = fraud_with_features.merge(country_counts, on='country', how='left')

print('Feature engineering completed')
print('New features added:')
print('- time_since_signup')
print('- hour_of_day') 
print('- day_of_week')
print('- user_transaction_count')
print('- device_usage_count')
print('- ip_usage_count')
print('- country_transaction_count')

print(f'Final dataset shape: {fraud_with_features.shape}')

Feature engineering completed
New features added:
- time_since_signup
- hour_of_day
- day_of_week
- user_transaction_count
- device_usage_count
- ip_usage_count
- country_transaction_count
Final dataset shape: (129146, 23)


In [15]:
# Cell 6: Data Transformation
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# 1. Categorical encoding
categorical_columns = ['source', 'browser', 'sex', 'country']
label_encoders = {}

for col in categorical_columns:
    if col in fraud_with_features.columns:
        le = LabelEncoder()
        fraud_with_features[f'{col}_encoded'] = le.fit_transform(fraud_with_features[col].astype(str))
        label_encoders[col] = le

# 2. Select features for modeling
feature_columns = [
    'purchase_value', 'age', 'time_since_signup', 'hour_of_day', 'day_of_week',
    'user_transaction_count', 'device_usage_count', 'ip_usage_count', 'country_transaction_count',
    'source_encoded', 'browser_encoded', 'sex_encoded', 'country_encoded'
]

# Remove rows with missing values in features
fraud_with_features = fraud_with_features.dropna(subset=feature_columns + ['class'])

# 3. Prepare X and y
X = fraud_with_features[feature_columns]
y = fraud_with_features['class']

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')
print(f'Class distribution:')
print(y.value_counts(normalize=True))

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Training set shape: {X_train_scaled.shape}')
print(f'Test set shape: {X_test_scaled.shape}')

Features shape: (129146, 13)
Target shape: (129146,)
Class distribution:
class
0    0.905007
1    0.094993
Name: proportion, dtype: float64
Training set shape: (103316, 13)
Test set shape: (25830, 13)


In [16]:
# Cell 7: Handle Class Imbalance
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

print('Original training set class distribution:')
print(pd.Series(y_train).value_counts(normalize=True))

# Option 1: SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print('\nAfter SMOTE:')
print(pd.Series(y_train_smote).value_counts(normalize=True))

# Option 2: Random Undersampling (for comparison)
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train_scaled, y_train)

print('\nAfter Random Undersampling:')
print(pd.Series(y_train_rus).value_counts(normalize=True))

# Save processed data
fraud_with_features.to_csv('../../data/processed/fraud_data_with_features.csv', index=False)
print('\nProcessed data saved to: ../../data/processed/fraud_data_with_features.csv')

Original training set class distribution:
class
0    0.90501
1    0.09499
Name: proportion, dtype: float64

After SMOTE:
class
0    0.5
1    0.5
Name: proportion, dtype: float64

After Random Undersampling:
class
0    0.5
1    0.5
Name: proportion, dtype: float64

Processed data saved to: ../../data/processed/fraud_data_with_features.csv
