In [1]:
# Import libraries and configure environment
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

TRAIN_RATIO = 0.60
VAL_RATIO = 0.20
TEST_RATIO = 0.20

INPUT_FILE = '../data/processed/transactions_enriched.parquet'
pd.set_option('display.max_columns', None)
print("Environment setup complete.")

Environment setup complete.


In [2]:
# Load and validate transaction data
df = pd.read_parquet(INPUT_FILE)

if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
    df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"Data Shape: {df.shape}")
print(f"Timestamp Type: {df['timestamp'].dtype}")
df.head()

Data Shape: (284807, 46)
Timestamp Type: datetime64[ns]


Unnamed: 0,transaction_id,user_id,merchant_id,timestamp,card_network,card_issuer,card_tier,credit_limit,card_age,amount_inr,merchant_category,merchant_city,merchant_state,transaction_channel,entry_mode,is_international,is_recurring,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,is_fraud
0,txn_a4e5b0c4f0d0,user_00013,merchant_00005,2023-09-02 00:00:00,Visa,Axis Bank,Gold,187000,2,13166.56,Dining,Kochi,Kerala,POS,Chip,0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,txn_a0c090d4a0b9,user_00001,merchant_00001,2023-09-02 00:00:00,Visa,Axis Bank,Platinum,646000,15,236.72,Fuel,Pune,Maharashtra,POS,Tap,0,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,txn_e19e53312f8c,user_00001,merchant_00001,2023-09-02 00:00:01,Visa,ICICI Bank,Platinum,743000,39,33322.08,Hospital,Nagpur,Maharashtra,POS,Chip,0,0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,txn_3c590c659e28,user_00001,merchant_00060,2023-09-02 00:00:01,RuPay,Axis Bank,Classic,55000,39,10868.0,Fashion,Kolkata,West Bengal,POS,Chip,0,0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,txn_88b8192c985c,user_00035,merchant_00003,2023-09-02 00:00:02,RuPay,Axis Bank,Classic,46000,33,6159.12,Dining,Patna,Bihar,Online,CVC,0,0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


In [3]:
# Sort data chronologically to prevent time leakage
df = df.sort_values('timestamp', ascending=True).reset_index(drop=True)

print(f"Start Date: {df['timestamp'].min()}")
print(f"End Date:   {df['timestamp'].max()}")
print("Data sorted by timestamp.")

Start Date: 2023-09-02 00:00:00
End Date:   2023-09-03 23:59:52
Data sorted by timestamp.


In [4]:
# Split data into train,validation and test sets based on time
n = len(df)
train_end = int(n * TRAIN_RATIO)
val_end = int(n * (TRAIN_RATIO + VAL_RATIO))

# Perform splits
train_df = df.iloc[:train_end].copy()
val_df = df.iloc[train_end:val_end].copy()
test_df = df.iloc[val_end:].copy()

# Validation checks
print("="*60)
print("DATASET SPLIT SUMMARY")
print("="*60)
print(f"{'Set':<12} {'Samples':>10} {'Percentage':>12} {'Date Range'}")
print("-"*60)
print(f"{'Training':<12} {len(train_df):>10,} {len(train_df)/n*100:>11.1f}%  {train_df['timestamp'].min().date()} → {train_df['timestamp'].max().date()}")
print(f"{'Validation':<12} {len(val_df):>10,} {len(val_df)/n*100:>11.1f}%  {val_df['timestamp'].min().date()} → {val_df['timestamp'].max().date()}")
print(f"{'Test':<12} {len(test_df):>10,} {len(test_df)/n*100:>11.1f}%  {test_df['timestamp'].min().date()} → {test_df['timestamp'].max().date()}")
print("="*60)

# Time leakage assertions
assert train_df['timestamp'].max() <= val_df['timestamp'].min(), \
    "ERROR: Time leakage between train and validation!"
assert val_df['timestamp'].max() <= test_df['timestamp'].min(), \
    "ERROR: Time leakage between validation and test!"

print("✓ No time leakage detected.")

# Check fraud distribution across splits
if 'is_fraud' in df.columns:
    print("\nFraud Rate by Split:")
    print(f"  Training:   {train_df['is_fraud'].mean()*100:.3f}%")
    print(f"  Validation: {val_df['is_fraud'].mean()*100:.3f}%")
    print(f"  Test:       {test_df['is_fraud'].mean()*100:.3f}%")

DATASET SPLIT SUMMARY
Set             Samples   Percentage Date Range
------------------------------------------------------------
Training        170,884        60.0%  2023-09-02 → 2023-09-03
Validation       56,961        20.0%  2023-09-03 → 2023-09-03
Test             56,962        20.0%  2023-09-03 → 2023-09-03
✓ No time leakage detected.

Fraud Rate by Split:
  Training:   0.211%
  Validation: 0.100%
  Test:       0.132%


In [5]:
# Extract time-based features
for dataset in [train_df, val_df, test_df]:
    dataset['hour'] = dataset['timestamp'].dt.hour
    dataset['day_of_week'] = dataset['timestamp'].dt.dayofweek
    dataset['is_night'] = dataset['hour'].isin([2, 3, 4]).astype(int)

print("Time features extracted.")
print("\nNight Transaction Distribution (Train):")
print(train_df['is_night'].value_counts(normalize=True))

Time features extracted.

Night Transaction Distribution (Train):
is_night
0    0.947163
1    0.052837
Name: proportion, dtype: float64


In [6]:
# Apply log transformation to transaction amounts
for dataset in [train_df, val_df, test_df]:
    dataset['log_amount'] = np.log1p(dataset['amount_inr'])

print("Log transformation applied to amount_inr.")
print(f"Original Max Amount: {train_df['amount_inr'].max():,.2f}")
print(f"Log Max Amount:      {train_df['log_amount'].max():.4f}")
print("\nSample transformation:")
print(train_df[['amount_inr', 'log_amount']].head())

Log transformation applied to amount_inr.
Original Max Amount: 1,729,774.64
Log Max Amount:      14.3635

Sample transformation:
   amount_inr  log_amount
0    13166.56    9.485512
1      236.72    5.471094
2    33322.08   10.414006
3    10868.00    9.293670
4     6159.12    8.725852


In [7]:
# Create interaction feature for international online transactions
for dataset in [train_df, val_df, test_df]:
    dataset['interaction_intl_online'] = (
        (dataset['is_international'] == 1) & 
        (dataset['transaction_channel'] == 'Online')
    ).astype(int)

high_risk_count = train_df['interaction_intl_online'].sum()
high_risk_pct = 100 * high_risk_count / len(train_df)

print("Interaction feature created: international + online transactions")
print(f"Count in training set: {high_risk_count} ({high_risk_pct:.2f}%)")

Interaction feature created: international + online transactions
Count in training set: 1309 (0.77%)


In [8]:
# Target encode merchant category using fraud rate from training data
global_fraud_rate = train_df['is_fraud'].mean()
category_fraud_map = train_df.groupby('merchant_category')['is_fraud'].mean()

train_df['merchant_category_encoded'] = train_df['merchant_category'].map(category_fraud_map)
val_df['merchant_category_encoded'] = val_df['merchant_category'].map(category_fraud_map)
test_df['merchant_category_encoded'] = test_df['merchant_category'].map(category_fraud_map)

val_df['merchant_category_encoded'] = val_df['merchant_category_encoded'].fillna(global_fraud_rate)
test_df['merchant_category_encoded'] = test_df['merchant_category_encoded'].fillna(global_fraud_rate)

print("Target encoding applied to merchant_category")
print(f"Global fraud rate (baseline): {global_fraud_rate:.5f}")
print("\nTop 3 risky merchant categories:")
print(category_fraud_map.nlargest(3))
print("\nMissing values after encoding:")
print(f"  Train: {train_df['merchant_category_encoded'].isna().sum()}")
print(f"  Val:   {val_df['merchant_category_encoded'].isna().sum()}")
print(f"  Test:  {test_df['merchant_category_encoded'].isna().sum()}")

Target encoding applied to merchant_category
Global fraud rate (baseline): 0.00211

Top 3 risky merchant categories:
merchant_category
Fast Food    0.005256
Jewelry      0.004872
Grocery      0.004073
Name: is_fraud, dtype: float64

Missing values after encoding:
  Train: 0
  Val:   0
  Test:  0


In [9]:
# One-hot encode categorical features
train_df = pd.get_dummies(train_df, columns=['entry_mode', 'card_network'], 
                           prefix=['mode', 'net'], drop_first=False)
val_df = pd.get_dummies(val_df, columns=['entry_mode', 'card_network'], 
                          prefix=['mode', 'net'], drop_first=False)
test_df = pd.get_dummies(test_df, columns=['entry_mode', 'card_network'], 
                          prefix=['mode', 'net'], drop_first=False)

# Ensure test set has same columns as train set
train_columns = train_df.columns.tolist()
test_df = test_df.reindex(columns=train_columns, fill_value=0)

print("One-hot encoding applied to entry_mode and card_network")
print(f"Train columns: {len(train_df.columns)}")
print(f"Val columns:  {len(test_df.columns)}")
print(f"Test columns:  {len(test_df.columns)}")

One-hot encoding applied to entry_mode and card_network
Train columns: 58
Val columns:  58
Test columns:  58


In [10]:
# Remove redundant and high-cardinality columns
columns_to_remove = [
    'merchant_category',
    'merchant_city',
    'merchant_state',
    'card_issuer',
    'first_name',
    'last_name',
    'date',
    'date_hour'
]

# Only drop columns that exist in dataframes
columns_to_remove = [col for col in columns_to_remove if col in train_df.columns]
train_df.drop(columns=columns_to_remove, inplace=True)
val_df.drop(columns=columns_to_remove, inplace=True)
test_df.drop(columns=columns_to_remove, inplace=True)

print(f"Removed {len(columns_to_remove)} redundant columns : {columns_to_remove}")

Removed 4 redundant columns : ['merchant_category', 'merchant_city', 'merchant_state', 'card_issuer']


In [11]:
# Define final feature set for model training
excluded_columns = ['transaction_id', 'user_id', 'merchant_id', 'timestamp', 'is_fraud']
feature_columns = [col for col in train_df.columns if col not in excluded_columns]

print(f"Final feature set contains {len(feature_columns)} features:")
print(feature_columns)
print("\nExcluded columns (metadata and target):")
print(excluded_columns)

Final feature set contains 49 features:
['card_tier', 'credit_limit', 'card_age', 'amount_inr', 'transaction_channel', 'is_international', 'is_recurring', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'hour', 'day_of_week', 'is_night', 'log_amount', 'interaction_intl_online', 'merchant_category_encoded', 'mode_CVC', 'mode_Chip', 'mode_Swipe', 'mode_Tap', 'net_Amex', 'net_Mastercard', 'net_RuPay', 'net_Visa']

Excluded columns (metadata and target):
['transaction_id', 'user_id', 'merchant_id', 'timestamp', 'is_fraud']


In [12]:
# Save processed datasets to parquet files
train_df.to_parquet('../data/processed/train_processed.parquet', index=False)
val_df.to_parquet('../data/processed/val_processed.parquet', index=False)
test_df.to_parquet('../data/processed/test_processed.parquet', index=False)

print("Data processing complete.")
print("Saved: train_processed.parquet")
print("Saved: val_processed.parquet")
print("Saved: test_processed.parquet")
print("Ready for model training phase.")

Data processing complete.
Saved: train_processed.parquet
Saved: val_processed.parquet
Saved: test_processed.parquet
Ready for model training phase.
