In [None]:
# Libraries & Setup
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Libraries imported successfully
Pandas version: 2.3.3
NumPy version: 2.3.5


In [None]:
# Load Data
df = pd.read_csv('../data/raw/creditcard.csv')

print("="*60)
print("ORIGINAL KAGGLE DATASET")
print("="*60)
print(f"Shape: {df.shape}")
print(f"Columns ({len(df.columns)}): {df.columns.tolist()}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Fraud rate: {df['Class'].mean()*100:.3f}%")
print(f"Fraud transactions: {df['Class'].sum():,}")
print(f"Legitimate transactions: {(df['Class']==0).sum():,}")
print(f"\nFirst 3 rows:\n{df.head(3)}")

ORIGINAL KAGGLE DATASET
Shape: (284807, 31)
Columns (31): ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']
Memory usage: 67.36 MB
Fraud rate: 0.173%
Fraud transactions: 492
Legitimate transactions: 284,315

First 3 rows:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   

         V8        V9       V10       V11       V12       V13       V14  \
0  0.098698  0.363787  0.090794 -0.551600 -0.617801 -0.991390 -0.311169   
1  0.085102 -0.255425 -0.166974  1.612727  1.065235  0.489095 -0.143772   
2  0.247676 -1.514654  0.207643  0.624501  0.066084  0.717293 -

In [None]:
# Transaction IDs
df['transaction_id'] = [f'txn_{uuid.uuid4().hex[:12]}' for _ in range(len(df))]

print("Transaction IDs generated")
print(f"Sample IDs: {df['transaction_id'].head(3).tolist()}")
print(f"Unique IDs: {df['transaction_id'].nunique():,}")

Transaction IDs generated
Sample IDs: ['txn_66741a9edac3', 'txn_c60411a5aefd', 'txn_6074fbafebe0']
Unique IDs: 284,807


In [None]:
# User IDs with Power-Law Distribution
user_indices = np.random.zipf(a=1.5, size=len(df))
user_indices = np.clip(user_indices, 1, 10000)
df['user_id'] = [f'user_{idx:05d}' for idx in user_indices]

user_txn_counts = df['user_id'].value_counts()
print("User IDs generated")
print(f"Unique users: {df['user_id'].nunique():,}")
print(f"Most active user: {user_txn_counts.iloc[0]} transactions")
print(f"Median user: {user_txn_counts.median():.0f} transactions")
print(f"Top 5 users account for: {user_txn_counts.head(5).sum()/len(df)*100:.1f}% of transactions")

User IDs generated
Unique users: 3,958
Most active user: 108809 transactions
Median user: 2 transactions
Top 5 users account for: 67.4% of transactions


In [None]:
# Merchant IDs with Power-Law Distribution
merchant_indices = np.random.zipf(a=1.3, size=len(df))
merchant_indices = np.clip(merchant_indices, 1, 5000)
df['merchant_id'] = [f'merchant_{idx:05d}' for idx in merchant_indices]

merchant_txn_counts = df['merchant_id'].value_counts()
print("Merchant IDs generated")
print(f"Unique merchants: {df['merchant_id'].nunique():,}")
print(f"Highest volume merchant: {merchant_txn_counts.iloc[0]} transactions")
print(f"Median merchant: {merchant_txn_counts.median():.0f} transactions")

Merchant IDs generated
Unique merchants: 4,491
Highest volume merchant: 72422 transactions
Median merchant: 3 transactions


In [None]:
# Timestamps
START_DATE = datetime(2023, 9, 2, 0, 0, 0)
df['timestamp'] = df['Time'].apply(lambda x: START_DATE + timedelta(seconds=int(x)))

df_temp = df[df['timestamp'] < (START_DATE + timedelta(hours=24))].copy()
df_temp['hour'] = df_temp['timestamp'].dt.hour
hourly = df_temp.groupby('hour').size()

print(f"Start Time: {df['timestamp'].min()}")
print(f"End Time: {df['timestamp'].max()}")
print(f"\nHourly Distribution (First 24h):\n{hourly}")

Start Time: 2023-09-02 00:00:00
End Time: 2023-09-03 23:59:52

Hourly Distribution (First 24h):
hour
0     3963
1     2217
2     1576
3     1821
4     1082
5     1681
6     1831
7     3368
8     5179
9     7878
10    8288
11    8517
12    7732
13    7585
14    8029
15    7836
16    7786
17    7882
18    8607
19    7994
20    8980
21    9895
22    8977
23    6082
dtype: int64


In [None]:
# Amount Conversion to INR
EXCHANGE_RATE = 88
df['amount_inr'] = (df['Amount'] * EXCHANGE_RATE).round(2)

print(f"Exchange Rate Applied: {EXCHANGE_RATE}")
print(f"\nAmount Statistics (INR):\n{df['amount_inr'].describe()}")
print(f"\nTop 5 Highest Transactions:\n{df[['amount_inr', 'Class']].sort_values('amount_inr', ascending=False).head(5)}")

Exchange Rate Applied: 88

Amount Statistics (INR):
count    2.848070e+05
mean     7.774766e+03
std      2.201057e+04
min      0.000000e+00
25%      4.928000e+02
50%      1.936000e+03
75%      6.790520e+03
max      2.260822e+06
Name: amount_inr, dtype: float64

Top 5 Highest Transactions:
        amount_inr  Class
274771  2260822.08      0
58465   1729774.64      0
151296  1664080.00      0
46841   1136161.84      0
54018   1047031.92      0


In [None]:
# Card Networks
low_limit = df['amount_inr'].quantile(0.25)
high_limit = df['amount_inr'].quantile(0.90)

networks = ['Visa', 'Mastercard', 'RuPay', 'Amex']
df['card_network'] = 'Unknown'

mask_low = df['amount_inr'] <= low_limit
mask_mid = (df['amount_inr'] > low_limit) & (df['amount_inr'] <= high_limit)
mask_high = df['amount_inr'] > high_limit

df.loc[mask_low, 'card_network'] = np.random.choice(networks, size=mask_low.sum(), p=[0.30, 0.30, 0.40, 0.00])
df.loc[mask_mid, 'card_network'] = np.random.choice(networks, size=mask_mid.sum(), p=[0.45, 0.45, 0.10, 0.00])
df.loc[mask_high, 'card_network'] = np.random.choice(networks, size=mask_high.sum(), p=[0.45, 0.45, 0.02, 0.08])

print(f"Low Limit: {low_limit:.2f}, High Limit: {high_limit:.2f}")
print(f"\nNetwork Distribution:\n{df['card_network'].value_counts(normalize=True)}")
print(f"\nAverage Amount by Network:\n{df.groupby('card_network')['amount_inr'].mean().sort_values()}")

Low Limit: 492.80, High Limit: 17864.00

Network Distribution:
card_network
Visa          0.412816
Mastercard    0.411981
RuPay         0.167243
Amex          0.007960
Name: proportion, dtype: float64

Average Amount by Network:
card_network
RuPay          2481.723191
Mastercard     8423.936644
Visa           8498.950445
Amex          47829.190543
Name: amount_inr, dtype: float64


In [None]:
# Card Tiers
tiers = ['Classic', 'Gold', 'Platinum', 'Signature']
df['card_tier'] = 'Unknown'

mask_amex = df['card_network'] == 'Amex'
df.loc[mask_amex, 'card_tier'] = np.random.choice(tiers, size=mask_amex.sum(), p=[0.00, 0.30, 0.40, 0.30])

mask_others = df['card_network'] != 'Amex'
mask_low_tier = mask_others & (df['amount_inr'] <= low_limit)
mask_mid_tier = mask_others & (df['amount_inr'] > low_limit) & (df['amount_inr'] <= high_limit)
mask_high_tier = mask_others & (df['amount_inr'] > high_limit)

df.loc[mask_low_tier, 'card_tier'] = np.random.choice(tiers, size=mask_low_tier.sum(), p=[0.60, 0.30, 0.08, 0.02])
df.loc[mask_mid_tier, 'card_tier'] = np.random.choice(tiers, size=mask_mid_tier.sum(), p=[0.30, 0.40, 0.20, 0.10])
df.loc[mask_high_tier, 'card_tier'] = np.random.choice(tiers, size=mask_high_tier.sum(), p=[0.10, 0.20, 0.40, 0.30])

print(f"Card Tier vs Network:\n{pd.crosstab(df['card_network'], df['card_tier'])}")
print(f"\nAverage Spending by Tier:\n{df.groupby('card_tier')['amount_inr'].mean().sort_values()}")

Card Tier vs Network:
card_tier     Classic   Gold  Platinum  Signature
card_network                                     
Amex                0    657       921        689
Mastercard      39212  42227     23309      12587
RuPay           22766  16068      6227       2571
Visa            39067  42456     23370      12680

Average Spending by Tier:
card_tier
Classic       3778.832181
Gold          6081.448667
Platinum     13249.125570
Signature    17618.662836
Name: amount_inr, dtype: float64


In [None]:
# Card Issuers
banks = ['HDFC Bank', 'SBI Card', 'ICICI Bank', 'Axis Bank', 'Kotak Mahindra', 
         'IndusInd Bank', 'RBL Bank', 'IDFC First', 'Yes Bank', 'Standard Chartered']
probs = [0.27, 0.19, 0.17, 0.14, 0.05, 0.04, 0.04, 0.04, 0.03, 0.03]

df['card_issuer'] = np.random.choice(banks, size=len(df), p=probs)
df.loc[df['card_network'] == 'Amex', 'card_issuer'] = 'American Express'

print(f"Issuer Market Share:\n{df['card_issuer'].value_counts(normalize=True)}")
print(f"\nAmex Issuer Check: {df[df['card_network'] == 'Amex']['card_issuer'].unique()}")

Issuer Market Share:
card_issuer
HDFC Bank             0.268375
SBI Card              0.188331
ICICI Bank            0.168177
Axis Bank             0.139470
Kotak Mahindra        0.049465
RBL Bank              0.039711
IndusInd Bank         0.039595
IDFC First            0.039191
Yes Bank              0.030062
Standard Chartered    0.029662
American Express      0.007960
Name: proportion, dtype: float64

Amex Issuer Check: ['American Express']


In [None]:
# Merchant Categories
df.loc[df['amount_inr'] == 0, 'amount_inr'] = 1.0

low_cats = ['Grocery', 'Fast Food', 'Public Transport', 'Fuel', 'Digital Services', 'Pharmacy', 'Entertainment']
low_probs = [0.20, 0.20, 0.20, 0.15, 0.15, 0.05, 0.05]

mid_cats = ['Dining', 'Fashion', 'Supermarket', 'Pharmacy', 'Utility', 'Fuel', 'Entertainment', 'Auto', 'Personal Care']
mid_probs = [0.25, 0.20, 0.15, 0.10, 0.10, 0.05, 0.05, 0.05, 0.05]

high_cats = ['Airline', 'Hotel', 'Electronics', 'Jewelry', 'Furniture', 'Hospital']
high_probs = [0.30, 0.20, 0.20, 0.10, 0.10, 0.10]

df['merchant_category'] = 'Other'
df.loc[mask_low, 'merchant_category'] = np.random.choice(low_cats, size=mask_low.sum(), p=low_probs)
df.loc[mask_mid, 'merchant_category'] = np.random.choice(mid_cats, size=mask_mid.sum(), p=mid_probs)
df.loc[mask_high, 'merchant_category'] = np.random.choice(high_cats, size=mask_high.sum(), p=high_probs)

print(f"Merchant Category Distribution:\n{df['merchant_category'].value_counts().head(8)}")
print(f"\nFuel Amount Range:\n{df[df['merchant_category'] == 'Fuel']['amount_inr'].describe()[['min', 'mean', 'max']]}")

Merchant Category Distribution:
merchant_category
Dining         46085
Fashion        37114
Supermarket    27778
Pharmacy       22076
Fuel           19920
Utility        18453
Fast Food      14409
Grocery        14227
Name: count, dtype: int64

Fuel Amount Range:
min         0.880000
mean     2186.306616
max     17864.000000
Name: amount_inr, dtype: float64


In [None]:
# Geography (City & State)
tier1 = ['Mumbai', 'Delhi', 'Bengaluru', 'Chennai', 'Hyderabad', 'Kolkata', 'Pune', 'Ahmedabad']
tier2 = ['Jaipur', 'Lucknow', 'Chandigarh', 'Indore', 'Kochi', 'Surat', 'Nagpur', 'Coimbatore', 'Bhopal', 'Patna']
tier3 = ['Varanasi', 'Agra', 'Nashik', 'Vadodara', 'Ludhiana', 'Madurai', 'Vizag', 'Guwahati', 'Bhubaneswar', 'Raipur']

all_cities = tier1 + tier2 + tier3
weights = [0.075]*8 + [0.030]*10 + [0.010]*10

df['merchant_city'] = np.random.choice(all_cities, size=len(df), p=weights)

city_state_map = {
    'Mumbai': 'Maharashtra', 'Pune': 'Maharashtra', 'Nagpur': 'Maharashtra', 'Nashik': 'Maharashtra',
    'Delhi': 'Delhi', 'Bengaluru': 'Karnataka',
    'Chennai': 'Tamil Nadu', 'Coimbatore': 'Tamil Nadu', 'Madurai': 'Tamil Nadu',
    'Hyderabad': 'Telangana', 'Kolkata': 'West Bengal',
    'Ahmedabad': 'Gujarat', 'Surat': 'Gujarat', 'Vadodara': 'Gujarat',
    'Jaipur': 'Rajasthan',
    'Lucknow': 'Uttar Pradesh', 'Agra': 'Uttar Pradesh', 'Varanasi': 'Uttar Pradesh',
    'Chandigarh': 'Chandigarh', 'Indore': 'Madhya Pradesh', 'Bhopal': 'Madhya Pradesh',
    'Kochi': 'Kerala', 'Patna': 'Bihar', 'Ludhiana': 'Punjab',
    'Vizag': 'Andhra Pradesh', 'Guwahati': 'Assam', 'Bhubaneswar': 'Odisha', 'Raipur': 'Chhattisgarh'
}

df['merchant_state'] = df['merchant_city'].map(city_state_map)

print(f"Top Cities:\n{df['merchant_city'].value_counts(normalize=True).head(5)}")
print(f"\nTop States:\n{df['merchant_state'].value_counts().head(5)}")

Top Cities:
merchant_city
Ahmedabad    0.075714
Mumbai       0.075672
Chennai      0.075174
Bengaluru    0.074816
Delhi        0.074787
Name: proportion, dtype: float64

Top States:
merchant_state
Maharashtra    54194
Gujarat        33024
Tamil Nadu     32864
Karnataka      21308
Delhi          21300
Name: count, dtype: int64


In [None]:
# Transaction Channel
df['transaction_channel'] = 'POS'

online_cats = ['Digital Services', 'Airline', 'Entertainment', 'Public Transport', 'Hotel']
pos_cats = ['Fuel', 'Hospital']

df.loc[df['merchant_category'].isin(online_cats), 'transaction_channel'] = 'Online'
df.loc[df['merchant_category'].isin(pos_cats), 'transaction_channel'] = 'POS'

mask_dining = df['merchant_category'] == 'Dining'
df.loc[mask_dining, 'transaction_channel'] = np.random.choice(['Online', 'POS'], size=mask_dining.sum(), p=[0.30, 0.70])

mask_grocery = df['merchant_category'].isin(['Grocery', 'Supermarket'])
df.loc[mask_grocery, 'transaction_channel'] = np.random.choice(['Online', 'POS'], size=mask_grocery.sum(), p=[0.30, 0.70])

mask_retail = df['merchant_category'].isin(['Fashion', 'Electronics', 'Furniture', 'Jewelry'])
df.loc[mask_retail, 'transaction_channel'] = np.random.choice(['Online', 'POS'], size=mask_retail.sum(), p=[0.50, 0.50])

mask_util = df['merchant_category'].isin(['Utility', 'Pharmacy'])
df.loc[mask_util, 'transaction_channel'] = np.random.choice(['Online', 'POS'], size=mask_util.sum(), p=[0.60, 0.40])

print(f"Channel Distribution:\n{df['transaction_channel'].value_counts(normalize=True)}")
print(f"\nDining Channel Split:\n{df[df['merchant_category'] == 'Dining']['transaction_channel'].value_counts(normalize=True)}")

Channel Distribution:
transaction_channel
POS       0.556131
Online    0.443869
Name: proportion, dtype: float64

Dining Channel Split:
transaction_channel
POS       0.701183
Online    0.298817
Name: proportion, dtype: float64


In [None]:
# Entry Mode
df['entry_mode'] = 'Unknown'

mask_online_ch = df['transaction_channel'] == 'Online'
df.loc[mask_online_ch, 'entry_mode'] = 'CVC'

mask_gen_pos = (df['Class'] == 0) & (df['transaction_channel'] == 'POS')
mask_small = mask_gen_pos & (df['amount_inr'] <= 5000)
mask_large = mask_gen_pos & (df['amount_inr'] > 5000)

df.loc[mask_small, 'entry_mode'] = np.random.choice(['Tap', 'Chip', 'Swipe'], size=mask_small.sum(), p=[0.585, 0.40, 0.015])
df.loc[mask_large, 'entry_mode'] = np.random.choice(['Tap', 'Chip', 'Swipe'], size=mask_large.sum(), p=[0.00, 0.985, 0.015])

mask_fraud_pos = (df['Class'] == 1) & (df['transaction_channel'] == 'POS')
df.loc[mask_fraud_pos, 'entry_mode'] = np.random.choice(['Swipe', 'Tap', 'Chip'], size=mask_fraud_pos.sum(), p=[0.50, 0.40, 0.10])

stats = df.groupby('entry_mode')['Class'].agg(['count', 'sum', 'mean'])
stats.columns = ['Total_Txns', 'Fraud_Count', 'Fraud_Rate']
print(f"Entry Mode Statistics:\n{stats}")

Entry Mode Statistics:
            Total_Txns  Fraud_Count  Fraud_Rate
entry_mode                                     
CVC             126417          231    0.001827
Chip             89513           20    0.000223
Swipe             2491          134    0.053794
Tap              66386          107    0.001612


In [None]:
# Boolean Flags
df['is_recurring'] = 0
mask_util_rec = df['merchant_category'] == 'Utility'
df.loc[mask_util_rec, 'is_recurring'] = 1

mask_digital = df['merchant_category'] == 'Digital Services'
df.loc[mask_digital, 'is_recurring'] = np.random.choice([1, 0], size=mask_digital.sum(), p=[0.8, 0.2])

df['is_international'] = 0
mask_gen = df['Class'] == 0
mask_fraud = df['Class'] == 1

df.loc[mask_gen, 'is_international'] = np.random.choice([0, 1], size=mask_gen.sum(), p=[0.98, 0.02])
df.loc[mask_fraud, 'is_international'] = np.random.choice([0, 1], size=mask_fraud.sum(), p=[0.75, 0.25])

mask_rupay = df['card_network'] == 'RuPay'
df.loc[mask_rupay, 'is_international'] = 0

print(f"Recurring Distribution:\n{df['is_recurring'].value_counts()}")
print(f"\nInternational by Class:\n{df.groupby('Class')['is_international'].mean()}")

Recurring Distribution:
is_recurring
0    257814
1     26993
Name: count, dtype: int64

International by Class:
Class
0    0.016717
1    0.213415
Name: is_international, dtype: float64


In [None]:
# Card Profile Details
df['card_age'] = np.random.randint(1, 61, size=len(df))

limits = {
    'Classic': (25000, 100000),
    'Gold': (100000, 300000),
    'Platinum': (300000, 750000),
    'Signature': (750000, 1500000)
}

df['credit_limit'] = 0
for tier, (low, high) in limits.items():
    mask_tier = df['card_tier'] == tier
    df.loc[mask_tier, 'credit_limit'] = (np.random.randint(low, high, size=mask_tier.sum()) // 1000) * 1000

mask_over_limit = df['amount_inr'] > df['credit_limit']
df.loc[mask_over_limit, 'credit_limit'] = (df.loc[mask_over_limit, 'amount_inr'] * 1.2).astype(int)

print(f"Average Credit Limit by Tier:\n{df.groupby('card_tier')['credit_limit'].mean().sort_values()}")
print(f"\nCard Age Distribution (Sample):\n{df['card_age'].value_counts().sort_index().head(5)}")

Average Credit Limit by Tier:
card_tier
Classic      6.244113e+04
Gold         1.996596e+05
Platinum     5.250720e+05
Signature    1.123149e+06
Name: credit_limit, dtype: float64

Card Age Distribution (Sample):
card_age
1    4695
2    4774
3    4801
4    4696
5    4797
Name: count, dtype: int64


In [None]:
# Final Cleanup & Reordering
df = df.rename(columns={'Class': 'is_fraud'})
df = df.drop(columns=['Time', 'Amount'], errors='ignore')

cols_id = ['transaction_id', 'user_id', 'merchant_id', 'timestamp']
cols_card = ['card_network', 'card_issuer', 'card_tier', 'credit_limit', 'card_age']
cols_txn = ['amount_inr', 'merchant_category', 'merchant_city', 'merchant_state', 
            'transaction_channel', 'entry_mode', 'is_international', 'is_recurring']
cols_pca = [f'V{i}' for i in range(1, 29)]
cols_target = ['is_fraud']

final_order = cols_id + cols_card + cols_txn + cols_pca + cols_target
final_order = [c for c in final_order if c in df.columns]
df = df[final_order]

print(f"Final Shape: {df.shape}")
print(f"Columns: {len(df.columns)}")
print(f"Column List: {df.columns.tolist()}")
print(f"\nFraud Distribution:\n{df['is_fraud'].value_counts()}")

Final Shape: (284807, 46)
Columns: 46
Column List: ['transaction_id', 'user_id', 'merchant_id', 'timestamp', 'card_network', 'card_issuer', 'card_tier', 'credit_limit', 'card_age', 'amount_inr', 'merchant_category', 'merchant_city', 'merchant_state', 'transaction_channel', 'entry_mode', 'is_international', 'is_recurring', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'is_fraud']

Fraud Distribution:
is_fraud
0    284315
1       492
Name: count, dtype: int64


In [None]:
# Save Data
import os
os.makedirs('../data/processed', exist_ok=True)
save_path = '../data/processed/transactions_enriched.parquet'
df.to_parquet(save_path, index=False)

print(f"Dataset saved to: {save_path}")
print(f"File Size: {os.path.getsize(save_path) / 1024 / 1024:.2f} MB")

Dataset saved to: ../data/processed/transactions_enriched.parquet
File Size: 76.14 MB
