Automated Data Leakage Detection
System



In [18]:
import pandas as pd
import numpy as np

In [19]:
# Create a sample DataFrame for demonstration
data = {
    'feature_id': range(100),
    'user_category': np.random.choice(['A', 'B', 'C'], 100),
    'transaction_amount': np.random.rand(100) * 1000,
    'is_fraud': np.random.choice([0, 1], 100, p=[0.8, 0.2]),
    'account_balance_at_fraud': np.random.rand(100) * 5000
}
df_sample = pd.DataFrame(data)

In [20]:
# Introduce some leakage: make 'account_balance_at_fraud' strongly correlated with 'is_fraud'
df_sample.loc[df_sample['is_fraud'] == 1, 'account_balance_at_fraud'] = np.random.rand(df_sample['is_fraud'].sum()) * 100 + 5000
df_sample.loc[df_sample['is_fraud'] == 0, 'account_balance_at_fraud'] = np.random.rand(df_sample['is_fraud'].value_counts()[0]) * 1000


In [21]:
# Let's also create another feature that is slightly correlated but not perfectly leaking
df_sample['spending_habit'] = df_sample['transaction_amount'] + np.random.randn(100) * 50
df_sample.loc[df_sample['is_fraud'] == 1, 'spending_habit'] = df_sample.loc[df_sample['is_fraud'] == 1, 'spending_habit'] + 1000



In [22]:
print("Sample DataFrame head:")
display(df_sample.head())

Sample DataFrame head:


Unnamed: 0,feature_id,user_category,transaction_amount,is_fraud,account_balance_at_fraud,spending_habit
0,0,A,389.446649,0,392.634548,315.626861
1,1,A,413.180436,0,818.708586,406.208751
2,2,C,512.934217,0,614.225478,574.096198
3,3,B,185.688865,0,366.792499,136.443615
4,4,C,52.833489,0,970.110307,-5.766245


In [23]:
print("\nRunning leakage analysis...")
# Run the analysis
leakage_report = analyze(df_sample, 'is_fraud')


Running leakage analysis...


In [24]:
# Apply risk category
leakage_report['risk'] = leakage_report['risk_score'].apply(risk_category)

In [25]:
print("\n===== FEATURE LEAKAGE RISK REPORT =====\n")
display(leakage_report)



===== FEATURE LEAKAGE RISK REPORT =====



Unnamed: 0,feature,risk_score,risk
3,account_balance_at_fraud,0.4988,LOW
4,spending_habit,0.4491,LOW
2,transaction_amount,0.2122,LOW
0,feature_id,0.2103,LOW
1,user_category,0.0661,LOW
