In [66]:
import pandas as pd
import numpy as np
from faker import Faker
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import tensorflow as tf
from tensorflow.keras import layers, models

In [67]:
# ---- STEP 1: DATA GENERATION ----
fake = Faker()

# Generate merchant profiles
num_merchants = 1000
merchants = pd.DataFrame({
    'MerchantID': range(1, num_merchants + 1),
    'Name': [fake.company() for _ in range(num_merchants)],
    'BusinessType': [fake.random_element(['Retail', 'E-commerce', 'Food', 'Services']) for _ in range(num_merchants)],
    'RegistrationDate': [fake.date_this_decade() for _ in range(num_merchants)],
    'GSTStatus': [fake.random_element(['Active', 'Inactive']) for _ in range(num_merchants)]
})

In [68]:
# Generate transaction data
num_transactions = 50000
transactions = pd.DataFrame({
    'TransactionID': range(1, num_transactions + 1),
    'MerchantID': np.random.choice(merchants['MerchantID'], num_transactions),
    'Amount': np.random.exponential(scale=50, size=num_transactions),
    'Hour': np.random.randint(0, 24, num_transactions),
    'CustomerID': np.random.randint(1, 10000, num_transactions),
    'IsFraud': np.random.choice([0, 1], num_transactions, p=[0.95, 0.05])
})


In [69]:
# ---- STEP 2: FEATURE ENGINEERING ----
# Calculate transaction velocity and customer concentration
transaction_count = transactions.groupby('MerchantID').size().reset_index(name='TransactionCount')
customer_concentration = transactions.groupby(['MerchantID', 'CustomerID'])['Amount'].sum()
customer_concentration = customer_concentration.groupby('MerchantID').apply(lambda x: (x / x.sum()).max())

# Aggregate features at the merchant level
merchant_features = transactions.groupby('MerchantID').agg({
    'Amount': ['mean', 'std'],
    'Hour': 'std'
}).reset_index()
merchant_features.columns = ['MerchantID', 'AmountMean', 'AmountStd', 'HourStd']
merchant_features = merchant_features.merge(transaction_count, on='MerchantID', how='left')
merchant_features['CustomerConcentration'] = merchant_features['MerchantID'].map(customer_concentration).fillna(0)

In [70]:
# Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(merchant_features.drop('MerchantID', axis=1))

In [71]:
# ---- STEP 3: AUTOENCODER MODEL ----
# Define autoencoder
input_dim = normalized_features.shape[1]
autoencoder = models.Sequential([
    layers.InputLayer(input_shape=(input_dim,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(input_dim, activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')



In [72]:
# Prepare data for training
merchant_fraud_status = transactions.groupby('MerchantID')['IsFraud'].max().reset_index()
merchant_fraud_status['IsFraud'] = merchant_fraud_status['IsFraud'].apply(lambda x: 0 if x == 0 else 1)

normal_data = normalized_features[merchant_fraud_status['IsFraud'] == 0]

# Train the autoencoder
autoencoder.fit(normal_data, normal_data, epochs=50, batch_size=32, shuffle=True)

Epoch 1/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - loss: 1.3781
Epoch 2/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.3281 
Epoch 3/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.2882 
Epoch 4/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.2541 
Epoch 5/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.2385 
Epoch 6/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.2567 
Epoch 7/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.2232 
Epoch 8/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.1509 
Epoch 9/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.0417 
Epoch 10/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.0743 
Epoch 11/50
[1m3/3

<keras.src.callbacks.history.History at 0x2a05deda1b0>

In [73]:
# ---- STEP 4: ANOMALY DETECTION ----
# Predict reconstruction error
reconstructed = autoencoder.predict(normalized_features)
reconstruction_error = np.mean(np.square(normalized_features - reconstructed), axis=1)

# Set threshold and identify anomalies
threshold = np.percentile(reconstruction_error, 95)
merchant_features['ReconstructionError'] = reconstruction_error
merchant_features['IsAnomaly'] = merchant_features['ReconstructionError'] > threshold

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [74]:
# ---- STEP 5: FRAUD PATTERN DETECTION ----
# Odd-hour pattern detection
odd_hour_transactions = transactions[(transactions['Hour'] >= 22) | (transactions['Hour'] <= 6)]
odd_hour_flags = odd_hour_transactions.groupby('MerchantID').size()
merchant_features['OddHourFlag'] = merchant_features['MerchantID'].map(odd_hour_flags).fillna(0).astype(int)

# High customer concentration detection
merchant_features['HighConcentrationFlag'] = merchant_features['CustomerConcentration'] > 0.6


# Combine scores into final risk score
merchant_features['FinalRiskScore'] = (
    0.6 * merchant_features['ReconstructionError'] / threshold +
    0.2 * merchant_features['OddHourFlag'] +
    0.2 * merchant_features['HighConcentrationFlag']
)
merchant_features['FraudRisk'] = merchant_features['FinalRiskScore'] > 1.0

In [75]:
# ---- STEP 6: EVALUATION ----
# Evaluate the model
y_true = merchant_fraud_status['IsFraud']
y_pred = merchant_features['FraudRisk']

print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

roc_auc = roc_auc_score(y_true, merchant_features['FinalRiskScore'])
print(f"ROC-AUC: {roc_auc}")

[[  0  75]
 [  0 925]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        75
           1       0.93      1.00      0.96       925

    accuracy                           0.93      1000
   macro avg       0.46      0.50      0.48      1000
weighted avg       0.86      0.93      0.89      1000

ROC-AUC: 0.4941405405405406


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
