# Modus AI ML Engineer Task

## Data Generation Script

In [2]:
import pandas as pd
import numpy as np
import random
import string
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

# Constants
BUSINESS_TYPES = ["Electronics", "Fashion", "Food", "Travel", "Healthcare"]
FRAUD_PATTERNS = ["late_night_trading", "sudden_activity_spike", "customer_concentration"]
NORMAL_HOURS = range(9, 21)  # Business hours

# Generate merchant profiles
def generate_merchant_id() -> str:
    return "M" + ''.join(random.choices(string.digits, k=8))

def generate_business_name() -> str:
    return "Business_" + ''.join(random.choices(string.ascii_uppercase, k=5))

def generate_random_date(start_year=2015) -> datetime:
    start_date = datetime(start_year, 1, 1)
    end_date = datetime.today()
    return start_date + timedelta(days=random.randint(0, (end_date - start_date).days))

def generate_merchant_base(count: int) -> pd.DataFrame:
    merchants = []
    for _ in range(count):
        merchant = {
            "merchant_id": generate_merchant_id(),
            "business_name": generate_business_name(),
            "business_type": random.choice(BUSINESS_TYPES),
            "registration_date": generate_random_date(),
        }
        merchants.append(merchant)
    return pd.DataFrame(merchants)


In [4]:

# Generate normal transaction patterns
def generate_normal_transactions(
    merchant_id: str, days: int, daily_volume=(10, 20), amount_range=(100, 5000)
) -> pd.DataFrame:
    transactions = []
    for _ in range(days):
        daily_txns = random.randint(*daily_volume)
        for _ in range(daily_txns):
            txn_time = datetime.now() - timedelta(days=random.randint(0, 365))
            txn_time = txn_time.replace(hour=random.choice(NORMAL_HOURS), minute=random.randint(0, 59))
            transaction = {
                "transaction_id": ''.join(random.choices(string.ascii_uppercase + string.digits, k=12)),
                "merchant_id": merchant_id,
                "timestamp": txn_time,
                "amount": round(random.uniform(*amount_range), 2),
                "velocity_flag": False,
                "time_flag": False,
                "customer_id": ''.join(random.choices(string.ascii_uppercase + string.digits, k=8)),
                "fraud": 0  # Label as normal
            }
            transactions.append(transaction)
    return pd.DataFrame(transactions)

# Inject fraud patterns
def inject_fraud_pattern(transactions: pd.DataFrame, pattern: str) -> pd.DataFrame:
    if pattern == "late_night_trading":
        for _ in range(random.randint(10, 20)):
            txn_time = datetime.now() - timedelta(days=random.randint(0, 30))
            hour = random.choice(list(range(23, 24)) + list(range(0, 5)))  # Late night hours: 23:00-04:00
            txn_time = txn_time.replace(hour=hour, minute=random.randint(0, 59))
            transactions.loc[len(transactions)] = {
                "transaction_id": ''.join(random.choices(string.ascii_uppercase + string.digits, k=12)),
                "merchant_id": random.choice(transactions["merchant_id"].unique()),
                "timestamp": txn_time,
                "amount": round(random.uniform(5000, 10000), 2),
                "velocity_flag": True,
                "time_flag": True,
                "customer_id": ''.join(random.choices(string.ascii_uppercase + string.digits, k=8)),
                "fraud": 1  # Label as fraud
            }
    return transactions

# Feature Engineering
def engineer_features(transactions: pd.DataFrame) -> pd.DataFrame:
    transactions["hour"] = transactions["timestamp"].dt.hour
    transactions["is_night"] = transactions["hour"].apply(lambda x: 1 if 23 <= x or x < 5 else 0)
    features = transactions.groupby("merchant_id").agg(
        total_txns=("transaction_id", "count"),
        avg_amount=("amount", "mean"),
        max_amount=("amount", "max"),
        min_amount=("amount", "min"),
        night_txn_ratio=("is_night", "mean"),
    ).reset_index()
    features = features.merge(
        transactions.groupby("merchant_id").agg(fraud=("fraud", "max")).reset_index(),
        on="merchant_id",
        how="left"
    )
    return features


## Autoencoder structure

In [5]:

# Build autoencoder
def build_autoencoder(input_dim: int) -> Sequential:
    model = Sequential([
        Dense(128, activation="relu", input_dim=input_dim),
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(64, activation="relu"),
        Dense(128, activation="relu"),
        Dense(input_dim, activation="sigmoid")
    ])
    model.compile(optimizer=Adam(), loss="mse")
    return model

# Train autoencoder
def train_autoencoder(features: pd.DataFrame, normal_data: pd.DataFrame) -> (Sequential, StandardScaler, list):
    numeric_cols = normal_data.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    normal_scaled = scaler.fit_transform(normal_data[numeric_cols])
    model = build_autoencoder(input_dim=normal_scaled.shape[1])
    model.fit(normal_scaled, normal_scaled, epochs=50, batch_size=32, verbose=1)
    return model, scaler, list(numeric_cols)

# Anomaly scoring
def calculate_anomaly_scores(model: Sequential, scaler: StandardScaler, features: pd.DataFrame) -> pd.Series:
    features_numeric = features[feature_names]  # Align columns
    scaled_features = scaler.transform(features_numeric)
    reconstructed = model.predict(scaled_features)
    mse = np.mean(np.power(scaled_features - reconstructed, 2), axis=1)
    return pd.Series(mse, index=features.index)


In [6]:

# Main process
merchant_count = 1000  # Set merchant count to 1000
merchants = generate_merchant_base(merchant_count)
all_transactions = pd.DataFrame()

for _, merchant in merchants.iterrows():
    txns = generate_normal_transactions(merchant["merchant_id"], days=30)
    if random.random() < 0.2:  # 20% chance of fraud
        txns = inject_fraud_pattern(txns, random.choice(FRAUD_PATTERNS))
    all_transactions = pd.concat([all_transactions, txns], ignore_index=True)

all_transactions["timestamp"] = pd.to_datetime(all_transactions["timestamp"])
features = engineer_features(all_transactions)

# Train-test split
normal_data = features[features["fraud"] == 0].sample(frac=0.8, random_state=42)
test_data = features.drop(normal_data.index)

# Autoencoder training
autoencoder, scaler, feature_names = train_autoencoder(features, normal_data)

# Anomaly detection
test_scores = calculate_anomaly_scores(autoencoder, scaler, test_data[feature_names])
test_data["anomaly_score"] = test_scores
test_data["predicted_fraud"] = (test_scores > test_scores.quantile(0.90)).astype(int)

# Metrics
accuracy = accuracy_score(test_data["fraud"], test_data["predicted_fraud"])
precision = precision_score(test_data["fraud"], test_data["predicted_fraud"])
recall = recall_score(test_data["fraud"], test_data["predicted_fraud"])
f1 = f1_score(test_data["fraud"], test_data["predicted_fraud"])


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.8897
Epoch 2/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5996
Epoch 3/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.4349 
Epoch 4/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.4373
Epoch 5/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 871us/step - loss: 0.4138
Epoch 6/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4207 
Epoch 7/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 876us/step - loss: 0.4254
Epoch 8/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 985us/step - loss: 0.4133
Epoch 9/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 628us/step - loss: 0.4196
Epoch 10/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 969us/step - loss: 0.4057
Epoch 11

In [8]:

print(f"Accuracy: {accuracy:.2f}")

# Save datasets
all_transactions.to_csv("all_transactions.csv", index=False)
test_data.to_csv("test_results.csv", index=False)


Accuracy: 0.82
