<a href="https://colab.research.google.com/github/mjr2010/Hypothetical_AML_alert_prototype/blob/main/AML_alert_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AML Alert Triage + Case Summarizer
Exploring AI in Simulated AI Scenarios


The notebook includes:
1. Generate synthetic AML data (customers, transactions, alerts)
2. Train a simple alert-priority model (High/Medium/Low)
3. Build an LLM-style prompt you can send to GPT / Azure / local LLM

In [49]:
'''Install core libraries for data + ML
- pandas/ numpy: dataframes and number work
- scikit-learn: to train the alert-priority model
- joblib: to save the model on disk so that we can reuse it
- faker: to generate fake data
'''
!pip install pandas numpy scikit-learn joblib faker --quiet
print('✅ Packages installed')

✅ Packages installed


Import base libraries, fix random seeds for reproducibility and create a data folder where all CSV are saved

In [50]:
import os, random
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
os.makedirs('data', exist_ok=True)
np.random.seed(42)
random.seed(42)
print(' Environment ready, /data created')

 Environment ready, /data created


## 1. Generate synthetic data
Simulate a bank-like dataset with following data:
*   customers.csv
*   transactions.csv
*   alerts.csv



In [51]:
HIGH_RISK_COUNTRIES = ['IR', 'SY', 'RU', 'AF', 'UA']
CHANNELS = ['wire', 'eft', 'cash', 'cheque']
CURRENCIES = ['CAD', 'USD', 'EUR']

'''define customers function generates 300 fake customers. '''

def make_customers(n=300):
    customers = []
    for i in range(1, n+1):
        risk = np.random.choice(['Low', 'Medium', 'High'], p=[0.55, 0.3, 0.15])
        #Each gets a risk rating (55% Low, 30% medium, 15*High)

        pep = 1 if (risk == 'High' and np.random.rand() < 0.35) else 0
        #pep flag is randomly applied to some high-risk customers
        #metada includes country, occupation, onboarding_date

        customers.append({
            'customer_id': f'C{i:04d}',
            'name': f'Customer {i}',
            'country': np.random.choice(['CA', 'US', 'IN', 'PK', 'AE', 'GB']),
            'occupation': np.random.choice(['Consultant', 'Trader', 'IT', 'Restaurant', 'Student']),
            'risk_rating': risk,
            'pep_flag': pep,
            'onboarding_date': (datetime(2020,1,1) + timedelta(days=np.random.randint(0, 1600))).date(),
        })
    return pd.DataFrame(customers)

#For easy identification: High risk & pep flag combines, helps the model learn 'suspicous activity'


# **#Make transations**
1. Each customer gets 10-20 transactions.
2. Each transaction holds amount in range of 200 to 15000
3. Payment modes are either wire, EFT, cash or cheque
4. Flags would be:
*   **is_structured:** when amount is just under 10k
*   **is_unusual_country:** when counterparty country is in HIGH_RISK_COUNTRIES i.e. risky regions (like cross-border)

In [52]:

def make_transactions(customers_df, avg_tx_per_customer=15):
    txs = []
    tx_id = 1
    for _, row in customers_df.iterrows():
        n_txs = np.random.randint(avg_tx_per_customer-5, avg_tx_per_customer+5)
        for _ in range(n_txs):
            amt = round(np.random.uniform(200, 15000), 2)
            country_pool = ["CA","US","IN","PK","AE","GB"] + HIGH_RISK_COUNTRIES
            counterparty_country = np.random.choice(
                country_pool
            )
            is_high_risk_geo = 1 if counterparty_country in HIGH_RISK_COUNTRIES else 0
            ts = datetime(2025, 10, 1) + timedelta(days=np.random.randint(0, 30))
            txs.append({
                'tx_id': f'T{tx_id:06d}',
                'customer_id': row['customer_id'],
                'counterparty_country': counterparty_country,
                'channel': np.random.choice(CHANNELS, p=[0.45,0.3,0.2,0.05]),
                'amount': amt,
                'currency': np.random.choice(CURRENCIES, p=[0.7,0.25,0.05]),
                'timestamp': ts,
                'is_structured': 1 if (8000 < amt < 10000) else 0,
                'is_unusual_country': is_high_risk_geo,
            })
            tx_id += 1
    return pd.DataFrame(txs)



# **Make Alert**
The function would pick andom customer and group few of their transactions together.
Assign a trigger reason rule when to be fired


In [53]:
def make_alerts(customers_df, tx_df, n_alerts=250):
    alerts = []
    alert_id = 1
    for _ in range(n_alerts):
        cust = customers_df.sample(1).iloc[0]
        cust_txs = tx_df[tx_df['customer_id'] == cust['customer_id']].sample(np.random.randint(1,4))
        trigger = np.random.choice([
            'High cash activity',
            'Rapid movement of funds',
            'High-risk country counterparty',
            'Structuring pattern',
            'Unusual channel usage',
        ])
        pep = cust['pep_flag']
        high_risk_geo = 1 if (cust_txs['is_unusual_country'].sum() > 0) else 0
        large_amt = 1 if (cust_txs['amount'].max() > 10000) else 0

     #define label priority(High/ Medium/ Low)
        if pep and high_risk_geo and large_amt:
            label = 'High'
        elif high_risk_geo or large_amt or cust['risk_rating'] == 'High':
            label = 'Medium'
        else:
            label = 'Low'

        alerts.append({
            'alert_id': f'A{alert_id:05d}',
            'customer_id': cust['customer_id'],
            'trigger_rule': trigger,
            'tx_ids': ';'.join(cust_txs['tx_id'].tolist()),
            'generated_at': datetime(2025, 10, 25) + timedelta(minutes=np.random.randint(0,1440)),
            'label_priority': label,
        })
        alert_id += 1
    return pd.DataFrame(alerts)

customers = make_customers(300)
transactions = make_transactions(customers)
alerts = make_alerts(customers, transactions, 250)

customers.to_csv('data/customers.csv', index=False)
transactions.to_csv('data/transactions.csv', index=False)
alerts.to_csv('data/alerts.csv', index=False)

print('✅ Data generated: customers, transactions, alerts')
customers.head()

✅ Data generated: customers, transactions, alerts


Unnamed: 0,customer_id,name,country,occupation,risk_rating,pep_flag,onboarding_date
0,C0001,Customer 1,AE,IT,Low,0,2022-12-31
1,C0002,Customer 2,US,IT,Medium,0,2023-05-23
2,C0003,Customer 3,AE,Restaurant,Low,0,2022-05-21
3,C0004,Customer 4,GB,Student,Medium,0,2022-02-08
4,C0005,Customer 5,GB,Trader,Medium,0,2023-04-30


## 2. Train model &  Build LLM prompt
Model should learn *pep + high risk + large_amount* = "**HIGH ALert**"

**Purpose:** To create every alert into a single ML-friendly row(features), and later to train a classifier to predict alert priority(High/Medium/ Low)

In [54]:
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

alerts = pd.read_csv('data/alerts.csv')
customers = pd.read_csv('data/customers.csv')
txs = pd.read_csv('data/transactions.csv')

def build_features(alerts, customers, txs):
    rows = []
    for _, a in alerts.iterrows():
        cust = customers[customers['customer_id'] == a['customer_id']].iloc[0]
        tx_list = a['tx_ids'].split(';')
        tx_sub = txs[txs['tx_id'].isin(tx_list)]
        rows.append({
            'alert_id': a['alert_id'],
            'customer_id': a['customer_id'],
            'trigger_rule': a['trigger_rule'],
            'customer_risk_rating': cust['risk_rating'],
            'pep_flag': cust['pep_flag'],
            'tx_count': len(tx_sub),
            'max_tx_amount': tx_sub['amount'].max(),
            'has_high_risk_country': 1 if (tx_sub['is_unusual_country'].sum() > 0) else 0,
            'label_priority': a['label_priority'],
        })
    return pd.DataFrame(rows)

df = build_features(alerts, customers, txs)

X = df.drop(columns=['label_priority', 'alert_id', 'customer_id'])
y = df['label_priority']

cat_cols = ['trigger_rule', 'customer_risk_rating']
num_cols = ['pep_flag', 'tx_count', 'max_tx_amount', 'has_high_risk_country']

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols),
    ]
)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
pipe = Pipeline(steps=[('preprocess', preprocess), ('model', clf)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

joblib.dump(pipe, 'alert_priority_model.joblib')
print('✅ Model trained and saved as alert_priority_model.joblib')

              precision    recall  f1-score   support

        High       1.00      1.00      1.00         1
         Low       1.00      0.90      0.95        10
      Medium       0.98      1.00      0.99        52

    accuracy                           0.98        63
   macro avg       0.99      0.97      0.98        63
weighted avg       0.98      0.98      0.98        63

✅ Model trained and saved as alert_priority_model.joblib


## 3. Add an Alert context using LLM prompt

Add 1 *Alert*, Pull a *Customer *+ *Transaction*, run the model to get an alert priority. Build a prompt send to a in LLM function to auto-write the investigator note .



In [55]:
import joblib
model = joblib.load('alert_priority_model.joblib')

customers = pd.read_csv('data/customers.csv')
transactions = pd.read_csv('data/transactions.csv')
alerts = pd.read_csv('data/alerts.csv')

def build_alert_context(alert_id, customers, transactions, alerts):
    a = alerts[alerts['alert_id'] == alert_id].iloc[0]
    c = customers[customers['customer_id'] == a['customer_id']].iloc[0]
    tx_ids = a['tx_ids'].split(';')
    tx_sub = transactions[transactions['tx_id'].isin(tx_ids)].sort_values('timestamp')
    tx_lines = []
    for _, tx in tx_sub.iterrows():
        tx_lines.append(
            f"- {tx['timestamp']} | {tx['channel']} | {tx['amount']} {tx['currency']} -> {tx['counterparty_country']} (high_risk={bool(tx['is_unusual_country'])})"
        )
    return {
        'alert_id': a['alert_id'],
        'customer': {
            'id': c['customer_id'],
            'risk_rating': c['risk_rating'],
            'pep_flag': int(c['pep_flag']),
            'country': c['country'],
            'occupation': c['occupation'],
        },
        'trigger_rule': a['trigger_rule'],
        'transactions': tx_lines,
    }

def make_prompt(alert_context, predicted_priority):
    prompt = f"""You are an AML investigator at a Canadian bank.

Write an analyst note for alert **{alert_context['alert_id']}**.

Customer:
- ID: {alert_context['customer']['id']}
- Risk rating: {alert_context['customer']['risk_rating']}
- PEP: {alert_context['customer']['pep_flag']}
- Country: {alert_context['customer']['country']}
- Occupation: {alert_context['customer']['occupation']}

Alert details:
- Trigger rule: {alert_context['trigger_rule']}
- System priority: {predicted_priority}

Transactions involved:
{chr(10).join(alert_context['transactions'])}

Write 130-180 words. Structure:
1. Why the alert was generated
2. What transactions look unusual (mention dates, amounts, countries)
3. Why the customer risk/PEP matters
4. Recommendation (monitor / request EDD / escalate)

Use professional compliance language.
"""
    return prompt

demo_alert_id = alerts.sample(1).iloc[0]['alert_id']
a = alerts[alerts['alert_id'] == demo_alert_id].iloc[0]
c = customers[customers['customer_id'] == a['customer_id']].iloc[0]
tx_ids = a['tx_ids'].split(';')
tx_sub = transactions[transactions['tx_id'].isin(tx_ids)]
row = {
    'trigger_rule': a['trigger_rule'],
    'customer_risk_rating': c['risk_rating'],
    'pep_flag': c['pep_flag'],
    'tx_count': len(tx_sub),
    'max_tx_amount': tx_sub['amount'].max(),
    'has_high_risk_country': 1 if (tx_sub['is_unusual_country'].sum() > 0) else 0,
}
X_one = pd.DataFrame([row])
pred_priority = model.predict(X_one)[0]

ctx = build_alert_context(demo_alert_id, customers, transactions, alerts)
prompt = make_prompt(ctx, pred_priority)

print(f'######### Demo alert: {demo_alert_id}')
print(f'Predicted priority: {pred_priority}\n')
print('***Prompt to send to LLM:****\n')
print(prompt)


######### Demo alert: A00077
Predicted priority: Low

***Prompt to send to LLM:****

You are an AML investigator at a Canadian bank.

Write an analyst note for alert **A00077**.

Customer:
- ID: C0144
- Risk rating: Low
- PEP: 0
- Country: US
- Occupation: Restaurant

Alert details:
- Trigger rule: High cash activity
- System priority: Low

Transactions involved:
- 2025-10-29 | eft | 503.06 CAD -> CA (high_risk=False)

Write 130-180 words. Structure:
1. Why the alert was generated
2. What transactions look unusual (mention dates, amounts, countries)
3. Why the customer risk/PEP matters
4. Recommendation (monitor / request EDD / escalate)

Use professional compliance language.



# RAG KNOWLEDGE BASE


In [56]:
# === RAG KNOWLEDGE BASE (FINTRAC / FATF mini set) ===

rag_docs = [
    {
        "id": "fintrac_structuring_1",
        "topic": "structuring",
        "text": "Multiple cash deposits below reporting thresholds within a short period, especially across different branches, may indicate structuring to avoid detection."
    },
    {
        "id": "fintrac_highrisk_geo_1",
        "topic": "high-risk-country",
        "text": "Transactions involving jurisdictions with weak AML/CFT controls or sanctioned regions should be subject to enhanced due diligence."
    },
   {
        "id": "fatf_pep_1",
        "topic": "pep",
        "text": "Politically exposed persons (PEPs) present higher risk due to potential misuse of public office; financial institutions should conduct enhanced ongoing monitoring."
    },

    {
        "id": "fatf_thirdparty_1",
        "topic": "third-party",
        "text": "Use of third-party cash deposits or unexplained intermediaries may indicate attempts to obscure the source of funds."
    },

    {
        "id": "fintrac_rapid_movement_1",
        "topic": "rapid movement",
        "text": "Rapid in-and-out movement of funds, particularly with no clear economic purpose, is a potential money laundering indicator."
    },

    {
       "id": "fatf_trade_1",
        "topic": "trade-based",
        "text": "Over/under invoicing, unusual shipping routes, or mismatch between goods and payments may indicate trade-based money laundering."
    }
]

# super simple similarity over keywords
def retrieve_relevant_passages(trigger_rule, alert_features):
    candidates = []
    text_to_match = trigger_rule.lower()
    if alert_features.get("has_high_risk_country") == 1:
        text_to_match += " high-risk-country"
    if alert_features.get("pep_flag") == 1:
        text_to_match += " pep"

   # naive scoring
    for d in rag_docs:
        score = 0
        for token in d["topic"].split("-"):
            if token in text_to_match:
                score += 1
        if d["text"].lower() in text_to_match:
            score += 2
        if score > 0:
            candidates.append((score, d))
    # sort by score desc
    candidates = sorted(candidates, key=lambda x: x[0], reverse=True)
    # return top 2

    return [c[1] for c in candidates[:2]]

In [57]:
def apply_policy_overrides(pred_priority, alert_row, customer_row, tx_sub):
    """
    pred_priority: model output ('High'/'Medium'/'Low')
    alert_row: row from alerts df
    customer_row: row from customers df
    tx_sub: dataframe of transactions in alert
    """
    reasons = []
    final_priority = pred_priority

    # 1) PEP + high-risk country → force High
    has_high_risk_country = 1 if (tx_sub["is_unusual_country"].sum() > 0) else 0
    if int(customer_row["pep_flag"]) == 1 and has_high_risk_country:
        final_priority = "High"
        reasons.append("policy: PEP + high-risk country")

    # 2) Very large tx → at least Medium
    if tx_sub["amount"].max() > 20000 and final_priority == "Low":
        final_priority = "Medium"
        reasons.append("policy: large transaction > 20k")

    # 3) Trigger is sanctions / high-risk → bump 1 level
    if "High-risk country" in alert_row["trigger_rule"] and final_priority == "Medium":
        final_priority = "High"
        reasons.append("policy: high-risk trigger escalated")

    return final_priority, reasons


In [58]:
#Post RAG model
pred_priority_model = model.predict(X_one)[0]
final_priority, override_readons= apply_policy_overrides(
    pred_priority_model, a, c, tx_sub)


# Inject RAG into the LLMK prompt

In [59]:
def make_prompt(alert_context, predicted_priority, regulatory_snippets):
  reg_text=""
  if regulatory_snippets:
    reg_text = "Relevant regulatory guidance:\n" + "\n".join(
        [f"- {s['text']}" for s in regulatory_snippets]
    ) + "\n\n"

    prompt = f"""You are an AML investigator at a Canadian bank.
{reg_text} Write an analyst note for alert **{alert_context['alert_id']}**.

Customer:
- ID: {alert_context['customer']['id']}
- Risk rating: {alert_context['customer']['risk_rating']}
- PEP: {alert_context['customer']['pep_flag']}
- Country: {alert_context['customer']['country']}
- Occupation: {alert_context['customer']['occupation']}

Alert details:
- Trigger rule: {alert_context['trigger_rule']}
- System priority: {predicted_priority}

Transactions involved:
{chr(10).join(alert_context['transactions'])}

Write 130-180 words. Structure:
1. Why the alert was generated
2. What transactions look unusual (mention dates, amounts, countries)
3. Why the customer risk/PEP matters
4. Recommendation (monitor / request EDD / escalate)

Use professional compliance language.
"""
    return prompt



# Audit JSON (wrapping into a function)

In [60]:
import json

from datetime import datetime
def build_audit_record(
    alert_id,
    model_priority,
    final_priority,
    override_reasons,
    features_dict,
    regulatory_snippets,
    llm_prompt,
    llm_response=None,
):
    # Convert NumPy integers to standard integers for JSON serialization
    for key, value in features_dict.items():
        if isinstance(value, np.integer):
            features_dict[key] = int(value)

    return {
        "alert_id": alert_id,
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "model_priority": model_priority,
        "final_priority": final_priority,
        "overrides_applied": override_reasons,
        "features_used": features_dict,
        "regulatory_snippets": [s["id"] for s in regulatory_snippets],
        "llm_prompt_preview": llm_prompt[:300],
        "llm_response": llm_response,
        "version": "aml-alert-poc-v1"
    }

In [61]:
# DEMO RUN
demo_alert_id = alerts.sample(1).iloc[0]["alert_id"]
a = alerts[alerts["alert_id"] == demo_alert_id].iloc[0]
c = customers[customers["customer_id"] == a["customer_id"]].iloc[0]
tx_sub = transactions[transactions["tx_id"].isin(a["tx_ids"].split(";"))]


# build feature row like before
row = {
    "trigger_rule": a["trigger_rule"],
    "customer_risk_rating": c["risk_rating"],
    "pep_flag": c["pep_flag"],
    "tx_count": len(tx_sub),
    "max_tx_amount": tx_sub["amount"].max(),
    "has_high_risk_country": 1 if (tx_sub["is_unusual_country"].sum() > 0) else 0
}

X_one = pd.DataFrame([row])
pred_priority_model = model.predict(X_one)[0]

# apply policy overrides
final_priority, override_reasons = apply_policy_overrides(pred_priority_model, a, c, tx_sub)

# retrieve RAG
snippets = retrieve_relevant_passages(a["trigger_rule"], row)

# build context + prompt
ctx = build_alert_context(demo_alert_id, customers, transactions, alerts)
prompt = make_prompt(ctx, final_priority, snippets)


# (optional) call LLM here → get `llm_text`
llm_text = None  # put model output here
audit_record = build_audit_record(
    alert_id=demo_alert_id,
    model_priority=pred_priority_model,
    final_priority=final_priority,
    override_reasons=override_reasons,
    features_dict=row,
    regulatory_snippets=snippets,
    llm_prompt=prompt,
    llm_response=llm_text
)

print(json.dumps(audit_record, indent=2))

{
  "alert_id": "A00159",
  "timestamp": "2025-11-02T21:05:40.224938Z",
  "model_priority": "Medium",
  "final_priority": "Medium",
  "overrides_applied": [],
  "features_used": {
    "trigger_rule": "Structuring pattern",
    "customer_risk_rating": "Low",
    "pep_flag": 0,
    "tx_count": 3,
    "max_tx_amount": 9240.41,
    "has_high_risk_country": 1
  },
  "regulatory_snippets": [
    "fintrac_highrisk_geo_1",
    "fintrac_structuring_1"
  ],
  "llm_prompt_preview": "You are an AML investigator at a Canadian bank.\nRelevant regulatory guidance:\n- Transactions involving jurisdictions with weak AML/CFT controls or sanctioned regions should be subject to enhanced due diligence.\n- Multiple cash deposits below reporting thresholds within a short period, especially acr",
  "llm_response": null,
  "version": "aml-alert-poc-v1"
}


  "timestamp": datetime.utcnow().isoformat() + "Z",
