In [None]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from itertools import combinations
from datetime import timedelta
from glob import glob

import leidenalg as la
import igraph as ig
import numpy as np
import pandas as pd
import seaborn as sns
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from sklearn.ensemble import IsolationForest

import settings as s
from common import get_processes

%load_ext autoreload
%autoreload 2

In [None]:
sns.set_theme(style="white", context="talk")

In [None]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [None]:
config = [
    ("spark.driver.memory", "16g"),
    ("spark.worker.memory", "16g"),
    ("spark.driver.maxResultSize", "16g"),
]
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(config))
    .getOrCreate()
)

In [None]:
start_script = time.time()

In [None]:
MAX_DEGREE_PER_ACCOUNT = 100
MAX_TRANSACTIONS_PER_ACCOUNT = 1_000

data = spark.read.parquet(s.STAGED_DATA_LOCATION)

#### [START] Seed selection ####
data = data.where(sf.col("source") != sf.col("target"))
data = data.where(sf.col("format").isin(["ACH", "Wire", "Bitcoin"]))

large_sources = (
    data.groupby("source")
    .count()
    .where(sf.col("count") > MAX_TRANSACTIONS_PER_ACCOUNT)
    .select("source")
    .toPandas()["source"]
    .tolist()
)
large_targets = (
    data.groupby("target")
    .count()
    .where(sf.col("count") > MAX_TRANSACTIONS_PER_ACCOUNT)
    .select("target")
    .toPandas()["target"]
    .tolist()
)

large_sources = set(large_sources).union(
    data.groupby("source")
    .agg(sf.countDistinct("target").alias("count"))
    .where(sf.col("count") > MAX_DEGREE_PER_ACCOUNT)
    .select("source")
    .toPandas()["source"]
    .tolist()
)
large_targets = set(large_targets).union(
    data.groupby("target")
    .agg(sf.countDistinct("source").alias("count"))
    .where(sf.col("count") > MAX_DEGREE_PER_ACCOUNT)
    .select("target")
    .toPandas()["target"]
    .tolist()
)

data = data.where(~sf.col("source").isin(large_sources))
data = data.where(~sf.col("target").isin(large_targets))
#### [END] Seed selection ####

In [None]:
def aggregate_edges(data_input):
    data_aggregated = (
        data_input.groupby(["source", "target"])
        .agg(
            sf.sum("source_amount").alias("source_amount"),
            sf.sum("target_amount").alias("target_amount"),
        )
        .toPandas()
    )

    source_totals = (
        data_aggregated.groupby("source")
        .agg({"source_amount": "sum"})["source_amount"]
        .to_dict()
    )
    target_totals = (
        data_aggregated.groupby("target")
        .agg({"target_amount": "sum"})["target_amount"]
        .to_dict()
    )

    data_aggregated.loc[:, "total_sent_by_source"] = data_aggregated.loc[
        :, "source"
    ].apply(lambda x: source_totals[x])
    data_aggregated.loc[:, "total_received_by_target"] = data_aggregated.loc[
        :, "target"
    ].apply(lambda x: target_totals[x])
    data_aggregated.loc[:, "weight"] = data_aggregated.apply(
        lambda x: (
            (x["source_amount"] / x["total_sent_by_source"])
            + (x["target_amount"] / x["total_received_by_target"])
        ),
        axis=1,
    )
    data_aggregated.loc[:, "source"] = data_aggregated["source"].str.slice(0, 8)
    data_aggregated.loc[:, "target"] = data_aggregated["target"].str.slice(0, 8)
    filter_self = data_aggregated["source"] != data_aggregated["target"]
    data_aggregated = data_aggregated.loc[filter_self, :].reset_index(drop=True)
    return data_aggregated.loc[:, ["source", "target", "weight"]]

In [None]:
%%time

edges = aggregate_edges(data)
graph = ig.Graph.DataFrame(edges, use_vids=False, directed=True)
nodes = [x["name"] for x in graph.vs()]
random.shuffle(nodes)

In [None]:
nodes_mapping = {x.index: x["name"] for x in graph.vs()}

In [None]:
%%time

communities = []
for filename in glob("./staging/*.pickle"):
    with open(filename, "rb") as f:
        communities += pickle.load(f)

original_size = len(communities)

filename = "communities.pickle"
with open(filename, "wb") as f:
    pickle.dump(communities, f)

communities_unique_rev = {}
for comm_id, comm in communities:
    communities_unique_rev[tuple(sorted(comm))] = comm_id

communities_unique = {
    f"{v}-{i}": k for i, (k, v) in enumerate(communities_unique_rev.items())
}
communities = list({x for x in communities_unique.values()})
new_size = len(communities)

print(original_size, new_size, round(new_size / original_size, 2))
print()

In [None]:
sizes = [len(x) for x in communities]
round(np.mean(sizes)), round(np.median(sizes)), round(np.max(sizes)), sum(sizes)

In [None]:
currency_rates = {
    "jpy": np.float32(0.009487665410827868),
    "cny": np.float32(0.14930721887033868),
    "cad": np.float32(0.7579775434031815),
    "sar": np.float32(0.2665884611958837),
    "aud": np.float32(0.7078143121927827),
    "ils": np.float32(0.29612081311363503),
    "chf": np.float32(1.0928961554056371),
    "usd": np.float32(1.0),
    "eur": np.float32(1.171783425225877),
    "rub": np.float32(0.012852809604990688),
    "gbp": np.float32(1.2916554735187644),
    "btc": np.float32(11879.132698717296),
    "inr": np.float32(0.013615817231245796),
    "mxn": np.float32(0.047296753463246695),
    "brl": np.float32(0.1771008654705292),
}

In [None]:
flows = pd.read_parquet("flows.parquet")
flow_stats = pd.read_parquet("flow_stats.parquet")
flow_stats.loc[:, "turnover_weight"] = flow_stats.loc[:, "turnover_weight"].apply(
    lambda x: json.loads(x)
)
flows.loc[:, "amount_usd"] = flows.apply(
    lambda x: x["source_amount"] * currency_rates[x["source_currency"]], axis=1
)
ml_nodes = set(pd.read_parquet("ml_nodes.parquet")["ml"].tolist())
single_node_flows = flow_stats[flow_stats["turnover_weight"].apply(len) == 1][
    "id"
].tolist()
flows = flows[~flows["id"].isin(single_node_flows)].reset_index(drop=True)
flow_stats = flow_stats[~flow_stats["id"].isin(single_node_flows)].reset_index(
    drop=True
)
print(flow_stats.shape[0], len(single_node_flows))

In [None]:
location_transactions = "transactions_with_edges"

In [None]:
%%time

columns = [
    sf.substring("source", 1, 8).alias("source"),
    sf.substring("target", 1, 8).alias("target"),
    "source_bank",
    "target_bank",
    sf.unix_timestamp("timestamp").alias("timestamp"),
    sf.col("source_amount").alias("amount"),
    sf.col("source_currency").alias("currency"),
]
data.where(
    (sf.col("source_currency") == sf.col("target_currency"))
    & (sf.col("source") != sf.col("target"))
).select(*columns).withColumn(
    "edge",
    sf.when(
        sf.col("source") < sf.col("target"),
        sf.concat(sf.col("source"), sf.lit("-"), sf.col("target")),
    ).otherwise(sf.concat(sf.col("target"), sf.lit("-"), sf.col("source"))),
).repartition(
    1
).write.parquet(
    location_transactions, mode="overwrite"
)

In [None]:
transactions = pd.read_parquet(location_transactions).set_index("edge")
transactions.loc[:, "amount_usd"] = transactions.apply(
    lambda x: currency_rates[x["currency"]] * x["amount"], axis=1
)

In [None]:
location = "transactions_communities"

In [None]:
%%time

shutil.rmtree(location, ignore_errors=True)
os.mkdir(location)

shutil.rmtree(location, ignore_errors=True)
os.mkdir(location)

communities_keys = [x for x in communities_unique.keys()]

number_of_chunks = int(np.ceil(len(communities_keys) / 50_000))
chunks = np.array_split(communities_keys, number_of_chunks)
for index, chunk in enumerate(chunks):
    comm_inner = []
    for key in chunk:
        comm_node = communities_unique[key]
        comm_inner += [[key, sorted(x)] for x in combinations(comm_node, 2)]
    edge_combinations = pd.DataFrame(comm_inner, columns=["id", "edge"])
    edge_combinations.loc[:, "edge"] = edge_combinations.loc[:, "edge"].apply(
        lambda x: f"{x[0]}-{x[1]}"
    )
    edge_combinations.set_index("edge", inplace=True)
    edge_combinations.join(transactions, how="inner").reset_index(drop=True).to_parquet(
        f"{location}/part-{index}.parquet"
    )
    if not (index % 20):
        print(index, len(chunks))

In [None]:
location_features_global = "features_global"

In [None]:
%%time

NUMBER_OF_PROCESSES = 10

parts = sorted(
    [x for x in glob(f"{location}/*.parquet")],
    key=lambda x: int(x.split("-")[-1].split(".")[0]),
)

shutil.rmtree(location_features_global, ignore_errors=True)
os.mkdir(location_features_global)

process_ids = set()
process_name = "features.py"
while parts:
    if len(get_processes(process_ids)) < NUMBER_OF_PROCESSES:
        process_id = str(uuid.uuid4())
        process_ids = process_ids.union({process_id})
        os.system(
            f"{sys.executable} {process_name} {parts.pop()} {location_features_global} {process_id} &"
        )

while get_processes(process_ids):
    time.sleep(5)

In [None]:
TOP_N_ANOMALIES = 25_000
# TOP_N_ANOMALIES = 50_000
# TOP_N_ANOMALIES = 75_000

In [None]:
features_global = pd.read_parquet("features_global")
features_global.loc[:, "key_parent"] = features_global.loc[:, "key"].apply(
    lambda x: x.split("-")[0]
)

predictions = features_global.loc[:, ["key", "key_parent", "turnover"]].copy(deep=True)
del features_global["key"]
del features_global["key_parent"]

In [None]:
def add_turnover_score(df_input):
    df_input.loc[:, "turnover_score"] = df_input["turnover"] / 100_000
    df_input.loc[df_input["turnover_score"] > 1, "turnover_score"] = 1
    return df_input

In [None]:
%%time

model = IsolationForest()
model.fit(features_global)
predictions.loc[:, "anomaly_score"] = model.decision_function(features_global)
predictions = add_turnover_score(predictions)

In [None]:
top_anomalies = predictions.loc[predictions.groupby("key_parent")["key"].idxmin(), :]
top_anomalies = top_anomalies.sort_values("anomaly_score").head(TOP_N_ANOMALIES)
communities_shortlisted = {x: communities_unique[x] for x in top_anomalies["key"]}
print(f"Number of commmununities: {len(communities_shortlisted):,}")
max_comm_size = max(
    [len(x) for x in communities_shortlisted.values()] + [len(x) for x in flow_stats["turnover_weight"]]
) + 1
print(f"Max community size: {max_comm_size}")

In [None]:
flow_stats.loc[:, "turnover"] = 0
for key, df in flows.groupby("id"):
    left = (
        df.loc[:, ["target", "amount_usd"]]
        .rename(columns={"target": "source"})
        .groupby("source")
        .agg({"amount_usd": "sum"})
    )
    right = df.groupby("source").agg({"amount_usd": "sum"})
    result = left.join(right, how="outer", lsuffix="_left").fillna(0).reset_index()
    result.loc[:, "delta"] = result["amount_usd_left"] - result["amount_usd"]
    turnover = float(result[result["delta"] > 0]["delta"].sum())
    flow_stats.loc[flow_stats["id"] == key, "turnover"] = int(np.ceil(turnover))
flow_stats.loc[:, "turnover_score"] = add_turnover_score(flow_stats)

## Context-Weighted Confusion Matrix

In [None]:
%%time

matches = []
not_found = []
for index, stats in flow_stats.iterrows():
    key = stats["id"]
    turnover_weight = stats["turnover_weight"]
    flow_nodes = set(turnover_weight.keys())
    candidates = []
    for comm_id, comm in communities_shortlisted.items():
        matched = flow_nodes.intersection(comm)
        if len(matched):
            matched_score = sum([turnover_weight[x] for x in matched])
            # We are not counting the isolated "is_laundering" transactions as true-positives,
            # - though they should be not be counted towards false positive either
            non_matched = set(comm) - ml_nodes
            candidates.append((matched_score, matched, non_matched, turnover_weight, comm_id))
    to_score = float(stats["turnover_score"])
    if candidates:
        best = sorted(candidates, reverse=True, key=lambda x: (x[0], -len(x[2])))[0]
        true_positives = best[0]
        false_negatives = len(flow_nodes) - true_positives
        false_positives = len(best[2])
        true_negatives = max_comm_size - false_positives - len(flow_nodes)
        total = int(round(sum((true_positives, false_negatives, false_positives, true_negatives))))
        assert total == max_comm_size
        true_positives /= max_comm_size
        false_positives /= max_comm_size
        true_negatives /= max_comm_size
        false_negatives /= max_comm_size
        matches.append(
            (
                true_positives * to_score, 
                false_positives * to_score, 
                true_negatives * to_score, 
                false_negatives * to_score, 
                best[4]
            )
        )
    else:
        not_found.append((0, 0, 0, to_score, key))
    if not (index % 5_000):
        print(index)

In [None]:
non_matches = set(communities_shortlisted.keys()).difference([x[4] for x in matches])
non_matches = [(x, len(communities_shortlisted[x])) for x in non_matches]
non_matches_pd = predictions.loc[
    predictions["key"].isin([x[0] for x in non_matches]), :
].reset_index(drop=True)

In [None]:
non_matches_cm = []
for index, row in non_matches_pd.iterrows():
    to_score = float(stats["turnover_score"])
    # We are not counting the isolated "is_laundering" transactions as true-positives,
    # - though they should be not be counted towards false positive either
    non_matched = set(comm) - ml_nodes
    false_positives = len(non_matched)
    true_negatives = max_comm_size - false_positives
    false_positives /= max_comm_size
    true_negatives /= max_comm_size
    non_matches_cm.append((0, false_positives * to_score, true_negatives * to_score, 0, row["key"]))

In [None]:
tp_all, fp_all, tn_all, fn_all = 0, 0, 0, 0
for tp, fp, tn, fn, _ in matches + non_matches_cm + not_found:
    tp_all += tp
    fp_all += fp
    tn_all += tn
    fn_all += fn

In [None]:
int(tp_all), int(fp_all), int(tn_all), int(fn_all)

In [None]:
f1 = (2 * tp_all) / ((2 * tp_all) + fp_all + fn_all)
f1

In [None]:
recall = tp_all / (tp_all + fn_all)
recall
# 0.42 0.55 

In [None]:
precision = tp_all / (tp_all + fp_all)
precision

In [None]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")