In [2]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict
from datetime import timedelta
from glob import glob
from itertools import product
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score

import igraph as ig
import leidenalg as la
import numpy as np
import pandas as pd

import settings as s

assert s.FILE_SIZE == "Large"
assert s.HIGH_ILLICIT == False

os.environ["EXT_DATA_TYPE_FOLDER"] = s.OUTPUT_POSTFIX.lstrip("-")

from common import get_weights
from communities import get_communities_spark
from evaluation import cw_confusion_matrix, cw_recall, cw_f1
from features import generate_features_spark, generate_features_udf_wrapper, SCHEMA_FEAT_UDF, CURRENCY_RATES

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
start_script = time.time()

In [3]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [4]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
    ("spark.sql.autoBroadcastJoinThreshold", -1)
]

shutil.rmtree("artifacts", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/24 20:53:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
FLOWS_FORMAT_SCOPE = ["ACH", "Bitcoin", "Wire"]
MAX_DEGREE_PER_ACCOUNT = 100
MAX_TRANSACTIONS_PER_ACCOUNT = 1_000

location_main = os.path.join("features", os.environ["EXT_DATA_TYPE_FOLDER"])
# shutil.rmtree(location_main, ignore_errors=True)

location_communities_leiden = f"{location_main}{os.sep}communities_leiden.parquet"

location_features_leiden = f"{location_main}{os.sep}features_leiden.parquet"
location_features_ego = f"{location_main}{os.sep}features_ego.parquet"
location_features_2_hop = f"{location_main}{os.sep}features_2_hop.parquet"
location_features_2_hop_out = f"{location_main}{os.sep}features_2_hop_out.parquet"
location_features_2_hop_in = f"{location_main}{os.sep}features_2_hop_in.parquet"
location_features_2_hop_combined = f"{location_main}{os.sep}features_2_hop_combined.parquet"
location_features_source = f"{location_main}{os.sep}features_source.parquet"
location_features_target = f"{location_main}{os.sep}features_target.parquet"

location_flow_dispense = f"{location_main}{os.sep}location_flow_dispense.parquet"
location_flow_passthrough = f"{location_main}{os.sep}location_flow_passthrough.parquet"
location_flow_sink = f"{location_main}{os.sep}location_flow_sink.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [6]:
%%time

data = spark.read.parquet(s.STAGED_DATA_LOCATION)

#### [START] Seed selection ####

data = data.where(sf.col("source") != sf.col("target"))
data = data.where(sf.col("format").isin(FLOWS_FORMAT_SCOPE))

large_sources = (
    data.groupby("source")
    .count()
    .where(sf.col("count") > MAX_TRANSACTIONS_PER_ACCOUNT)
    .select("source")
    .toPandas()["source"]
    .tolist()
)
large_targets = (
    data.groupby("target")
    .count()
    .where(sf.col("count") > MAX_TRANSACTIONS_PER_ACCOUNT)
    .select("target")
    .toPandas()["target"]
    .tolist()
)

large_sources = set(large_sources).union(
    data.groupby("source")
    .agg(sf.countDistinct("target").alias("count"))
    .where(sf.col("count") > MAX_DEGREE_PER_ACCOUNT)
    .select("source")
    .toPandas()["source"]
    .tolist()
)
large_targets = set(large_targets).union(
    data.groupby("target")
    .agg(sf.countDistinct("source").alias("count"))
    .where(sf.col("count") > MAX_DEGREE_PER_ACCOUNT)
    .select("target")
    .toPandas()["target"]
    .tolist()
)

data = data.where(~sf.col("source").isin(large_sources))
data = data.where(~sf.col("target").isin(large_targets))

graph = ig.Graph.DataFrame(data.select("source", "target").toPandas(), use_vids=False, directed=True)
nodes_mapping = {x.index: x["name"] for x in graph.vs()}
cc = graph.connected_components("weak")
small_components = [x for x in cc if len(x) < 3]
small_nodes = [nodes_mapping[x] for y in small_components for x in y]

data = data.where(~sf.col("source").isin(small_nodes))
data = data.where(~sf.col("target").isin(small_nodes))

data = data.repartition(os.cpu_count() * 5).persist(StorageLevel.DISK_ONLY)
print(data.count())

#### [END] Seed selection ####

25/07/24 20:55:28 WARN DAGScheduler: Broadcasting large task binary with size 6.2 MiB
                                                                                

26446978
CPU times: user 41 s, sys: 12.6 s, total: 53.6 s
Wall time: 2min 25s


In [7]:
%%time

flows = pd.read_parquet(s.STAGED_CASES_DATA_LOCATION)
flows.loc[:, "src"] = flows["source"].str.slice(0, 8).tolist()
flows.loc[:, "tgt"] = flows["target"].str.slice(0, 8).tolist()
flows_nodes = set(flows["src"].unique()).union(flows["tgt"].unique())

valid_flows = []
flows_hash = defaultdict(list)
for flow_id, flow in flows.groupby("id"):
    flow_nodes = set(flow["src"].unique()).union(flow["tgt"].unique())
    # Any flow with less than 2 source/target does not qualify as a "flow"
    if len(flow_nodes) > 2:
        flow_graph = ig.Graph.DataFrame(flow.loc[:, ["src", "tgt"]], use_vids=False, directed=True)
        num_components = len(flow_graph.connected_components("weak"))
        if num_components == 1:
            valid_flows.append(flow_id)
            for f_n in flow_nodes:
                flows_hash[f_n].append(flow_id)
flows = flows.loc[flows["id"].isin(valid_flows), :].reset_index(drop=True)
flows_nodes_filtered = set(flows["src"].unique()).union(flows["tgt"].unique())
print(len(valid_flows), len(flows_nodes_filtered))

1360 12473
CPU times: user 1.57 s, sys: 61.4 ms, total: 1.63 s
Wall time: 1.66 s


In [8]:
nodes_filtered = data.select(
    sf.col("source").alias("x")
).union(data.select(sf.col("target").alias("x"))).distinct().toPandas()["x"].tolist()

assert (len(set(nodes_filtered).intersection(flows_nodes_filtered)) / len(flows_nodes_filtered)) == 1
print(len(nodes_filtered))

                                                                                

1346071


In [9]:
%%time

data = data.withColumn("is_laundering", sf.col("is_laundering").cast("boolean"))
edges = data.groupby(["source", "target"]).agg(
    sf.sum("amount").alias("amount")
).toPandas()
weights = get_weights(edges)
edges_agg = edges.set_index(["source", "target"]).join(
    weights.set_index(["source", "target"]), how="left"
).reset_index()
edges_agg.loc[:, "amount_weighted"] = (
    edges_agg.loc[:, "amount"] * 
    (edges_agg.loc[:, "weight"] / edges_agg.loc[:, "weight"].max())
)

                                                                                

CPU times: user 20.5 s, sys: 547 ms, total: 21.1 s
Wall time: 30 s


In [10]:
# Later on, we will reset the variables (to free up memory), while still keeping these intact
to_keep = %who_ls
to_keep = list(to_keep)

In [11]:
%%time

TOP_N = 50
NUM_HOPS = 5

data_input = spark.createDataFrame(edges_agg)
nodes_source = set(edges_agg["source"].unique())
nodes_target = set(edges_agg["target"].unique())
nodes_passthrough = nodes_source.intersection(nodes_target)

%run generate_flow_features.ipynb

comm_as_source_features.to_parquet(location_comm_as_source_features)
comm_as_target_features.to_parquet(location_comm_as_target_features)
comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

del comm_as_source_features
del comm_as_target_features
del comm_as_passthrough_features
del comm_as_passthrough_features_reverse

                                                                                


Processing comm_as_source

Processed hop #1 | 3,419,956 | 1,188,542
Processed hop #2 | 3,958,383 | 998,337
Processed hop #3 | 4,514,218 | 944,260
Processed hop #4 | 4,946,552 | 932,800
Processed hop #5 | 5,214,606 | 930,260

Processing comm_as_target

Processed hop #1 | 3,301,246 | 758,923
Processed hop #2 | 3,714,122 | 567,766
Processed hop #3 | 4,061,439 | 473,050
Processed hop #4 | 4,552,930 | 437,574
Processed hop #5 | 5,006,711 | 425,609

Processing comm_as_passthrough

Processed hop #1 | 1,583,527 | 601,394
Processed hop #2 | 1,694,361 | 457,529
Processed hop #3 | 1,975,855 | 424,371
Processed hop #4 | 2,197,564 | 417,108
Processed hop #5 | 2,341,535 | 415,442

Processing comm_as_passthrough_reverse

Processed hop #1 | 3,070,214 | 601,394
Processed hop #2 | 3,431,506 | 454,413
Processed hop #3 | 3,677,663 | 387,820
Processed hop #4 | 4,052,239 | 362,380
Processed hop #5 | 4,406,899 | 353,952


comm_as_source_features

CPU times: user 3min 2s, sys: 9.18 s, total: 3min 12s
Wall ti

In [12]:
%%time

print("Constructing Leiden communities")

graph = ig.Graph.DataFrame(edges_agg.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True)
nodes_mapping = {x.index: x["name"] for x in graph.vs()}
communities_leiden = la.find_partition(
    graph, la.ModularityVertexPartition, n_iterations=100, weights="amount_weighted"
)
communities_leiden = [[nodes_mapping[_] for _ in x] for x in communities_leiden]
communities_leiden = [(str(uuid.uuid4()), set(x)) for x in communities_leiden]
sizes_leiden = [len(x[1]) for x in communities_leiden]

with open(location_communities_leiden, "wb") as fl:
    pickle.dump(communities_leiden, fl)

Constructing Leiden communities


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


CPU times: user 23min 50s, sys: 13.3 s, total: 24min 4s
Wall time: 24min 7s


In [13]:
# with open(location_communities_leiden, "rb") as fl:
#     communities_leiden = pickle.load(fl)

In [14]:
%%time

print("Constructing 2-hop communities")

communities_2_hop = get_communities_spark(
    [(x, [x]) for x in nodes_filtered],
    ig.Graph.DataFrame(edges_agg.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True), 
    os.cpu_count(), spark, 2, "all", 0.01, "amount_weighted"
)
sizes_2_hop = [len(x[1]) for x in communities_2_hop]

Constructing 2-hop communities


                                                                                

CPU times: user 11.8 s, sys: 540 ms, total: 12.3 s
Wall time: 3min 55s


In [15]:
%%time

ts_min = data.select(sf.min("timestamp").alias("x")).collect()[0]["x"] - timedelta(minutes=1)
data_graph_agg = data.groupby(["source", "target", "source_bank", "target_bank", "source_currency"]).agg(
    sf.count("source").alias("num_transactions"),
    sf.sum("amount").alias("amount"),
    sf.sum("source_amount").alias("source_amount"),
    sf.collect_list(sf.array((sf.col("timestamp") - ts_min).cast("long"), sf.col("amount"))).alias("timestamps_amounts"),
)
data_graph_agg_sdf = data_graph_agg.persist(StorageLevel.DISK_ONLY)
print(data_graph_agg_sdf.count())
data_graph_agg = data_graph_agg_sdf.toPandas()
index = ["source", "target"]
edges_agg.loc[:, index + ["weight"]].set_index(index)
data_graph_agg = data_graph_agg.set_index(index).join(
    edges_agg.loc[:, index + ["weight"]].set_index(index), how="left"
).reset_index()
data_graph_agg.loc[:, "amount_weighted"] = (
    data_graph_agg.loc[:, "amount"] * 
    (data_graph_agg.loc[:, "weight"] / data_graph_agg.loc[:, "weight"].max())
)

                                                                                

3421971


                                                                                

CPU times: user 25.9 s, sys: 1.94 s, total: 27.8 s
Wall time: 51.2 s


In [16]:
graph = ig.Graph.DataFrame(data_graph_agg, use_vids=False, directed=True)

In [17]:
%%time

print("Leiden communitites features creation")

features_leiden = generate_features_spark(communities_leiden, graph, spark)
features_leiden = features_leiden.rename(columns={"key": "key_fake"})
communities_leiden_dict = dict(communities_leiden)
features_leiden.loc[:, "key"] = features_leiden.loc[:, "key_fake"].apply(lambda x: communities_leiden_dict[x])
features_leiden = features_leiden.explode("key")
del features_leiden["key_fake"]
features_leiden.set_index("key").to_parquet(location_features_leiden)

Leiden communitites features creation


                                                                                

CPU times: user 1min 8s, sys: 23.9 s, total: 1min 31s
Wall time: 3min 20s


In [18]:
%%time

print("2-hop communitites features creation")

features_2_hop = generate_features_spark(communities_2_hop, graph, spark)
features_2_hop.set_index("key").to_parquet(location_features_2_hop)

2-hop communitites features creation


                                                                                

CPU times: user 12min 7s, sys: 21.8 s, total: 12min 29s
Wall time: 33min 4s


In [19]:
del graph

In [20]:
%%time

print("Temporal flows features creation")

edges_totals = data.select("source", "target", "amount").groupby(
    ["source", "target"]
).agg(sf.count("amount").alias("amount")).toPandas()
edges_totals = edges_totals.sort_values("amount", ascending=False).reset_index(drop=True)
left_edges = spark.createDataFrame(edges_totals.groupby("target").head(TOP_N).loc[:, ["source", "target"]])
right_edges = spark.createDataFrame(edges_totals.groupby("source").head(TOP_N).loc[:, ["source", "target"]])

columns = ["source", "target", "timestamp", "amount"]

left = left_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)
select = []
for column in left.columns:
    select.append(sf.col(column).alias(f"left_{column}"))
left = left.select(*select)
right = right_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)

print(left.count(), right.count())

flows_temporal = left.join(
    right,
    (left["left_target"] == right["source"]) &
    (left["left_timestamp"] <= right["timestamp"]),
    how="inner"
).groupby(["left_source", "left_target", "source", "target"]).agg(
    sf.sum("left_amount").alias("left_amount"),
    sf.sum("amount").alias("amount"),
).drop("left_target").select(
    sf.col("left_source").alias("dispense"),
    sf.col("source").alias("passthrough"),
    sf.col("target").alias("sink"),
    sf.least("left_amount", "amount").alias("amount"),
)

aggregate = [
    sf.sum("amount").alias("amount_sum"),
    sf.mean("amount").alias("amount_mean"),
    sf.median("amount").alias("amount_median"),
    sf.max("amount").alias("amount_max"),
    sf.stddev("amount").alias("amount_std"),
    sf.countDistinct("dispense").alias("dispense_count"),
    sf.countDistinct("passthrough").alias("passthrough_count"),
    sf.countDistinct("sink").alias("sink_count"),
]
for flow_location, flow_type in [
    (location_flow_dispense, "dispense"), (location_flow_passthrough, "passthrough"), (location_flow_sink, "sink")
]:
    print(flow_type)
    flows_temporal_stats = flows_temporal.groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_cyclic_stats = flows_temporal.where(
        (sf.col("dispense") == sf.col("sink"))
    ).groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_stats = flows_temporal_stats.set_index(flow_type).join(
        flows_temporal_cyclic_stats.set_index(flow_type),
        how="left", rsuffix="_cycle"
    )
    flows_temporal_stats.index.name = "key"
    flows_temporal_stats.to_parquet(flow_location)
    del flows_temporal_stats
    del flows_temporal_cyclic_stats

left.unpersist()
right.unpersist()

del edges_totals
del left_edges
del right_edges

Temporal flows features creation


                                                                                

26323795 26446862
dispense


                                                                                

passthrough


                                                                                

sink


                                                                                

CPU times: user 6.44 s, sys: 1.33 s, total: 7.77 s
Wall time: 8min 55s


In [21]:
%%time

print("1-hop-source features creation")

features_source = spark.createDataFrame(data_graph_agg).withColumn(
    "key", sf.col("source")
).repartition(os.cpu_count() * 5, "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_source = pd.DataFrame(features_source["features"].apply(json.loads).tolist())
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]
features_source.set_index("key").to_parquet(location_features_source)

1-hop-source features creation


25/07/24 22:25:56 WARN TaskSetManager: Stage 311 contains a task of very large size (2701 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1min 26s, sys: 6.6 s, total: 1min 33s
Wall time: 15min 48s


In [22]:
%%time

print("1-hop-target features creation")

features_target = spark.createDataFrame(data_graph_agg).withColumn(
    "key", sf.col("target")
).repartition(os.cpu_count() * 5, "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_target = pd.DataFrame(features_target["features"].apply(json.loads).tolist())
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]
features_target.set_index("key").to_parquet(location_features_target)

1-hop-target features creation


25/07/24 22:41:42 WARN TaskSetManager: Stage 314 contains a task of very large size (2701 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1min 21s, sys: 3.85 s, total: 1min 25s
Wall time: 10min 58s


In [23]:
del data_graph_agg

In [24]:
ENABLED_FEATURES = [
    ("leiden", location_features_leiden),
    ("2_hop", location_features_2_hop),
    ("as_source", location_features_source),
    ("as_target", location_features_target),
    ("comm_as_source_features", location_comm_as_source_features),
    ("comm_as_target_features", location_comm_as_target_features),
    ("comm_as_passthrough_features", location_comm_as_passthrough_features),
    ("comm_as_passthrough_features_reverse", location_comm_as_passthrough_features_reverse),
    ("flow_dispense", location_flow_dispense),
    ("flow_passthrough", location_flow_passthrough),
    ("flow_sink", location_flow_sink),   
]

In [25]:
all_features = pd.DataFrame()
all_features.index.name = "key"

for feature_group, location in ENABLED_FEATURES:
    all_features = all_features.join(
        pd.read_parquet(location), how="outer", rsuffix=f"_{feature_group}"
    )

all_features.to_parquet(location_features_node_level)
print("Features:", all_features.shape)
del all_features

Features: (1346071, 344)


In [26]:
all_features = pd.read_parquet(location_features_node_level)

In [27]:
constants = []
for column in all_features.columns:
    if all_features[column].nunique(dropna=True) <= 1:
        del all_features[column]
        constants.append(column)
print(f"Deleted {len(constants)} constant columns")

Deleted 18 constant columns


In [28]:
medians = {}
for column in all_features.columns:
    medians[column] = np.nanmedian(all_features[column])

In [29]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")

Script executed in 1:58:55


In [30]:
def add_turnover_score(df_input):
    df_input.loc[:, "turnover_score"] = df_input["turnover"] / 100_000
    df_input.loc[df_input["turnover_score"] > 1, "turnover_score"] = 1
    return df_input

In [31]:
flows.loc[:, "amount"] = flows.apply(
    lambda x: x["source_amount"] * CURRENCY_RATES[x["source_currency"]], axis=1
)

flow_stats = []
for key, df in flows.groupby("id"):
    left = (
        df.loc[:, ["tgt", "amount"]]
        .rename(columns={"tgt": "src"})
        .groupby("src")
        .agg({"amount": "sum"})
    )
    right = df.groupby("src").agg({"amount": "sum"})
    result = left.join(right, how="outer", lsuffix="_left").fillna(0).reset_index()
    result.loc[:, "delta"] = result["amount_left"] - result["amount"]
    turnover = float(result[result["delta"] > 0]["delta"].sum())
    turnover_weight = (
        result.set_index("src").apply(
            lambda x: max([x["amount_left"], x["amount"]]), axis=1
        )
        / turnover
    ).to_dict()
    turnover_weight = {
        k: v / (sum(turnover_weight.values()) / len(turnover_weight)) for k, v in turnover_weight.items()
    }
    nodes = sorted(set(df["src"]).union(df["tgt"]))
    flow_stats.append(
        {"id": key, "nodes": nodes, "turnover": int(np.ceil(turnover)), "turnover_weight": turnover_weight}
    )
flow_stats = pd.DataFrame(flow_stats)
flow_stats = add_turnover_score(flow_stats)

In [32]:
%%time

print("Training the model")

anomalies = all_features.loc[:, ["turnover"]].fillna(0)
model = IsolationForest(n_estimators=10_000)
anomalies.loc[:, "anomaly_score"] = model.fit(
    all_features.fillna(medians)
).decision_function(all_features.fillna(medians))
anomalies = anomalies.sort_values("anomaly_score", ascending=True)
anomalies = add_turnover_score(anomalies)

Training the model
CPU times: user 3min 53s, sys: 21.2 s, total: 4min 14s
Wall time: 4min 15s


In [33]:
count_50_perc = round(anomalies.shape[0] * 0.5)
top_50_perc_normal = anomalies.sort_values("anomaly_score", ascending=False).head(count_50_perc)
top_50_perc_normal = set(top_50_perc_normal.index)

In [34]:
communities_2_hop_dict = dict(communities_2_hop)
communities_filtered = defaultdict(list)
for node, comm in communities_2_hop_dict.items():
    comm = tuple(sorted(comm - top_50_perc_normal))
    communities_filtered[comm].append(node)

In [35]:
communities_filtered_rev = {}
for key, value in communities_filtered.items():
    for x in value:
        communities_filtered_rev[x] = key

In [36]:
libra_ml_accounts = 600
libra_anomalies_counts = [385, 770, 1925, 3851]
libra_anomalies_actuals_ratio = [x / libra_ml_accounts for x in libra_anomalies_counts]
ibm_anomalies_counts = [round(flows["id"].nunique() * x) for x in libra_anomalies_actuals_ratio]
ibm_anomalies_counts

[873, 1745, 4363, 8729]

In [37]:
%%time

results = []
for anomalies_count in ibm_anomalies_counts:
    processed = set()
    top_communities = {}
    for index, row in anomalies.iterrows():
        comm = set(communities_filtered_rev[index]) - processed
        if len(comm) > 2:
            processed = processed.union(comm)
            top_communities[index] = comm
        if len(top_communities) >= anomalies_count:
            break
    communities_shortlisted = {
        k: v for k, v in top_communities.items()
    }
    max_comm_size = max(
        [len(x) for x in communities_shortlisted.values()] + [len(x) for x in flow_stats["turnover_weight"]]
    ) + 1
    assert len(top_communities) == len(communities_shortlisted)
    tp, fp, tn, fn = cw_confusion_matrix(flow_stats, communities_shortlisted, max_comm_size, anomalies, flows_hash)
    recall, f1 = cw_recall(tp, fn), cw_f1(tp, fp, fn)
    print(f"{recall=} | {max_comm_size=} | {anomalies_count=}")
    anom_comm_sizes = [len(x) for x in communities_shortlisted.values()]
    results.append({
        "anomalies_count": anomalies_count,
        "recall": recall,
        "max_comm_size": max_comm_size,
        "mean_comm_size": np.mean(anom_comm_sizes),
        "median_comm_size": np.median(anom_comm_sizes),
        "confusion_matrix": [tp, fp, tn, fn],
    })
results = pd.DataFrame(results)
results.to_parquet(f".{os.sep}results{os.sep}{s.OUTPUT_POSTFIX[1:]}.parquet")

recall=0.0343 | max_comm_size=104 | anomalies_count=873
recall=0.0747 | max_comm_size=104 | anomalies_count=1745
recall=0.1904 | max_comm_size=104 | anomalies_count=4363
recall=0.3506 | max_comm_size=104 | anomalies_count=8729
CPU times: user 7.21 s, sys: 421 ms, total: 7.63 s
Wall time: 7.64 s


In [5]:
results = pd.read_parquet(f".{os.sep}results{os.sep}{s.OUTPUT_POSTFIX[1:]}.parquet")