In [None]:
from glob import glob

import numpy as np
import pandas as pd

In [None]:
flows = pd.read_parquet("flows.parquet")
flow_stats = pd.read_parquet("flow_stats.parquet")

original_size = flows["id"].nunique()

flow_stats = flow_stats.loc[flow_stats["number_components"] == 1, :].reset_index(drop=True)
reduction = round((flow_stats.shape[0] / original_size) * 100, 2)
print(f"Remaining {reduction}%")

cycles = flow_stats[flow_stats["type"].isin(["random", "cycle"])][["id", "sub_type", "max_distance"]]
hops_allowed = [f"max {x} hops" for x in range(1, 5)]
exclude = cycles[~cycles["sub_type"].isin(hops_allowed)]["id"].tolist()
flow_stats = flow_stats.loc[~flow_stats["id"].isin(exclude), :].reset_index(drop=True)
reduction = round((flow_stats.shape[0] / original_size) * 100, 2)
print(f"Remaining {reduction}%")
flows = flows.loc[flows["id"].isin(flow_stats["id"].unique()), :].reset_index(drop=True)

print(original_size, flows["id"].nunique(), set(flows["id"]).symmetric_difference(flow_stats["id"]))

In [None]:
location_predictions = "predictions"
predictions = pd.read_parquet(location_predictions)

In [None]:
predictions_aggregated = predictions.groupby("id").agg({"anomaly_score": "sum"}).reset_index()
predictions_aggregated = predictions_aggregated.sort_values("anomaly_score", ascending=False).reset_index(drop=True)

In [None]:
predictions_aggregated_max = predictions.groupby("id").agg({"anomaly_score": "max"}).reset_index()
predictions_aggregated_max = predictions_aggregated_max.sort_values("anomaly_score", ascending=False).reset_index(drop=True)

In [None]:
anomalous_comunities_agg

In [None]:
anomalous_comunities_all = predictions["id"].unique()
anomalous_comunities_agg = set(predictions_aggregated.head(25000)["id"].unique())
anomalous_comunities_max = set(predictions_aggregated_max.head(25000)["id"].unique())

anomalous_comunities = anomalous_comunities_agg.union(anomalous_comunities_max)
# anomalous_comunities = anomalous_comunities_all

print(len(anomalous_comunities))

In [None]:
location = "transactions_communities"

parts = sorted([x for x in glob(f"{location}/*.parquet")], key=lambda x: int(x.split("-")[-1].split(".")[0]))
predictions_communities = []
for part in parts:
    part_comm_trxs = pd.read_parquet(part)
    predictions_communities.append(
        part_comm_trxs.loc[part_comm_trxs["id"].isin(anomalous_comunities), :].copy(deep=True)
    )
predictions_communities = pd.concat(predictions_communities, ignore_index=True)

In [None]:
currency_rates = {
    "jpy": np.float32(0.009487665410827868),
    "cny": np.float32(0.14930721887033868),
    "cad": np.float32(0.7579775434031815),
    "sar": np.float32(0.2665884611958837),
    "aud": np.float32(0.7078143121927827),
    "ils": np.float32(0.29612081311363503),
    "chf": np.float32(1.0928961554056371),
    "usd": np.float32(1.0),
    "eur": np.float32(1.171783425225877),
    "rub": np.float32(0.012852809604990688),
    "gbp": np.float32(1.2916554735187644),
    "btc": np.float32(11879.132698717296),
    "inr": np.float32(0.013615817231245796),
    "mxn": np.float32(0.047296753463246695),
    "brl": np.float32(0.1771008654705292)
 }

In [None]:
%%time

predictions_communities.loc[:, "amount_usd"] = predictions_communities.apply(
    lambda x: x["amount"] * currency_rates[x["currency"]], axis=1
)
flows.loc[:, "amount_usd"] = flows.apply(
    lambda x: x["source_amount"] * currency_rates[x["source_currency"]], axis=1
)

In [None]:
HIGH_LAUNDER_AMOUNT = 1e6


def risk_score(sample):
    balances = {}
    sample = sample.sort_values("timestamp").reset_index(drop=True)
    for index, row in sample.iterrows():
        row = row.to_dict()
        amount = row["amount_usd"]
        balance_source = balances.get(row["source"], 0) - amount
        if balance_source < 0:
            balance_source = 0
        balance_target = balances.get(row["target"], 0) + amount
        balances[row["source"]] = balance_source
        balances[row["target"]] = balance_target
    score = sum(balances.values()) / HIGH_LAUNDER_AMOUNT
    return score if score < 1 else 1

In [None]:
risk_score_per_flow = flows.groupby(
    "id"
)[["timestamp", "amount_usd", "source", "target"]].apply(risk_score).to_dict()

In [None]:
total_risk = sum(risk_score_per_flow.values())
flows.loc[:, "risk_score"] = flows.loc[:, "id"].apply(lambda x: risk_score_per_flow[x])

In [None]:
# with open("synthetic-data/HI-Large_Patterns.txt") as fl:
#     for line in fl.readlines():
#         if "806A1707" in line and "8095A93C" in line:
#             print(line)

In [None]:
flows_edges = set([(x, y) for x, y in flows[["source", "target"]].values])
predictions_edges = set([(x, y) for x, y in predictions_communities[["source", "target"]].values])

In [None]:
len(flows_edges), len(predictions_edges)

In [None]:
len(flows_edges.intersection(predictions_edges)) / len(flows_edges)