In [None]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from itertools import combinations
from datetime import timedelta
from glob import glob

import igraph as ig
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest

import settings as s
from common import get_processes

%load_ext autoreload
%autoreload 2

In [None]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [None]:
start_script = time.time()

In [None]:
data = pd.read_csv("./open-data/Libra_bank_3months_graph/data.csv")
rename = {
    "id_source": "source",
    "id_destination": "target",
    "cum_amount": "amount",
    "nr_transactions": "transactions_count",
    "nr_alerts": "alerts_count",
    "nr_reports": "reports_count",
}
data = data.rename(columns=rename)
data.loc[:, "source_"] = (
    data.loc[:, "source"].astype(str).apply(lambda x: f"nid-{int(x)}")
)
data.loc[:, "target_"] = (
    data.loc[:, "target"].astype(str).apply(lambda x: f"nid-{int(x)}")
)
del data["source"]
del data["target"]
data = data.rename(
    columns={
        "source_": "source",
        "target_": "target",
    }
).loc[:, ["source", "target", "amount", "transactions_count", "alerts_count", "reports_count"]]

In [None]:
nodes_data = pd.DataFrame(set(data["source"].tolist() + data["target"].tolist()), columns=["id"])

w_alerts = int(data["alerts_count"].sum() * 2)
w_reports = int(data["reports_count"].sum() * 2)

w_alerts_source = data[data["alerts_count"] > 0].groupby("source").agg({"alerts_count": "sum"}).to_dict()["alerts_count"]
w_alerts_target = data[data["alerts_count"] > 0].groupby("target").agg({"alerts_count": "sum"}).to_dict()["alerts_count"]

w_reports_source = data[data["reports_count"] > 0].groupby("source").agg({"reports_count": "sum"}).to_dict()["reports_count"]
w_reports_target = data[data["reports_count"] > 0].groupby("target").agg({"reports_count": "sum"}).to_dict()["reports_count"]

nodes_data.loc[:, "alert_weight"] = nodes_data.loc[:, "id"].apply(
    lambda x: (w_alerts_source.get(x, 0) + w_alerts_target.get(x, 0)) / w_alerts
)
nodes_data.loc[:, "report_weight"] = nodes_data.loc[:, "id"].apply(
    lambda x: (w_reports_source.get(x, 0) + w_reports_target.get(x, 0)) / w_reports
)

In [None]:
%%time

communities = []
for filename in glob("./staging/*.pickle"):
    with open(filename, "rb") as f:
        communities += pickle.load(f)

original_size = len(communities)

filename = "communities.pickle"
with open(filename, "wb") as f:
    pickle.dump(communities, f)

communities_unique_rev = {}
for comm_id, comm in communities:
    communities_unique_rev[tuple(sorted(comm))] = comm_id

communities_unique = {
    f"{v}-{i}": k for i, (k, v) in enumerate(communities_unique_rev.items())
}
communities = list({x for x in communities_unique.values()})
new_size = len(communities)

print(original_size, new_size, round(new_size / original_size, 2))
print()

In [None]:
sizes = [len(x) for x in communities]
round(np.mean(sizes)), round(np.median(sizes)), round(np.max(sizes)), sum(sizes)

In [None]:
data.loc[:, "edge"] = data.apply(
    lambda x: "-".join(sorted([x["source"], x["target"]])), axis=1
)
data.set_index("edge", inplace=True)

In [None]:
location = "transactions_communities_libra"

In [None]:
%%time

shutil.rmtree(location, ignore_errors=True)
os.mkdir(location)

shutil.rmtree(location, ignore_errors=True)
os.mkdir(location)

communities_keys = [x for x in communities_unique.keys()]

number_of_chunks = int(np.ceil(len(communities_keys) / 50_000))
chunks = np.array_split(communities_keys, number_of_chunks)
for index, chunk in enumerate(chunks):
    comm_inner = []
    for key in chunk:
        comm_node = communities_unique[key]
        comm_inner += [[key, sorted(x)] for x in combinations(comm_node, 2)]
    edge_combinations = pd.DataFrame(comm_inner, columns=["id", "edge"])
    edge_combinations.loc[:, "edge"] = edge_combinations.loc[:, "edge"].apply(
        lambda x: f"{x[0]}-{x[1]}"
    )
    edge_combinations.set_index("edge", inplace=True)
    edge_combinations.join(data, how="inner").reset_index(drop=True).to_parquet(
        f"{location}/part-{index}.parquet"
    )
    if not (index % 20):
        print(index, len(chunks))

In [None]:
location_features_global = "features_global_libra"

In [None]:
%%time

NUMBER_OF_PROCESSES = 10

parts = sorted(
    [x for x in glob(f"{location}/*.parquet")],
    key=lambda x: int(x.split("-")[-1].split(".")[0]),
)

shutil.rmtree(location_features_global, ignore_errors=True)
os.mkdir(location_features_global)

process_ids = set()
process_name = "features_libra.py"
while parts:
    if len(get_processes(process_ids)) < NUMBER_OF_PROCESSES:
        process_id = str(uuid.uuid4())
        process_ids = process_ids.union({process_id})
        os.system(
            f"{sys.executable} {process_name} {parts.pop()} {location_features_global} {process_id} &"
        )

while get_processes(process_ids):
    time.sleep(5)

In [None]:
features_global = pd.read_parquet(location_features_global)
features_global.loc[:, "key_parent"] = features_global.loc[:, "key"].apply(
    lambda x: "-".join(x.split("-")[:2])
)

predictions = features_global.loc[:, ["key", "key_parent"]].copy(deep=True)
del features_global["key"]
del features_global["key_parent"]

In [None]:
%%time

model = IsolationForest()
model.fit(features_global)
predictions.loc[:, "anomaly_score"] = model.decision_function(features_global)

In [None]:
# TOP_N_ANOMALIES = 5000
# TOP_N_ANOMALIES = 10000
TOP_N_ANOMALIES = 15000

In [None]:
top_anomalies = predictions.loc[predictions.groupby("key_parent")["key"].idxmin(), :]
top_anomalies = top_anomalies.sort_values("anomaly_score").head(TOP_N_ANOMALIES)
communities_shortlisted = {x: communities_unique[x] for x in top_anomalies["key"]}
print(f"Number of commmununities: {len(communities_shortlisted):,}")
max_comm_size = max([len(x) for x in communities_shortlisted.values()]) + 1
print(f"Max community size: {max_comm_size}")

In [None]:
alert_nodes = set(nodes_data[nodes_data["alert_weight"] > 0]["id"])

In [None]:
recall = round(len(alert_nodes.intersection(top_anomalies["key_parent"])) / len(alert_nodes), 2)
recall

In [None]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")