In [None]:
import json
import math
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict, Counter
from itertools import combinations, product
from datetime import datetime, timedelta, date
from glob import glob

import leidenalg as la
import igraph as ig
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import psutil
from pyspark.sql import functions as sf, types as st
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.ensemble import IsolationForest

import settings as s

%load_ext autoreload
%autoreload 2

In [None]:
sns.set_theme(style="white", context="talk")

In [None]:
if (
    sys.version_info.major, 
    sys.version_info.minor, 
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError("Only runs efficiently on Python 3.11.8 | conda 24.1.2 | Apple M3 Pro")

In [None]:
config = [
    ("spark.driver.memory", "16g"),
    ("spark.worker.memory", "16g"),
    ("spark.driver.maxResultSize", "16g"),
]
spark = SparkSession.builder.appName("testing").config(conf=SparkConf().setAll(config)).getOrCreate()

In [None]:
start_script = time.time()

In [None]:
MAX_DEGREE_PER_ACCOUNT = 100
MAX_TRANSACTIONS_PER_ACCOUNT = 1_000

data = spark.read.parquet(s.STAGED_DATA_LOCATION)

#### [START] Seed selection ####
data = data.where(sf.col("source") != sf.col("target")) 
data = data.where(sf.col("format").isin(["ACH", "Wire", "Bitcoin"]))

large_sources = data.groupby("source").count().where(
    sf.col("count") > MAX_TRANSACTIONS_PER_ACCOUNT
).select("source").toPandas()["source"].tolist()
large_targets = data.groupby("target").count().where(
    sf.col("count") > MAX_TRANSACTIONS_PER_ACCOUNT
).select("target").toPandas()["target"].tolist()

large_sources = set(large_sources).union(
    data.groupby("source").agg(sf.countDistinct("target").alias("count")).where(
        sf.col("count") > MAX_DEGREE_PER_ACCOUNT
    ).select("source").toPandas()["source"].tolist()
)
large_targets = set(large_targets).union(
    data.groupby("target").agg(sf.countDistinct("source").alias("count")).where(
        sf.col("count") > MAX_DEGREE_PER_ACCOUNT
    ).select("target").toPandas()["target"].tolist()
)

data = data.where(~sf.col("source").isin(large_sources))
data = data.where(~sf.col("target").isin(large_targets))
#### [END] Seed selection ####

In [None]:
def aggregate_edges(data_input):
    data_aggregated = data_input.groupby(["source", "target"]).agg(
        sf.sum("source_amount").alias("source_amount"),
        sf.sum("target_amount").alias("target_amount"),
    ).toPandas()
    
    source_totals = data_aggregated.groupby(
        "source"
    ).agg({"source_amount": "sum"})["source_amount"].to_dict()
    target_totals = data_aggregated.groupby(
        "target"
    ).agg({"target_amount": "sum"})["target_amount"].to_dict()
    
    data_aggregated.loc[:, "total_sent_by_source"] = data_aggregated.loc[:, "source"].apply(
        lambda x: source_totals[x]
    )
    data_aggregated.loc[:, "total_received_by_target"] = data_aggregated.loc[:, "target"].apply(
        lambda x: target_totals[x]
    )
    data_aggregated.loc[:, "weight"] = data_aggregated.apply(
        lambda x: (
            (x["source_amount"] / x["total_sent_by_source"]) +
            (x["target_amount"] / x["total_received_by_target"])
        ),
        axis=1
    )
    data_aggregated.loc[:, "source"] = data_aggregated["source"].str.slice(0, 8)
    data_aggregated.loc[:, "target"] = data_aggregated["target"].str.slice(0, 8)
    filter_self = data_aggregated["source"] != data_aggregated["target"]
    data_aggregated = data_aggregated.loc[filter_self, :].reset_index(drop=True)
    return data_aggregated.loc[:, ["source", "target", "weight"]]

In [None]:
%%time

edges = aggregate_edges(data)
graph = ig.Graph.DataFrame(edges, use_vids=False, directed=True)
nodes = [x["name"] for x in graph.vs()]

In [None]:
def get_processes(ids):
    processes = []
    for process in psutil.process_iter():
        cmdline = []
        try:
            cmdline = process.cmdline()
        except Exception as error:
            pass
        if ids.intersection(cmdline):
            processes.append(process)
    return processes

In [None]:
%%time

NUMBER_OF_PROCESSES = 10

shutil.rmtree("staging", ignore_errors=True)
os.mkdir("staging")
chunks = np.array_split(nodes, NUMBER_OF_PROCESSES)

filename = "graph.pickle"
with open(filename, "wb") as f:
    pickle.dump(graph, f, protocol=pickle.HIGHEST_PROTOCOL)

filename = "nodes.pickle"
with open(filename, "wb") as f:
    pickle.dump(chunks, f, protocol=pickle.HIGHEST_PROTOCOL)

process_ids = set()
process_name = "communities.py"
for chunk_number in range(NUMBER_OF_PROCESSES):
    process_id = str(uuid.uuid4())
    process_ids = process_ids.union({process_id})
    os.system(f"{sys.executable} {process_name} {chunk_number} {process_id} &")

while get_processes(process_ids):
    time.sleep(5)

In [None]:
# for proc in get_processes(process_ids):
#     try:
#         proc.kill()
#     except psutil.NoSuchProcess:
#         pass

communities = []
for filename in glob("./staging/*.pickle"):
    with open(filename, "rb") as f:
        communities += pickle.load(f)

filename = "communities.pickle"
with open(filename, "wb") as f:
    pickle.dump(communities, f)
communities = [x[1] for x in communities]

In [None]:
len(communities)

In [None]:
sizes = [len(x) for x in communities]
round(np.mean(sizes)), round(np.median(sizes)), round(np.max(sizes)), sum(sizes)

In [None]:
sns.set_theme(rc={"figure.figsize":(12.7, 7.27)})
sns.histplot(data=pd.DataFrame(sizes, columns=["Size"]), x="Size", kde=True)

In [None]:
sns.set_theme(rc={"figure.figsize":(10.7, 5.27)})
sns.boxplot(x=sizes)

In [None]:
flows = pd.read_parquet("flows.parquet")
flow_stats = pd.read_parquet("flow_stats.parquet")

In [None]:
%%time

search_hash = defaultdict(list)
for index, community in enumerate(communities):
    for node in community:
        search_hash[node].append(index)

In [None]:
%%time

percentages = []
start = time.time()
for index, (group, grouped) in enumerate(flows.groupby("id")):
    flow_nodes = set(grouped["source"]).union(grouped["target"])
    size = len(flow_nodes)
    matches = []
    perc = 0
    for node in flow_nodes:
        for i in search_hash[node]:
            try:
                in_scope = set(communities[i])
                matched_size = len(in_scope.intersection(flow_nodes))
            except KeyError:
                continue
            perc = matched_size / size
            perc_matched = matched_size / len(in_scope)
            matches.append((node, perc, perc_matched, len(in_scope)))
        #     if perc == 1:
        #         break
        # if perc == 1:
        #     break
    matched_node_comm, perc, perc_matched, matched_comm_size  = sorted(
        matches, reverse=True, key=lambda x: (x[1], x[2])
    )[0]
    stats = flow_stats.loc[flow_stats["id"] == group, :].iloc[0].to_dict()
    stats["score"] = perc
    stats["matched_node_comm"] = matched_node_comm
    stats["matched_comm_size"] = matched_comm_size
    percentages.append(dict(stats))
    if not (index % 2_000):
        print(index, round(time.time() - start))
        start = time.time()

percentages = pd.DataFrame(percentages)

In [None]:
percentages[percentages["score"] == 1]["matched_comm_size"].max()

In [None]:
# percentages[percentages["score"] == 1][~percentages["type"].isin(
#     ["random", "cycle", "stack", "bipartite"]
# )].sort_values("matched_comm_size")

In [None]:
round(percentages["score"].mean(), 2) * 100

In [None]:
round(percentages[percentages["score"] == 1].shape[0] / percentages.shape[0], 2) * 100

In [None]:
percentages.groupby("type").agg({"score": "mean"}).sort_values("score").plot.bar()

In [None]:
filter_ = percentages["number_components"] == 1
percentages_scope = percentages.loc[filter_, :].reset_index(drop=True)
percentages_scope.groupby("type").agg({"score": "mean"}).sort_values("score").plot.bar()

In [None]:
percentages_scope[percentages_scope["score"] < 1].shape[0]

In [None]:
percentages_scope[percentages_scope["score"] < 1].groupby("sub_type")["type"].count()

In [None]:
location_transactions = "transactions_with_edges"

In [None]:
%%time

columns = [
    sf.substring("source", 1, 8).alias("source"), 
    sf.substring("target", 1, 8).alias("target"), 
    sf.unix_timestamp("timestamp").alias("timestamp"),
    sf.col("source_amount").alias("amount"),
    sf.col("source_currency").alias("currency"),
]
data.where(
    (sf.col("source_currency") == sf.col("target_currency")) &
    (sf.col("source") != sf.col("target"))
).select(*columns).withColumn(
    "edge",
    sf.when(
        sf.col("source") < sf.col("target"), 
        sf.concat(sf.col("source"), sf.lit("-"), sf.col("target"))
    ).otherwise(sf.concat(sf.col("target"), sf.lit("-"), sf.col("source")))
).repartition(1).write.parquet(location_transactions, mode="overwrite")

In [None]:
transactions = pd.read_parquet(location_transactions).set_index("edge")

In [None]:
location = "transactions_communities"

In [None]:
%%time

shutil.rmtree(location, ignore_errors=True)
os.mkdir(location)

shutil.rmtree(location, ignore_errors=True)
os.mkdir(location)

number_of_chunks = int(np.ceil(len(communities) / 100_000))
chunks = np.array_split(communities, number_of_chunks)
processed = 0
for index, chunk in enumerate(chunks):
    edge_combinations = pd.DataFrame([
        x for y in
        [product([i + processed], combinations(sorted(x), 2)) for i, x in enumerate(chunk)]
        for x in y
    ], columns=["id", "edge"])
    processed += chunk.shape[0]
    edge_combinations.loc[:, "edge"] = edge_combinations.loc[:, "edge"].apply(
        lambda x: f"{x[0]}-{x[1]}"
    )
    edge_combinations.set_index("edge", inplace=True)
    edge_combinations.join(transactions, how="inner").reset_index(drop=True).to_parquet(
        f"{location}/part-{index}.parquet"
    )
    print(index)

In [None]:
dates = sorted(data.select(sf.to_date("timestamp").alias("date")).distinct().toPandas()["date"].values)

In [None]:
location_features = "features"

In [None]:
%%time

raise Exception("Do not run!")

NUMBER_OF_PROCESSES = 8

parts = sorted([x for x in glob(f"{location}/*.parquet")], key=lambda x: int(x.split("-")[-1].split(".")[0]))

shutil.rmtree(location_features, ignore_errors=True)
os.mkdir(location_features)

shutil.rmtree(location_features, ignore_errors=True)
os.mkdir(location_features)

process_ids = set()
process_name = "features.py"
for index in range(NUMBER_OF_PROCESSES):
    process_id = str(uuid.uuid4())
    process_ids = process_ids.union({process_id})
    os.system(f"{sys.executable} {process_name} {parts.pop()} {location_features} {dates[0]} {dates[-1]} {process_id} &")

while get_processes(process_ids):
    if len(get_processes(process_ids)) < NUMBER_OF_PROCESSES:
        if len(parts):
            process_id = str(uuid.uuid4())
            process_ids = process_ids.union({process_id})
            os.system(f"{sys.executable} {process_name} {parts.pop()} {location_features} {dates[0]} {dates[-1]} {process_id} &")
    time.sleep(5)

In [None]:
location_predictions = "predictions"

In [None]:
%%time

raise Exception("Do not run!")

parts = sorted([x for x in glob(f"{location_features}/window-*")], key=lambda x: int(x.split("-")[-1]))

shutil.rmtree(location_predictions, ignore_errors=True)
os.mkdir(location_predictions)

shutil.rmtree(location_predictions, ignore_errors=True)
os.mkdir(location_predictions)

for part in parts:
    input_features = []
    for part_input in glob(f"{part}/*"):
        input_features.append(pd.read_parquet(part_input))
    input_features = pd.concat(input_features, ignore_index=True)
    print(part, input_features.shape)
    predictions = input_features.loc[:, ["id", "window_size", "day_number"]].copy(deep=True)
    del input_features["id"]
    del input_features["window_size"]
    del input_features["day_number"]
    model = IsolationForest()
    model.fit(input_features)
    predictions.loc[:, "anomaly_score"] = model.decision_function(input_features)
    predictions.loc[predictions["anomaly_score"] >= 0, "anomaly_score"] = 0
    predictions.loc[:, "anomaly_score"] *= -1
    predictions.loc[:, "anomaly_score"] /= predictions.loc[:, "anomaly_score"].max()
    predictions.loc[:, "anomaly_score"] = predictions["anomaly_score"].abs().astype(np.float32)
    predictions = predictions.loc[predictions["anomaly_score"] > 0, :].reset_index(drop=True)
    part_number = int(part.split("-")[-1])
    predictions.to_parquet(f"{location_predictions}/part-{part_number}.parquet")

In [None]:
predictions = pd.read_parquet(location_predictions)

In [None]:
predictions_aggregated = predictions.groupby("id").agg({"anomaly_score": "sum"}).reset_index()
predictions_aggregated = predictions_aggregated.sort_values("anomaly_score", ascending=False).reset_index(drop=True)

In [None]:
predictions_aggregated_max = predictions.groupby("id").agg({"anomaly_score": "max"}).reset_index()
predictions_aggregated_max = predictions_aggregated_max.sort_values("anomaly_score", ascending=False).reset_index(drop=True)

In [None]:
anomalous_comunities = predictions["id"].unique()
anomalous_comunities = predictions_aggregated_max[
    predictions_aggregated_max["anomaly_score"] > 0.3
]["id"].unique()
len(anomalous_comunities)

In [None]:
parts = sorted([x for x in glob(f"{location}/*.parquet")], key=lambda x: int(x.split("-")[-1].split(".")[0]))
predictions_communities = []
for part in parts:
    part_comm_trxs = pd.read_parquet(part)
    predictions_communities.append(
        part_comm_trxs.loc[part_comm_trxs["id"].isin(anomalous_comunities), :].copy(deep=True)
    )
predictions_communities = pd.concat(predictions_communities, ignore_index=True)

In [None]:
predicted_nodes = set(predictions_communities["source"]).union(
    predictions_communities["target"]
)

In [None]:
len(
    predicted_nodes.intersection(set(flows["source"].unique()).union(flows["target"].unique()))
) / len(set(flows["source"].unique()).union(flows["target"].unique()))

In [None]:
len(set(flows["source"].unique()).union(flows["target"].unique())) / len(predicted_nodes)

In [None]:
len(predicted_nodes) / len(set(transactions["source"].unique()).union(transactions["target"].unique()))

In [None]:
len(predicted_nodes)

In [None]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")