### For the community detection part BEFORE this, set the following in communities.py:
* THRESHOLD_RANK = 1e-2
* NEIGHBORS_ONLY = False
* PPR_THRESHOLD_ONLY = True

In [None]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from itertools import combinations
from datetime import timedelta
from glob import glob

import leidenalg as la
import igraph as ig
import numpy as np
import pandas as pd
import seaborn as sns
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from sklearn.metrics import recall_score, f1_score
from xgboost import XGBClassifier

import settings as s
from common import get_processes

%load_ext autoreload
%autoreload 2

In [None]:
sns.set_theme(style="white", context="talk")

In [None]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [None]:
config = [
    ("spark.driver.memory", "16g"),
    ("spark.worker.memory", "16g"),
    ("spark.driver.maxResultSize", "16g"),
]
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(config))
    .getOrCreate()
)

In [None]:
start_script = time.time()

In [None]:
MAX_DEGREE_PER_ACCOUNT = 100
MAX_TRANSACTIONS_PER_ACCOUNT = 1_000

data = spark.read.parquet(s.STAGED_DATA_LOCATION)

#### [START] Seed selection ####
data = data.where(sf.col("source") != sf.col("target"))
data = data.where(sf.col("format").isin(["ACH", "Wire", "Bitcoin"]))

large_sources = (
    data.groupby("source")
    .count()
    .where(sf.col("count") > MAX_TRANSACTIONS_PER_ACCOUNT)
    .select("source")
    .toPandas()["source"]
    .tolist()
)
large_targets = (
    data.groupby("target")
    .count()
    .where(sf.col("count") > MAX_TRANSACTIONS_PER_ACCOUNT)
    .select("target")
    .toPandas()["target"]
    .tolist()
)

large_sources = set(large_sources).union(
    data.groupby("source")
    .agg(sf.countDistinct("target").alias("count"))
    .where(sf.col("count") > MAX_DEGREE_PER_ACCOUNT)
    .select("source")
    .toPandas()["source"]
    .tolist()
)
large_targets = set(large_targets).union(
    data.groupby("target")
    .agg(sf.countDistinct("source").alias("count"))
    .where(sf.col("count") > MAX_DEGREE_PER_ACCOUNT)
    .select("target")
    .toPandas()["target"]
    .tolist()
)

data = data.where(~sf.col("source").isin(large_sources))
data = data.where(~sf.col("target").isin(large_targets))
#### [END] Seed selection ####

In [None]:
def aggregate_edges(data_input):
    data_aggregated = (
        data_input.groupby(["source", "target"])
        .agg(
            sf.sum("source_amount").alias("source_amount"),
            sf.sum("target_amount").alias("target_amount"),
        )
        .toPandas()
    )

    source_totals = (
        data_aggregated.groupby("source")
        .agg({"source_amount": "sum"})["source_amount"]
        .to_dict()
    )
    target_totals = (
        data_aggregated.groupby("target")
        .agg({"target_amount": "sum"})["target_amount"]
        .to_dict()
    )

    data_aggregated.loc[:, "total_sent_by_source"] = data_aggregated.loc[
        :, "source"
    ].apply(lambda x: source_totals[x])
    data_aggregated.loc[:, "total_received_by_target"] = data_aggregated.loc[
        :, "target"
    ].apply(lambda x: target_totals[x])
    data_aggregated.loc[:, "weight"] = data_aggregated.apply(
        lambda x: (
            (x["source_amount"] / x["total_sent_by_source"])
            + (x["target_amount"] / x["total_received_by_target"])
        ),
        axis=1,
    )
    data_aggregated.loc[:, "source"] = data_aggregated["source"].str.slice(0, 8)
    data_aggregated.loc[:, "target"] = data_aggregated["target"].str.slice(0, 8)
    filter_self = data_aggregated["source"] != data_aggregated["target"]
    data_aggregated = data_aggregated.loc[filter_self, :].reset_index(drop=True)
    return data_aggregated.loc[:, ["source", "target", "weight"]]

In [None]:
%%time

edges = aggregate_edges(data)
graph = ig.Graph.DataFrame(edges, use_vids=False, directed=True)
nodes = [x["name"] for x in graph.vs()]
random.shuffle(nodes)

In [None]:
nodes_mapping = {x.index: x["name"] for x in graph.vs()}

In [None]:
%%time

communities = []
for filename in glob("./staging/*.pickle"):
    with open(filename, "rb") as f:
        communities += pickle.load(f)

original_size = len(communities)

filename = "communities.pickle"
with open(filename, "wb") as f:
    pickle.dump(communities, f)

In [None]:
currency_rates = {
    "jpy": np.float32(0.009487665410827868),
    "cny": np.float32(0.14930721887033868),
    "cad": np.float32(0.7579775434031815),
    "sar": np.float32(0.2665884611958837),
    "aud": np.float32(0.7078143121927827),
    "ils": np.float32(0.29612081311363503),
    "chf": np.float32(1.0928961554056371),
    "usd": np.float32(1.0),
    "eur": np.float32(1.171783425225877),
    "rub": np.float32(0.012852809604990688),
    "gbp": np.float32(1.2916554735187644),
    "btc": np.float32(11879.132698717296),
    "inr": np.float32(0.013615817231245796),
    "mxn": np.float32(0.047296753463246695),
    "brl": np.float32(0.1771008654705292),
}

In [None]:
flows = pd.read_parquet("flows.parquet")
flow_stats = pd.read_parquet("flow_stats.parquet")
flow_stats.loc[:, "turnover_ranks"] = flow_stats.loc[:, "turnover_ranks"].apply(
    lambda x: json.loads(x)
)
flows.loc[:, "amount_usd"] = flows.apply(
    lambda x: x["source_amount"] * currency_rates[x["source_currency"]], axis=1
)
ml_nodes = set(pd.read_parquet("ml_nodes.parquet")["ml"].tolist())
single_node_flows = flow_stats[flow_stats["turnover_ranks"].apply(len) == 1][
    "id"
].tolist()
flows = flows[~flows["id"].isin(single_node_flows)].reset_index(drop=True)
flow_stats = flow_stats[~flow_stats["id"].isin(single_node_flows)].reset_index(
    drop=True
)
print(flow_stats.shape[0], len(single_node_flows))

In [None]:
%%time

communities_negative = []
for key, comm in communities:
    if not comm.intersection(ml_nodes):
        communities_negative.append((key, comm))

communities = list(communities_negative)

communities_unique_rev = {}
for comm_id, comm in communities:
    communities_unique_rev[tuple(sorted(comm))] = comm_id

communities_unique = {
    f"{v}-{i}": k for i, (k, v) in enumerate(communities_unique_rev.items())
}
communities = list({x for x in communities_unique.values()})
new_size = len(communities)

print(original_size, new_size, round(new_size / original_size, 2))
print()

In [None]:
sizes = [len(x) for x in communities]
round(np.mean(sizes)), round(np.median(sizes)), round(np.max(sizes)), sum(sizes)

In [None]:
PERC_POS_TRAINING = 0.4
PERC_NEG_TRAINING = 0.5

In [None]:
train_flows = (
    flow_stats.sample(frac=1).sample(frac=PERC_POS_TRAINING).reset_index(drop=True)
)
test_flows = flow_stats.loc[~flow_stats["id"].isin(train_flows["id"]), :].reset_index(
    drop=True
)
train_nodes = [x for y in train_flows.loc[:, "turnover_ranks"] for x in y]
test_nodes = [x for y in test_flows.loc[:, "turnover_ranks"] for x in y]

In [None]:
communities_train_pos = []
for index, turnover_rank in enumerate(train_flows["turnover_ranks"]):
    communities_train_pos.append((f"train-pos-{index}", set(turnover_rank.keys())))

# NOTE: For now we assume that the community detection part returns perfectly matching flows
communities_test_pos = []
for index, turnover_rank in enumerate(test_flows["turnover_ranks"]):
    communities_test_pos.append((f"test-pos-{index}", set(turnover_rank.keys())))

In [None]:
communities_negative = list(communities_unique.values())
random.shuffle(communities_negative)
number_of_neg_training = int(len(communities_negative) * PERC_NEG_TRAINING)
communities_train_neg = [
    (f"train-neg-{index}", x)
    for index, x in enumerate(communities_negative[:number_of_neg_training])
]
communities_test_neg = [
    (f"test-neg-{index}", x)
    for index, x in enumerate(communities_negative[number_of_neg_training:])
]

In [None]:
len(communities_train_pos), len(communities_train_neg), len(communities_test_pos), len(
    communities_test_neg
)

In [None]:
communities_unique = dict(
    communities_train_pos
    + communities_train_neg
    + communities_test_pos
    + communities_test_neg
)

In [None]:
len(communities_unique)

In [None]:
location_transactions = "transactions_with_edges"

In [None]:
%%time

columns = [
    sf.substring("source", 1, 8).alias("source"),
    sf.substring("target", 1, 8).alias("target"),
    "source_bank",
    "target_bank",
    sf.unix_timestamp("timestamp").alias("timestamp"),
    sf.col("source_amount").alias("amount"),
    sf.col("source_currency").alias("currency"),
]
data.where(
    (sf.col("source_currency") == sf.col("target_currency"))
    & (sf.col("source") != sf.col("target"))
).select(*columns).withColumn(
    "edge",
    sf.when(
        sf.col("source") < sf.col("target"),
        sf.concat(sf.col("source"), sf.lit("-"), sf.col("target")),
    ).otherwise(sf.concat(sf.col("target"), sf.lit("-"), sf.col("source"))),
).repartition(
    1
).write.parquet(
    location_transactions, mode="overwrite"
)

In [None]:
transactions = pd.read_parquet(location_transactions).set_index("edge")
transactions.loc[:, "amount_usd"] = transactions.apply(
    lambda x: currency_rates[x["currency"]] * x["amount"], axis=1
)

In [None]:
location = "transactions_communities"

In [None]:
%%time

shutil.rmtree(location, ignore_errors=True)
os.mkdir(location)

shutil.rmtree(location, ignore_errors=True)
os.mkdir(location)

communities_keys = [x for x in communities_unique.keys()]

number_of_chunks = int(np.ceil(len(communities_keys) / 50_000))
chunks = np.array_split(communities_keys, number_of_chunks)
for index, chunk in enumerate(chunks):
    comm_inner = []
    for key in chunk:
        comm_node = communities_unique[key]
        comm_inner += [[key, sorted(x)] for x in combinations(comm_node, 2)]
    edge_combinations = pd.DataFrame(comm_inner, columns=["id", "edge"])
    edge_combinations.loc[:, "edge"] = edge_combinations.loc[:, "edge"].apply(
        lambda x: f"{x[0]}-{x[1]}"
    )
    edge_combinations.set_index("edge", inplace=True)
    edge_combinations.join(transactions, how="inner").reset_index(drop=True).to_parquet(
        f"{location}/part-{index}.parquet"
    )
    if not (index % 20):
        print(index, len(chunks))

In [None]:
location_features_global = "features_global"

In [None]:
%%time

NUMBER_OF_PROCESSES = 10

parts = sorted(
    [x for x in glob(f"{location}/*.parquet")],
    key=lambda x: int(x.split("-")[-1].split(".")[0]),
)

shutil.rmtree(location_features_global, ignore_errors=True)
os.mkdir(location_features_global)

process_ids = set()
process_name = "features.py"
while parts:
    if len(get_processes(process_ids)) < NUMBER_OF_PROCESSES:
        process_id = str(uuid.uuid4())
        process_ids = process_ids.union({process_id})
        os.system(
            f"{sys.executable} {process_name} {parts.pop()} {location_features_global} {process_id} &"
        )

while get_processes(process_ids):
    time.sleep(5)

In [None]:
features_global = pd.read_parquet("features_global")
features_global.loc[:, "is_train"] = features_global.loc[:, "key"].apply(
    lambda x: x.split("-")[0] == "train"
)
features_global.loc[:, "label"] = features_global.loc[:, "key"].apply(
    lambda x: x.split("-")[1] == "pos"
)
del features_global["key"]

In [None]:
print(
    features_global[features_global["is_train"]]["label"].sum()
    + features_global[~features_global["is_train"]]["label"].sum()
)
print(flow_stats.shape[0])

In [None]:
x_train = features_global[features_global["is_train"]].reset_index(drop=True)
y_train = features_global[features_global["is_train"]][["label"]].reset_index(drop=True)
del x_train["label"]
del x_train["is_train"]
x_test = features_global[~features_global["is_train"]].reset_index(drop=True)
y_test = features_global[~features_global["is_train"]][["label"]].reset_index(drop=True)
del x_test["label"]
del x_test["is_train"]

In [None]:
%%time

model = XGBClassifier(objective="binary:logistic")
model.fit(x_train, y_train)
predictions = y_test.copy(deep=True)
predictions.loc[:, "y"] = predictions.loc[:, "label"].astype(int)
del predictions["label"]
predictions.loc[:, "y_predicted"] = model.predict(x_test)

In [None]:
recall_score(predictions["y"], predictions["y_predicted"])

In [None]:
f1_score(predictions["y"], predictions["y_predicted"])

In [None]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")