### https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9762926

In [51]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict
from datetime import timedelta
from glob import glob
from itertools import product
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score

import igraph as ig
import numpy as np
import pandas as pd

import settings as s
from common import get_weights
from communities import get_communities_spark
from features import generate_features_spark, generate_features_udf_wrapper, SCHEMA_FEAT_UDF

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [3]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
    ("spark.sql.autoBroadcastJoinThreshold", -1)
]

shutil.rmtree("artifacts", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/20 13:24:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
start_script = time.time()

In [5]:
location_main = os.path.join("features", "libra")
# shutil.rmtree(location_main, ignore_errors=True)

location_nodes_neighborhoods = f"{location_main}{os.sep}nodes_neighborhoods.pickle"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [26]:
data = pd.read_csv("./data/Libra_bank_3months_graph/data.csv")
rename = {
    "id_source": "source",
    "id_destination": "target",
    "cum_amount": "amount",
    "nr_transactions": "num_transactions",
    "nr_alerts": "alerts_count",
    "nr_reports": "reports_count",
}
data = data.rename(columns=rename)
data.loc[:, "source_"] = (
    data.loc[:, "source"].astype(str).apply(lambda x: f"nid-{int(x)}")
)
data.loc[:, "target_"] = (
    data.loc[:, "target"].astype(str).apply(lambda x: f"nid-{int(x)}")
)
del data["source"]
del data["target"]
data = data.rename(
    columns={
        "source_": "source",
        "target_": "target",
    }
).loc[:, ["source", "target", "amount", "num_transactions", "alerts_count", "reports_count"]]

data = data.set_index(["source", "target"]).join(
    get_weights(data).set_index(["source", "target"]), how="left"
).reset_index()
data.loc[:, "amount_weighted"] = data.loc[:, "amount"] * data.loc[:, "weight"]

In [7]:
nodes_data = pd.DataFrame(index=sorted(set(data["source"].tolist() + data["target"].tolist())))
nodes_data.index.name = "key"

w_alerts = int(data["alerts_count"].sum() * 2)
w_reports = int(data["reports_count"].sum() * 2)

w_alerts_source = data[data["alerts_count"] > 0].groupby("source").agg({"alerts_count": "sum"}).to_dict()["alerts_count"]
w_alerts_target = data[data["alerts_count"] > 0].groupby("target").agg({"alerts_count": "sum"}).to_dict()["alerts_count"]

w_reports_source = data[data["reports_count"] > 0].groupby("source").agg({"reports_count": "sum"}).to_dict()["reports_count"]
w_reports_target = data[data["reports_count"] > 0].groupby("target").agg({"reports_count": "sum"}).to_dict()["reports_count"]

nodes_data.loc[:, "alert_weight"] = nodes_data.index.map(
    lambda x: (w_alerts_source.get(x, 0) + w_alerts_target.get(x, 0)) / w_alerts
)
nodes_data.loc[:, "report_weight"] = nodes_data.index.map(
    lambda x: (w_reports_source.get(x, 0) + w_reports_target.get(x, 0)) / w_reports
)


In [8]:
# %%time

# data_input = spark.createDataFrame(data)
# nodes_source = set(data["source"].unique())
# nodes_target = set(data["target"].unique())
# nodes_passthrough = nodes_source.intersection(nodes_target)

# %run generate_flow_features.ipynb

# comm_as_source_features.to_parquet(location_comm_as_source_features)
# comm_as_target_features.to_parquet(location_comm_as_target_features)
# comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
# comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

# del comm_as_source_features
# del comm_as_target_features
# del comm_as_passthrough_features
# del comm_as_passthrough_features_reverse

In [9]:
%%time

print("Constructing ego-net communities")

communities_ego = get_communities_spark(
    [(x, [x]) for x in nodes_data.index], 
    ig.Graph.DataFrame(data.loc[:, ["source", "target"]], use_vids=False, directed=True), 
    os.cpu_count(), spark, 1, "all", 1e-100, None
)
sizes = [len(x[1]) for x in communities_ego]
print(len(sizes), round(np.mean(sizes), 4), np.median(sizes), np.percentile(sizes, 99), np.max(sizes))

Constructing ego-net communities


                                                                                

385100 4.0201 2.0 28.0 49596


In [10]:
%%time

print("Constructing 1-hop communities")

communities_1_hop = get_communities_spark(
    [(x, [x]) for x in nodes_data.index], 
    ig.Graph.DataFrame(data.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True), 
    os.cpu_count(), spark, 1, "all", 0.01, "amount_weighted"
)
sizes = [len(x[1]) for x in communities_1_hop]
print(len(sizes), round(np.mean(sizes), 4), np.median(sizes), np.percentile(sizes, 99), np.max(sizes))

Constructing 1-hop communities


                                                                                

385100 2.7259 2.0 15.0 312


In [13]:
%%time

print("Constructing 2-hop-out communities")

communities_2_hop_out = get_communities_spark(
    [(x, [x]) for x in nodes_data.index], 
    ig.Graph.DataFrame(data.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True), 
    os.cpu_count(), spark, 2, "out", 0.01, "amount_weighted"
)
sizes = [len(x[1]) for x in communities_2_hop_out]
print(len(sizes), round(np.mean(sizes), 4), np.median(sizes), np.percentile(sizes, 99), np.max(sizes))

Constructing 2-hop-out communities


                                                                                

385100 7.3424 3.0 44.0 1033
CPU times: user 2.47 s, sys: 90.6 ms, total: 2.56 s
Wall time: 34.6 s


In [15]:
%%time

print("Constructing 2-hop-in communities")

communities_2_hop_in = get_communities_spark(
    [(x, [x]) for x in nodes_data.index], 
    ig.Graph.DataFrame(data.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True), 
    os.cpu_count(), spark, 2, "in", 0.01, "amount_weighted"
)
sizes = [len(x[1]) for x in communities_2_hop_in]
print(len(sizes), round(np.mean(sizes), 4), np.median(sizes), np.percentile(sizes, 99), np.max(sizes))

Constructing 2-hop-in communities


                                                                                

385100 5.7006 3.0 33.0 772
CPU times: user 2.98 s, sys: 86.6 ms, total: 3.06 s
Wall time: 40 s


In [20]:
communities_2_hop_in_dict = dict(communities_2_hop_in)
communities_2_hop_combined = [
    (x, y.union(communities_2_hop_in_dict[x])) for x, y in communities_2_hop_out
]
sizes = [len(x[1]) for x in communities_2_hop_combined]
print(len(sizes), round(np.mean(sizes), 4), np.median(sizes), np.percentile(sizes, 99), np.max(sizes))

385100 12.3355 8.0 58.0 1066


In [22]:
graph = ig.Graph.DataFrame(data.loc[:, ["source", "target", "amount"]], use_vids=False, directed=True)
nodes = [x["name"] for x in graph.vs()]
page_rank_d_uw = graph.personalized_pagerank(vertices=None, directed=True, weights=None)
page_rank_d_uw = {k: v for k, v in zip(nodes, page_rank_d_uw)}
page_rank_ud_uw = graph.personalized_pagerank(vertices=None, directed=False, weights=None)
page_rank_ud_uw = {k: v for k, v in zip(nodes, page_rank_ud_uw)}
page_rank_d_amounts = graph.personalized_pagerank(vertices=None, directed=True, weights="amount")
page_rank_d_amounts = {k: v for k, v in zip(nodes, page_rank_d_amounts)}
page_rank_ud_amounts = graph.personalized_pagerank(vertices=None, directed=False, weights="amount")
page_rank_ud_amounts = {k: v for k, v in zip(nodes, page_rank_ud_amounts)}
del graph

In [33]:
graph = ig.Graph.DataFrame(data, use_vids=False, directed=True)

In [38]:
%%time

features_ego = generate_features_spark(communities_ego, graph, spark)

                                                                                

CPU times: user 2min 11s, sys: 20 s, total: 2min 31s
Wall time: 5min 52s


In [52]:
%%time

features_1_hop = generate_features_spark(communities_1_hop, graph, spark)

24


                                                                                

CPU times: user 2min 11s, sys: 17.9 s, total: 2min 29s
Wall time: 5min 30s


In [41]:
%%time

features_2_hop_out = generate_features_spark(communities_2_hop_out, graph, spark)

                                                                                

CPU times: user 1min 44s, sys: 18.5 s, total: 2min 2s
Wall time: 4min


In [42]:
%%time

features_2_hop_in = generate_features_spark(communities_2_hop_in, graph, spark)

                                                                                

CPU times: user 1min 47s, sys: 22.7 s, total: 2min 9s
Wall time: 4min 6s


In [43]:
%%time

features_2_hop_combined = generate_features_spark(communities_2_hop_combined, graph, spark)

                                                                                

CPU times: user 2min 23s, sys: 17.7 s, total: 2min 41s
Wall time: 6min 9s


In [55]:
%%time

print("1-hop-source features creation")

features_source = spark.createDataFrame(data).withColumn(
    "key", sf.col("source")
).repartition(os.cpu_count(), "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_source = pd.DataFrame(features_source["features"].apply(json.loads).tolist())
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]

1-hop-source features creation


25/07/20 14:49:31 WARN TaskSetManager: Stage 58 contains a task of very large size (6867 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.62 s, sys: 213 ms, total: 1.83 s
Wall time: 1min 14s


In [56]:
%%time

print("1-hop-target features creation")

features_target = spark.createDataFrame(data).withColumn(
    "key", sf.col("target")
).repartition(os.cpu_count(), "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_target = pd.DataFrame(features_target["features"].apply(json.loads).tolist())
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]

1-hop-target features creation


25/07/20 14:50:46 WARN TaskSetManager: Stage 61 contains a task of very large size (6961 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.62 s, sys: 203 ms, total: 1.83 s
Wall time: 1min 14s


In [59]:
features_ego.loc[:, "page_rank_d_uw"] = features_ego.loc[:, "key"].apply(lambda x: page_rank_d_uw[x])
features_ego.loc[:, "page_rank_ud_uw"] = features_ego.loc[:, "key"].apply(lambda x: page_rank_ud_uw[x])
features_ego.loc[:, "page_rank_d_amounts"] = features_ego.loc[:, "key"].apply(lambda x: page_rank_d_amounts[x])
features_ego.loc[:, "page_rank_ud_amounts"] = features_ego.loc[:, "key"].apply(lambda x: page_rank_ud_amounts[x])

features_1_hop
features_2_hop_out
features_2_hop_in
features_2_hop_combined

all_features = features_ego.set_index("key").join(
    features_1_hop.set_index("key"), how="outer", rsuffix=f"_1_hop"
).join(
    features_2_hop_out.set_index("key"), how="outer", rsuffix=f"_2_hop_out"
).join(
    features_2_hop_in.set_index("key"), how="outer", rsuffix=f"_2_hop_in"
).join(
    features_2_hop_combined.set_index("key"), how="outer", rsuffix=f"_2_hop_combined"
).join(
    features_source.set_index("key"), how="outer", rsuffix=f"_as_source"
).join(
    features_target.set_index("key"), how="outer", rsuffix=f"_as_target"
)

all_features = all_features.join(
    pd.read_parquet(location_comm_as_source_features), how="left", rsuffix="_dispense"
).join(
    pd.read_parquet(location_comm_as_target_features), how="left", rsuffix="_sink"
).join(
    pd.read_parquet(location_comm_as_passthrough_features), how="left", rsuffix="_passthrough"
).join(
    pd.read_parquet(location_comm_as_passthrough_features_reverse), how="left", rsuffix="_passthrough_rev"
)

all_features.to_parquet(location_features_node_level)
del all_features

In [60]:
all_features = pd.read_parquet(location_features_node_level)

In [62]:
constants = []
for column in all_features.columns:
    if all_features[column].nunique(dropna=True) <= 1:
        print("Deleting", column)
        del all_features[column]
        constants.append(column)

Deleting graph_1_hop_feat_num_sources
Deleting graph_1_hop_feat_num_source_and_target
Deleting graph_1_hop_feat_num_source_only
Deleting graph_1_hop_feat_std_debit_edges
Deleting graph_1_hop_feat_std_debit_edges_weighted
Deleting graph_1_hop_feat_num_targets_as_target
Deleting graph_1_hop_feat_num_source_and_target_as_target
Deleting graph_1_hop_feat_num_target_only_as_target
Deleting graph_1_hop_feat_std_credit_edges_as_target
Deleting graph_1_hop_feat_std_credit_edges_weighted_as_target


In [64]:
medians = {}
for column in all_features.columns:
    medians[column] = np.nanmedian(all_features[column])

In [81]:
# from sklearn.preprocessing import MinMaxScaler

# outliers = all_features.loc[nodes_data[nodes_data["alert_weight"] > 0].index, :]
# rest = all_features.loc[nodes_data[nodes_data["alert_weight"] == 0].index, :]

# outliers_means = {}
# for column in outliers.columns:
#     scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(
#         outliers[column].astype(np.float64).values.reshape(-1, 1)
#     ).flatten()
#     outliers_means[column] = np.nanmedian(scaled)

# rest_means = {}
# for column in rest.columns:
#     scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(
#         rest[column].astype(np.float64).values.reshape(-1, 1)
#     ).flatten()
#     rest_means[column] = np.nanmedian(scaled)

# diffs = []
# for column in all_features.columns:
#     x, y = outliers_means[column], rest_means[column]
#     x_y = (x+y) or 1
#     diffs.append((column, abs(x-y) / x_y))
# diffs = pd.DataFrame(diffs, columns=["feat", "diff"])

# selec_cols = diffs.sort_values("diff").tail(100)["feat"].tolist()

In [99]:
%%time

anomalies = all_features.loc[:, []]
model = IsolationForest(n_estimators=10_000, max_features=1.0)
anomalies.loc[:, "anomaly_score"] = model.fit(
    all_features.fillna(0)
).decision_function(all_features.fillna(medians))
anomalies = anomalies.sort_values("anomaly_score", ascending=True)
anomalies = anomalies.join(nodes_data)

CPU times: user 5min 20s, sys: 22.5 s, total: 5min 43s
Wall time: 5min 43s


In [100]:
size = anomalies.shape[0]
perc_point_1 = round(size * (0.1 / 100))
perc_point_2 = round(size * (0.2 / 100))
perc_point_5 = round(size * (0.5 / 100))
perc_1 = round(size * (1 / 100))
print(size, perc_point_1, perc_point_2, perc_point_5, perc_1)

385100 385 770 1926 3851


In [101]:
def add_predicted_alert_weight(anomalies_input, perc_count):
    anomalies_perc_x = anomalies_input.copy(deep=True)
    index = anomalies_perc_x.head(perc_count).index.tolist()
    anomalies_perc_x = anomalies_perc_x.loc[index, :]
    anomalies_perc_x.loc[:, "predicted_alert_weight"] = anomalies_perc_x.loc[:, "alert_weight"]
    anomalies_perc_x.loc[:, "predicted_report_weight"] = anomalies_perc_x.loc[:, "report_weight"]
    return anomalies_perc_x

In [102]:
anomalies_perc_point_1 = add_predicted_alert_weight(anomalies, perc_point_1)
anomalies_perc_point_2 = add_predicted_alert_weight(anomalies, perc_point_2)
anomalies_perc_point_5 = add_predicted_alert_weight(anomalies, perc_point_5)
anomalies_perc_1 = add_predicted_alert_weight(anomalies, perc_1)

In [103]:
print(round(anomalies_perc_point_1["predicted_alert_weight"].sum(), 4))
print(round(anomalies_perc_point_2["predicted_alert_weight"].sum(), 4))
print(round(anomalies_perc_point_5["predicted_alert_weight"].sum(), 4))
print(round(anomalies_perc_1["predicted_alert_weight"].sum(), 4))
# 0.2834
# 0.4265
# 0.6431
# 0.7544

0.2776
0.4197
0.6412
0.7553


In [104]:
tpr = np.cumsum(anomalies_perc_1["predicted_alert_weight"])
print()
print(round(np.mean(tpr), 4))
print()
# 0.5725


0.5711



In [105]:
print(round(anomalies_perc_point_1["predicted_report_weight"].sum(), 4))
print(round(anomalies_perc_point_2["predicted_report_weight"].sum(), 4))
print(round(anomalies_perc_point_5["predicted_report_weight"].sum(), 4))
print(round(anomalies_perc_1["predicted_report_weight"].sum(), 4))
# 0.2273
# 0.4545
# 0.6818
# 0.9091

0.2273
0.4545
0.6818
0.9091


In [106]:
tpr = np.cumsum(anomalies_perc_1["predicted_report_weight"])
print()
print(round(np.mean(tpr), 4))
print()
# 0.614


0.6154



In [None]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")