### https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9762926

In [1]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict
from datetime import timedelta
from glob import glob
from itertools import product
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score

import igraph as ig
import leidenalg as la
import numpy as np
import pandas as pd

import settings as s
from common import get_weights
from communities import get_communities_spark
from features import generate_features_spark, generate_features_udf_wrapper, SCHEMA_FEAT_UDF

%load_ext autoreload
%autoreload 2

In [2]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [3]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
    ("spark.sql.autoBroadcastJoinThreshold", -1)
]

shutil.rmtree("artifacts", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/21 16:09:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
data = pd.read_csv("./data/Libra_bank_3months_graph/data.csv")
rename = {
    "id_source": "source",
    "id_destination": "target",
    "cum_amount": "amount",
    "nr_transactions": "num_transactions",
    "nr_alerts": "alerts_count",
    "nr_reports": "reports_count",
}

data = data.rename(columns=rename)
data.loc[:, "source_"] = (
    data.loc[:, "source"].astype(str).apply(lambda x: f"nid-{int(x)}")
)
data.loc[:, "target_"] = (
    data.loc[:, "target"].astype(str).apply(lambda x: f"nid-{int(x)}")
)
del data["source"]
del data["target"]
data = data.rename(
    columns={
        "source_": "source",
        "target_": "target",
    }
).loc[:, ["source", "target", "amount", "num_transactions", "alerts_count", "reports_count"]]

In [5]:
nodes_data = pd.DataFrame(index=sorted(set(data["source"].tolist() + data["target"].tolist())))
nodes_data.index.name = "key"

w_alerts = int(data["alerts_count"].sum() * 2)
w_reports = int(data["reports_count"].sum() * 2)

w_alerts_source = data[data["alerts_count"] > 0].groupby("source").agg({"alerts_count": "sum"}).to_dict()["alerts_count"]
w_alerts_target = data[data["alerts_count"] > 0].groupby("target").agg({"alerts_count": "sum"}).to_dict()["alerts_count"]

w_reports_source = data[data["reports_count"] > 0].groupby("source").agg({"reports_count": "sum"}).to_dict()["reports_count"]
w_reports_target = data[data["reports_count"] > 0].groupby("target").agg({"reports_count": "sum"}).to_dict()["reports_count"]

nodes_data.loc[:, "alert_weight"] = nodes_data.index.map(
    lambda x: (w_alerts_source.get(x, 0) + w_alerts_target.get(x, 0)) / w_alerts
)
nodes_data.loc[:, "report_weight"] = nodes_data.index.map(
    lambda x: (w_reports_source.get(x, 0) + w_reports_target.get(x, 0)) / w_reports
)

In [6]:
total_nodes = nodes_data.shape[0]
perc_point_1_cnt = round(total_nodes * (0.1 / 100))
perc_point_2_cnt = round(total_nodes * (0.2 / 100))
perc_point_5_cnt = round(total_nodes * (0.5 / 100))
perc_1_cnt = round(total_nodes * (1 / 100))
perc_10_cnt = round(total_nodes * (10 / 100))
perc_20_cnt = round(total_nodes * (20 / 100))
perc_30_cnt = round(total_nodes * (30 / 100))
perc_40_cnt = round(total_nodes * (40 / 100))
perc_50_cnt = round(total_nodes * (50 / 100))
perc_75_cnt = round(total_nodes * (75 / 100))
print(total_nodes, perc_point_1_cnt, perc_point_2_cnt, perc_point_5_cnt, perc_1_cnt)

385100 385 770 1926 3851


In [7]:
%%time

candidates = nodes_data.index.tolist()
data_in_scope = data.copy(deep=True)
data_in_scope = data_in_scope.set_index(["source", "target"]).join(
    get_weights(data_in_scope).set_index(["source", "target"]), how="left"
).reset_index()
data_in_scope.loc[:, "amount_weighted"] = (
    data_in_scope.loc[:, "amount"] * 
    (data_in_scope.loc[:, "weight"] / data_in_scope.loc[:, "weight"].max())
)

%run model.ipynb

anomalies_main = anomalies.copy(deep=True)

25/07/21 16:09:47 WARN TaskSetManager: Stage 0 contains a task of very large size (3851 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 472,592 | 213,511
Processed hop #2 | 6,730,058 | 194,847
Processed hop #3 | 7,805,533 | 186,398
Processed hop #4 | 8,894,931 | 185,538
Processed hop #5 | 8,889,368 | 185,321

Processing comm_as_target

Processed hop #1 | 414,376 | 221,458
Processed hop #2 | 6,111,806 | 201,104
Processed hop #3 | 7,163,788 | 188,404
Processed hop #4 | 8,469,276 | 186,909
Processed hop #5 | 8,208,835 | 183,360

Processing comm_as_passthrough

Processed hop #1 | 251,417 | 49,869
Processed hop #2 | 1,310,227 | 42,290
Processed hop #3 | 1,725,981 | 40,671
Processed hop #4 | 1,918,895 | 40,412
Processed hop #5 | 1,952,010 | 40,363

Processing comm_as_passthrough_reverse

Processed hop #1 | 197,559 | 49,869
Processed hop #2 | 1,081,845 | 41,267
Processed hop #3 | 1,434,805 | 38,874
Processed hop #4 | 1,719,403 | 38,462
Processed hop #5 | 1,722,034 | 38,030


comm_as_source_features

CPU times: user 37.4 s, sys: 363 ms, total: 37.8 s
Wall time: 37.7 s

comm_as_tar

                                                                                

CPU times: user 2.04 s, sys: 69.8 ms, total: 2.11 s
Wall time: 24.8 s


                                                                                

CPU times: user 4.49 s, sys: 273 ms, total: 4.76 s
Wall time: 37.8 s


                                                                                

CPU times: user 2min 7s, sys: 32.9 s, total: 2min 39s
Wall time: 5min 40s
1-hop-source features creation


25/07/21 16:28:57 WARN TaskSetManager: Stage 12 contains a task of very large size (6867 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.69 s, sys: 240 ms, total: 1.93 s
Wall time: 1min 13s
1-hop-target features creation


25/07/21 16:30:10 WARN TaskSetManager: Stage 15 contains a task of very large size (6961 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.72 s, sys: 263 ms, total: 1.98 s
Wall time: 1min 15s
Features: (385100, 176)
Deleted 10 constant columns
Script executed in 0:21:44
Training the model
CPU times: user 11min 4s, sys: 58.1 s, total: 12min 2s
Wall time: 12min 2s
CPU times: user 25min 10s, sys: 2min 13s, total: 27min 24s
Wall time: 33min 51s


In [None]:
graph_global = ig.Graph.DataFrame(data[["source", "target"]], use_vids=False, directed=True)

In [8]:
def add_predicted_alert_weight(anomalies_input, perc_count_in):
    anomalies_perc_x = anomalies_input.copy(deep=True)
    index = anomalies_perc_x.head(perc_count_in).index.tolist()
    anomalies_perc_x = anomalies_perc_x.loc[index, :]
    anomalies_perc_x.loc[:, "predicted_alert_weight"] = anomalies_perc_x.loc[:, "alert_weight"]
    anomalies_perc_x.loc[:, "predicted_report_weight"] = anomalies_perc_x.loc[:, "report_weight"]
    return anomalies_perc_x

In [9]:
communities_1_hop_dict = dict(communities_1_hop)

comms_1_hop_0_1_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies_main, perc_point_1_cnt).index]
comms_1_hop_0_2_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies_main, perc_point_2_cnt).index]
comms_1_hop_0_5_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies_main, perc_point_5_cnt).index]
comms_1_hop_1_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies_main, perc_1_cnt).index]

sizes_1_hop_0_1_perc = [len(x) for x in comms_1_hop_0_1_perc]
sizes_1_hop_0_2_perc = [len(x) for x in comms_1_hop_0_2_perc]
sizes_1_hop_0_5_perc = [len(x) for x in comms_1_hop_0_5_perc]
sizes_1_hop_1_perc = [len(x) for x in comms_1_hop_1_perc]

sizes_edge_1_hop_0_1_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_1_perc]
sizes_edge_1_hop_0_2_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_2_perc]
sizes_edge_1_hop_0_5_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_5_perc]
sizes_edge_1_hop_1_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_1_perc]

results = [{
    "rectify_perc": 1.0,
    "leiden_size_max": np.max(sizes_leiden),
    "leiden_size_mean": np.mean(sizes_leiden),
    "leiden_size_median": np.median(sizes_leiden),
    "1_hop_size_max": np.max(sizes_1_hop),
    "1_hop_size_mean": np.mean(sizes_1_hop),
    "1_hop_size_median": np.median(sizes_1_hop),
    "0.1%": add_predicted_alert_weight(anomalies_main, perc_point_1_cnt)["predicted_alert_weight"].sum(),
    "0.2%": add_predicted_alert_weight(anomalies_main, perc_point_2_cnt)["predicted_alert_weight"].sum(),
    "0.5%": add_predicted_alert_weight(anomalies_main, perc_point_5_cnt)["predicted_alert_weight"].sum(),
    "1%": add_predicted_alert_weight(anomalies_main, perc_1_cnt)["predicted_alert_weight"].sum(),
    "auc_1%": np.mean(np.cumsum(add_predicted_alert_weight(anomalies_main, perc_1_cnt)["predicted_alert_weight"])),
    "report_0.1%": add_predicted_alert_weight(anomalies_main, perc_point_1_cnt)["predicted_report_weight"].sum(),
    "report_0.2%": add_predicted_alert_weight(anomalies_main, perc_point_2_cnt)["predicted_report_weight"].sum(),
    "report_0.5%": add_predicted_alert_weight(anomalies_main, perc_point_5_cnt)["predicted_report_weight"].sum(),
    "report_1%": add_predicted_alert_weight(anomalies_main, perc_1_cnt)["predicted_report_weight"].sum(),
    "report_auc_1%": np.mean(np.cumsum(add_predicted_alert_weight(anomalies_main, perc_1_cnt)["predicted_report_weight"])),
    "max_1_hop_0_1_perc": np.max(sizes_1_hop_0_1_perc),
    "max_1_hop_0_2_perc": np.max(sizes_1_hop_0_2_perc),
    "max_1_hop_0_5_perc": np.max(sizes_1_hop_0_5_perc),
    "max_1_hop_1_perc": np.max(sizes_1_hop_1_perc),
    "mean_1_hop_0_1_perc": np.mean(sizes_1_hop_0_1_perc),
    "mean_1_hop_0_2_perc": np.mean(sizes_1_hop_0_2_perc),
    "mean_1_hop_0_5_perc": np.mean(sizes_1_hop_0_5_perc),
    "mean_1_hop_1_perc": np.mean(sizes_1_hop_1_perc),
    "median_1_hop_0_1_perc": np.median(sizes_1_hop_0_1_perc),
    "median_1_hop_0_2_perc": np.median(sizes_1_hop_0_2_perc),
    "median_1_hop_0_5_perc": np.median(sizes_1_hop_0_5_perc),
    "median_1_hop_1_perc": np.median(sizes_1_hop_1_perc),
    "max_edges_1_hop_0_1_perc": np.max(sizes_edge_1_hop_0_1_perc),
    "max_edges_1_hop_0_2_perc": np.max(sizes_edge_1_hop_0_2_perc),
    "max_edges_1_hop_0_5_perc": np.max(sizes_edge_1_hop_0_5_perc),
    "max_edges_1_hop_1_perc": np.max(sizes_edge_1_hop_1_perc),
    "mean_edges_1_hop_0_1_perc": np.mean(sizes_edge_1_hop_0_1_perc),
    "mean_edges_1_hop_0_2_perc": np.mean(sizes_edge_1_hop_0_2_perc),
    "mean_edges_1_hop_0_5_perc": np.mean(sizes_edge_1_hop_0_5_perc),
    "mean_edges_1_hop_1_perc": np.mean(sizes_edge_1_hop_1_perc),
    "median_edges_1_hop_0_1_perc": np.median(sizes_edge_1_hop_0_1_perc),
    "median_edges_1_hop_0_2_perc": np.median(sizes_edge_1_hop_0_2_perc),
    "median_edges_1_hop_0_5_perc": np.median(sizes_edge_1_hop_0_5_perc),
    "median_edges_1_hop_1_perc": np.median(sizes_edge_1_hop_1_perc),
}]

In [10]:
%%time

for perc_count in [perc_75_cnt, perc_50_cnt, perc_40_cnt, perc_30_cnt, perc_20_cnt, perc_10_cnt]:
    rectify_perc = round(perc_count / total_nodes, 2)
    candidates = add_predicted_alert_weight(anomalies_main, perc_count).index.tolist()
    filter_ = data["source"].isin(candidates) & data["target"].isin(candidates)
    data_in_scope = data.loc[filter_, :]
    candidates = sorted(set(data_in_scope["source"].unique()).union(data_in_scope["target"].unique()))
    print("=" * 100)
    print(rectify_perc, perc_count, len(candidates), data_in_scope.shape)
    print("=" * 100)
    data_in_scope = data_in_scope.set_index(["source", "target"]).join(
        get_weights(data_in_scope).set_index(["source", "target"]), how="left"
    ).reset_index()
    data_in_scope.loc[:, "amount_weighted"] = data_in_scope.loc[:, "amount"] * data_in_scope.loc[:, "weight"]
    
    %run model.ipynb

    communities_1_hop_dict = dict(communities_1_hop)

    comms_1_hop_0_1_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies, perc_point_1_cnt).index]
    comms_1_hop_0_2_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies, perc_point_2_cnt).index]
    comms_1_hop_0_5_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies, perc_point_5_cnt).index]
    comms_1_hop_1_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies, perc_1_cnt).index]
    
    sizes_1_hop_0_1_perc = [len(x) for x in comms_1_hop_0_1_perc]
    sizes_1_hop_0_2_perc = [len(x) for x in comms_1_hop_0_2_perc]
    sizes_1_hop_0_5_perc = [len(x) for x in comms_1_hop_0_5_perc]
    sizes_1_hop_1_perc = [len(x) for x in comms_1_hop_1_perc]
    
    sizes_edge_1_hop_0_1_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_1_perc]
    sizes_edge_1_hop_0_2_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_2_perc]
    sizes_edge_1_hop_0_5_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_5_perc]
    sizes_edge_1_hop_1_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_1_perc]

    results.append({
        "rectify_perc": rectify_perc,
        "leiden_size_max": np.max(sizes_leiden),
        "leiden_size_mean": np.mean(sizes_leiden),
        "leiden_size_median": np.median(sizes_leiden),
        "1_hop_size_max": np.max(sizes_1_hop),
        "1_hop_size_mean": np.mean(sizes_1_hop),
        "1_hop_size_median": np.median(sizes_1_hop),
        "0.1%": add_predicted_alert_weight(anomalies, perc_point_1_cnt)["predicted_alert_weight"].sum(),
        "0.2%": add_predicted_alert_weight(anomalies, perc_point_2_cnt)["predicted_alert_weight"].sum(),
        "0.5%": add_predicted_alert_weight(anomalies, perc_point_5_cnt)["predicted_alert_weight"].sum(),
        "1%": add_predicted_alert_weight(anomalies, perc_1_cnt)["predicted_alert_weight"].sum(),
        "auc_1%": np.mean(np.cumsum(add_predicted_alert_weight(anomalies, perc_1_cnt)["predicted_alert_weight"])),
        "report_0.1%": add_predicted_alert_weight(anomalies, perc_point_1_cnt)["predicted_report_weight"].sum(),
        "report_0.2%": add_predicted_alert_weight(anomalies, perc_point_2_cnt)["predicted_report_weight"].sum(),
        "report_0.5%": add_predicted_alert_weight(anomalies, perc_point_5_cnt)["predicted_report_weight"].sum(),
        "report_1%": add_predicted_alert_weight(anomalies, perc_1_cnt)["predicted_report_weight"].sum(),
        "report_auc_1%": np.mean(np.cumsum(add_predicted_alert_weight(anomalies, perc_1_cnt)["predicted_report_weight"])),
        "max_1_hop_0_1_perc": np.max(sizes_1_hop_0_1_perc),
        "max_1_hop_0_2_perc": np.max(sizes_1_hop_0_2_perc),
        "max_1_hop_0_5_perc": np.max(sizes_1_hop_0_5_perc),
        "max_1_hop_1_perc": np.max(sizes_1_hop_1_perc),
        "mean_1_hop_0_1_perc": np.mean(sizes_1_hop_0_1_perc),
        "mean_1_hop_0_2_perc": np.mean(sizes_1_hop_0_2_perc),
        "mean_1_hop_0_5_perc": np.mean(sizes_1_hop_0_5_perc),
        "mean_1_hop_1_perc": np.mean(sizes_1_hop_1_perc),
        "median_1_hop_0_1_perc": np.median(sizes_1_hop_0_1_perc),
        "median_1_hop_0_2_perc": np.median(sizes_1_hop_0_2_perc),
        "median_1_hop_0_5_perc": np.median(sizes_1_hop_0_5_perc),
        "median_1_hop_1_perc": np.median(sizes_1_hop_1_perc),
        "max_edges_1_hop_0_1_perc": np.max(sizes_edge_1_hop_0_1_perc),
        "max_edges_1_hop_0_2_perc": np.max(sizes_edge_1_hop_0_2_perc),
        "max_edges_1_hop_0_5_perc": np.max(sizes_edge_1_hop_0_5_perc),
        "max_edges_1_hop_1_perc": np.max(sizes_edge_1_hop_1_perc),
        "mean_edges_1_hop_0_1_perc": np.mean(sizes_edge_1_hop_0_1_perc),
        "mean_edges_1_hop_0_2_perc": np.mean(sizes_edge_1_hop_0_2_perc),
        "mean_edges_1_hop_0_5_perc": np.mean(sizes_edge_1_hop_0_5_perc),
        "mean_edges_1_hop_1_perc": np.mean(sizes_edge_1_hop_1_perc),
        "median_edges_1_hop_0_1_perc": np.median(sizes_edge_1_hop_0_1_perc),
        "median_edges_1_hop_0_2_perc": np.median(sizes_edge_1_hop_0_2_perc),
        "median_edges_1_hop_0_5_perc": np.median(sizes_edge_1_hop_0_5_perc),
        "median_edges_1_hop_1_perc": np.median(sizes_edge_1_hop_1_perc),
    })

results = pd.DataFrame(results)
results.to_parquet("results.parquet")

0.75 288825 288825 (500887, 6)


25/07/21 16:43:36 WARN TaskSetManager: Stage 18 contains a task of very large size (3244 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 396,140 | 158,111
Processed hop #2 | 4,888,476 | 141,918
Processed hop #3 | 5,747,791 | 137,011
Processed hop #4 | 6,526,808 | 136,522
Processed hop #5 | 6,594,056 | 136,395

Processing comm_as_target

Processed hop #1 | 352,172 | 179,118
Processed hop #2 | 5,092,778 | 162,026
Processed hop #3 | 5,986,628 | 154,615
Processed hop #4 | 7,028,505 | 153,656
Processed hop #5 | 6,949,320 | 152,123

Processing comm_as_passthrough

Processed hop #1 | 230,637 | 48,404
Processed hop #2 | 1,276,613 | 41,708
Processed hop #3 | 1,701,713 | 40,271
Processed hop #4 | 1,900,589 | 40,040
Processed hop #5 | 1,938,006 | 40,001

Processing comm_as_passthrough_reverse

Processed hop #1 | 177,583 | 48,404
Processed hop #2 | 1,037,030 | 40,558
Processed hop #3 | 1,406,900 | 38,357
Processed hop #4 | 1,693,442 | 37,975
Processed hop #5 | 1,710,164 | 37,614


comm_as_source_features

CPU times: user 27.6 s, sys: 231 ms, total: 27.8 s
Wall time: 27.8 s

comm_as_tar

                                                                                

CPU times: user 2.25 s, sys: 46.8 ms, total: 2.3 s
Wall time: 12 s


                                                                                

CPU times: user 4.31 s, sys: 511 ms, total: 4.82 s
Wall time: 36.1 s


                                                                                

CPU times: user 1min 34s, sys: 10.4 s, total: 1min 44s
Wall time: 4min 14s
1-hop-source features creation


25/07/21 16:58:38 WARN TaskSetManager: Stage 30 contains a task of very large size (5785 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.37 s, sys: 194 ms, total: 1.56 s
Wall time: 54.1 s
1-hop-target features creation


25/07/21 16:59:32 WARN TaskSetManager: Stage 33 contains a task of very large size (5852 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.48 s, sys: 222 ms, total: 1.7 s
Wall time: 1min 1s
Features: (288825, 176)
Deleted 10 constant columns
Script executed in 0:17:01
Training the model
CPU times: user 8min 57s, sys: 37.5 s, total: 9min 35s
Wall time: 9min 35s
0.5 192550 190883 (399396, 6)


25/07/21 17:10:14 WARN TaskSetManager: Stage 36 contains a task of very large size (2600 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 312,358 | 107,851
Processed hop #2 | 3,675,487 | 100,049
Processed hop #3 | 4,335,147 | 98,180
Processed hop #4 | 4,727,743 | 97,911
Processed hop #5 | 4,788,164 | 97,863

Processing comm_as_target

Processed hop #1 | 272,663 | 128,056
Processed hop #2 | 4,180,373 | 119,094
Processed hop #3 | 4,826,942 | 115,924
Processed hop #4 | 5,399,060 | 115,417
Processed hop #5 | 5,385,781 | 114,717

Processing comm_as_passthrough

Processed hop #1 | 199,675 | 45,024
Processed hop #2 | 1,217,236 | 40,202
Processed hop #3 | 1,647,051 | 39,269
Processed hop #4 | 1,854,669 | 39,097
Processed hop #5 | 1,897,340 | 39,067

Processing comm_as_passthrough_reverse

Processed hop #1 | 150,941 | 45,024
Processed hop #2 | 968,960 | 39,046
Processed hop #3 | 1,362,423 | 37,416
Processed hop #4 | 1,647,956 | 37,119
Processed hop #5 | 1,689,262 | 36,896


comm_as_source_features

CPU times: user 19.5 s, sys: 198 ms, total: 19.7 s
Wall time: 19.7 s

comm_as_target_f

                                                                                

CPU times: user 1.4 s, sys: 31.3 ms, total: 1.43 s
Wall time: 7.48 s


                                                                                

CPU times: user 2.12 s, sys: 51.5 ms, total: 2.17 s
Wall time: 28.2 s


                                                                                

CPU times: user 59.4 s, sys: 1.2 s, total: 1min
Wall time: 2min 30s
1-hop-source features creation


25/07/21 17:20:56 WARN TaskSetManager: Stage 48 contains a task of very large size (4638 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 910 ms, sys: 146 ms, total: 1.06 s
Wall time: 37.5 s
1-hop-target features creation


25/07/21 17:21:33 WARN TaskSetManager: Stage 51 contains a task of very large size (4680 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.08 s, sys: 146 ms, total: 1.23 s
Wall time: 43.7 s
Features: (190883, 176)
Deleted 10 constant columns
Script executed in 0:12:05
Training the model
CPU times: user 6min 28s, sys: 16.4 s, total: 6min 44s
Wall time: 6min 44s
0.4 154040 152747 (354693, 6)


25/07/21 17:29:06 WARN TaskSetManager: Stage 54 contains a task of very large size (2321 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 272,071 | 77,729
Processed hop #2 | 2,361,260 | 71,374
Processed hop #3 | 2,988,970 | 69,875
Processed hop #4 | 3,333,275 | 69,647
Processed hop #5 | 3,393,947 | 69,601

Processing comm_as_target

Processed hop #1 | 255,291 | 118,637
Processed hop #2 | 3,960,404 | 111,017
Processed hop #3 | 4,564,945 | 108,438
Processed hop #4 | 5,066,632 | 108,006
Processed hop #5 | 5,067,354 | 107,436

Processing comm_as_passthrough

Processed hop #1 | 191,965 | 43,619
Processed hop #2 | 1,198,151 | 39,635
Processed hop #3 | 1,628,464 | 38,834
Processed hop #4 | 1,835,073 | 38,680
Processed hop #5 | 1,878,240 | 38,653

Processing comm_as_passthrough_reverse

Processed hop #1 | 145,105 | 43,619
Processed hop #2 | 949,766 | 38,439
Processed hop #3 | 1,345,950 | 37,010
Processed hop #4 | 1,630,207 | 36,745
Processed hop #5 | 1,677,167 | 36,567


comm_as_source_features

CPU times: user 13.9 s, sys: 140 ms, total: 14.1 s
Wall time: 14.1 s

comm_as_target_fea

                                                                                

CPU times: user 1.12 s, sys: 29.3 ms, total: 1.15 s
Wall time: 6.39 s


                                                                                

CPU times: user 1.38 s, sys: 36.5 ms, total: 1.42 s
Wall time: 23.5 s


                                                                                

CPU times: user 47.9 s, sys: 477 ms, total: 48.4 s
Wall time: 2min
1-hop-source features creation


25/07/21 17:38:01 WARN TaskSetManager: Stage 66 contains a task of very large size (4142 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 670 ms, sys: 117 ms, total: 787 ms
Wall time: 27.1 s
1-hop-target features creation


25/07/21 17:38:28 WARN TaskSetManager: Stage 69 contains a task of very large size (4168 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.01 s, sys: 143 ms, total: 1.15 s
Wall time: 41.4 s
Features: (152747, 176)
Deleted 10 constant columns
Script executed in 0:10:05
Training the model
CPU times: user 5min 27s, sys: 11 s, total: 5min 38s
Wall time: 5min 38s
0.3 115530 114523 (306716, 6)


25/07/21 17:44:51 WARN TaskSetManager: Stage 72 contains a task of very large size (2011 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 246,426 | 65,322
Processed hop #2 | 1,922,643 | 60,256
Processed hop #3 | 2,518,073 | 59,048
Processed hop #4 | 2,812,400 | 58,864
Processed hop #5 | 2,870,465 | 58,824

Processing comm_as_target

Processed hop #1 | 217,457 | 90,132
Processed hop #2 | 2,822,552 | 83,887
Processed hop #3 | 3,364,734 | 81,907
Processed hop #4 | 3,790,510 | 81,549
Processed hop #5 | 3,850,631 | 81,366

Processing comm_as_passthrough

Processed hop #1 | 180,661 | 40,931
Processed hop #2 | 1,133,796 | 37,781
Processed hop #3 | 1,558,435 | 37,136
Processed hop #4 | 1,756,934 | 37,010
Processed hop #5 | 1,794,993 | 36,989

Processing comm_as_passthrough_reverse

Processed hop #1 | 137,527 | 40,931
Processed hop #2 | 897,432 | 36,652
Processed hop #3 | 1,289,924 | 35,495
Processed hop #4 | 1,567,589 | 35,268
Processed hop #5 | 1,622,151 | 35,136


comm_as_source_features

CPU times: user 11.7 s, sys: 77.9 ms, total: 11.7 s
Wall time: 11.7 s

comm_as_target_feature

                                                                                

CPU times: user 935 ms, sys: 25.5 ms, total: 960 ms
Wall time: 5.47 s


                                                                                

CPU times: user 1.48 s, sys: 25.1 ms, total: 1.5 s
Wall time: 13.3 s


                                                                                

CPU times: user 34.8 s, sys: 1.8 s, total: 36.6 s
Wall time: 1min 31s
1-hop-source features creation


25/07/21 17:52:00 WARN TaskSetManager: Stage 84 contains a task of very large size (3589 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 570 ms, sys: 86.5 ms, total: 656 ms
Wall time: 22.9 s
1-hop-target features creation


25/07/21 17:52:23 WARN TaskSetManager: Stage 87 contains a task of very large size (3608 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 767 ms, sys: 114 ms, total: 880 ms
Wall time: 31.5 s
Features: (114523, 176)
Deleted 10 constant columns
Script executed in 0:08:05
Training the model
CPU times: user 4min 24s, sys: 13.8 s, total: 4min 37s
Wall time: 4min 37s
0.2 77020 76228 (254086, 6)


25/07/21 17:57:35 WARN TaskSetManager: Stage 90 contains a task of very large size (1670 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 218,974 | 53,211
Processed hop #2 | 1,522,982 | 49,394
Processed hop #3 | 2,070,687 | 48,520
Processed hop #4 | 2,309,479 | 48,379
Processed hop #5 | 2,363,897 | 48,345

Processing comm_as_target

Processed hop #1 | 175,948 | 58,852
Processed hop #2 | 1,537,957 | 53,939
Processed hop #3 | 2,018,635 | 52,404
Processed hop #4 | 2,373,355 | 52,112
Processed hop #5 | 2,448,841 | 52,026

Processing comm_as_passthrough

Processed hop #1 | 166,394 | 35,835
Processed hop #2 | 988,775 | 33,451
Processed hop #3 | 1,393,947 | 32,960
Processed hop #4 | 1,561,664 | 32,871
Processed hop #5 | 1,601,137 | 32,851

Processing comm_as_passthrough_reverse

Processed hop #1 | 126,441 | 35,835
Processed hop #2 | 772,665 | 32,396
Processed hop #3 | 1,142,152 | 31,478
Processed hop #4 | 1,393,023 | 31,286
Processed hop #5 | 1,453,054 | 31,219


comm_as_source_features

CPU times: user 9.61 s, sys: 71.4 ms, total: 9.68 s
Wall time: 9.67 s

comm_as_target_features


                                                                                

CPU times: user 521 ms, sys: 14.7 ms, total: 536 ms
Wall time: 4.78 s


                                                                                

CPU times: user 1.18 s, sys: 31 ms, total: 1.21 s
Wall time: 4.96 s


                                                                                

CPU times: user 23.1 s, sys: 190 ms, total: 23.3 s
Wall time: 1min 6s
1-hop-source features creation


25/07/21 18:03:04 WARN TaskSetManager: Stage 102 contains a task of very large size (2979 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 459 ms, sys: 78 ms, total: 537 ms
Wall time: 19.3 s
1-hop-target features creation


25/07/21 18:03:24 WARN TaskSetManager: Stage 105 contains a task of very large size (2993 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 508 ms, sys: 84.7 ms, total: 593 ms
Wall time: 20.5 s
Features: (76228, 176)
Deleted 10 constant columns
Script executed in 0:06:10
Training the model
CPU times: user 3min 16s, sys: 3.12 s, total: 3min 19s
Wall time: 3min 19s
0.1 38510 38053 (172282, 6)


25/07/21 18:07:05 WARN TaskSetManager: Stage 108 contains a task of very large size (1139 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 153,263 | 30,374
Processed hop #2 | 841,287 | 28,455
Processed hop #3 | 1,216,177 | 28,022
Processed hop #4 | 1,344,135 | 27,960
Processed hop #5 | 1,375,353 | 27,946

Processing comm_as_target

Processed hop #1 | 119,348 | 30,256
Processed hop #2 | 693,467 | 27,495
Processed hop #3 | 999,834 | 26,695
Processed hop #4 | 1,198,997 | 26,544
Processed hop #5 | 1,250,920 | 26,507

Processing comm_as_passthrough

Processed hop #1 | 125,086 | 22,577
Processed hop #2 | 640,370 | 21,402
Processed hop #3 | 923,907 | 21,153
Processed hop #4 | 1,015,528 | 21,109
Processed hop #5 | 1,037,948 | 21,101

Processing comm_as_passthrough_reverse

Processed hop #1 | 94,470 | 22,577
Processed hop #2 | 497,392 | 20,573
Processed hop #3 | 749,719 | 20,064
Processed hop #4 | 900,486 | 19,962
Processed hop #5 | 942,680 | 19,941


comm_as_source_features

CPU times: user 5.53 s, sys: 25.4 ms, total: 5.55 s
Wall time: 5.55 s

comm_as_target_features

CPU times: use

                                                                                

CPU times: user 378 ms, sys: 13.5 ms, total: 392 ms
Wall time: 1.66 s


                                                                                

CPU times: user 779 ms, sys: 22.9 ms, total: 802 ms
Wall time: 3.22 s


                                                                                

CPU times: user 11.2 s, sys: 97 ms, total: 11.3 s
Wall time: 33.4 s
1-hop-source features creation


25/07/21 18:10:10 WARN TaskSetManager: Stage 120 contains a task of very large size (2030 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 282 ms, sys: 47.7 ms, total: 330 ms
Wall time: 11.7 s
1-hop-target features creation


25/07/21 18:10:22 WARN TaskSetManager: Stage 123 contains a task of very large size (2036 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 266 ms, sys: 44.7 ms, total: 311 ms
Wall time: 11 s
Features: (38053, 176)
Deleted 10 constant columns
Script executed in 0:03:28
Training the model
CPU times: user 2min 8s, sys: 2 s, total: 2min 10s
Wall time: 2min 10s
CPU times: user 1h 9min 56s, sys: 3min 33s, total: 1h 13min 30s
Wall time: 1h 29min 10s


In [11]:
results

Unnamed: 0,rectify_perc,leiden_size_max,leiden_size_mean,leiden_size_median,1_hop_size_max,1_hop_size_mean,1_hop_size_median,0.1%,0.2%,0.5%,...,max_1_hop_0_5_perc,max_1_hop_1_perc,mean_1_hop_0_1_perc,mean_1_hop_0_2_perc,mean_1_hop_0_5_perc,mean_1_hop_1_perc,median_1_hop_0_1_perc,median_1_hop_0_2_perc,median_1_hop_0_5_perc,median_1_hop_1_perc
0,1.0,48422,35.957049,2.0,312,2.725913,2.0,0.360735,0.520309,0.660542,...,312,312,23.25974,23.448052,21.009346,17.516489,13.0,13.0,11.0,8.0
1,0.75,48442,27.109536,2.0,259,2.867692,2.0,0.361702,0.527079,0.670213,...,259,259,23.428571,23.487013,20.241952,16.637497,13.0,13.0,11.0,8.0
2,0.5,48585,43.244903,2.0,263,3.057554,2.0,0.370406,0.525145,0.687621,...,263,263,22.431169,21.996104,18.59865,14.332641,13.0,12.5,10.0,7.0
3,0.4,48552,40.689132,2.0,260,3.228725,2.0,0.375242,0.525145,0.700193,...,260,260,21.545455,21.35974,17.45379,13.655934,12.0,12.0,9.0,7.0
4,0.3,33397,36.115736,2.0,256,3.489517,2.0,0.376209,0.517408,0.705029,...,256,256,20.61039,19.964935,16.214953,12.820826,12.0,11.5,9.0,6.0
5,0.2,11970,28.918058,3.0,255,3.983038,3.0,0.375242,0.520309,0.700193,...,255,255,19.148052,18.294805,14.911215,12.122046,11.0,10.0,8.0,6.0
6,0.1,3086,22.450147,3.0,257,4.684703,3.0,0.367505,0.510638,0.681818,...,257,257,16.94026,15.744156,13.553479,10.942612,9.0,8.0,6.0,5.0
