### https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9762926

In [1]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict
from datetime import timedelta
from glob import glob
from itertools import product
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score

import igraph as ig
import leidenalg as la
import numpy as np
import pandas as pd

import settings as s
from common import get_weights
from communities import get_communities_spark
from features import generate_features_spark, generate_features_udf_wrapper, SCHEMA_FEAT_UDF

%load_ext autoreload
%autoreload 2

In [2]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [3]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
    ("spark.sql.autoBroadcastJoinThreshold", -1)
]

shutil.rmtree("artifacts", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/24 08:58:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
data = pd.read_csv("./data/Libra_bank_3months_graph/data.csv")
rename = {
    "id_source": "source",
    "id_destination": "target",
    "cum_amount": "amount",
    "nr_transactions": "num_transactions",
    "nr_alerts": "alerts_count",
    "nr_reports": "reports_count",
}

data = data.rename(columns=rename)
data.loc[:, "source_"] = (
    data.loc[:, "source"].astype(str).apply(lambda x: f"nid-{int(x)}")
)
data.loc[:, "target_"] = (
    data.loc[:, "target"].astype(str).apply(lambda x: f"nid-{int(x)}")
)
del data["source"]
del data["target"]
data = data.rename(
    columns={
        "source_": "source",
        "target_": "target",
    }
).loc[:, ["source", "target", "amount", "num_transactions", "alerts_count", "reports_count"]]

In [5]:
nodes_data = pd.DataFrame(index=sorted(set(data["source"].tolist() + data["target"].tolist())))
nodes_data.index.name = "key"

w_alerts = int(data["alerts_count"].sum() * 2)
w_reports = int(data["reports_count"].sum() * 2)

w_alerts_source = data[data["alerts_count"] > 0].groupby("source").agg({"alerts_count": "sum"}).to_dict()["alerts_count"]
w_alerts_target = data[data["alerts_count"] > 0].groupby("target").agg({"alerts_count": "sum"}).to_dict()["alerts_count"]

w_reports_source = data[data["reports_count"] > 0].groupby("source").agg({"reports_count": "sum"}).to_dict()["reports_count"]
w_reports_target = data[data["reports_count"] > 0].groupby("target").agg({"reports_count": "sum"}).to_dict()["reports_count"]

nodes_data.loc[:, "alert_weight"] = nodes_data.index.map(
    lambda x: (w_alerts_source.get(x, 0) + w_alerts_target.get(x, 0)) / w_alerts
)
nodes_data.loc[:, "report_weight"] = nodes_data.index.map(
    lambda x: (w_reports_source.get(x, 0) + w_reports_target.get(x, 0)) / w_reports
)

In [54]:
total_nodes = nodes_data.shape[0]
perc_point_1_cnt = round(total_nodes * (0.1 / 100))
perc_point_2_cnt = round(total_nodes * (0.2 / 100))
perc_point_5_cnt = round(total_nodes * (0.5 / 100))
perc_1_cnt = round(total_nodes * (1 / 100))
perc_10_cnt = round(total_nodes * (10 / 100))
perc_20_cnt = round(total_nodes * (20 / 100))
perc_30_cnt = round(total_nodes * (30 / 100))
perc_40_cnt = round(total_nodes * (40 / 100))
perc_50_cnt = round(total_nodes * (50 / 100))
perc_75_cnt = round(total_nodes * (75 / 100))
print(total_nodes, perc_point_1_cnt, perc_point_2_cnt, perc_point_5_cnt, perc_1_cnt)

385100 385 770 1926 3851


In [7]:
%%time

candidates = nodes_data.index.tolist()
data_in_scope = data.copy(deep=True)
data_in_scope = data_in_scope.set_index(["source", "target"]).join(
    get_weights(data_in_scope).set_index(["source", "target"]), how="left"
).reset_index()
data_in_scope.loc[:, "amount_weighted"] = (
    data_in_scope.loc[:, "amount"] * 
    (data_in_scope.loc[:, "weight"] / data_in_scope.loc[:, "weight"].max())
)

%run model.ipynb

anomalies_main = anomalies.copy(deep=True)

25/07/24 08:58:20 WARN TaskSetManager: Stage 0 contains a task of very large size (3851 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 472,592 | 213,511
Processed hop #2 | 6,730,058 | 194,847
Processed hop #3 | 7,805,533 | 186,398
Processed hop #4 | 8,894,931 | 185,538
Processed hop #5 | 8,889,368 | 185,321

Processing comm_as_target

Processed hop #1 | 414,376 | 221,458
Processed hop #2 | 6,111,806 | 201,104
Processed hop #3 | 7,163,788 | 188,404
Processed hop #4 | 8,469,276 | 186,909
Processed hop #5 | 8,208,835 | 183,360

Processing comm_as_passthrough

Processed hop #1 | 251,417 | 49,869
Processed hop #2 | 1,310,227 | 42,290
Processed hop #3 | 1,725,981 | 40,671
Processed hop #4 | 1,918,895 | 40,412
Processed hop #5 | 1,952,010 | 40,363

Processing comm_as_passthrough_reverse

Processed hop #1 | 197,559 | 49,869
Processed hop #2 | 1,081,845 | 41,267
Processed hop #3 | 1,434,805 | 38,874
Processed hop #4 | 1,719,403 | 38,462
Processed hop #5 | 1,722,034 | 38,030


comm_as_source_features

CPU times: user 37.3 s, sys: 375 ms, total: 37.7 s
Wall time: 37.7 s

comm_as_tar

                                                                                

CPU times: user 2.16 s, sys: 83.6 ms, total: 2.24 s
Wall time: 26.4 s


                                                                                

CPU times: user 4.62 s, sys: 425 ms, total: 5.04 s
Wall time: 41.4 s


                                                                                

CPU times: user 2min 9s, sys: 24.1 s, total: 2min 33s
Wall time: 5min 34s
1-hop-source features creation


25/07/24 09:17:44 WARN TaskSetManager: Stage 12 contains a task of very large size (6867 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.69 s, sys: 242 ms, total: 1.93 s
Wall time: 1min 13s
1-hop-target features creation


25/07/24 09:18:58 WARN TaskSetManager: Stage 15 contains a task of very large size (6961 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.71 s, sys: 227 ms, total: 1.94 s
Wall time: 1min 16s
Features: (385100, 176)
Deleted 10 constant columns
Script executed in 0:22:00
Training the model
CPU times: user 11min 7s, sys: 46.4 s, total: 11min 54s
Wall time: 12min 26s
CPU times: user 25min 33s, sys: 1min 52s, total: 27min 26s
Wall time: 34min 31s


In [None]:
graph_global = ig.Graph.DataFrame(data[["source", "target"]], use_vids=False, directed=True)

In [22]:
def add_predicted_alert_weight(anomalies_input, perc_count_in):
    anomalies_perc_x = anomalies_input.copy(deep=True)
    index = anomalies_perc_x.head(perc_count_in).index.tolist()
    anomalies_perc_x = anomalies_perc_x.loc[index, :]
    anomalies_perc_x.loc[:, "predicted_alert_weight"] = anomalies_perc_x.loc[:, "alert_weight"]
    anomalies_perc_x.loc[:, "predicted_report_weight"] = anomalies_perc_x.loc[:, "report_weight"]
    return anomalies_perc_x

In [57]:
communities_1_hop_dict = dict(communities_1_hop)

comms_1_hop_0_1_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies_main, perc_point_1_cnt).index]
comms_1_hop_0_2_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies_main, perc_point_2_cnt).index]
comms_1_hop_0_5_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies_main, perc_point_5_cnt).index]
comms_1_hop_1_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies_main, perc_1_cnt).index]

sizes_1_hop_0_1_perc = [len(x) for x in comms_1_hop_0_1_perc]
sizes_1_hop_0_2_perc = [len(x) for x in comms_1_hop_0_2_perc]
sizes_1_hop_0_5_perc = [len(x) for x in comms_1_hop_0_5_perc]
sizes_1_hop_1_perc = [len(x) for x in comms_1_hop_1_perc]

sizes_edge_1_hop_0_1_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_1_perc]
sizes_edge_1_hop_0_2_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_2_perc]
sizes_edge_1_hop_0_5_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_5_perc]
sizes_edge_1_hop_1_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_1_perc]

results = [{
    "rectify_perc": 1.0,
    "leiden_size_max": np.max(sizes_leiden),
    "leiden_size_mean": np.mean(sizes_leiden),
    "leiden_size_median": np.median(sizes_leiden),
    "1_hop_size_max": np.max(sizes_1_hop),
    "1_hop_size_mean": np.mean(sizes_1_hop),
    "1_hop_size_median": np.median(sizes_1_hop),
    "0.1%": add_predicted_alert_weight(anomalies_main, perc_point_1_cnt)["predicted_alert_weight"].sum(),
    "0.2%": add_predicted_alert_weight(anomalies_main, perc_point_2_cnt)["predicted_alert_weight"].sum(),
    "0.5%": add_predicted_alert_weight(anomalies_main, perc_point_5_cnt)["predicted_alert_weight"].sum(),
    "1%": add_predicted_alert_weight(anomalies_main, perc_1_cnt)["predicted_alert_weight"].sum(),
    "auc_1%": np.mean(np.cumsum(add_predicted_alert_weight(anomalies_main, perc_1_cnt)["predicted_alert_weight"])),
    "report_0.1%": add_predicted_alert_weight(anomalies_main, perc_point_1_cnt)["predicted_report_weight"].sum(),
    "report_0.2%": add_predicted_alert_weight(anomalies_main, perc_point_2_cnt)["predicted_report_weight"].sum(),
    "report_0.5%": add_predicted_alert_weight(anomalies_main, perc_point_5_cnt)["predicted_report_weight"].sum(),
    "report_1%": add_predicted_alert_weight(anomalies_main, perc_1_cnt)["predicted_report_weight"].sum(),
    "report_auc_1%": np.mean(np.cumsum(add_predicted_alert_weight(anomalies_main, perc_1_cnt)["predicted_report_weight"])),
    "max_1_hop_0_1_perc": np.max(sizes_1_hop_0_1_perc),
    "max_1_hop_0_2_perc": np.max(sizes_1_hop_0_2_perc),
    "max_1_hop_0_5_perc": np.max(sizes_1_hop_0_5_perc),
    "max_1_hop_1_perc": np.max(sizes_1_hop_1_perc),
    "mean_1_hop_0_1_perc": np.mean(sizes_1_hop_0_1_perc),
    "mean_1_hop_0_2_perc": np.mean(sizes_1_hop_0_2_perc),
    "mean_1_hop_0_5_perc": np.mean(sizes_1_hop_0_5_perc),
    "mean_1_hop_1_perc": np.mean(sizes_1_hop_1_perc),
    "median_1_hop_0_1_perc": np.median(sizes_1_hop_0_1_perc),
    "median_1_hop_0_2_perc": np.median(sizes_1_hop_0_2_perc),
    "median_1_hop_0_5_perc": np.median(sizes_1_hop_0_5_perc),
    "median_1_hop_1_perc": np.median(sizes_1_hop_1_perc),
    "max_edges_1_hop_0_1_perc": np.max(sizes_edge_1_hop_0_1_perc),
    "max_edges_1_hop_0_2_perc": np.max(sizes_edge_1_hop_0_2_perc),
    "max_edges_1_hop_0_5_perc": np.max(sizes_edge_1_hop_0_5_perc),
    "max_edges_1_hop_1_perc": np.max(sizes_edge_1_hop_1_perc),
    "mean_edges_1_hop_0_1_perc": np.mean(sizes_edge_1_hop_0_1_perc),
    "mean_edges_1_hop_0_2_perc": np.mean(sizes_edge_1_hop_0_2_perc),
    "mean_edges_1_hop_0_5_perc": np.mean(sizes_edge_1_hop_0_5_perc),
    "mean_edges_1_hop_1_perc": np.mean(sizes_edge_1_hop_1_perc),
    "median_edges_1_hop_0_1_perc": np.median(sizes_edge_1_hop_0_1_perc),
    "median_edges_1_hop_0_2_perc": np.median(sizes_edge_1_hop_0_2_perc),
    "median_edges_1_hop_0_5_perc": np.median(sizes_edge_1_hop_0_5_perc),
    "median_edges_1_hop_1_perc": np.median(sizes_edge_1_hop_1_perc),
}]

In [23]:
%%time

anomalies = anomalies_main.copy(deep=True)
for perc in [0.5, 0.5, 0.5]:
    total_nodes = len(anomalies)
    perc_count = round(perc * total_nodes)
    rectify_perc = round(perc_count / len(anomalies_main), 2)
    candidates = add_predicted_alert_weight(anomalies, perc_count).index.tolist()
    filter_ = data["source"].isin(candidates) & data["target"].isin(candidates)
    data_in_scope = data.loc[filter_, :]
    candidates = sorted(set(data_in_scope["source"].unique()).union(data_in_scope["target"].unique()))
    print("=" * 100)
    print(rectify_perc, perc_count, len(candidates), len(anomalies_main), data_in_scope.shape)
    print("=" * 100)
    data_in_scope = data_in_scope.set_index(["source", "target"]).join(
        get_weights(data_in_scope).set_index(["source", "target"]), how="left"
    ).reset_index()
    data_in_scope.loc[:, "amount_weighted"] = data_in_scope.loc[:, "amount"] * data_in_scope.loc[:, "weight"]
    
    %run model.ipynb

    communities_1_hop_dict = dict(communities_1_hop)

    comms_1_hop_0_1_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies, perc_point_1_cnt).index]
    comms_1_hop_0_2_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies, perc_point_2_cnt).index]
    comms_1_hop_0_5_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies, perc_point_5_cnt).index]
    comms_1_hop_1_perc = [communities_1_hop_dict[x] for x in add_predicted_alert_weight(anomalies, perc_1_cnt).index]
    
    sizes_1_hop_0_1_perc = [len(x) for x in comms_1_hop_0_1_perc]
    sizes_1_hop_0_2_perc = [len(x) for x in comms_1_hop_0_2_perc]
    sizes_1_hop_0_5_perc = [len(x) for x in comms_1_hop_0_5_perc]
    sizes_1_hop_1_perc = [len(x) for x in comms_1_hop_1_perc]
    
    sizes_edge_1_hop_0_1_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_1_perc]
    sizes_edge_1_hop_0_2_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_2_perc]
    sizes_edge_1_hop_0_5_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_0_5_perc]
    sizes_edge_1_hop_1_perc = [graph_global.induced_subgraph(x).ecount() for x in comms_1_hop_1_perc]

    results.append({
        "rectify_perc": rectify_perc,
        "leiden_size_max": np.max(sizes_leiden),
        "leiden_size_mean": np.mean(sizes_leiden),
        "leiden_size_median": np.median(sizes_leiden),
        "1_hop_size_max": np.max(sizes_1_hop),
        "1_hop_size_mean": np.mean(sizes_1_hop),
        "1_hop_size_median": np.median(sizes_1_hop),
        "0.1%": add_predicted_alert_weight(anomalies, perc_point_1_cnt)["predicted_alert_weight"].sum(),
        "0.2%": add_predicted_alert_weight(anomalies, perc_point_2_cnt)["predicted_alert_weight"].sum(),
        "0.5%": add_predicted_alert_weight(anomalies, perc_point_5_cnt)["predicted_alert_weight"].sum(),
        "1%": add_predicted_alert_weight(anomalies, perc_1_cnt)["predicted_alert_weight"].sum(),
        "auc_1%": np.mean(np.cumsum(add_predicted_alert_weight(anomalies, perc_1_cnt)["predicted_alert_weight"])),
        "report_0.1%": add_predicted_alert_weight(anomalies, perc_point_1_cnt)["predicted_report_weight"].sum(),
        "report_0.2%": add_predicted_alert_weight(anomalies, perc_point_2_cnt)["predicted_report_weight"].sum(),
        "report_0.5%": add_predicted_alert_weight(anomalies, perc_point_5_cnt)["predicted_report_weight"].sum(),
        "report_1%": add_predicted_alert_weight(anomalies, perc_1_cnt)["predicted_report_weight"].sum(),
        "report_auc_1%": np.mean(np.cumsum(add_predicted_alert_weight(anomalies, perc_1_cnt)["predicted_report_weight"])),
        "max_1_hop_0_1_perc": np.max(sizes_1_hop_0_1_perc),
        "max_1_hop_0_2_perc": np.max(sizes_1_hop_0_2_perc),
        "max_1_hop_0_5_perc": np.max(sizes_1_hop_0_5_perc),
        "max_1_hop_1_perc": np.max(sizes_1_hop_1_perc),
        "mean_1_hop_0_1_perc": np.mean(sizes_1_hop_0_1_perc),
        "mean_1_hop_0_2_perc": np.mean(sizes_1_hop_0_2_perc),
        "mean_1_hop_0_5_perc": np.mean(sizes_1_hop_0_5_perc),
        "mean_1_hop_1_perc": np.mean(sizes_1_hop_1_perc),
        "median_1_hop_0_1_perc": np.median(sizes_1_hop_0_1_perc),
        "median_1_hop_0_2_perc": np.median(sizes_1_hop_0_2_perc),
        "median_1_hop_0_5_perc": np.median(sizes_1_hop_0_5_perc),
        "median_1_hop_1_perc": np.median(sizes_1_hop_1_perc),
        "max_edges_1_hop_0_1_perc": np.max(sizes_edge_1_hop_0_1_perc),
        "max_edges_1_hop_0_2_perc": np.max(sizes_edge_1_hop_0_2_perc),
        "max_edges_1_hop_0_5_perc": np.max(sizes_edge_1_hop_0_5_perc),
        "max_edges_1_hop_1_perc": np.max(sizes_edge_1_hop_1_perc),
        "mean_edges_1_hop_0_1_perc": np.mean(sizes_edge_1_hop_0_1_perc),
        "mean_edges_1_hop_0_2_perc": np.mean(sizes_edge_1_hop_0_2_perc),
        "mean_edges_1_hop_0_5_perc": np.mean(sizes_edge_1_hop_0_5_perc),
        "mean_edges_1_hop_1_perc": np.mean(sizes_edge_1_hop_1_perc),
        "median_edges_1_hop_0_1_perc": np.median(sizes_edge_1_hop_0_1_perc),
        "median_edges_1_hop_0_2_perc": np.median(sizes_edge_1_hop_0_2_perc),
        "median_edges_1_hop_0_5_perc": np.median(sizes_edge_1_hop_0_5_perc),
        "median_edges_1_hop_1_perc": np.median(sizes_edge_1_hop_1_perc),
    })

results = pd.DataFrame(results)
results.to_parquet("results-rec-ver.parquet")

0.5 192550 190877 385100 (399355, 6)


25/07/24 10:09:55 WARN TaskSetManager: Stage 45 contains a task of very large size (2599 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 312,064 | 107,474
Processed hop #2 | 3,653,958 | 99,646
Processed hop #3 | 4,314,776 | 97,782
Processed hop #4 | 4,708,516 | 97,513
Processed hop #5 | 4,770,342 | 97,465

Processing comm_as_target

Processed hop #1 | 273,195 | 128,438
Processed hop #2 | 4,193,500 | 119,453
Processed hop #3 | 4,841,506 | 116,285
Processed hop #4 | 5,415,980 | 115,780
Processed hop #5 | 5,401,775 | 115,046

Processing comm_as_passthrough

Processed hop #1 | 199,876 | 45,035
Processed hop #2 | 1,217,622 | 40,209
Processed hop #3 | 1,647,027 | 39,269
Processed hop #4 | 1,854,648 | 39,097
Processed hop #5 | 1,897,203 | 39,067

Processing comm_as_passthrough_reverse

Processed hop #1 | 150,970 | 45,035
Processed hop #2 | 969,388 | 39,053
Processed hop #3 | 1,362,601 | 37,423
Processed hop #4 | 1,648,516 | 37,124
Processed hop #5 | 1,690,671 | 36,914


comm_as_source_features

CPU times: user 19.6 s, sys: 192 ms, total: 19.8 s
Wall time: 19.8 s

comm_as_target_fe

                                                                                

CPU times: user 1.43 s, sys: 28.2 ms, total: 1.46 s
Wall time: 6.05 s


                                                                                

CPU times: user 2.28 s, sys: 47.9 ms, total: 2.33 s
Wall time: 28.5 s


                                                                                

CPU times: user 1min, sys: 485 ms, total: 1min 1s
Wall time: 2min 32s
1-hop-source features creation


25/07/24 10:20:53 WARN TaskSetManager: Stage 57 contains a task of very large size (4637 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 934 ms, sys: 139 ms, total: 1.07 s
Wall time: 38.3 s
1-hop-target features creation


25/07/24 10:21:31 WARN TaskSetManager: Stage 60 contains a task of very large size (4679 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1.11 s, sys: 152 ms, total: 1.26 s
Wall time: 45.9 s
Features: (190877, 176)
Deleted 10 constant columns
Script executed in 0:12:25
Training the model
CPU times: user 6min 27s, sys: 9.14 s, total: 6min 36s
Wall time: 6min 36s
0.25 95438 95286 385100 (290730, 6)


25/07/24 10:28:58 WARN TaskSetManager: Stage 63 contains a task of very large size (1905 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 251,781 | 66,913
Processed hop #2 | 1,981,864 | 61,601
Processed hop #3 | 2,571,844 | 60,391
Processed hop #4 | 2,872,748 | 60,212
Processed hop #5 | 2,934,594 | 60,175

Processing comm_as_target

Processed hop #1 | 200,623 | 70,206
Processed hop #2 | 1,743,199 | 63,710
Processed hop #3 | 2,290,611 | 61,666
Processed hop #4 | 2,759,919 | 61,302
Processed hop #5 | 2,838,178 | 61,127

Processing comm_as_passthrough

Processed hop #1 | 184,804 | 41,833
Processed hop #2 | 1,148,458 | 38,162
Processed hop #3 | 1,577,373 | 37,475
Processed hop #4 | 1,773,189 | 37,351
Processed hop #5 | 1,814,461 | 37,330

Processing comm_as_passthrough_reverse

Processed hop #1 | 139,512 | 41,833
Processed hop #2 | 907,318 | 37,004
Processed hop #3 | 1,299,913 | 35,792
Processed hop #4 | 1,579,957 | 35,561
Processed hop #5 | 1,634,149 | 35,419


comm_as_source_features

CPU times: user 12.1 s, sys: 128 ms, total: 12.2 s
Wall time: 12.2 s

comm_as_target_features

                                                                                

CPU times: user 768 ms, sys: 15.5 ms, total: 784 ms
Wall time: 2.42 s


                                                                                

CPU times: user 1.19 s, sys: 38.2 ms, total: 1.23 s
Wall time: 6.05 s


                                                                                

CPU times: user 28.6 s, sys: 222 ms, total: 28.9 s
Wall time: 1min 25s
1-hop-source features creation


25/07/24 10:35:28 WARN TaskSetManager: Stage 75 contains a task of very large size (3398 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 577 ms, sys: 84.8 ms, total: 662 ms
Wall time: 24.1 s
1-hop-target features creation


25/07/24 10:35:52 WARN TaskSetManager: Stage 78 contains a task of very large size (3419 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 600 ms, sys: 92.1 ms, total: 692 ms
Wall time: 24.4 s
Features: (95286, 176)
Deleted 10 constant columns
Script executed in 0:07:19
Training the model
CPU times: user 3min 49s, sys: 2.42 s, total: 3min 51s
Wall time: 3min 51s
0.12 47643 47412 385100 (185085, 6)


25/07/24 10:40:10 WARN TaskSetManager: Stage 81 contains a task of very large size (1221 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Processing comm_as_source

Processed hop #1 | 158,083 | 33,076
Processed hop #2 | 1,005,986 | 31,430
Processed hop #3 | 1,352,794 | 30,984
Processed hop #4 | 1,483,752 | 30,928
Processed hop #5 | 1,513,903 | 30,914

Processing comm_as_target

Processed hop #1 | 126,670 | 38,202
Processed hop #2 | 1,105,919 | 35,918
Processed hop #3 | 1,410,338 | 35,201
Processed hop #4 | 1,614,997 | 35,078
Processed hop #5 | 1,668,112 | 35,038

Processing comm_as_passthrough

Processed hop #1 | 125,941 | 23,866
Processed hop #2 | 704,898 | 22,790
Processed hop #3 | 978,626 | 22,530
Processed hop #4 | 1,075,292 | 22,496
Processed hop #5 | 1,098,903 | 22,489

Processing comm_as_passthrough_reverse

Processed hop #1 | 94,466 | 23,866
Processed hop #2 | 553,503 | 22,126
Processed hop #3 | 807,473 | 21,659
Processed hop #4 | 966,887 | 21,574
Processed hop #5 | 1,012,571 | 21,557


comm_as_source_features

CPU times: user 6.1 s, sys: 25.4 ms, total: 6.13 s
Wall time: 6.13 s

comm_as_target_features

CPU tim

                                                                                

CPU times: user 210 ms, sys: 10.4 ms, total: 220 ms
Wall time: 1.59 s


                                                                                

CPU times: user 583 ms, sys: 26 ms, total: 609 ms
Wall time: 3.31 s


                                                                                

CPU times: user 14.9 s, sys: 132 ms, total: 15.1 s
Wall time: 43 s
1-hop-source features creation


25/07/24 10:43:43 WARN TaskSetManager: Stage 93 contains a task of very large size (2177 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 290 ms, sys: 49.6 ms, total: 339 ms
Wall time: 12.7 s
1-hop-target features creation


25/07/24 10:43:55 WARN TaskSetManager: Stage 96 contains a task of very large size (2185 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 339 ms, sys: 55.8 ms, total: 395 ms
Wall time: 13.7 s
Features: (47412, 176)
Deleted 10 constant columns
Script executed in 0:04:00
Training the model
CPU times: user 2min 27s, sys: 1.26 s, total: 2min 29s
Wall time: 2min 29s
CPU times: user 29min 32s, sys: 59.1 s, total: 30min 31s
Wall time: 36min 46s


In [42]:
results = pd.read_parquet("results-rec-ver.parquet")