In [None]:
start_script = time.time()

In [None]:
location_main = os.path.join("features", "libra")
# shutil.rmtree(location_main, ignore_errors=True)

location_features_leiden = f"{location_main}{os.sep}features_leiden.parquet"
location_features_ego = f"{location_main}{os.sep}features_ego.parquet"
location_features_1_hop = f"{location_main}{os.sep}features_1_hop.parquet"
location_features_2_hop_out = f"{location_main}{os.sep}features_2_hop_out.parquet"
location_features_2_hop_in = f"{location_main}{os.sep}features_2_hop_in.parquet"
location_features_2_hop_combined = f"{location_main}{os.sep}features_2_hop_combined.parquet"
location_features_source = f"{location_main}{os.sep}features_source.parquet"
location_features_target = f"{location_main}{os.sep}features_target.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [None]:
%%time

TOP_N = 50
NUM_HOPS = 5

data_input = spark.createDataFrame(data_in_scope)
nodes_source = set(data_in_scope["source"].unique())
nodes_target = set(data_in_scope["target"].unique())
nodes_passthrough = nodes_source.intersection(nodes_target)

%run generate_flow_features.ipynb

comm_as_source_features.to_parquet(location_comm_as_source_features)
comm_as_target_features.to_parquet(location_comm_as_target_features)
comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

del comm_as_source_features
del comm_as_target_features
del comm_as_passthrough_features
del comm_as_passthrough_features_reverse

In [None]:
%%time

print("Constructing Leiden communities")

graph = ig.Graph.DataFrame(data_in_scope.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True)
nodes_mapping = {x.index: x["name"] for x in graph.vs()}
communities_leiden = la.find_partition(
    graph, la.ModularityVertexPartition, n_iterations=100, weights="amount_weighted"
)
communities_leiden = [[nodes_mapping[_] for _ in x] for x in communities_leiden]
communities_leiden = [(str(uuid.uuid4()), set(x)) for x in communities_leiden]
sizes_leiden = [len(x[1]) for x in communities_leiden]

In [None]:
%%time

print("Constructing 1-hop communities")

communities_1_hop = get_communities_spark(
    [(x, [x]) for x in candidates], 
    ig.Graph.DataFrame(data_in_scope.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True), 
    os.cpu_count(), spark, 1, "all", 0.01, "amount_weighted"
)
sizes_1_hop = [len(x[1]) for x in communities_1_hop]
sizes_1_hop_alerted = [len(x[1]) for x in communities_1_hop if alerted_nodes.intersection({x[0]})]

In [None]:
graph = ig.Graph.DataFrame(data_in_scope, use_vids=False, directed=True)

In [None]:
%%time

print("Leiden communitites features creation")

features_leiden = generate_features_spark(
    communities_leiden, graph, spark
)
features_leiden = features_leiden.rename(columns={"key": "key_fake"})
communities_leiden_dict = dict(communities_leiden)
features_leiden.loc[:, "key"] = features_leiden.loc[:, "key_fake"].apply(lambda x: communities_leiden_dict[x])
features_leiden = features_leiden.explode("key")
del features_leiden["key_fake"]
features_leiden.set_index("key").to_parquet(location_features_leiden)

In [None]:
%%time

print("1-hop communitites features creation")

features_1_hop = generate_features_spark(communities_1_hop, graph, spark)
features_1_hop.set_index("key").to_parquet(location_features_1_hop)

In [None]:
del graph

In [None]:
%%time

print("1-hop-source features creation")

features_source = spark.createDataFrame(data_in_scope).withColumn(
    "key", sf.col("source")
).repartition(os.cpu_count(), "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_source = pd.DataFrame(features_source["features"].apply(json.loads).tolist())
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]
features_source.set_index("key").to_parquet(location_features_source)

In [None]:
%%time

print("1-hop-target features creation")

features_target = spark.createDataFrame(data_in_scope).withColumn(
    "key", sf.col("target")
).repartition(os.cpu_count(), "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_target = pd.DataFrame(features_target["features"].apply(json.loads).tolist())
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]
features_target.set_index("key").to_parquet(location_features_target)

In [None]:
ENABLED_FEATURES = [
    ("leiden", location_features_leiden),
    ("1_hop", location_features_1_hop),
    ("as_source", location_features_source),
    ("as_target", location_features_target),
    ("comm_as_source_features", location_comm_as_source_features),
    ("comm_as_target_features", location_comm_as_target_features),
    ("comm_as_passthrough_features", location_comm_as_passthrough_features),
    ("comm_as_passthrough_features_reverse", location_comm_as_passthrough_features_reverse),
]

In [None]:
all_features = pd.DataFrame()
all_features.index.name = "key"

for feature_group, location in ENABLED_FEATURES:
    all_features = all_features.join(
        pd.read_parquet(location), how="outer", rsuffix=f"_{feature_group}"
    )

all_features.to_parquet(location_features_node_level)
print("Features:", all_features.shape)
del all_features

In [None]:
all_features = pd.read_parquet(location_features_node_level)

In [None]:
constants = []
for column in all_features.columns:
    if all_features[column].nunique(dropna=True) <= 1:
        del all_features[column]
        constants.append(column)
print(f"Deleted {len(constants)} constant columns")

In [None]:
medians = {}
for column in all_features.columns:
    medians[column] = np.nanmedian(all_features[column])

In [None]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")

In [None]:
%%time

print("Training the model")

anomalies = all_features.loc[:, []]
model = IsolationForest(n_estimators=100_000)
anomalies.loc[:, "anomaly_score"] = model.fit(
    all_features.fillna(medians)
).decision_function(all_features.fillna(medians))
anomalies = anomalies.sort_values("anomaly_score", ascending=True)
anomalies = anomalies.join(nodes_data)