In [None]:
def aggregate_edges(data_input):
    data_aggregated = (
        data_input.groupby(["source", "target"])
        .agg(
            amount_usd=("amount_usd", "sum")
        )
    ).reset_index()

    source_totals = (
        data_aggregated.groupby("source")
        .agg({"amount_usd": "sum"})["amount_usd"]
        .to_dict()
    )
    target_totals = (
        data_aggregated.groupby("target")
        .agg({"amount_usd": "sum"})["amount_usd"]
        .to_dict()
    )

    data_aggregated.loc[:, "total_sent_by_source"] = data_aggregated.loc[
        :, "source"
    ].apply(lambda x: source_totals[x])
    data_aggregated.loc[:, "total_received_by_target"] = data_aggregated.loc[
        :, "target"
    ].apply(lambda x: target_totals[x])
    data_aggregated.loc[:, "weight"] = data_aggregated.apply(
        lambda x: (
            (x["amount_usd"] / (x["total_sent_by_source"] or 1))
            + (x["amount_usd"] / (x["total_received_by_target"] or 1))
        ),
        axis=1,
    )
    return data_aggregated.loc[:, ["source", "target", "weight"]]

In [None]:
window_edges = aggregate_edges(in_scope_window)

In [None]:
graph = ig.Graph.DataFrame(window_edges, use_vids=False, directed=True)

In [None]:
communities = get_communities_multi_proc(in_scope_nodes, graph, NUM_PROCS)
sizes = [len(x[1]) for x in communities]
print(round(np.mean(sizes)), round(np.max(sizes)), sum(sizes))

In [None]:
in_scope_window.loc[:, "window_delta"] = (
    in_scope_window["timestamp"] - first_trx_ts
).dt.total_seconds() + 1

In [None]:
columns = ["source", "target", "is_zero_transaction", "num_transactions", "amount_usd", "window_delta"]
graph = ig.Graph.DataFrame(in_scope_window[columns], use_vids=False, directed=True)

In [None]:
iterator_chunk_as_pickles, param_locations = create_workload_for_multi_proc(
    len(communities),
    communities, 
    NUM_PROCS, 
    graph,
    shuffle=True
)
features = get_features_multi_proc(
    iterator_chunk_as_pickles, param_locations[0], "features.get_features_chunk_with_gf",
    reset_staging=False
)
features.columns = [f"{s.G_COMM_PREFIX}{x}" if x != "key" else x for x in features.columns]

In [None]:
iterator_chunk_as_pickles, _ = create_workload_for_multi_proc(
    in_scope_window["source"].nunique(),
    in_scope_window.groupby("source"), 
    NUM_PROCS, shuffle=False
)
features_source = get_features_multi_proc(
    iterator_chunk_as_pickles, None, "features.get_features_chunk_without_gf",
    reset_staging=False,
)
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]

In [None]:
iterator_chunk_as_pickles, _ = create_workload_for_multi_proc(
    in_scope_window["target"].nunique(),
    in_scope_window.groupby("target"), 
    NUM_PROCS, shuffle=False
)
features_target = get_features_multi_proc(
    iterator_chunk_as_pickles, None, "features.get_features_chunk_without_gf",
    reset_staging=False
)
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]

In [None]:
def reset_index(df_input, index_name):
    df_input.index.name = index_name
    df_input = df_input.reset_index()
    return df_input

In [None]:
features_all = features.set_index("key").join(
    features_source.set_index("key"), how="left", rsuffix=f"_1_hop_as_source"
)
features_all = reset_index(features_all, "key")     

features_all = features_all.set_index("key").join(
    features_target.set_index("key"), how="left", rsuffix=f"_1_hop_as_target"
)
features_all = reset_index(features_all, "key") 

In [None]:
features_all.loc[:, "active_for"] = features_all.loc[:, "key"].apply(lambda x: active_for[x])