In [41]:
in_scope_nodes = list(set(data_graph_agg["source"].unique()).union(data_graph_agg["target"].unique()))

In [42]:
def aggregate_edges(data_input):
    data_aggregated = (
        data_input.groupby(["source", "target"])
        .agg(
            amount=("amount", "sum")
        )
    ).reset_index()

    source_totals = (
        data_aggregated.groupby("source")
        .agg({"amount": "sum"})["amount"]
        .to_dict()
    )
    target_totals = (
        data_aggregated.groupby("target")
        .agg({"amount": "sum"})["amount"]
        .to_dict()
    )

    data_aggregated.loc[:, "total_sent_by_source"] = data_aggregated.loc[
        :, "source"
    ].apply(lambda x: source_totals[x])
    data_aggregated.loc[:, "total_received_by_target"] = data_aggregated.loc[
        :, "target"
    ].apply(lambda x: target_totals[x])
    data_aggregated.loc[:, "weight"] = data_aggregated.apply(
        lambda x: (
            (x["amount"] / x["total_sent_by_source"])
            + (x["amount"] / x["total_received_by_target"])
        ),
        axis=1,
    )
    return data_aggregated.loc[:, ["source", "target", "weight"]]

In [43]:
window_edges = aggregate_edges(data_graph_agg.loc[:,["source", "target", "amount"]])

In [44]:
%%time

print("Constructing communities")
graph = ig.Graph.DataFrame(window_edges, use_vids=False, directed=True)
communities = get_communities_multi_proc(in_scope_nodes, graph, NUM_PROCS)
del graph
del window_edges

In [None]:
%%time

print("Loading graph")
graph = ig.Graph.DataFrame(data_graph_agg, use_vids=False, directed=True)

In [None]:
NUM_PROCS = 4

In [None]:
%%time

print("Constructing features")
iterator_chunk_as_pickles, param_locations = create_workload_for_multi_proc(
    len(communities),
    communities, 
    NUM_PROCS, 
    graph,
    shuffle=True
)
features = get_features_multi_proc(
    iterator_chunk_as_pickles, param_locations[0], "features.get_features_chunk_with_gf",
    reset_staging=False
)
features.columns = [f"{s.G_COMM_PREFIX}{x}" if x != "key" else x for x in features.columns]

In [None]:
NUM_PROCS = 10

In [52]:
iterator_chunk_as_pickles, _ = create_workload_for_multi_proc(
    data_graph_agg["source"].nunique(),
    data_graph_agg.groupby("source"), 
    NUM_PROCS, shuffle=False
)
features_source = get_features_multi_proc(
    iterator_chunk_as_pickles, None, "features.get_features_chunk_without_gf",
    reset_staging=False
)
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]

CPU times: user 2.86 s, sys: 221 ms, total: 3.08 s
Wall time: 38.3 s


In [None]:
iterator_chunk_as_pickles, _ = create_workload_for_multi_proc(
    data_graph_agg["target"].nunique(),
    data_graph_agg.groupby("target"), 
    NUM_PROCS, shuffle=False
)
features_target = get_features_multi_proc(
    iterator_chunk_as_pickles, None, "features.get_features_chunk_without_gf",
    reset_staging=False
)
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]

In [None]:
def reset_index(df_input, index_name):
    df_input.index.name = index_name
    df_input = df_input.reset_index()
    return df_input

In [None]:
all_features = features.set_index("key").join(
    features_source.set_index("key"), how="left", rsuffix=f"_1_hop_as_source"
)
all_features = reset_index(all_features, "key") 

all_features = all_features.set_index("key").join(
    features_target.set_index("key"), how="left", rsuffix=f"_1_hop_as_target"
)