In [None]:
def aggregate_edges(data_input):
    data_aggregated = (
        data_input.groupby(["source", "target"])
        .agg(
            amount_usd=("amount_usd", "sum")
        )
    ).reset_index()

    source_totals = (
        data_aggregated.groupby("source")
        .agg({"amount_usd": "sum"})["amount_usd"]
        .to_dict()
    )
    target_totals = (
        data_aggregated.groupby("target")
        .agg({"amount_usd": "sum"})["amount_usd"]
        .to_dict()
    )

    data_aggregated.loc[:, "total_sent_by_source"] = data_aggregated.loc[
        :, "source"
    ].apply(lambda x: source_totals[x])
    data_aggregated.loc[:, "total_received_by_target"] = data_aggregated.loc[
        :, "target"
    ].apply(lambda x: target_totals[x])
    data_aggregated.loc[:, "weight"] = data_aggregated.apply(
        lambda x: (
            (x["amount_usd"] / (x["total_sent_by_source"] or 1))
            + (x["amount_usd"] / (x["total_received_by_target"] or 1))
        ),
        axis=1,
    )
    return data_aggregated.loc[:, ["source", "target", "weight"]]

In [None]:
window_edges = aggregate_edges(in_scope_window)

In [None]:
graph = ig.Graph.DataFrame(window_edges, use_vids=False, directed=True)

In [None]:
communities = get_communities_multi_proc(in_scope_nodes, graph, NUM_PROCS)
sizes = [len(x[1]) for x in communities]
print(round(np.mean(sizes)), round(np.max(sizes)), sum(sizes))

In [None]:
in_scope_window.loc[:, "edge"] = in_scope_window.apply(
    lambda x: tuple(sorted([x["source"], x["target"]])), axis=1
)
in_scope_window.loc[:, "edge"] = in_scope_window.loc[:, "edge"].apply(
    lambda x: f"{x[0]}-{x[1]}"
)
in_scope_window.set_index("edge", inplace=True)

In [None]:
location_trx_comm = "transactions_communities"

In [None]:
in_scope_window.loc[:, "edge"] = in_scope_window.apply(
    lambda x: tuple(sorted([x["source"], x["target"]])), axis=1
)
in_scope_window.loc[:, "edge"] = in_scope_window.loc[:, "edge"].apply(
    lambda x: f"{x[0]}-{x[1]}"
)
in_scope_window.set_index("edge", inplace=True)

In [None]:
in_scope_window.loc[:, "window_delta"] = (
    in_scope_window["timestamp"] - first_trx_ts
).dt.total_seconds() + 1

In [None]:
shutil.rmtree(location_trx_comm, ignore_errors=True)
os.mkdir(location_trx_comm)

shutil.rmtree(location_trx_comm, ignore_errors=True)
os.mkdir(location_trx_comm)

communities = dict(communities)
communities_keys = [x for x in communities.keys()]

# chunk_size = int(np.ceil(len(communities_keys) / NUM_PROCS))
chunk_size = 10_000

number_of_chunks = int(np.ceil(len(communities_keys) / chunk_size))
chunks = np.array_split(communities_keys, number_of_chunks)
for index, chunk in enumerate(chunks):
    comm_inner = []
    for key in chunk:
        comm_node = communities[key]
        comm_inner += [[key, tuple(sorted(x))] for x in combinations(comm_node, 2)]
    edge_combinations = pd.DataFrame(comm_inner, columns=["id", "edge"])
    edge_combinations.loc[:, "edge"] = edge_combinations.loc[:, "edge"].apply(
        lambda x: f"{x[0]}-{x[1]}"
    )
    edge_combinations.set_index("edge", inplace=True)
    edge_combinations.join(in_scope_window, how="inner").reset_index(drop=True).to_parquet(
        f"{location_trx_comm}/part-{index}.parquet"
    )
    if not (index % 20):
        print(index, len(chunks))

In [None]:
features_all = []
trx_files = sorted(glob(f"{location_trx_comm}{os.sep}*"))
number_of_chunks = int(np.ceil(len(trx_files) / NUM_PROCS))
trx_chunks = np.array_split(trx_files, number_of_chunks)
for trx_files_chunk in trx_chunks:
    features = get_features_multi_proc(
        trx_files_chunk, "features.get_features_chunk_with_gf",
        reset_staging=True
    )
    features_all.append(features)
features_all = pd.concat(features_all, ignore_index=True)

In [None]:
iterator_chunk_as_pickles, _ = create_workload_for_multi_proc(
    in_scope_window["source"].nunique(),
    in_scope_window.groupby("source"), 
    NUM_PROCS, shuffle=False
)
features_source = get_features_multi_proc(
    iterator_chunk_as_pickles, "features.get_features_chunk_without_gf",
)

In [None]:
iterator_chunk_as_pickles, _ = create_workload_for_multi_proc(
    in_scope_window["target"].nunique(),
    in_scope_window.groupby("target"), 
    NUM_PROCS, shuffle=False
)
features_target = get_features_multi_proc(
    iterator_chunk_as_pickles, "features.get_features_chunk_without_gf",
)

In [None]:
def reset_index(df_input, index_name):
    df_input.index.name = index_name
    df_input = df_input.reset_index()
    return df_input

In [None]:
features_all = features_all.set_index("key").join(
    features_source.set_index("key"), how="left", rsuffix=f"_1_hop_as_source"
)
features_all = reset_index(features_all, "key")     

features_all = features_all.set_index("key").join(
    features_target.set_index("key"), how="left", rsuffix=f"_1_hop_as_target"
)
features_all = reset_index(features_all, "key") 

In [None]:
features_all.loc[:, "active_for"] = features_all.loc[:, "key"].apply(lambda x: active_for[x])