In [None]:
data_agg = (
    data_input.groupby(["source", "target"])
    .agg(
        sf.sum("amount").alias("amount")
    )
).toPandas()
data_agg.loc[:, "amount"] = np.ceil(data_agg.loc[:, "amount"])
data_agg = data_agg.astype({"amount": np.uint64})
data_agg = data_agg.sort_values("amount", ascending=False).reset_index(drop=True)

In [None]:
totals_sent = data_agg.groupby("source").agg({"amount": "sum"})["amount"].to_dict()
totals_received = data_agg.groupby("target").agg({"amount": "sum"})["amount"].to_dict()

In [None]:
def get_communities(top_n, n_hops, data_input, pov, cp, totals, to_check_in):
    if not(0 < n_hops < 11):
        raise NotImplementedError
    if top_n < 1:
        raise ValueError
    
    result = []
    
    data_input = data_input.loc[data_input["source"] != data_input["target"], :].reset_index(drop=True)
    level_1st = data_input[data_input[pov].isin(to_check_in)].groupby(pov).head(top_n).reset_index(drop=True)
    level_1st.loc[:, "amount"] = np.array(
        [
            level_1st.loc[:, pov].apply(lambda x: totals[x]).values, 
            level_1st.loc[:, "amount"].values
        ],
        dtype=np.uint64
    ).min(axis=0)
    level_1st = level_1st.sort_values("amount", ascending=False)
    level_1st_comms = level_1st.groupby(pov).agg(nodes=(cp, list), amounts=("amount", list))
    level_1st_comms = level_1st_comms.apply(lambda x: dict(zip(x["nodes"], x["amounts"])), axis=1).to_dict()
    print(f"Processed hop #1 | {level_1st.shape[0]:,} | {len(level_1st_comms):,}")
    
    result.append(level_1st_comms)

    for n_hop in range(1, n_hops):
        if n_hop == 1:
            n_minus_1 = level_1st.copy(deep=True)
        else:
            n_minus_1 = level_nth.copy(deep=True)
        level_nth = n_minus_1.set_index(cp).join(
            level_1st.set_index(pov), lsuffix="_left", how="inner"
        ).reset_index(drop=True)
        level_nth = level_nth.loc[level_nth["source"] != level_nth["target"], :].reset_index(drop=True)
        level_nth.loc[:, "amount"] = level_nth[["amount_left", "amount"]].min(axis=1)
        del level_nth["amount_left"]
        level_nth = level_nth.groupby([pov, cp]).agg(amount=("amount", "sum")).reset_index()
        level_nth.loc[:, "amount"] = np.array(
            [
                level_nth.loc[:, pov].apply(lambda x: totals[x]).values, 
                level_nth.loc[:, "amount"].values
            ],
            dtype=np.uint64
        ).min(axis=0)
        level_nth = level_nth.sort_values("amount", ascending=False).reset_index(drop=True)
        level_nth = level_nth.groupby(pov).head(top_n).reset_index(drop=True)
        level_nth_comms = level_nth.groupby(pov).agg(nodes=(cp, list), amounts=("amount", list))
        level_nth_comms = level_nth_comms.apply(lambda x: dict(zip(x["nodes"], x["amounts"])), axis=1).to_dict()
        print(f"Processed hop #{n_hop + 1} | {level_nth.shape[0]:,} | {len(level_nth_comms):,}")

        result.append(level_nth_comms)
    
    del level_1st
    del n_minus_1
    
    return result

In [None]:
print("\nProcessing comm_as_source\n")
comm_as_source = get_communities(TOP_N, NUM_HOPS, data_agg, "source", "target", totals_sent, nodes_source)

In [None]:
print("\nProcessing comm_as_target\n")
comm_as_target = get_communities(TOP_N, NUM_HOPS, data_agg, "target", "source", totals_received, nodes_target)

In [None]:
print("\nProcessing comm_as_passthrough\n")
comm_as_passthrough = get_communities(
    TOP_N, NUM_HOPS, data_agg.loc[data_agg["source"].isin(nodes_passthrough), :], "source", "target", 
    totals_received, nodes_passthrough
)

In [None]:
print("\nProcessing comm_as_passthrough_reverse\n")
comm_as_passthrough_reverse = get_communities(
    TOP_N, NUM_HOPS, data_agg.loc[data_agg["target"].isin(nodes_passthrough), :], "target", "source", 
    totals_sent, nodes_passthrough
)
print()

In [None]:
def construct_global_features(input_data, input_nodes, totals):
    results = []
    for node in input_nodes.intersection(input_data[0].keys()):
        all_nodes = set()
        node_comm_stats = {"key": node}
        for index, comms in enumerate(input_data):
            n_hop = index + 1
            # node_comm_stats[f"hop_{n_hop}_number_of_accounts"] = 0
            # if index:
            #     node_comm_stats[f"hop_{n_hop}_number_of_distinct_accounts"] = 0
            #     node_comm_stats[f"hop_{n_hop}_number_of_new_accounts"] = 0
            # node_comm_stats[f"hop_{n_hop}_max_transferred"] = 0
            # node_comm_stats[f"hop_{n_hop}_mean_transferred"] = 0
            # node_comm_stats[f"hop_{n_hop}_median_transferred"] = 0
            # node_comm_stats[f"hop_{n_hop}_std_transferred"] = 0
            if not comms.get(node):
                continue
            nodes_community, amounts_community = zip(*comms[node].items())
            if not index:
                sum_prev = totals[node] or 1
            else:
                sum_prev = sum(input_data[index - 1][node].values()) or 1
            sum_prev = min([totals[node], sum_prev])
            amounts_community_adjusted = np.array(amounts_community, dtype=np.float64)
            amounts_community_adjusted /= sum_prev
            amounts_community_adjusted[amounts_community_adjusted > 1] = 1
            sum_this = sum(comms[node].values())
            perc_transferred = (sum_this / sum_prev) if sum_prev > sum_this else 1
            amounts_community_adjusted *= perc_transferred
            number_of_new_accounts = len(set(nodes_community) - all_nodes)
            all_nodes = all_nodes.union(nodes_community)
            # node_comm_stats[f"hop_{n_hop}_number_of_accounts"] = len(nodes_community)
            # if index:
            #     node_comm_stats[f"hop_{n_hop}_number_of_distinct_accounts"] = len(all_nodes)
            #     node_comm_stats[f"hop_{n_hop}_number_of_new_accounts"] = number_of_new_accounts
            # node_comm_stats[f"hop_{n_hop}_max_transferred"] = np.max(amounts_community_adjusted)
            # node_comm_stats[f"hop_{n_hop}_mean_transferred"] = np.mean(amounts_community_adjusted)
            # node_comm_stats[f"hop_{n_hop}_median_transferred"] = np.median(amounts_community_adjusted)
            node_comm_stats[f"hop_{n_hop}_max_transferred_act"] = np.nanmax(amounts_community)
            node_comm_stats[f"hop_{n_hop}_std_transferred_act"] = np.nanstd(amounts_community)
            node_comm_stats[f"hop_{n_hop}_std_transferred_adj"] = np.nanstd(amounts_community_adjusted)
            # node_comm_stats[f"hop_{n_hop}_skew_transferred"] = stats.skew(amounts_community_adjusted)
            # node_comm_stats[f"hop_{n_hop}_kurtosis_transferred"] = stats.kurtosis(amounts_community_adjusted)
        results.append(node_comm_stats)

    return pd.DataFrame(results)

In [None]:
%%time

print("\ncomm_as_source_features\n")
comm_as_source_features = construct_global_features(comm_as_source, nodes_source, totals_sent)
del comm_as_source

In [None]:
%%time

print("\ncomm_as_target_features\n")
comm_as_target_features = construct_global_features(comm_as_target, nodes_target, totals_received)
del comm_as_target

In [None]:
%%time

print("\ncomm_as_passthrough_features\n")
comm_as_passthrough_features = construct_global_features(
    comm_as_passthrough, nodes_passthrough, totals_received
)
del comm_as_passthrough

In [None]:
%%time

print("\ncomm_as_passthrough_features_reverse\n")
comm_as_passthrough_features_reverse = construct_global_features(
    comm_as_passthrough_reverse, nodes_passthrough, totals_sent
)
del comm_as_passthrough_reverse

In [None]:
print("\n")
comm_as_source_features.set_index("key", inplace=True)
comm_as_target_features.set_index("key", inplace=True)
comm_as_passthrough_features.set_index("key", inplace=True)
comm_as_passthrough_features_reverse.set_index("key", inplace=True)

In [None]:
comm_as_source_features.columns = [f"{s.G_GLOB_PREFIX}{x}" for x in comm_as_source_features.columns]
comm_as_target_features.columns = [f"{s.G_GLOB_PREFIX}{x}" for x in comm_as_target_features.columns]
comm_as_passthrough_features.columns = [f"{s.G_GLOB_PREFIX}{x}" for x in comm_as_passthrough_features.columns]
comm_as_passthrough_features_reverse.columns = [f"{s.G_GLOB_PREFIX}{x}" for x in comm_as_passthrough_features_reverse.columns]