### Set THRESHOLD_RANK, NEIGHBORS_ONLY, PPR_THRESHOLD_ONLY in communities.py accordingly.

In [None]:
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from datetime import timedelta
from glob import glob

import leidenalg as la
import igraph as ig
import numpy as np
import pandas as pd
import seaborn as sns
import psutil

import settings as s
from common import get_processes

%load_ext autoreload
%autoreload 2

In [None]:
sns.set_theme(style="white", context="talk")

In [None]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [None]:
start_script = time.time()

In [None]:
def aggregate_edges(data_input):
    data_aggregated = (
        data_input.groupby(["source", "target"])
        .agg(
            {
                "source": "first",
                "target": "first",
                "source_amount": "sum",
                "target_amount": "sum",
            }
        )
        .reset_index(drop=True)
    )

    source_totals = (
        data_aggregated.groupby("source")
        .agg({"source_amount": "sum"})["source_amount"]
        .to_dict()
    )
    target_totals = (
        data_aggregated.groupby("target")
        .agg({"target_amount": "sum"})["target_amount"]
        .to_dict()
    )

    data_aggregated.loc[:, "total_sent_by_source"] = data_aggregated.loc[
        :, "source"
    ].apply(lambda x: source_totals[x])
    data_aggregated.loc[:, "total_received_by_target"] = data_aggregated.loc[
        :, "target"
    ].apply(lambda x: target_totals[x])
    data_aggregated.loc[:, "weight"] = data_aggregated.apply(
        lambda x: (
            (x["source_amount"] / x["total_sent_by_source"])
            + (x["target_amount"] / x["total_received_by_target"])
        ),
        axis=1,
    )
    data_aggregated.loc[:, "source"] = data_aggregated["source"].str.slice(0, 8)
    data_aggregated.loc[:, "target"] = data_aggregated["target"].str.slice(0, 8)
    filter_self = data_aggregated["source"] != data_aggregated["target"]
    data_aggregated = data_aggregated.loc[filter_self, :].reset_index(drop=True)
    return data_aggregated.loc[:, ["source", "target", "weight"]]

In [None]:
%%time

data = aggregate_edges(
    pd.read_parquet(
        s.STAGED_DATA_LOCATION,
        columns=["source", "target", "source_amount", "target_amount"],
    )
)

In [None]:
%%time

graph = ig.Graph.DataFrame(data, use_vids=False, directed=True)
nodes = [x["name"] for x in graph.vs()]
nodes_mapping = {x.index: x["name"] for x in graph.vs()}
random.shuffle(nodes)

sizes = graph.neighborhood_size(vertices=nodes, order=2, mode="all", mindist=0)
sizes_nodes = [(x, y) for x, y in zip(sizes, nodes)]
nodes = [x[1] for x in sorted(sizes_nodes) if x[0] <= 500]

# Using a random way for "seed" selection
nodes = np.random.choice(nodes, 10_000)

In [None]:
%%time

NUMBER_OF_PROCESSES = 10

shutil.rmtree("staging", ignore_errors=True)
os.mkdir("staging")
chunks = np.array_split(nodes, NUMBER_OF_PROCESSES)

filename = "graph.pickle"
with open(filename, "wb") as f:
    pickle.dump(graph, f, protocol=pickle.HIGHEST_PROTOCOL)

filename = "nodes.pickle"
with open(filename, "wb") as f:
    pickle.dump(chunks, f, protocol=pickle.HIGHEST_PROTOCOL)

process_ids = set()
process_name = "communities.py"
for chunk_number in range(NUMBER_OF_PROCESSES):
    process_id = str(uuid.uuid4())
    process_ids = process_ids.union({process_id})
    os.system(f"{sys.executable} {process_name} {chunk_number} {process_id} &")

while get_processes(process_ids):
    time.sleep(5)

In [None]:
%%time

for proc in get_processes(process_ids):
    try:
        proc.kill()
    except psutil.NoSuchProcess:
        pass

communities = []
for filename in glob("./staging/*.pickle"):
    with open(filename, "rb") as f:
        communities += pickle.load(f)

original_size = len(communities)

filename = "communities.pickle"
with open(filename, "wb") as f:
    pickle.dump(communities, f)

communities_unique_rev = {}
for comm_id, comm in communities:
    communities_unique_rev[tuple(sorted(comm))] = comm_id

communities_unique = {
    f"{v}-{i}": k for i, (k, v) in enumerate(communities_unique_rev.items())
}
communities = list({x for x in communities_unique.values()})
new_size = len(communities)

print(original_size, new_size, round(new_size / original_size, 2))
print()

In [None]:
sizes = [len(x) for x in communities]
print(new_size, round(np.mean(sizes)), round(np.max(sizes)), sum(sizes))

In [None]:
np.percentile(sizes, 25), np.percentile(sizes, 50), np.percentile(sizes, 75)

In [None]:
## sns.set_theme(rc={"figure.figsize":(12.7, 7.27)})
sns.histplot(data=pd.DataFrame(sizes, columns=["Size"]), x="Size", kde=True)

In [None]:
sns.set_theme(rc={"figure.figsize": (10.7, 5.27)})
sns.boxplot(x=sizes)

In [None]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")