In [None]:
import os
import pickle
import random
import shutil
import sys
import time
import uuid
from collections import Counter
from glob import glob
from datetime import timedelta, datetime
from itertools import combinations

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from sklearn.metrics import f1_score, recall_score
from scipy import stats

import settings as s
from common import create_workload_for_multi_proc
from communities import get_communities_multi_proc
from features import get_features_multi_proc, pov_features

%load_ext autoreload
%autoreload 2

In [None]:
config = [
    ("spark.driver.memory", "16g"),
    ("spark.worker.memory", "16g"),
    ("spark.driver.maxResultSize", "16g"),
]
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(config))
    .getOrCreate()
)

In [None]:
start = time.time()

In [None]:
data_agg = (
    data_input.groupby(["source", "target"])
    .agg(
        sf.sum("amount").alias("amount")
    )
).toPandas()
data_agg.loc[:, "amount"] = np.ceil(data_agg.loc[:, "amount"])
data_agg = data_agg.astype({"amount": np.uint64})
data_agg = data_agg.sort_values("amount", ascending=False).reset_index(drop=True)

In [None]:
totals_sent = data_agg.groupby("source").agg({"amount": "sum"})["amount"].to_dict()
totals_received = data_agg.groupby("target").agg({"amount": "sum"})["amount"].to_dict()

In [None]:
def get_communities(top_n, n_hops, data_input, pov, cp, totals, to_check_in):
    if not(0 < n_hops < 11):
        raise NotImplementedError
    if top_n < 1:
        raise ValueError
    communities_data = []
    n_1 = {}
    print("Processing hop # 1")
    for node, group in data_input.groupby(pov):
        if not set([node]).intersection(to_check_in):
            continue
        group = group.head(top_n)
        n_1[node] = Counter(dict(group.loc[:, [cp, "amount"]].values))
    communities_data.append(n_1)
    for n_hop in range(1, n_hops):
        n_minus_1th = communities_data[-1]
        print(f"Processing hop # {n_hop + 1}")
        n_th = {}
        for node in n_1.keys():
            nth_level = Counter()
            reference_amount = totals[node]
            for node_n, node_n_amount in n_minus_1th[node].items():
                n_1_th = pd.DataFrame(n_1.get(node_n, Counter()).items(), columns=["key", "amount"])
                n_1_th.loc[:, "amount"] = n_1_th.loc[:, "amount"].apply(lambda x: min([x, node_n_amount, reference_amount]))
                nth_level += Counter(dict(n_1_th.values))
            n_th[node] = Counter(dict(nth_level.most_common(top_n)))
        communities_data.append(dict(n_th))
    return communities_data

In [None]:
print("Processing communities_as_source\n")
communities_as_source = get_communities(50, 4, data_agg, "source", "target", totals_sent, nodes_source)

In [None]:
print("Processing communities_as_target\n")
communities_as_target = get_communities(50, 4, data_agg, "target", "source", totals_received, nodes_target)

In [None]:
print("Processing communities_as_passthrough\n")
communities_as_passthrough = get_communities(
    50, 4, data_agg.loc[data_agg["source"].isin(nodes_passthrough), :], "source", "target", 
    totals_received, nodes_passthrough
)

In [None]:
communities_as_source_features = []
for node in nodes_source:
    all_nodes = set()
    node_comm_stats = {"key": node}
    for index, communities in enumerate(communities_as_source):
        n_hop = index + 1
        node_comm_stats[f"hop_{n_hop}_number_of_accounts"] = 0
        node_comm_stats[f"hop_{n_hop}_number_of_new_accounts"] = 0
        if index:
            node_comm_stats[f"hop_{n_hop}_number_of_distinct_accounts"] = 0
        node_comm_stats[f"hop_{n_hop}_max_transferred"] = 0
        node_comm_stats[f"hop_{n_hop}_mean_transferred"] = 0
        node_comm_stats[f"hop_{n_hop}_median_transferred"] = 0
        node_comm_stats[f"hop_{n_hop}_std_transferred"] = 0
        if not communities[node]:
            continue
        nodes_community, amounts_community = zip(*communities[node].items())
        if not index:
            sum_prev = totals_sent[node]
        else:
            sum_prev = sum(communities_as_source[index - 1][node].values())
        sum_prev = min([totals_sent[node], sum_prev])
        amounts_community_adjusted = np.array(amounts_community, dtype=np.float64)
        amounts_community_adjusted /= sum_prev
        amounts_community_adjusted[amounts_community_adjusted > 1] = 1
        sum_this = sum(communities[node].values())
        perc_transferred = (sum_this / sum_prev) if sum_prev > sum_this else 1
        amounts_community_adjusted *= perc_transferred
        number_of_new_accounts = len(set(nodes_community) - all_nodes)
        all_nodes = all_nodes.union(nodes_community)
        node_comm_stats[f"hop_{n_hop}_number_of_accounts"] = len(nodes_community)
        if index:
            node_comm_stats[f"hop_{n_hop}_number_of_distinct_accounts"] = len(all_nodes)
            node_comm_stats[f"hop_{n_hop}_number_of_new_accounts"] = number_of_new_accounts
        node_comm_stats[f"hop_{n_hop}_max_transferred"] = np.max(amounts_community_adjusted)
        node_comm_stats[f"hop_{n_hop}_mean_transferred"] = np.mean(amounts_community_adjusted)
        node_comm_stats[f"hop_{n_hop}_median_transferred"] = np.median(amounts_community_adjusted)
        node_comm_stats[f"hop_{n_hop}_std_transferred"] = np.std(amounts_community_adjusted)
        # node_comm_stats[f"hop_{n_hop}_skew_transferred"] = stats.skew(amounts_community_adjusted)
        # node_comm_stats[f"hop_{n_hop}_kurtosis_transferred"] = stats.kurtosis(amounts_community_adjusted)
    communities_as_source_features.append(node_comm_stats)

communities_as_source_features = pd.DataFrame(communities_as_source_features)

In [None]:
communities_as_target_features = []
for node in nodes_target:
    all_nodes = set()
    node_comm_stats = {"key": node}
    for index, communities in enumerate(communities_as_target):
        n_hop = index + 1
        node_comm_stats[f"hop_{n_hop}_number_of_accounts"] = 0
        if index:
            node_comm_stats[f"hop_{n_hop}_number_of_distinct_accounts"] = 0
            node_comm_stats[f"hop_{n_hop}_number_of_new_accounts"] = 0
        node_comm_stats[f"hop_{n_hop}_max_transferred"] = 0
        node_comm_stats[f"hop_{n_hop}_mean_transferred"] = 0
        node_comm_stats[f"hop_{n_hop}_median_transferred"] = 0
        node_comm_stats[f"hop_{n_hop}_std_transferred"] = 0
        if not communities[node]:
            continue
        nodes_community, amounts_community = zip(*communities[node].items())
        if not index:
            sum_prev = totals_received[node]
        else:
            sum_prev = sum(communities_as_target[index - 1][node].values())
        sum_prev = min([totals_received[node], sum_prev])
        amounts_community_adjusted = np.array(amounts_community, dtype=np.float64)
        amounts_community_adjusted /= sum_prev
        amounts_community_adjusted[amounts_community_adjusted > 1] = 1
        sum_this = sum(communities[node].values())
        perc_transferred = (sum_this / sum_prev) if sum_prev > sum_this else 1
        amounts_community_adjusted *= perc_transferred
        number_of_new_accounts = len(set(nodes_community) - all_nodes)
        all_nodes = all_nodes.union(nodes_community)
        node_comm_stats[f"hop_{n_hop}_number_of_accounts"] = len(nodes_community)
        if index:
            node_comm_stats[f"hop_{n_hop}_number_of_distinct_accounts"] = len(all_nodes)
            node_comm_stats[f"hop_{n_hop}_number_of_new_accounts"] = number_of_new_accounts
        node_comm_stats[f"hop_{n_hop}_max_transferred"] = np.max(amounts_community_adjusted)
        node_comm_stats[f"hop_{n_hop}_mean_transferred"] = np.mean(amounts_community_adjusted)
        node_comm_stats[f"hop_{n_hop}_median_transferred"] = np.median(amounts_community_adjusted)
        node_comm_stats[f"hop_{n_hop}_std_transferred"] = np.std(amounts_community_adjusted)
        # node_comm_stats[f"hop_{n_hop}_skew_transferred"] = stats.skew(amounts_community_adjusted)
        # node_comm_stats[f"hop_{n_hop}_kurtosis_transferred"] = stats.kurtosis(amounts_community_adjusted)
    communities_as_target_features.append(node_comm_stats)

communities_as_target_features = pd.DataFrame(communities_as_target_features)

In [None]:
communities_as_passthrough_features = []
for node in nodes_passthrough:
    all_nodes = set()
    node_comm_stats = {"key": node}
    for index, communities in enumerate(communities_as_passthrough):
        n_hop = index + 1
        node_comm_stats[f"hop_{n_hop}_number_of_accounts"] = 0
        if index:
            node_comm_stats[f"hop_{n_hop}_number_of_distinct_accounts"] = 0
            node_comm_stats[f"hop_{n_hop}_number_of_new_accounts"] = 0
        node_comm_stats[f"hop_{n_hop}_max_transferred"] = 0
        node_comm_stats[f"hop_{n_hop}_mean_transferred"] = 0
        node_comm_stats[f"hop_{n_hop}_median_transferred"] = 0
        node_comm_stats[f"hop_{n_hop}_std_transferred"] = 0
        if not communities[node]:
            continue
        nodes_community, amounts_community = zip(*communities[node].items())
        if not index:
            sum_prev = totals_received[node]
        else:
            sum_prev = sum(communities_as_passthrough[index - 1][node].values())
        sum_prev = min([totals_received[node], sum_prev])
        amounts_community_adjusted = np.array(amounts_community, dtype=np.float64)
        amounts_community_adjusted /= sum_prev
        amounts_community_adjusted[amounts_community_adjusted > 1] = 1
        sum_this = sum(communities[node].values())
        perc_transferred = (sum_this / sum_prev) if sum_prev > sum_this else 1
        amounts_community_adjusted *= perc_transferred
        number_of_new_accounts = len(set(nodes_community) - all_nodes)
        all_nodes = all_nodes.union(nodes_community)
        node_comm_stats[f"hop_{n_hop}_number_of_accounts"] = len(nodes_community)
        if index:
            node_comm_stats[f"hop_{n_hop}_number_of_distinct_accounts"] = len(all_nodes)
            node_comm_stats[f"hop_{n_hop}_number_of_new_accounts"] = number_of_new_accounts
        node_comm_stats[f"hop_{n_hop}_max_transferred"] = np.max(amounts_community_adjusted)
        node_comm_stats[f"hop_{n_hop}_mean_transferred"] = np.mean(amounts_community_adjusted)
        node_comm_stats[f"hop_{n_hop}_median_transferred"] = np.median(amounts_community_adjusted)
        node_comm_stats[f"hop_{n_hop}_std_transferred"] = np.std(amounts_community_adjusted)
        # node_comm_stats[f"hop_{n_hop}_skew_transferred"] = stats.skew(amounts_community_adjusted)
        # node_comm_stats[f"hop_{n_hop}_kurtosis_transferred"] = stats.kurtosis(amounts_community_adjusted)
    communities_as_passthrough_features.append(node_comm_stats)

communities_as_passthrough_features = pd.DataFrame(communities_as_passthrough_features)

In [None]:
communities_as_source_features.set_index("key", inplace=True)
communities_as_target_features.set_index("key", inplace=True)
communities_as_passthrough_features.set_index("key", inplace=True)