In [1]:
import math
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict, Counter
from datetime import timedelta, date
from glob import glob

import leidenalg as la
import igraph as ig
import numpy as np
import pandas as pd
import psutil
from pyspark.sql import functions as sf, types as st
from pyspark import SparkConf
from pyspark.sql import SparkSession

import settings as s

%load_ext autoreload
%autoreload 2

In [2]:
if (
    sys.version_info.major, 
    sys.version_info.minor, 
    sys.version_info.micro,
) != (3, 11, 7):
    raise EnvironmentError("Only runs efficiently on Python 3.11.7 | conda 24.1.2 | Apple M3 Pro")

In [3]:
config = [
    ("spark.jars.packages", "graphframes:graphframes:0.8.3-spark3.5-s_2.13"),
    ("spark.driver.memory", "8g"),
    ("spark.worker.memory", "8g"),
]
spark = SparkSession.builder.appName("testing").config(conf=SparkConf().setAll(config)).getOrCreate()

:: loading settings :: url = jar:file:/opt/anaconda3/envs/redirect/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/haseeb.tariq/.ivy2/cache
The jars for the packages stored in: /Users/haseeb.tariq/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e5f386aa-6e17-4a8d-9ada-255ddca9ced2;1.0
	confs: [default]
	found graphframes#graphframes;0.8.3-spark3.5-s_2.13 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 60ms :: artifacts dl 2ms
	:: modules in use:
	graphframes#graphframes;0.8.3-spark3.5-s_2.13 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	--------------------

In [4]:
start_script = time.time()

In [5]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data = data.where(sf.col("source") != sf.col("target"))
data = data.where(sf.col("format").isin(["ACH", "Wire", "Bitcoin"]))

In [6]:
def aggregate_edges(data_input):
    data_aggregated = data_input.groupby(["source", "target"]).agg(
        sf.sum("source_amount").alias("source_amount"),
        sf.sum("target_amount").alias("target_amount"),
    ).toPandas()
    
    source_totals = data_aggregated.groupby(
        "source"
    ).agg({"source_amount": "sum"})["source_amount"].to_dict()
    target_totals = data_aggregated.groupby(
        "target"
    ).agg({"target_amount": "sum"})["target_amount"].to_dict()
    
    data_aggregated.loc[:, "total_sent_by_source"] = data_aggregated.loc[:, "source"].apply(
        lambda x: source_totals[x]
    )
    data_aggregated.loc[:, "total_received_by_target"] = data_aggregated.loc[:, "target"].apply(
        lambda x: target_totals[x]
    )
    data_aggregated.loc[:, "weight"] = data_aggregated.apply(
        lambda x: (
            (x["source_amount"] / x["total_sent_by_source"]) +
            (x["target_amount"] / x["total_received_by_target"])
        ),
        axis=1
    )
    data_aggregated.loc[:, "source"] = data_aggregated["source"].str.slice(0, 8)
    data_aggregated.loc[:, "target"] = data_aggregated["target"].str.slice(0, 8)
    filter_self = data_aggregated["source"] != data_aggregated["target"]
    data_aggregated = data_aggregated.loc[filter_self, :].reset_index(drop=True)
    return data_aggregated.loc[:, ["source", "target", "weight"]]

In [7]:
def get_reversed_graph(edges_data):
    columns = ["source", "target", "weight"]
    edges_r = edges_data.loc[:, columns].rename(
        columns={"target": "source", "source": "target"}
    ).copy(deep=True).loc[:, columns]
    return ig.Graph.DataFrame(edges_r, use_vids=False, directed=True)

In [8]:
%%time

edges = aggregate_edges(data)
graph = ig.Graph.DataFrame(edges, use_vids=False, directed=True)
nodes = [x["name"] for x in graph.vs()]

                                                                                

CPU times: user 28.7 s, sys: 503 ms, total: 29.2 s
Wall time: 42.9 s


In [9]:
def get_processes(ids):
    processes = []
    for process in psutil.process_iter():
        cmdline = []
        try:
            cmdline = process.cmdline()
        except Exception as error:
            pass
        if ids.intersection(cmdline):
            processes.append(process)
    return processes

In [None]:
%%time

NUMBER_OF_PROCESSES = 10

shutil.rmtree("staging", ignore_errors=True)
os.mkdir("staging")
chunks = np.array_split(nodes, NUMBER_OF_PROCESSES)

filename = "graph.pickle"
with open(filename, "wb") as f:
    pickle.dump(graph, f, protocol=pickle.HIGHEST_PROTOCOL)

filename = "nodes.pickle"
with open(filename, "wb") as f:
    pickle.dump(chunks, f, protocol=pickle.HIGHEST_PROTOCOL)

process_ids = set()
process_name = "communities.py"
for chunk_number in range(NUMBER_OF_PROCESSES):
    process_id = str(uuid.uuid4())
    process_ids = process_ids.union({process_id})
    os.system(f"{sys.executable} {process_name} {chunk_number} {process_id} &")

while get_processes(process_ids):
    time.sleep(5)

In [None]:
for proc in get_processes(process_ids):
    try:
        proc.kill()
    except psutil.NoSuchProcess:
        pass

communities = []
for filename in glob("./staging/*.pickle"):
    with open(filename, "rb") as f:
        communities += pickle.load(f)

filename = "communities.pickle"
with open(filename, "wb") as f:
    pickle.dump(communities, f)
communities = [x[1] for x in communities]

In [None]:
temporal_currency_limits = {
    "btc": 1,
    "eur": 100,
    "usd": 100,
    "gbp": 100,
    "cad": 100,
    "aud": 100,
    "chf": 100,
    "sar": 1_000,
    "ils": 1_000,
    "cny": 1_000,
    "rub": 5_000,
    "brl": 5_000,
    "jpy": 10_000,
    "mxn": 10_000,
    "inr": 10_000,
}

In [None]:
%%time

data_filtered_main = spark.read.parquet(s.STAGED_DATA_LOCATION)
data_filtered_main = data_filtered_main.where(sf.col("format").isin(["ACH", "Bitcoin"]))
data_filtered = data_filtered_main.where(sf.lit(False))
for currency, limit in temporal_currency_limits.items():
    data_currency = data_filtered_main.where(
        (sf.col("source_currency") == currency) &
        (sf.col("target_currency") == currency)
    ).where(sf.col("source_amount") >= limit)
    data_filtered = data_filtered.union(data_currency)
edges_filtered = aggregate_edges(data_filtered)
graph_filtered = ig.Graph.DataFrame(edges_filtered, use_vids=False, directed=True)
nodes_filtered = [x["name"] for x in graph_filtered.vs()]

In [None]:
%%time

NUMBER_OF_PROCESSES = 10

shutil.rmtree("staging-filtered", ignore_errors=True)
os.mkdir("staging-filtered")
chunks = np.array_split(nodes_filtered, NUMBER_OF_PROCESSES)

filename = "graph_filtered.pickle"
with open(filename, "wb") as f:
    pickle.dump(graph_filtered, f, protocol=pickle.HIGHEST_PROTOCOL)

filename = "nodes_filtered.pickle"
with open(filename, "wb") as f:
    pickle.dump(chunks, f, protocol=pickle.HIGHEST_PROTOCOL)

process_ids = set()
process_name = "communities_cycles.py"
for chunk_number in range(NUMBER_OF_PROCESSES):
    process_id = str(uuid.uuid4())
    process_ids = process_ids.union({process_id})
    os.system(f"{sys.executable} {process_name} {chunk_number} {process_id} &")

while get_processes(process_ids):
    time.sleep(5)

In [None]:
for proc in get_processes(process_ids):
    try:
        proc.kill()
    except psutil.NoSuchProcess:
        pass

communities_filtered = []
for filename in glob("./staging-filtered/*.pickle"):
    with open(filename, "rb") as f:
        communities_filtered += pickle.load(f)

filename = "communities_filtered.pickle"
with open(filename, "wb") as f:
    pickle.dump(communities_filtered, f)
communities_filtered = [x[1] for x in communities_filtered]

In [None]:
communities += communities_filtered

In [None]:
sizes = [len(x) for x in communities]
round(np.mean(sizes)), round(np.median(sizes)), round(np.max(sizes)), sum(sizes)

In [None]:
flows = pd.read_parquet("flows.parquet")
flow_stats = pd.read_parquet("flow_stats.parquet")

In [None]:
%%time

search_hash = defaultdict(list)
for index, community in enumerate(communities):
    for node in community:
        search_hash[node].append(index)

In [None]:
%%time

percentages = []
start = time.time()
for index, (group, grouped) in enumerate(flows.groupby("id")):
    flow_nodes = set(grouped["source"]).union(grouped["target"])
    size = len(flow_nodes)
    matches = []
    perc = 0
    for node in flow_nodes:
        for i in search_hash[node]:
            try:
                matched_size = len(set(communities[i]).intersection(flow_nodes))
            except KeyError:
                continue
            perc = matched_size / size
            matches.append((node, perc))
            if perc == 1:
                break
        if perc == 1:
            break
    matched_node_comm, perc = sorted(matches, reverse=True, key=lambda x: x[1])[0]
    stats = flow_stats.loc[flow_stats["id"] == group, :].iloc[0].to_dict()
    stats["score"] = perc
    stats["matched_node_comm"] = matched_node_comm
    percentages.append(dict(stats))
    if not (index % 2_000):
        print(index, round(time.time() - start))
        start = time.time()

percentages = pd.DataFrame(percentages)

In [None]:
data_filtered_main = data_filtered.select("*")
data_filtered_main.count()

In [None]:
round(percentages["score"].mean(), 2) * 100

In [None]:
round(percentages[percentages["score"] == 1].shape[0] / percentages.shape[0], 2) * 100

In [None]:
filter_ = percentages["number_components"] == 1
percentages_scope = percentages.loc[filter_, :].reset_index(drop=True)
percentages_scope.groupby("type").agg({"score": "mean"})

In [None]:
percentages_scope[percentages_scope["score"] < 1].shape[0] / percentages_scope.shape[0]

In [None]:
# def left_column(column):
#     return f"{column}_left"

In [None]:
# WINDOW_SIZE = 30
# L1_AMOUNT = 1000
# MAX_ALLOWED_DIFF_L1 = 0.15
# MAX_ALLOWED_DIFF_L2 = 0.25

# max_date = max(dates)
# for date in dates:
#     end_date = date + timedelta(days=WINDOW_SIZE)
#     day = data_filtered.where(sf.col("timestamp").astype(st.DateType()) == date)
#     day = day.select(*[sf.col(x).alias(left_column(x)) for x in day.columns])
#     window = data_filtered.where(
#         (sf.col("timestamp").astype(st.DateType()) >= date) &
#         (sf.col("timestamp").astype(st.DateType()) <= end_date)
#     )
#     joined = day.join(
#         window, 
#         on=(
#             (sf.col("timestamp") >= sf.col(left_column("timestamp"))) &
#             (sf.col("source") == sf.col(left_column("target")))
#         ),
#         how="inner"
#     )
#     join_diff_currency = joined.where(sf.col("currency") != sf.col(left_column("currency"))).withColumn(
#         "diff", sf.lit(0)
#     )
#     join_same_currency = joined.where(sf.col("currency") == sf.col(left_column("currency"))).withColumn(
#         "diff", 
#         sf.abs(sf.col("amount") - sf.col(left_column("amount"))) / (sf.col("amount") - sf.col(left_column("amount")))
#     )
#     join_same_currency_l1 = join_same_currency.where(
#         (sf.col("amount") <= L1_AMOUNT) | (sf.col(left_column("amount")) <= L1_AMOUNT)
#     )
#     join_same_currency_l2 = join_same_currency.where(
#         (sf.col("amount") > L1_AMOUNT) & (sf.col(left_column("amount")) > L1_AMOUNT)
#     )
#     combined = join_diff_currency.union(
#         join_same_currency_l1.where(sf.col("diff") <= MAX_ALLOWED_DIFF_L1)
#     ).union(
#         join_same_currency_l2.where(sf.col("diff") <= MAX_ALLOWED_DIFF_L2)
#     )
#     break

In [None]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")