In [69]:
import math
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict, Counter
from datetime import timedelta, date
from glob import glob

import leidenalg as la
import igraph as ig
import numpy as np
import pandas as pd
import psutil
from pyspark.sql import functions as sf, types as st
from pyspark import SparkConf
from pyspark.sql import SparkSession

import settings as s

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
if (
    sys.version_info.major, 
    sys.version_info.minor, 
    sys.version_info.micro,
) != (3, 11, 7):
    raise EnvironmentError("Only runs efficiently on Python 3.11.7 | conda 24.1.2 | Apple M3 Pro")

In [3]:
config = [
    ("spark.jars.packages", "graphframes:graphframes:0.8.3-spark3.5-s_2.13"),
    ("spark.driver.memory", "8g"),
    ("spark.worker.memory", "8g"),
]
spark = SparkSession.builder.appName("testing").config(conf=SparkConf().setAll(config)).getOrCreate()

:: loading settings :: url = jar:file:/opt/anaconda3/envs/redirect/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/haseeb.tariq/.ivy2/cache
The jars for the packages stored in: /Users/haseeb.tariq/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2790df24-b6ec-487d-aec9-b815efa387af;1.0
	confs: [default]
	found graphframes#graphframes;0.8.3-spark3.5-s_2.13 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 63ms :: artifacts dl 2ms
	:: modules in use:
	graphframes#graphframes;0.8.3-spark3.5-s_2.13 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	--------------------

In [4]:
start_script = time.time()

In [5]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data = data.where(sf.col("source") != sf.col("target"))

In [6]:
def aggregate_edges(data_input):
    data_aggregated = data_input.groupby(["source", "target"]).agg(
        sf.sum("source_amount").alias("source_amount"),
        sf.sum("target_amount").alias("target_amount"),
    ).toPandas()
    
    source_totals = data_aggregated.groupby(
        "source"
    ).agg({"source_amount": "sum"})["source_amount"].to_dict()
    target_totals = data_aggregated.groupby(
        "target"
    ).agg({"target_amount": "sum"})["target_amount"].to_dict()
    
    data_aggregated.loc[:, "total_sent_by_source"] = data_aggregated.loc[:, "source"].apply(
        lambda x: source_totals[x]
    )
    data_aggregated.loc[:, "total_received_by_target"] = data_aggregated.loc[:, "target"].apply(
        lambda x: target_totals[x]
    )
    data_aggregated.loc[:, "weight"] = data_aggregated.apply(
        lambda x: (
            (x["source_amount"] / x["total_sent_by_source"]) +
            (x["target_amount"] / x["total_received_by_target"])
        ),
        axis=1
    )
    data_aggregated.loc[:, "source"] = data_aggregated["source"].str.slice(0, 8)
    data_aggregated.loc[:, "target"] = data_aggregated["target"].str.slice(0, 8)
    filter_self = data_aggregated["source"] != data_aggregated["target"]
    data_aggregated = data_aggregated.loc[filter_self, :].reset_index(drop=True)
    return data_aggregated.loc[:, ["source", "target", "weight"]]

In [7]:
def get_reversed_graph(edges_data):
    columns = ["source", "target", "weight"]
    edges_r = edges_data.loc[:, columns].rename(
        columns={"target": "source", "source": "target"}
    ).copy(deep=True).loc[:, columns]
    return ig.Graph.DataFrame(edges_r, use_vids=False, directed=True)

In [8]:
%%time

edges = aggregate_edges(data)
graph = ig.Graph.DataFrame(edges, use_vids=False, directed=True)
nodes = [x["name"] for x in graph.vs()]

                                                                                

CPU times: user 56.5 s, sys: 1.18 s, total: 57.7 s
Wall time: 1min 35s


In [9]:
def get_processes(ids):
    processes = []
    for process in psutil.process_iter():
        cmdline = []
        try:
            cmdline = process.cmdline()
        except Exception as error:
            pass
        if ids.intersection(cmdline):
            processes.append(process)
    return processes

In [10]:
%%time

NUMBER_OF_PROCESSES = 10

shutil.rmtree("staging", ignore_errors=True)
os.mkdir("staging")
chunks = np.array_split(nodes, NUMBER_OF_PROCESSES)

filename = "graph.pickle"
with open(filename, "wb") as f:
    pickle.dump(graph, f, protocol=pickle.HIGHEST_PROTOCOL)

filename = "nodes.pickle"
with open(filename, "wb") as f:
    pickle.dump(chunks, f, protocol=pickle.HIGHEST_PROTOCOL)

process_ids = set()
process_name = "communities.py"
for chunk_number in range(NUMBER_OF_PROCESSES):
    process_id = str(uuid.uuid4())
    process_ids = process_ids.union({process_id})
    os.system(f"{sys.executable} {process_name} {chunk_number} {process_id} &")

while get_processes(process_ids):
    time.sleep(5)

0 0.58
100000 440.26
0 0.58
100000 744.03
0 0.66
100000 860.47
0 0.61
100000 873.43
0 0.57
100000 981.73
0 0.55
100000 980.23
0 0.57
100000 965.89
0 0.58
100000 964.95
0 0.59
100000 1052.5
0 0.77
100000 1246.18
CPU times: user 4.43 s, sys: 7.95 s, total: 12.4 s
Wall time: 30min 22s


In [11]:
for proc in get_processes(process_ids):
    try:
        proc.kill()
    except psutil.NoSuchProcess:
        pass

In [12]:
communities = []
for filename in glob("./staging/*.pickle"):
    with open(filename, "rb") as f:
        communities += pickle.load(f)

filename = "communities.pickle"
with open(filename, "wb") as f:
    pickle.dump(communities, f)
communities = [x[1] for x in communities]

In [13]:
sizes = [len(x) for x in communities]
round(np.mean(sizes)), round(np.median(sizes)), round(np.max(sizes)), sum(sizes)
# (np.float64(66.50524665198381), np.float64(56.0), np.int64(150), 126_643_414)

(67, 56, 150, 126643414)

In [14]:
sizes_original = graph.neighborhood_size(None, order=2, mode="all", mindist=0)

In [15]:
reduction = round(100 - ((sum(sizes) / sum(sizes_original)) * 100), 2)
print(f"Reduction of {reduction}%")

Reduction of 97.59%


In [16]:
flows = pd.read_parquet("flows.parquet")
flow_stats = pd.read_parquet("flow_stats.parquet")

In [17]:
%%time

search_hash = defaultdict(list)
for index, community in enumerate(communities):
    for node in community:
        search_hash[node].append(index)

CPU times: user 51.8 s, sys: 312 ms, total: 52.1 s
Wall time: 52.2 s


In [18]:
%%time

percentages = []
start = time.time()
for index, (group, grouped) in enumerate(flows.groupby("id")):
    flow_nodes = set(grouped["source"]).union(grouped["target"])
    size = len(flow_nodes)
    matches = []
    perc = 0
    for node in flow_nodes:
        for i in search_hash[node]:
            try:
                matched_size = len(set(communities[i]).intersection(flow_nodes))
            except KeyError:
                continue
            perc = matched_size / size
            matches.append((node, perc))
            if perc == 1:
                break
        if perc == 1:
            break
    matched_node_comm, perc = sorted(matches, reverse=True, key=lambda x: x[1])[0]
    stats = flow_stats.loc[flow_stats["id"] == group, :].iloc[0].to_dict()
    stats["score"] = perc
    stats["matched_node_comm"] = matched_node_comm
    percentages.append(dict(stats))
    if not (index % 2_000):
        print(index, round(time.time() - start))
        start = time.time()

percentages = pd.DataFrame(percentages)

0 0
2000 3
4000 3
6000 3
8000 3
10000 3
12000 3
14000 3
16000 3


In [19]:
round(percentages["score"].mean(), 2) * 100

np.float64(83.0)

In [20]:
round(percentages[percentages["score"] == 1].shape[0] / percentages.shape[0], 2) * 100

75.0

In [None]:
patterns = ["bipartite", "stack", "fan-in", "fan-out", "gather-scatter", "scatter-gather"]
filter_ = (
    (percentages["type"].isin(patterns)) &
    (percentages["number_components"] == 1)
)
percentages_scope = percentages.loc[filter_, :].reset_index(drop=True)
percentages_scope.groupby("type").agg({"score": "mean"})

In [None]:
percentages_scope[percentages_scope["score"] == 1].shape[0] / percentages_scope.shape[0]

In [21]:
filter_remaining = (
    (percentages["score"] < 1) &
    (percentages["number_components"] == 1)
)
remaining = percentages.loc[filter_remaining, :].reset_index(drop=True)
remaining_flow_ids = remaining["id"].tolist()

In [23]:
remaining.max_days_diff.max(), sorted(remaining.sub_type.unique())

(np.int64(29),
 [' max 10 hops',
  ' max 11 hops',
  ' max 12 hops',
  ' max 13 hops',
  ' max 4 hops',
  ' max 5 hops',
  ' max 6 hops',
  ' max 7 hops',
  ' max 8 hops',
  ' max 9 hops'])

In [28]:
currencies_mins = defaultdict(list)
for remaining_flow_id in remaining_flow_ids:
    remaining_flow = flows.loc[flows["id"] == remaining_flow_id, :]
    for c, a in remaining_flow.groupby("source_currency")["source_amount"].min().to_dict().items():
        currencies_mins[c] += [a]

In [55]:
observed_temporal_currency_limits = {c: min(a) for c, a in currencies_mins.items()}
temporal_currency_limits = {
    "btc": 1,
    "eur": 100,
    "usd": 100,
    "gbp": 100,
    "cad": 100,
    "aud": 100,
    "chf": 100,
    "sar": 1_000,
    "ils": 1_000,
    "cny": 1_000,
    "rub": 5_000,
    "brl": 5_000,
    "jpy": 10_000,
    "mxn": 10_000,
    "inr": 10_000,
}
for k, v in observed_temporal_currency_limits.items():
    assert v > temporal_currency_limits[k]

In [73]:
%%time

data_filtered = data.where(sf.lit(False))
for currency, limit in temporal_currency_limits.items():
    data_currency = data.where(
        (sf.col("source_currency") == currency) &
        (sf.col("target_currency") == currency)
    ).where(sf.col("source_amount") >= limit)
    data_filtered = data_filtered.union(data_currency)
data_filtered = data_filtered.repartition(128, "transaction_id")
data_filtered.write.parquet(s.STAGED_FILTERED_DATA_LOCATION, mode="overwrite")
data_filtered = spark.read.parquet(s.STAGED_FILTERED_DATA_LOCATION)

                                                                                

CPU times: user 406 ms, sys: 153 ms, total: 559 ms
Wall time: 2min 36s


In [74]:
data_filtered = spark.read.parquet(s.STAGED_FILTERED_DATA_LOCATION)

In [75]:
data_filtered.count()

131221439

In [76]:
data_filtered.show()

+--------------------+-------------------+-------------+-------------+-----------+-----------+---------------+---------------+------------------+------------------+-------+-------------+
|      transaction_id|          timestamp|       source|       target|source_bank|target_bank|source_currency|target_currency|     source_amount|     target_amount| format|is_laundering|
+--------------------+-------------------+-------------+-------------+-----------+-----------+---------------+---------------+------------------+------------------+-------+-------------+
|id--2603031962711...|2022-08-05 21:30:00|84DD24301-btc|851C01971-btc|   01209169|   00212492|            btc|            btc|10.467229843139648|10.467229843139648|Bitcoin|            0|
|id-71574820641461...|2022-08-16 22:45:00|84EEC3C71-btc|85014FE61-btc|       0017|   00214266|            btc|            btc|1.3586139678955078|1.3586139678955078|Bitcoin|            0|
|id--4246324583813...|2022-08-18 05:48:00|84DABCAC1-btc|84DABC9D1

In [39]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")

Script executed in 0:35:24
