In [1]:
import os
import pickle
import random
import shutil
import sys
import time
import uuid
from glob import glob
from datetime import timedelta, datetime
from itertools import combinations

import igraph as ig
import numpy as np
import pandas as pd
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from sklearn.metrics import f1_score, recall_score

import settings as s
from common import get_processes

In [2]:
config = [
    ("spark.driver.memory", "16g"),
    ("spark.worker.memory", "16g"),
    ("spark.driver.maxResultSize", "16g"),
]
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(config))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/06 14:54:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
start = time.time()

In [4]:
WINDOW_SIZE = 14
TRAIN_PERC = 0.6
VALIDATION_PERC = 0.2
TEST_PERC = 0.2

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [5]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)

In [6]:
# The last few days only contain incomplete data
trx_count_per_day = data.groupby(sf.to_date("timestamp").alias("date")).count().toPandas()
trx_count_per_day = trx_count_per_day.sort_values("date").set_index("date")
mean_per_day = np.mean(trx_count_per_day["count"])
mean_per_day_ratio = trx_count_per_day["count"] / mean_per_day
complete_data_present_till = max(mean_per_day_ratio[mean_per_day_ratio > 0.1].index)
complete_data_present_till = data.where(sf.to_date("timestamp") == complete_data_present_till).select(
    sf.max("timestamp").alias("x")
).collect()[0]["x"]
print(complete_data_present_till)

                                                                                

2022-09-10 23:59:00


In [7]:
trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)

                                                                                

In [8]:
last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes = spark.createDataFrame(
    pd.DataFrame(train_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(train_indexes.count())
validation_indexes = spark.createDataFrame(
    pd.DataFrame(validation_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(validation_indexes.count())
test_indexes = spark.createDataFrame(
    pd.DataFrame(test_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(test_indexes.count())

25/06/06 14:55:01 WARN TaskSetManager: Stage 11 contains a task of very large size (1645 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

3043615
1014538
1014540


In [9]:
train = train_indexes.join(
    data, on="transaction_id", how="left"
)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
)
test = test_indexes.join(
    data, on="transaction_id", how="left"
)
train_validation = train.union(validation)

In [10]:
def get_pandas(df):
    df.write.parquet("temp.parquet", mode="overwrite")
    df = pd.read_parquet("temp.parquet")
    # Because of tz discrepancy
    df.loc[:, "timestamp"] += timedelta(hours=2)
    return df

In [11]:
def get_windowed_datasets(data_dates, data_input):
    for date_trx in sorted([x["timestamp"] for x in data_dates.select("timestamp").distinct().collect()]):
        datetime_trx_start = datetime.combine(date_trx, datetime.min.time())
        datetime_trx_end = datetime.combine(date_trx, datetime.max.time())
        left_start = datetime_trx_start - timedelta(WINDOW_SIZE)
        right_end = datetime_trx_end + timedelta(WINDOW_SIZE)
        left = get_pandas(
            data_input.where(
                (data_input["timestamp"] >= left_start) & (data_input["timestamp"] <= datetime_trx_end)
            )
        )
        right = get_pandas(
            data_input.where(
                (data_input["timestamp"] >= datetime_trx_start) & (data_input["timestamp"] <= right_end)
            )
        )
        pov = get_pandas(
            data_input.where(
                (data_input["timestamp"] >= datetime_trx_start) & (data_input["timestamp"] <= datetime_trx_end)
            )
        )
        yield(left, pov, right)

In [12]:
%%time

for left_df, pov_df, right_df in get_windowed_datasets(train, train):
    for in_scope_window in [left_df, right_df]:
        break
    break

                                                                                

CPU times: user 781 ms, sys: 115 ms, total: 896 ms
Wall time: 12.7 s


In [13]:
in_scope_window.loc[:, "window_delta"] = (in_scope_window["timestamp"] - in_scope_window["timestamp"].min()).dt.seconds + 1

In [14]:
# Because of data quality issue

in_scope_window_size = (
    min([in_scope_window["timestamp"].max(), complete_data_present_till]) -
    in_scope_window["timestamp"].min()
).seconds
if in_scope_window_size < 1:
    in_scope_window_size = 1

In [15]:
in_scope_window_size

86340

In [16]:
in_scope_edges = pov_df.groupby(["source", "target"]).agg(is_laundering=("is_laundering", "max")).reset_index()
in_scope_nodes = set(in_scope_edges["source"]).union(in_scope_edges["target"])

In [17]:
nodes = in_scope_window.groupby(["source", "target"]).agg(
    amount=("amount", "sum"),
    is_laundering=("is_laundering", "max"),
).reset_index()
nodes.loc[:, "id"] = [f"i-{x}" for x in nodes.index]
nodes = nodes.loc[:, ["id", "source", "target", "amount", "is_laundering"]]

In [18]:
graph_fo = ig.Graph.DataFrame(
    nodes.loc[:, ["source", "target", "amount"]].rename(columns={"amount": "weight"}), 
    use_vids=False, directed=True
)
nodes_fo = list(in_scope_nodes.intersection([x["name"] for x in graph_fo.vs()]))
random.shuffle(nodes_fo)

In [19]:
left = nodes.set_index("target")
right = nodes.set_index("source")
edges = left.join(right, how="inner", lsuffix="_left")
edges = edges.loc[edges["id_left"] != edges["id"], :].reset_index(drop=True)
edges = edges[["id_left", "id", "amount_left", "amount"]].rename(
    columns={"id_left": "source", "id": "target"}
)
edges.loc[:, "weight"] = 1 - abs(edges["amount_left"] - edges["amount"]) / (
    edges["amount_left"] + edges["amount"]
)

In [20]:
graph_so = ig.Graph.DataFrame(edges, use_vids=False, directed=True)
nodes_so = set(
    [x["name"] for x in graph_so.vs()]
).intersection(
    nodes[["id", "source", "target"]].set_index(["source", "target"]).join(
        in_scope_edges[["source", "target"]].set_index(["source", "target"]), how="inner"
    )["id"].tolist()
)
nodes_so = list(nodes_so)
random.shuffle(nodes_so)

In [21]:
%%time

NUMBER_OF_PROCESSES = 10

shutil.rmtree("staging", ignore_errors=True)
os.mkdir("staging")
chunks = np.array_split(nodes_fo, NUMBER_OF_PROCESSES)

filename = "graph.pickle"
with open(filename, "wb") as f:
    pickle.dump(graph_fo, f, protocol=pickle.HIGHEST_PROTOCOL)

filename = "nodes.pickle"
with open(filename, "wb") as f:
    pickle.dump(chunks, f, protocol=pickle.HIGHEST_PROTOCOL)

process_ids = set()
process_name = "communities.py"
for chunk_number in range(NUMBER_OF_PROCESSES):
    process_id = str(uuid.uuid4())
    process_ids = process_ids.union({process_id})
    os.system(f"{sys.executable} {process_name} {chunk_number} {process_id} &")

while get_processes(process_ids):
    time.sleep(5)

Done -> 0
Done -> 6
Done -> 3
Done -> 7
Done -> 8
Done -> 1
Done -> 2
Done -> 9
Done -> 4
Done -> 5
CPU times: user 292 ms, sys: 234 ms, total: 525 ms
Wall time: 1min


In [22]:
%%time

for proc in get_processes(process_ids):
    try:
        proc.kill()
    except psutil.NoSuchProcess:
        pass

communities_fo = []
for filename in glob("./staging/*.pickle"):
    with open(filename, "rb") as f:
        communities_fo += pickle.load(f)

original_size = len(communities_fo)

filename = "communities.pickle"
with open(filename, "wb") as f:
    pickle.dump(communities_fo, f)

sizes = [len(x[1]) for x in communities_fo]
round(np.mean(sizes)), round(np.max(sizes)), sum(sizes)

CPU times: user 1.7 s, sys: 79.7 ms, total: 1.78 s
Wall time: 1.8 s


(3, 35, 1121276)

In [23]:
%%time

NUMBER_OF_PROCESSES = 10

shutil.rmtree("staging", ignore_errors=True)
os.mkdir("staging")
chunks = np.array_split(nodes_so, NUMBER_OF_PROCESSES)

filename = "graph.pickle"
with open(filename, "wb") as f:
    pickle.dump(graph_so, f, protocol=pickle.HIGHEST_PROTOCOL)

filename = "nodes.pickle"
with open(filename, "wb") as f:
    pickle.dump(chunks, f, protocol=pickle.HIGHEST_PROTOCOL)

process_ids = set()
process_name = "communities.py"
for chunk_number in range(NUMBER_OF_PROCESSES):
    process_id = str(uuid.uuid4())
    process_ids = process_ids.union({process_id})
    os.system(f"{sys.executable} {process_name} {chunk_number} {process_id} &")

while get_processes(process_ids):
    time.sleep(5)

Done -> 2
Done -> 5
Done -> 3
Done -> 1
Done -> 4
Done -> 8
Done -> 0
Done -> 6
Done -> 9
Done -> 7
CPU times: user 410 ms, sys: 263 ms, total: 673 ms
Wall time: 1min 15s


In [24]:
%%time

for proc in get_processes(process_ids):
    try:
        proc.kill()
    except psutil.NoSuchProcess:
        pass

communities_so = []
for filename in glob("./staging/*.pickle"):
    with open(filename, "rb") as f:
        communities_so += pickle.load(f)

original_size = len(communities_so)

filename = "communities.pickle"
with open(filename, "wb") as f:
    pickle.dump(communities_so, f)

sizes = [len(x[1]) for x in communities_so]
round(np.mean(sizes)), round(np.max(sizes)), sum(sizes)

CPU times: user 3.45 s, sys: 130 ms, total: 3.58 s
Wall time: 3.6 s


(6, 42, 3267954)

In [25]:
in_scope_window.loc[:, "edge"] = in_scope_window.apply(
    lambda x: tuple(sorted([x["source"], x["target"]])), axis=1
)
in_scope_window.loc[:, "edge"] = in_scope_window.loc[:, "edge"].apply(
    lambda x: f"{x[0]}-{x[1]}"
)
in_scope_window.set_index("edge", inplace=True)

In [26]:
location_trx_comm_fo = "transactions_communities_fo"

In [27]:
%%time

shutil.rmtree(location_trx_comm_fo, ignore_errors=True)
os.mkdir(location_trx_comm_fo)

shutil.rmtree(location_trx_comm_fo, ignore_errors=True)
os.mkdir(location_trx_comm_fo)

communities_fo = dict(communities_fo)
communities_keys = [x for x in communities_fo.keys()]

number_of_chunks = int(np.ceil(len(communities_keys) / 50_000))
chunks = np.array_split(communities_keys, number_of_chunks)
for index, chunk in enumerate(chunks):
    comm_inner = []
    for key in chunk:
        comm_node = communities_fo[key]
        comm_inner += [[key, tuple(sorted(x))] for x in combinations(comm_node, 2)]
    edge_combinations = pd.DataFrame(comm_inner, columns=["id", "edge"])
    edge_combinations.loc[:, "edge"] = edge_combinations.loc[:, "edge"].apply(
        lambda x: f"{x[0]}-{x[1]}"
    )
    edge_combinations.set_index("edge", inplace=True)
    edge_combinations.join(in_scope_window, how="inner").reset_index(drop=True).to_parquet(
        f"{location_trx_comm_fo}/part-{index}.parquet"
    )
    if not (index % 5):
        print(index, len(chunks))

0 9
5 9
CPU times: user 6.52 s, sys: 124 ms, total: 6.64 s
Wall time: 6.65 s


In [28]:
# trx_communities_fo = pd.read_parquet(location_trx_comm_fo)

In [29]:
nodes.loc[:, "source_target"] = nodes.apply(lambda x: (x["source"], x["target"]), axis=1)
edges_mapping = nodes.set_index("id")["source_target"].to_dict()

In [30]:
location_trx_comm_so = "transactions_communities_so"

In [31]:
%%time

shutil.rmtree(location_trx_comm_so, ignore_errors=True)
os.mkdir(location_trx_comm_so)

shutil.rmtree(location_trx_comm_so, ignore_errors=True)
os.mkdir(location_trx_comm_so)

communities_so = dict(communities_so)
communities_so = {
    edges_mapping[k]: [x for y in [edges_mapping[_] for _ in v] for x in y]
    for k, v in communities_so.items()
}
communities_keys = [x for x in communities_so.keys()]

number_of_chunks = int(np.ceil(len(communities_keys) / 50_000))
chunks = np.array_split(communities_keys, number_of_chunks)
for index, chunk in enumerate(chunks):
    comm_inner = []
    for key in chunk:
        key = tuple(key)
        comm_node = communities_so[key]
        comm_inner += [[key, tuple(sorted(x))] for x in combinations(comm_node, 2)]
    edge_combinations = pd.DataFrame(comm_inner, columns=["id", "edge"])
    edge_combinations.loc[:, "edge"] = edge_combinations.loc[:, "edge"].apply(
        lambda x: f"{x[0]}-{x[1]}"
    )
    del edge_combinations["id"]
    edge_combinations.set_index("edge", inplace=True)
    edge_combinations.loc[:, "id"] = edge_combinations.index
    edge_combinations.join(in_scope_window, how="inner").to_parquet(
        f"{location_trx_comm_so}/part-{index}.parquet"
    )
    if not (index % 5):
        print(index, len(chunks))

0 11
5 11
10 11
CPU times: user 1min 49s, sys: 5.53 s, total: 1min 54s
Wall time: 1min 54s


In [32]:
# trx_communities_so = pd.read_parquet(location_trx_comm_so)

In [33]:
location_features_fo = "features_fo"

In [34]:
%%time

NUMBER_OF_PROCESSES = 10

parts = sorted(
    [x for x in glob(f"{location_trx_comm_fo}/*.parquet")],
    key=lambda x: int(x.split("-")[-1].split(".")[0]),
)

shutil.rmtree(location_features_fo, ignore_errors=True)
os.mkdir(location_features_fo)

process_ids = set()
process_name = "features.py"
while parts:
    if len(get_processes(process_ids)) < NUMBER_OF_PROCESSES:
        process_id = str(uuid.uuid4())
        process_ids = process_ids.union({process_id})
        os.system(
            f"{sys.executable} {process_name} {parts.pop()} {location_features_fo} {process_id} &"
        )

while get_processes(process_ids):
    time.sleep(5)

CPU times: user 171 ms, sys: 670 ms, total: 842 ms
Wall time: 3min 27s


In [35]:
location_features_so = "features_so"

In [None]:
%%time

NUMBER_OF_PROCESSES = 10

parts = sorted(
    [x for x in glob(f"{location_trx_comm_so}/*.parquet")],
    key=lambda x: int(x.split("-")[-1].split(".")[0]),
)

shutil.rmtree(location_features_so, ignore_errors=True)
os.mkdir(location_features_so)

process_ids = set()
process_name = "features.py"
while parts:
    if len(get_processes(process_ids)) < NUMBER_OF_PROCESSES:
        process_id = str(uuid.uuid4())
        process_ids = process_ids.union({process_id})
        os.system(
            f"{sys.executable} {process_name} {parts.pop()} {location_features_so} {process_id} &"
        )

while get_processes(process_ids):
    time.sleep(5)

In [None]:
pd.read_parquet("features_so/")[["ts_weighted_mean", "ts_weighted_median", "ts_weighted_std"]]

In [None]:
print((time.time() - start) // 60)