In [1]:
import os
import pickle
import random
import shutil
import sys
import time
import uuid
from glob import glob
from datetime import timedelta, datetime
from itertools import combinations

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from sklearn.metrics import f1_score, recall_score

import settings as s
from common import create_workload_for_multi_proc
from communities import get_communities_multi_proc
from features import get_features_multi_proc, get_pov_features


%load_ext autoreload
%autoreload 2

In [2]:
config = [
    ("spark.driver.memory", "16g"),
    ("spark.worker.memory", "16g"),
    ("spark.driver.maxResultSize", "16g"),
]
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(config))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/23 13:23:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
start = time.time()

In [4]:
WINDOW_SIZE = 7
TRAIN_PERC = 0.6
VALIDATION_PERC = 0.2
TEST_PERC = 0.2

NUM_PROCS = 10

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [5]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)

In [6]:
data_ud = data.select("source", "target").union(
    data.select(sf.col("target").alias("source"), sf.col("source").alias("target"))
)
top_nodes = data_ud.groupby("source").agg(sf.countDistinct("target").alias("count")).toPandas()
cutoff_extreme = np.percentile(top_nodes["count"], 99.99)
nodes_extreme = top_nodes[top_nodes["count"] > cutoff_extreme]["source"].tolist()
nodes_normal = list(set(top_nodes["source"].tolist()) - set(nodes_extreme))
print("normal", len(nodes_normal))
print("extreme", len(nodes_extreme))
print("cuttoff", cutoff_extreme)

                                                                                

normal 2025934
extreme 192
cuttoff 76.0


In [7]:
# The last few days only contain incomplete data
trx_count_per_day = data.groupby(sf.to_date("timestamp").alias("date")).count().toPandas()
trx_count_per_day = trx_count_per_day.sort_values("date").set_index("date")
mean_per_day = np.mean(trx_count_per_day["count"])
mean_per_day_ratio = trx_count_per_day["count"] / mean_per_day
complete_data_present_till = max(mean_per_day_ratio[mean_per_day_ratio > 0.1].index)
complete_data_present_till = data.where(sf.to_date("timestamp") == complete_data_present_till).select(
    sf.max("timestamp").alias("x")
).collect()[0]["x"]
print(complete_data_present_till)

                                                                                

2022-09-16 23:59:00


In [8]:
trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)

                                                                                

In [9]:
last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes = spark.createDataFrame(
    pd.DataFrame(train_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(train_indexes.count())
validation_indexes = spark.createDataFrame(
    pd.DataFrame(validation_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(validation_indexes.count())
test_indexes = spark.createDataFrame(
    pd.DataFrame(test_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(test_indexes.count())

25/06/23 13:26:08 WARN TaskSetManager: Stage 17 contains a task of very large size (10591 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

18734115


25/06/23 13:26:39 WARN TaskSetManager: Stage 22 contains a task of very large size (3579 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

6244705


25/06/23 13:27:09 WARN TaskSetManager: Stage 27 contains a task of very large size (3579 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

6244705


In [10]:
train = train_indexes.join(
    data, on="transaction_id", how="left"
)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
)
test = test_indexes.join(
    data, on="transaction_id", how="left"
)
train_validation = train.union(validation)

In [11]:
def get_pandas(df):
    df.write.parquet("temp.parquet", mode="overwrite")
    df = pd.read_parquet("temp.parquet")
    # Because of tz discrepancy
    df.loc[:, "timestamp"] += timedelta(hours=2)
    return df

In [12]:
def get_windowed_datasets(data_dates, data_input):
    dates = data_dates.select(sf.to_date("timestamp").alias("x")).distinct().collect()
    dates = sorted([x["x"] for x in dates])
    for date_trx in dates:
        datetime_trx_start = datetime.combine(date_trx, datetime.min.time())
        datetime_trx_end = datetime.combine(date_trx, datetime.max.time())
        left_start = datetime_trx_start - timedelta(WINDOW_SIZE)
        right_end = datetime_trx_end + timedelta(WINDOW_SIZE)
        pov = get_pandas(
            data_input.where(
                (data_input["timestamp"] >= datetime_trx_start) & (data_input["timestamp"] <= datetime_trx_end)
            )
        )
        window = get_pandas(
            data_input.where(
                (data_input["timestamp"] >= left_start) & (data_input["timestamp"] <= right_end)
            )
        )
        yield(len(dates), date_trx, pov, window)

In [13]:
location_main_features = os.path.join("features", s.OUTPUT_POSTFIX.lstrip("-"))

In [14]:
# shutil.rmtree(location_main_features, ignore_errors=True)

In [15]:
location_train = f"{location_main_features}{os.sep}train{os.sep}"
location_validation = f"{location_main_features}{os.sep}validation{os.sep}"
location_test = f"{location_main_features}{os.sep}test{os.sep}"

try:
    os.makedirs(location_main_features)
except FileExistsError:
    pass

In [16]:
# %%time

# data_input = train.select("*")
# nodes_source = set(train.select("source").distinct().toPandas()["source"])
# nodes_target = set(train.select("target").distinct().toPandas()["target"])
# nodes_passthrough = nodes_source.intersection(nodes_target)

# %run communities_global.ipynb

# communities_as_source_features.to_parquet(f"{location_main_features}/train_communities_as_source_features.parquet")
# communities_as_target_features.to_parquet(f"{location_main_features}/train_communities_as_target_features.parquet")
# communities_as_passthrough_features.to_parquet(f"{location_main_features}/train_communities_as_passthrough_features.parquet")

In [17]:
# communities_as_source_features = pd.read_parquet(f"{location_main_features}/train_communities_as_source_features.parquet")
# communities_as_target_features = pd.read_parquet(f"{location_main_features}/train_communities_as_target_features.parquet")
# communities_as_passthrough_features = pd.read_parquet(f"{location_main_features}/train_communities_as_passthrough_features.parquet")

In [18]:
# %%time

# try:
#     os.makedirs(location_train)
# except FileExistsError:
#     pass

# st = time.time()
# for num_days, dt_trx, pov_df, window_df in get_windowed_datasets(train, train):
#     %run model_experiment_nested_hm.ipynb
    
#     all_features = all_features.join(
#         communities_as_source_features, how="left", rsuffix="_dispense"
#     ).join(
#         communities_as_target_features, how="left", rsuffix="_sink"
#     ).join(
#         communities_as_passthrough_features, how="left", rsuffix="_passthrough"
#     )
    
#     pov_features_df = []
#     for k, v in pov_df[pov_df["source"].isin(in_scope_sources)].groupby(
#         ["source", "target"]
#     ):
#         pov_features_df.append(get_pov_features(k, v))
#     pov_features = pd.DataFrame(pov_features_df)
    
#     pov_features = pov_features.set_index("target").join(
#         all_features, how="left", rsuffix="_source"
#     ).reset_index().set_index("source").join(
#         all_features, how="left", rsuffix="_target"
#     ).reset_index()
    
#     pov_features.to_parquet(f"{location_train}{dt_trx}.parquet")
#     print(f"Processed {dt_trx} in {(time.time() - st) // 60} minutes | {all_features.shape} | {num_days=}")
#     st = time.time()

In [19]:
# %%time

# grouped = window_df.groupby(["source", "target"]).agg(amount=("amount", "sum")).reset_index()
# grouped_rev = grouped.copy(deep=True).rename(columns={"target": "source", "source": "target"})
# grouped_ud = pd.concat([
#     grouped,
#     grouped_rev
# ], ignore_index=True)
# nodes = set(grouped_ud["target"].unique().tolist()) - set(top_nodes)
# nodes = list(nodes)
# random.shuffle(nodes)
# df_left = grouped_ud.set_index("target")
# df_right = grouped_ud.set_index("source")

In [20]:
# %%time

# comms = []
# for chunk in [[x] for x in top_nodes] + list(np.array_split(nodes, np.ceil(len(nodes) / 100_000))):
#     joined = df_left.loc[chunk, :].join(
#         df_right, how="inner", lsuffix="_left"
#     ).reset_index(drop=True)
#     joined = joined.loc[joined["source"] != joined["target"], :]
#     joined.loc[:, "amount"] = joined[["amount_left", "amount"]].min(axis=1)
#     del joined["amount_left"]
#     joined = joined.groupby(["source", "target"]).agg(amount=("amount", "sum")).reset_index()
#     joined = joined.sort_values("amount", ascending=False).reset_index(drop=True)
#     joined = joined.groupby("source").head(100).reset_index(drop=True)
#     comms.append(joined.groupby("source").agg(nodes=("target", list)))
#     print(len(chunk), comms[-1].shape[0])

In [21]:
# grouped = window_df.groupby(["source", "target"]).agg(sf.sum("amount").alias("amount"))
# grouped.cache()
# grouped.count()
# grouped_ud = grouped.union(
#     grouped.select(sf.col("target").alias("source"), sf.col("source").alias("target"), "amount")
# )
# grouped_ud_left = grouped_ud.select(*[sf.col(x).alias(f"left_{x}") for x in grouped_ud_left.columns])
# joined = grouped_ud_left.join(
#     grouped_ud,
#     (grouped_ud_left["left_target"] == grouped_ud["source"]) &
#     (grouped_ud_left["left_source"] != grouped_ud["target"])
# ).select(
#     sf.col("left_source").alias("source"), "target", 
#     sf.least("left_amount", "amount").alias("amount")
# )
# window = Window.partitionBy(joined["source"]).orderBy(joined["amount"].desc())
# joined = joined.select(
#     "*", sf.row_number().over(window).alias("row_number")
# ).where(sf.col("row_number") <= 100).cache()
# joined.count()

In [22]:
# %%time

# data_input = train_validation.select("*")
# nodes_source = set(validation.select("source").distinct().toPandas()["source"])
# nodes_target = set(validation.select("target").distinct().toPandas()["target"])
# nodes_passthrough = nodes_source.intersection(nodes_target)

# %run communities_global.ipynb

# communities_as_source_features.to_parquet(f"{location_main_features}/valid_communities_as_source_features.parquet")
# communities_as_target_features.to_parquet(f"{location_main_features}/valid_communities_as_target_features.parquet")
# communities_as_passthrough_features.to_parquet(f"{location_main_features}/valid_communities_as_passthrough_features.parquet")

In [23]:
communities_as_source_features = pd.read_parquet(f"{location_main_features}/valid_communities_as_source_features.parquet")
communities_as_target_features = pd.read_parquet(f"{location_main_features}/valid_communities_as_target_features.parquet")
communities_as_passthrough_features = pd.read_parquet(f"{location_main_features}/valid_communities_as_passthrough_features.parquet")

In [None]:
%%time

try:
    os.makedirs(location_validation)
except FileExistsError:
    pass

st = time.time()
for num_days, dt_trx, pov_df, window_df in get_windowed_datasets(validation, train_validation):
    %run model_experiment_nested_hm.ipynb

    all_features = all_features.join(
        communities_as_source_features, how="left", rsuffix="_dispense"
    ).join(
        communities_as_target_features, how="left", rsuffix="_sink"
    ).join(
        communities_as_passthrough_features, how="left", rsuffix="_passthrough"
    )
    
    pov_features_df = []
    for k, v in pov_df[pov_df["source"].isin(in_scope_sources)].groupby(
        ["source", "target"]
    ):
        pov_features_df.append(get_pov_features(k, v))
    pov_features = pd.DataFrame(pov_features_df)

    pov_features = pov_features.set_index("target").join(
        all_features, how="left", rsuffix="_source"
    ).reset_index().set_index("source").join(
        all_features, how="left", rsuffix="_target"
    ).reset_index()
    pov_features.to_parquet(f"{location_validation}{dt_trx}.parquet")
    print(f"Processed {dt_trx} in {(time.time() - st) // 60} minutes | {all_features.shape} | {num_days=}")
    st = time.time()

                                                                                

In [None]:
# %%time

# data_input = data.select("*")
# nodes_source = set(test.select("source").distinct().toPandas()["source"])
# nodes_target = set(test.select("target").distinct().toPandas()["target"])
# nodes_passthrough = nodes_source.intersection(nodes_target)

# %run communities_global.ipynb

# communities_as_source_features.to_parquet(f"{location_main_features}/test_communities_as_source_features.parquet")
# communities_as_target_features.to_parquet(f"{location_main_features}/test_communities_as_target_features.parquet")
# communities_as_passthrough_features.to_parquet(f"{location_main_features}/test_communities_as_passthrough_features.parquet")

In [None]:
communities_as_source_features = pd.read_parquet(f"{location_main_features}/test_communities_as_source_features.parquet")
communities_as_target_features = pd.read_parquet(f"{location_main_features}/test_communities_as_target_features.parquet")
communities_as_passthrough_features = pd.read_parquet(f"{location_main_features}/test_communities_as_passthrough_features.parquet")

In [None]:
%%time

try:
    os.makedirs(location_test)
except FileExistsError:
    pass

first_test_timestamp = test.select(sf.min("timestamp").alias("x")).collect()[0]["x"]

st = time.time()
for num_days, dt_trx, pov_df, window_df in get_windowed_datasets(test, data):
    # Just to (double) make sure
    pov_df = pov_df.loc[pov_df["timestamp"] >= first_test_timestamp, :].copy(deep=True)
    
    %run model_experiment_nested_hm.ipynb

    all_features = all_features.join(
        communities_as_source_features, how="left", rsuffix="_dispense"
    ).join(
        communities_as_target_features, how="left", rsuffix="_sink"
    ).join(
        communities_as_passthrough_features, how="left", rsuffix="_passthrough"
    )
    
    pov_features_df = []
    for k, v in pov_df[pov_df["source"].isin(in_scope_sources)].groupby(
        ["source", "target"]
    ):
        pov_features_df.append(get_pov_features(k, v))
    pov_features = pd.DataFrame(pov_features_df)

    pov_features = pov_features.set_index("target").join(
        all_features, how="left", rsuffix="_source"
    ).reset_index().set_index("source").join(
        all_features, how="left", rsuffix="_target"
    ).reset_index()
    
    pov_features.to_parquet(f"{location_test}{dt_trx}.parquet")
    print(f"Processed {dt_trx} in {(time.time() - st) // 60} minutes | {all_features.shape} | {num_days=}")
    st = time.time()

In [None]:
print((time.time() - start) // 60)

In [None]:
def load_dataset(loc_main):
    dfs = []
    for location in glob(f"{loc_main}{os.sep}*.parquet"):
        df_date = pd.read_parquet(location)
        df_date.loc[:, "date"] = location.split(os.sep)[-1].split(".")[0]
        dfs.append(df_date)
    return pd.concat(dfs, ignore_index=True)

In [None]:
label_columns = ["source", "target", "date", "is_laundering"]

train_features = load_dataset(location_train)
train_features_labels = train_features.loc[:, label_columns].copy(deep=True)
del train_features["source"]
del train_features["target"]
del train_features["date"]
del train_features["is_laundering"]

validation_features = load_dataset(location_validation)
validation_features_labels = validation_features.loc[:, label_columns].copy(deep=True)
# TODO: This is not ideal
for missing in set(train_features.columns) - set(validation_features.columns):
    del train_features[missing]
    print(missing)
validation_features = validation_features.loc[:, train_features.columns]

test_features = load_dataset(location_test)
test_features_labels = test_features.loc[:, label_columns].copy(deep=True)
# TODO: This is not ideal
for missing in set(train_features.columns) - set(test_features.columns):
    del train_features[missing]
    print(missing)
test_features = test_features.loc[:, train_features.columns]
test_labels_orig = test.select(["source", "target", "is_laundering"]).toPandas()

In [None]:
scale_pos_weight = int(train_features_labels.shape[0] / train_features_labels["is_laundering"].sum())

In [None]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))

In [None]:
# For HI
# def train_model(x, y, x_, y_):
#     model = xgb.XGBClassifier(
#         early_stopping_rounds=20, scale_pos_weight=10,
#         eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=20, max_depth=6,
#         colsample_bytree=0.5, subsample=0.5,
#     )
#     model.fit(x, y, verbose=False, eval_set=[(x_, y_)])
#     print(f"Best iteration: {model.best_iteration}\n")
#     return model


# For LI
def train_model(x, y, x_, y_):
    model = xgb.XGBClassifier(
        early_stopping_rounds=20, scale_pos_weight=3,
        eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=20, max_depth=6,
        colsample_bytree=0.5, subsample=0.5,
    )
    model.fit(x, y, verbose=False, eval_set=[(x_, y_)])
    print(f"Best iteration: {model.best_iteration}\n")
    return model

In [None]:
def get_orig_prediction_data(labels_data, labels_orig, prediction_values):
    labels_data = labels_data.copy(deep=True)
    labels_orig = labels_orig.copy(deep=True)
    labels_data.loc[:, "predicted"] = prediction_values
    predictions_agg = labels_data.groupby(["source", "target"]).agg(
        predicted=("predicted", "max")
    ).reset_index()
    final_predictions = labels_orig.set_index(["source", "target"]).join(
        predictions_agg.set_index(["source", "target"]), how="left"
    ).reset_index()
    return final_predictions

In [None]:
%%time

model = train_model(
    train_features, train_features_labels["is_laundering"].values, 
    validation_features, validation_features_labels["is_laundering"].values
)
y_test_predicted = model.predict(test_features)
print(
    "aggregated",
    round(f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2),
    round(recall_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2)
)
predictions_data = get_orig_prediction_data(
    test_features_labels, test_labels_orig, y_test_predicted
)
f1_final = round(f1_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
print(
    "final",
    f1_final,
    round(recall_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
)
print()

In [None]:
%%time

CV_FOLD_PERC = 0.8
N_FOLDS = 5

f1_scores = []
for fold in range(N_FOLDS):
    print("Fold", fold + 1)
    x_train = train_features.sample(frac=CV_FOLD_PERC)
    x_train_labels = x_train.loc[:, []].join(train_features_labels, how="left")
    x_validation = validation_features.sample(frac=CV_FOLD_PERC)
    x_validation_labels = x_validation.loc[:, []].join(validation_features_labels, how="left")
    model = train_model(
        x_train, x_train_labels["is_laundering"].values, 
        x_validation, x_validation_labels["is_laundering"].values
    )
    y_test_predicted = model.predict(test_features)
    predictions_data = get_orig_prediction_data(
        test_features_labels, test_labels_orig, y_test_predicted
    )
    f1_cv = f1_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100
    print(
        round(f1_cv, 2),
        round(recall_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
    )
    f1_scores.append(f1_cv)

In [None]:
print(f"{f1_final} Â±{round(np.std(f1_scores), 2)}")