In [1]:
import os
import pickle
import random
import shutil
import sys
import time
import uuid
from glob import glob
from datetime import timedelta, datetime
from itertools import combinations

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from sklearn.metrics import f1_score, recall_score

import settings as s
from common import create_workload_for_multi_proc
from communities import get_communities_multi_proc
from features import get_features_multi_proc, pov_features

%load_ext autoreload
%autoreload 2

In [2]:
config = [
    ("spark.driver.memory", "16g"),
    ("spark.worker.memory", "16g"),
    ("spark.driver.maxResultSize", "16g"),
]
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(config))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/10 14:01:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
start = time.time()

In [4]:
WINDOW_SIZE = 14
TRAIN_PERC = 0.6
VALIDATION_PERC = 0.2
TEST_PERC = 0.2

NUM_PROCS = 8

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [5]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data = data.where(data["source"] != data["target"])

In [6]:
# The last few days only contain incomplete data
trx_count_per_day = data.groupby(sf.to_date("timestamp").alias("date")).count().toPandas()
trx_count_per_day = trx_count_per_day.sort_values("date").set_index("date")
mean_per_day = np.mean(trx_count_per_day["count"])
mean_per_day_ratio = trx_count_per_day["count"] / mean_per_day
complete_data_present_till = max(mean_per_day_ratio[mean_per_day_ratio > 0.1].index)
complete_data_present_till = data.where(sf.to_date("timestamp") == complete_data_present_till).select(
    sf.max("timestamp").alias("x")
).collect()[0]["x"]
print(complete_data_present_till)

                                                                                

2022-09-10 23:59:00


In [7]:
trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)

                                                                                

In [8]:
last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes = spark.createDataFrame(
    pd.DataFrame(train_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(train_indexes.count())
validation_indexes = spark.createDataFrame(
    pd.DataFrame(validation_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(validation_indexes.count())
test_indexes = spark.createDataFrame(
    pd.DataFrame(test_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(test_indexes.count())

25/06/09 23:04:45 WARN TaskSetManager: Stage 11 contains a task of very large size (1507 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

2689317
896439
896440


In [9]:
train = train_indexes.join(
    data, on="transaction_id", how="left"
)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
)
test = test_indexes.join(
    data, on="transaction_id", how="left"
)
train_validation = train.union(validation)

In [10]:
def get_pandas(df):
    df.write.parquet("temp.parquet", mode="overwrite")
    df = pd.read_parquet("temp.parquet")
    # Because of tz discrepancy
    df.loc[:, "timestamp"] += timedelta(hours=2)
    return df

In [11]:
def get_windowed_datasets(data_dates, data_input):
    dates = data_dates.select(sf.to_date("timestamp").alias("x")).distinct().collect()
    dates = sorted([x["x"] for x in dates])
    for date_trx in dates:
        datetime_trx_start = datetime.combine(date_trx, datetime.min.time())
        datetime_trx_end = datetime.combine(date_trx, datetime.max.time())
        left_start = datetime_trx_start - timedelta(WINDOW_SIZE)
        right_end = datetime_trx_end + timedelta(WINDOW_SIZE)
        left = get_pandas(
            data_input.where(
                (data_input["timestamp"] >= left_start) & (data_input["timestamp"] <= datetime_trx_end)
            )
        )
        pov = get_pandas(
            data_input.where(
                (data_input["timestamp"] >= datetime_trx_start) & (data_input["timestamp"] <= datetime_trx_end)
            )
        )
        right = get_pandas(
            data_input.where(
                (data_input["timestamp"] >= datetime_trx_start) & (data_input["timestamp"] <= right_end)
            )
        )
        yield(len(dates), date_trx, left, pov, right)

In [12]:
location_main_features = "features"

In [13]:
# shutil.rmtree(location_main_features, ignore_errors=True)

In [14]:
%%time

SAMPLE_SIZE_NEG_SOURCES = 1

location_train = f"{location_main_features}{os.sep}train{os.sep}"
try:
    os.makedirs(location_train)
except FileExistsError:
    pass

st = time.time()
for num_days, dt_trx, left_df, pov_df, right_df in get_windowed_datasets(train, train):
    pov_features_generated = False
    for in_scope_window, window_name in [(left_df, "left"), (right_df, "right")]:
        %run model_experiment_nested.ipynb
    all_features.to_parquet(f"{location_train}{dt_trx}.parquet")
    print(f"Processed {dt_trx} in {(time.time() - st) // 60} minutes | {all_features.shape} | {num_days=}")
    st = time.time()

                                                                                

Processed 2022-09-01 in 25.0 minutes | (310355, 449) | num_days=6


                                                                                

Processed 2022-09-02 in 34.0 minutes | (391950, 449) | num_days=6


                                                                                

Processed 2022-09-03 in 16.0 minutes | (108876, 449) | num_days=6


                                                                                

Processed 2022-09-04 in 15.0 minutes | (108889, 449) | num_days=6


                                                                                

Processed 2022-09-05 in 22.0 minutes | (234285, 449) | num_days=6


                                                                                

Processed 2022-09-06 in 21.0 minutes | (232668, 449) | num_days=6
CPU times: user 30min 15s, sys: 1min 20s, total: 31min 36s
Wall time: 2h 15min 17s


In [15]:
%%time

SAMPLE_SIZE_NEG_SOURCES = 1

location_validation = f"{location_main_features}{os.sep}validation{os.sep}"
try:
    os.makedirs(location_validation)
except FileExistsError:
    pass

st = time.time()
for num_days, dt_trx, left_df, pov_df, right_df in get_windowed_datasets(validation, train_validation):
    pov_features_generated = False
    for in_scope_window, window_name in [(left_df, "left"), (right_df, "right")]:
        %run model_experiment_nested.ipynb
    all_features.to_parquet(f"{location_validation}{dt_trx}.parquet")
    print(f"Processed {dt_trx} in {(time.time() - st) // 60} minutes | {all_features.shape} | {num_days=}")
    st = time.time()

                                                                                

Processed 2022-09-06 in 22.0 minutes | (233744, 449) | num_days=3


                                                                                

Processed 2022-09-07 in 22.0 minutes | (234400, 449) | num_days=3


                                                                                

Processed 2022-09-08 in 21.0 minutes | (210737, 449) | num_days=3
CPU times: user 15min 37s, sys: 41.8 s, total: 16min 19s
Wall time: 1h 5min 47s


In [16]:
%%time

SAMPLE_SIZE_NEG_SOURCES = 1

location_test = f"{location_main_features}{os.sep}test{os.sep}"
try:
    os.makedirs(location_test)
except FileExistsError:
    pass

st = time.time()
for num_days, dt_trx, left_df, pov_df, right_df in get_windowed_datasets(test, data):
    pov_features_generated = False
    for in_scope_window, window_name in [(left_df, "left"), (right_df, "right")]:
        %run model_experiment_nested.ipynb
    all_features.to_parquet(f"{location_test}{dt_trx}.parquet")
    print(f"Processed {dt_trx} in {(time.time() - st) // 60} minutes | {all_features.shape} | {num_days=}")
    st = time.time()

                                                                                

Processed 2022-09-08 in 23.0 minutes | (234414, 449) | num_days=11


                                                                                

Processed 2022-09-09 in 33.0 minutes | (350115, 449) | num_days=11


                                                                                

Processed 2022-09-10 in 15.0 minutes | (109754, 449) | num_days=11


                                                                                

Processed 2022-09-11 in 1.0 minutes | (229, 407) | num_days=11


                                                                                

Processed 2022-09-12 in 1.0 minutes | (170, 398) | num_days=11


                                                                                

Processed 2022-09-13 in 1.0 minutes | (106, 386) | num_days=11


                                                                                

Processed 2022-09-14 in 1.0 minutes | (70, 373) | num_days=11


                                                                                

Processed 2022-09-15 in 1.0 minutes | (27, 354) | num_days=11


                                                                                

Processed 2022-09-16 in 1.0 minutes | (26, 343) | num_days=11


                                                                                

Processed 2022-09-17 in 1.0 minutes | (14, 324) | num_days=11


                                                                                

Processed 2022-09-18 in 1.0 minutes | (8, 304) | num_days=11
CPU times: user 20min 10s, sys: 56.6 s, total: 21min 6s
Wall time: 1h 21min 1s


In [17]:
print((time.time() - start) // 60)

282.0


In [18]:
def load_dataset(loc_main):
    dfs = []
    for location in glob(f"{loc_main}{os.sep}*.parquet"):
        df_date = pd.read_parquet(location)
        df_date.loc[:, "date"] = location.split(os.sep)[-1].split(".")[0]
        dfs.append(df_date)
    return pd.concat(dfs, ignore_index=True)

In [19]:
label_columns = ["source", "target", "date", "is_laundering"]

train_features = load_dataset(location_train)
train_features_labels = train_features.loc[:, label_columns].copy(deep=True)
del train_features["source"]
del train_features["target"]
del train_features["date"]
del train_features["is_laundering"]

validation_features = load_dataset(location_validation)
validation_features_labels = validation_features.loc[:, label_columns].copy(deep=True)
validation_features = validation_features.loc[:, train_features.columns]

test_features = load_dataset(location_test)
test_features_labels = test_features.loc[:, label_columns].copy(deep=True)
test_features = test_features.loc[:, train_features.columns]

In [20]:
scale_pos_weight = int(train_features_labels.shape[0] / train_features_labels["is_laundering"].sum())

In [21]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))

In [79]:
%%time

model = xgb.XGBClassifier(
    early_stopping_rounds=20, scale_pos_weight=5,
    eval_metric=f1_eval, disable_default_eval_metric=True,
)
model.fit(
    train_features, train_features_labels["is_laundering"].values,
    eval_set=[
        # (train_features, train_features_labels["is_laundering"].values), 
        (validation_features, validation_features_labels["is_laundering"].values)
    ]
)

[0]	validation_0-f1_eval:0.44351
[1]	validation_0-f1_eval:0.42519
[2]	validation_0-f1_eval:0.42308
[3]	validation_0-f1_eval:0.38539
[4]	validation_0-f1_eval:0.37785
[5]	validation_0-f1_eval:0.37192
[6]	validation_0-f1_eval:0.36518
[7]	validation_0-f1_eval:0.36426
[8]	validation_0-f1_eval:0.36257
[9]	validation_0-f1_eval:0.36269
[10]	validation_0-f1_eval:0.36396
[11]	validation_0-f1_eval:0.36208
[12]	validation_0-f1_eval:0.36150
[13]	validation_0-f1_eval:0.35927
[14]	validation_0-f1_eval:0.35627
[15]	validation_0-f1_eval:0.35680
[16]	validation_0-f1_eval:0.35683
[17]	validation_0-f1_eval:0.35652
[18]	validation_0-f1_eval:0.35751
[19]	validation_0-f1_eval:0.35723
[20]	validation_0-f1_eval:0.35201
[21]	validation_0-f1_eval:0.35257
[22]	validation_0-f1_eval:0.35129
[23]	validation_0-f1_eval:0.35199
[24]	validation_0-f1_eval:0.34966
[25]	validation_0-f1_eval:0.34729
[26]	validation_0-f1_eval:0.34781
[27]	validation_0-f1_eval:0.34568
[28]	validation_0-f1_eval:0.34468
[29]	validation_0-f1_eva

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,20
,enable_categorical,False


In [80]:
y_test_predicted = model.predict(test_features)

In [81]:
round(f1_score(test_features_labels["is_laundering"], y_test_predicted), 4)

0.7102

In [82]:
round(recall_score(test_features_labels["is_laundering"], y_test_predicted), 4)

0.6062