In [53]:
import os
import pickle
import random
import shutil
import sys
import time
import uuid
from glob import glob
from datetime import timedelta, datetime
from itertools import combinations

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from sklearn.metrics import f1_score, recall_score

import settings as s
from common import create_workload_for_multi_proc
from communities import get_communities_multi_proc
from features import get_features_multi_proc, get_pov_features


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
config = [
    ("spark.driver.memory", "16g"),
    ("spark.worker.memory", "16g"),
    ("spark.driver.maxResultSize", "16g"),
]
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(config))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/19 12:33:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
start = time.time()

In [4]:
WINDOW_SIZE = 7
TRAIN_PERC = 0.6
VALIDATION_PERC = 0.2
TEST_PERC = 0.2

NUM_PROCS = 10

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [5]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data = data.where(data["source"] != data["target"])

In [6]:
# The last few days only contain incomplete data
trx_count_per_day = data.groupby(sf.to_date("timestamp").alias("date")).count().toPandas()
trx_count_per_day = trx_count_per_day.sort_values("date").set_index("date")
mean_per_day = np.mean(trx_count_per_day["count"])
mean_per_day_ratio = trx_count_per_day["count"] / mean_per_day
complete_data_present_till = max(mean_per_day_ratio[mean_per_day_ratio > 0.1].index)
complete_data_present_till = data.where(sf.to_date("timestamp") == complete_data_present_till).select(
    sf.max("timestamp").alias("x")
).collect()[0]["x"]
print(complete_data_present_till)

                                                                                

2022-09-10 23:59:00


In [7]:
trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)

                                                                                

In [8]:
last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes = spark.createDataFrame(
    pd.DataFrame(train_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(train_indexes.count())
validation_indexes = spark.createDataFrame(
    pd.DataFrame(validation_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(validation_indexes.count())
test_indexes = spark.createDataFrame(
    pd.DataFrame(test_indexes, columns=["transaction_id"])
).repartition(1).cache()
print(test_indexes.count())

25/06/19 12:34:07 WARN TaskSetManager: Stage 11 contains a task of very large size (1507 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

2689317
896439
896440


In [9]:
train = train_indexes.join(
    data, on="transaction_id", how="left"
)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
)
test = test_indexes.join(
    data, on="transaction_id", how="left"
)
train_validation = train.union(validation)

In [10]:
def get_pandas(df):
    df.write.parquet("temp.parquet", mode="overwrite")
    df = pd.read_parquet("temp.parquet")
    # Because of tz discrepancy
    df.loc[:, "timestamp"] += timedelta(hours=2)
    return df

In [11]:
def get_windowed_datasets(data_dates, data_input):
    dates = data_dates.select(sf.to_date("timestamp").alias("x")).distinct().collect()
    dates = sorted([x["x"] for x in dates])
    for date_trx in dates:
        datetime_trx_start = datetime.combine(date_trx, datetime.min.time())
        datetime_trx_end = datetime.combine(date_trx, datetime.max.time())
        left_start = datetime_trx_start - timedelta(WINDOW_SIZE)
        right_end = datetime_trx_end + timedelta(WINDOW_SIZE)
        pov = get_pandas(
            data_input.where(
                (data_input["timestamp"] >= datetime_trx_start) & (data_input["timestamp"] <= datetime_trx_end)
            )
        )
        window = get_pandas(
            data_input.where(
                (data_input["timestamp"] >= left_start) & (data_input["timestamp"] <= right_end)
            )
        )
        yield(len(dates), date_trx, pov, window)

In [10]:
location_main_features = "features"

In [13]:
# shutil.rmtree(location_main_features, ignore_errors=True)

In [14]:
# %%time

# data_input = train.select("*")
# nodes_source = set(train.select("source").distinct().toPandas()["source"])
# nodes_target = set(train.select("target").distinct().toPandas()["target"])
# nodes_passthrough = nodes_source.intersection(nodes_target)

# %run communities_global.ipynb

# communities_as_source_features.to_parquet(f"{location_main_features}/train_communities_as_source_features.parquet")
# communities_as_target_features.to_parquet(f"{location_main_features}/train_communities_as_target_features.parquet")
# communities_as_passthrough_features.to_parquet(f"{location_main_features}/train_communities_as_passthrough_features.parquet")

In [15]:
communities_as_source_features = pd.read_parquet(f"{location_main_features}/train_communities_as_source_features.parquet")
communities_as_target_features = pd.read_parquet(f"{location_main_features}/train_communities_as_target_features.parquet")
communities_as_passthrough_features = pd.read_parquet(f"{location_main_features}/train_communities_as_passthrough_features.parquet")

In [11]:
location_train = f"{location_main_features}{os.sep}train{os.sep}"
location_validation = f"{location_main_features}{os.sep}validation{os.sep}"
location_test = f"{location_main_features}{os.sep}test{os.sep}"

In [16]:
%%time

try:
    os.makedirs(location_train)
except FileExistsError:
    pass

st = time.time()
for num_days, dt_trx, pov_df, window_df in get_windowed_datasets(train, train):
    %run model_experiment_nested.ipynb
    
    all_features = all_features.join(
        communities_as_source_features, how="left", rsuffix="_dispense"
    ).join(
        communities_as_target_features, how="left", rsuffix="_sink"
    ).join(
        communities_as_passthrough_features, how="left", rsuffix="_passthrough"
    )
    
    pov_features_df = []
    for k, v in pov_df[pov_df["source"].isin(in_scope_sources)].groupby(
        ["source", "target"]
    ):
        pov_features_df.append(get_pov_features(k, v))
    pov_features = pd.DataFrame(pov_features_df)
    
    pov_features = pov_features.set_index("target").join(
        all_features, how="left", rsuffix="_source"
    ).reset_index().set_index("source").join(
        all_features, how="left", rsuffix="_target"
    ).reset_index()
    
    pov_features.to_parquet(f"{location_train}{dt_trx}.parquet")
    print(f"Processed {dt_trx} in {(time.time() - st) // 60} minutes | {all_features.shape} | {num_days=}")
    st = time.time()

                                                                                

Processed 2022-09-01 in 15.0 minutes | (273775, 185) | num_days=6


                                                                                

Processed 2022-09-02 in 21.0 minutes | (352061, 185) | num_days=6


                                                                                

Processed 2022-09-03 in 9.0 minutes | (146548, 185) | num_days=6


                                                                                

Processed 2022-09-04 in 9.0 minutes | (146618, 185) | num_days=6


                                                                                

Processed 2022-09-05 in 13.0 minutes | (224048, 185) | num_days=6


                                                                                

Processed 2022-09-06 in 13.0 minutes | (223096, 185) | num_days=6
CPU times: user 22min 23s, sys: 1min 4s, total: 23min 28s
Wall time: 1h 21min 42s


In [17]:
# %%time

# data_input = train_validation.select("*")
# nodes_source = set(validation.select("source").distinct().toPandas()["source"])
# nodes_target = set(validation.select("target").distinct().toPandas()["target"])
# nodes_passthrough = nodes_source.intersection(nodes_target)

# %run communities_global.ipynb

# communities_as_source_features.to_parquet(f"{location_main_features}/valid_communities_as_source_features.parquet")
# communities_as_target_features.to_parquet(f"{location_main_features}/valid_communities_as_target_features.parquet")
# communities_as_passthrough_features.to_parquet(f"{location_main_features}/valid_communities_as_passthrough_features.parquet")

In [18]:
communities_as_source_features = pd.read_parquet(f"{location_main_features}/valid_communities_as_source_features.parquet")
communities_as_target_features = pd.read_parquet(f"{location_main_features}/valid_communities_as_target_features.parquet")
communities_as_passthrough_features = pd.read_parquet(f"{location_main_features}/valid_communities_as_passthrough_features.parquet")

In [19]:
%%time

try:
    os.makedirs(location_validation)
except FileExistsError:
    pass

st = time.time()
for num_days, dt_trx, pov_df, window_df in get_windowed_datasets(validation, train_validation):
    %run model_experiment_nested.ipynb

    all_features = all_features.join(
        communities_as_source_features, how="left", rsuffix="_dispense"
    ).join(
        communities_as_target_features, how="left", rsuffix="_sink"
    ).join(
        communities_as_passthrough_features, how="left", rsuffix="_passthrough"
    )
    
    pov_features_df = []
    for k, v in pov_df[pov_df["source"].isin(in_scope_sources)].groupby(
        ["source", "target"]
    ):
        pov_features_df.append(get_pov_features(k, v))
    pov_features = pd.DataFrame(pov_features_df)

    pov_features = pov_features.set_index("target").join(
        all_features, how="left", rsuffix="_source"
    ).reset_index().set_index("source").join(
        all_features, how="left", rsuffix="_target"
    ).reset_index()

    pov_features.to_parquet(f"{location_validation}{dt_trx}.parquet")
    print(f"Processed {dt_trx} in {(time.time() - st) // 60} minutes | {all_features.shape} | {num_days=}")
    st = time.time()

                                                                                

Processed 2022-09-06 in 13.0 minutes | (223618, 185) | num_days=3


                                                                                

Processed 2022-09-07 in 13.0 minutes | (224078, 185) | num_days=3


                                                                                

Processed 2022-09-08 in 13.0 minutes | (212250, 185) | num_days=3
CPU times: user 11min 20s, sys: 33.9 s, total: 11min 54s
Wall time: 40min 50s


In [20]:
# %%time

# data_input = data.select("*")
# nodes_source = set(test.select("source").distinct().toPandas()["source"])
# nodes_target = set(test.select("target").distinct().toPandas()["target"])
# nodes_passthrough = nodes_source.intersection(nodes_target)

# %run communities_global.ipynb

# communities_as_source_features.to_parquet(f"{location_main_features}/test_communities_as_source_features.parquet")
# communities_as_target_features.to_parquet(f"{location_main_features}/test_communities_as_target_features.parquet")
# communities_as_passthrough_features.to_parquet(f"{location_main_features}/test_communities_as_passthrough_features.parquet")

In [21]:
communities_as_source_features = pd.read_parquet(f"{location_main_features}/test_communities_as_source_features.parquet")
communities_as_target_features = pd.read_parquet(f"{location_main_features}/test_communities_as_target_features.parquet")
communities_as_passthrough_features = pd.read_parquet(f"{location_main_features}/test_communities_as_passthrough_features.parquet")

In [22]:
%%time

try:
    os.makedirs(location_test)
except FileExistsError:
    pass

st = time.time()
for num_days, dt_trx, pov_df, window_df in get_windowed_datasets(test, data):
    %run model_experiment_nested.ipynb

    all_features = all_features.join(
        communities_as_source_features, how="left", rsuffix="_dispense"
    ).join(
        communities_as_target_features, how="left", rsuffix="_sink"
    ).join(
        communities_as_passthrough_features, how="left", rsuffix="_passthrough"
    )
    
    pov_features_df = []
    for k, v in pov_df[pov_df["source"].isin(in_scope_sources)].groupby(
        ["source", "target"]
    ):
        pov_features_df.append(get_pov_features(k, v))
    pov_features = pd.DataFrame(pov_features_df)

    pov_features = pov_features.set_index("target").join(
        all_features, how="left", rsuffix="_source"
    ).reset_index().set_index("source").join(
        all_features, how="left", rsuffix="_target"
    ).reset_index()
    
    pov_features.to_parquet(f"{location_test}{dt_trx}.parquet")
    print(f"Processed {dt_trx} in {(time.time() - st) // 60} minutes | {all_features.shape} | {num_days=}")
    st = time.time()

                                                                                

Processed 2022-09-08 in 14.0 minutes | (224172, 185) | num_days=11


                                                                                

Processed 2022-09-09 in 18.0 minutes | (345481, 185) | num_days=11


                                                                                

Processed 2022-09-10 in 8.0 minutes | (147496, 185) | num_days=11


                                                                                

Processed 2022-09-11 in 0.0 minutes | (338, 174) | num_days=11


                                                                                

Processed 2022-09-12 in 0.0 minutes | (239, 174) | num_days=11


                                                                                

Processed 2022-09-13 in 0.0 minutes | (146, 172) | num_days=11


                                                                                

Processed 2022-09-14 in 0.0 minutes | (95, 170) | num_days=11


                                                                                

Processed 2022-09-15 in 0.0 minutes | (37, 166) | num_days=11


                                                                                

Processed 2022-09-16 in 0.0 minutes | (35, 165) | num_days=11
Processed 2022-09-17 in 0.0 minutes | (21, 164) | num_days=11
Processed 2022-09-18 in 0.0 minutes | (10, 159) | num_days=11
CPU times: user 12min 59s, sys: 40.1 s, total: 13min 39s
Wall time: 46min 14s


In [23]:
print((time.time() - start) // 60)

169.0


In [12]:
def load_dataset(loc_main):
    dfs = []
    for location in glob(f"{loc_main}{os.sep}*.parquet"):
        df_date = pd.read_parquet(location)
        df_date.loc[:, "date"] = location.split(os.sep)[-1].split(".")[0]
        dfs.append(df_date)
    return pd.concat(dfs, ignore_index=True)

In [25]:
label_columns = ["source", "target", "date", "is_laundering"]

train_features = load_dataset(location_train)
train_features_labels = train_features.loc[:, label_columns].copy(deep=True)
del train_features["source"]
del train_features["target"]
del train_features["date"]
del train_features["is_laundering"]

validation_features = load_dataset(location_validation)
validation_features_labels = validation_features.loc[:, label_columns].copy(deep=True)
validation_features = validation_features.loc[:, train_features.columns]

test_features = load_dataset(location_test)
test_features_labels = test_features.loc[:, label_columns].copy(deep=True)
test_features = test_features.loc[:, train_features.columns]
test_labels_orig = test.select(["source", "target", "is_laundering"]).toPandas()

                                                                                

In [26]:
# bool_cols = [
#     "source_currency_aud", "source_currency_brl", "source_currency_btc", 
#     "source_currency_cad", "source_currency_chf", "source_currency_cny", 
#     "source_currency_gbp", "source_currency_ils", "source_currency_inr", 
#     "source_currency_jpy", "source_currency_mxn", "source_currency_rub", 
#     "format_Bitcoin", "format_Cash", "format_Cheque", "format_Credit Card", 
#     "format_Wire"
# ]
# bool_cols_types = {c: bool for c in bool_cols}
# for col in bool_cols:
#     test_features.loc[:, col] = test_features[col].apply(
#         lambda x: False if np.isnan(x) else x
#     )
# test_features = test_features.astype(bool_cols_types)

In [191]:
# from sklearn.experimental import enable_halving_search_cv
# from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
# from scipy.stats import randint

# param_grid = {
#     "min_child_weight": [1, 3, 5],
#     "subsample": [0.5, 0.8, 1.0],
#     "colsample_bytree": [0.5, 0.8, 1.0],
#     "learning_rate": [0.01, 0.1, 0.3],
#     "scale_pos_weight": [1, 5, 10],
# }
# param_distributions = {
#     "min_child_weight": np.random.randint(1, 6),
#     "subsample": 0.1 * np.random.randint(5, 11),
#     "colsample_bytree": 0.1 * np.random.randint(5, 11),
#     "scale_pos_weight": np.random.randint(1, 11),
# }
# # Create XGBoost classifier
# model_xgb = xgb.XGBClassifier(
#     n_estimators=100, objective="binary:logistic", 
#     # eval_metric=f1_eval, disable_default_eval_metric=True,
# )

# # Perform halving grid search
# halving_search = HalvingRandomSearchCV(
#     estimator=model_xgb, param_distributions=param_distributions,  # param_grid=param_grid, 
#     cv=5, factor=3, resource="n_estimators", max_resources=100, verbose=2
# )
# halving_search.fit(
#     train_features, train_features_labels["is_laundering"].values,
#     eval_set=[
#         # (train_features, train_features_labels["is_laundering"].values), 
#         (validation_features, validation_features_labels["is_laundering"].values)
#     ]
# )

# # Print best parameters
# print(f"Best parameters: {halving_search.best_params_}")
# print(f"Best score: {halving_search.best_score_}")

In [27]:
scale_pos_weight = int(train_features_labels.shape[0] / train_features_labels["is_laundering"].sum())

In [22]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))

In [33]:
def train_model(x, y, x_, y_):
    model = xgb.XGBClassifier(
        early_stopping_rounds=20, scale_pos_weight=10,
        eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=20, max_depth=6,
        colsample_bytree=0.5, subsample=0.5,
    )
    model.fit(x, y, verbose=False, eval_set=[(x_, y_)])
    print(f"Best iteration: {model.best_iteration}\n")
    return model

In [42]:
def get_orig_prediction_data(labels_data, labels_orig, prediction_values):
    labels_data = labels_data.copy(deep=True)
    labels_orig = labels_orig.copy(deep=True)
    labels_data.loc[:, "predicted"] = prediction_values
    predictions_agg = labels_data.groupby(["source", "target"]).agg(
        predicted=("predicted", "max")
    ).reset_index()
    final_predictions = labels_orig.set_index(["source", "target"]).join(
        predictions_agg.set_index(["source", "target"]), how="left"
    ).reset_index()
    return final_predictions

In [56]:
%%time

model = train_model(
    train_features, train_features_labels["is_laundering"].values, 
    validation_features, validation_features_labels["is_laundering"].values
)
y_test_predicted = model.predict(test_features)
print(
    "aggregated",
    round(f1_score(test_features_labels["is_laundering"], y_test_predicted), 4),
    round(recall_score(test_features_labels["is_laundering"], y_test_predicted), 4)
)
predictions_data = get_orig_prediction_data(
    test_features_labels, test_labels_orig, y_test_predicted
)
f1_final = round(f1_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
print(
    "final",
    f1_final,
    round(recall_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
)
print()

Best iteration: 5

aggregated 0.7231 0.6516
final 75.39 69.03

CPU times: user 11min 1s, sys: 7.43 s, total: 11min 9s
Wall time: 1min 19s


In [59]:
%%time

CV_FOLD_PERC = 0.8
N_FOLDS = 5

f1_scores = []
for fold in range(N_FOLDS):
    print("Fold", fold + 1)
    x_train = train_features.sample(frac=CV_FOLD_PERC)
    x_train_labels = x_train.loc[:, []].join(train_features_labels, how="left")
    x_validation = validation_features.sample(frac=CV_FOLD_PERC)
    x_validation_labels = x_validation.loc[:, []].join(validation_features_labels, how="left")
    model = train_model(
        x_train, x_train_labels["is_laundering"].values, 
        x_validation, x_validation_labels["is_laundering"].values
    )
    y_test_predicted = model.predict(test_features)
    predictions_data = get_orig_prediction_data(
        test_features_labels, test_labels_orig, y_test_predicted
    )
    f1_cv = f1_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100
    print(
        round(f1_cv, 2),
        round(recall_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
    )
    f1_scores.append(f1_cv)

Fold 1
Best iteration: 5

75.14 69.39
Fold 2
Best iteration: 4

75.46 68.18
Fold 3
Best iteration: 6

74.91 68.91
Fold 4
Best iteration: 5

74.98 69.33
Fold 5
Best iteration: 5

76.58 69.81
CPU times: user 49min 50s, sys: 30.1 s, total: 50min 20s
Wall time: 5min 59s


In [60]:
print(f"{f1_final} Â±{round(np.std(f1_scores), 2)}")

75.39 Â±0.61
