In [1]:
import os
import pickle
import random
import shutil
import sys
import time
import uuid
from glob import glob
from datetime import timedelta, datetime
from itertools import combinations

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.metrics import f1_score, recall_score
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

import settings as s
from common import create_workload_for_multi_proc, get_weights
from communities import get_communities_multi_proc
from features import get_features_multi_proc, get_edge_features

%load_ext autoreload
%autoreload 2

In [2]:
config = [
    ("spark.driver.memory", "16g"),
    ("spark.worker.memory", "16g"),
    ("spark.driver.maxResultSize", "16g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    # ("spark.sql.autoBroadcastJoinThreshold", "2g"),
]
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(config))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/02 21:07:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
start = time.time()

In [4]:
WINDOW_SIZE = 7
TRAIN_PERC = 0.6
VALIDATION_PERC = 0.2
TEST_PERC = 0.2

NUM_PROCS = 10

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [5]:
def get_pandas(df):
    df.write.parquet("temp.parquet", mode="overwrite")
    df = pd.read_parquet("temp.parquet")
    if "timestamp" in df.columns:
        # Because of tz discrepancy
        df.loc[:, "timestamp"] += timedelta(hours=2)
    return df

In [6]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data_count_original = data.count()

In [7]:
%%time

KEEP_TOP_N = 100

data_agg_weights = get_weights(
    get_pandas(
        data.groupby(["source", "target"])
        .agg(
            sf.sum("amount").alias("amount")
        )
    )
)
data_agg_weights.sort_values("weight", ascending=False, inplace=True)

edges_to_keep = data_agg_weights.groupby("source").head(KEEP_TOP_N).reset_index(drop=True)
edges_to_keep.sort_values("weight", ascending=False, inplace=True)
edges_to_keep = edges_to_keep.groupby("target").head(KEEP_TOP_N).reset_index(drop=True)
edges_to_keep = edges_to_keep.loc[:, ["source", "target"]].drop_duplicates()
edges_to_keep = spark.createDataFrame(edges_to_keep)

data_graph = data.join(
    edges_to_keep.select(sf.col("source").alias("src"), sf.col("target").alias("dst")),
    (sf.col("source") == sf.col("src")) &
    (sf.col("target") == sf.col("dst"))
).drop("src", "dst").persist(StorageLevel.DISK_ONLY)
data_count_graph = data_graph.count()
reduction = round((data_count_graph / data_count_original) * 100, 2)
print(f"\nReduced to {reduction}%\n")

25/07/02 21:09:50 WARN TaskSetManager: Stage 7 contains a task of very large size (16227 KiB). The maximum recommended task size is 1000 KiB.


Reduced to 90.53%

CPU times: user 1min 49s, sys: 1.15 s, total: 1min 50s
Wall time: 3min 25s


                                                                                

In [8]:
%%time

left = data_graph.select("source", "target", "timestamp", "amount")
select = []
for column in left.columns:
    select.append(sf.col(column).alias(f"left_{column}"))
left = left.select(*select)
right = data_graph.select("source", "target", "timestamp", "amount")

flows_temporal = left.join(
    right,
    (left["left_target"] == right["source"]) &
    (left["left_timestamp"] <= right["timestamp"]),
    how="inner"
).groupby(["left_source", "left_target", "source", "target"]).agg(
    sf.sum("left_amount").alias("left_amount"),
    sf.sum("amount").alias("amount"),
).drop("left_target").persist(StorageLevel.DISK_ONLY)

flows_temporal.count()



CPU times: user 209 ms, sys: 77.5 ms, total: 287 ms
Wall time: 4min 28s


                                                                                

28558658

In [9]:
trx_ids_sorted = get_pandas(data.sort("timestamp").select("transaction_id"))["transaction_id"].values
trx_count = len(trx_ids_sorted)
print(trx_count)

                                                                                

179504480


In [10]:
%%time

last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

pd.DataFrame(train_indexes, columns=["transaction_id"]).to_parquet("temp-train.parquet")
train_indexes = spark.read.parquet("temp-train.parquet").repartition(1).persist(StorageLevel.DISK_ONLY)
print(train_indexes.count())
pd.DataFrame(validation_indexes, columns=["transaction_id"]).to_parquet("temp-valid.parquet")
validation_indexes = spark.read.parquet("temp-valid.parquet").repartition(1).persist(StorageLevel.DISK_ONLY)
print(validation_indexes.count())
pd.DataFrame(test_indexes, columns=["transaction_id"]).to_parquet("temp-test.parquet")
test_indexes = spark.read.parquet("temp-test.parquet").repartition(1).persist(StorageLevel.DISK_ONLY)
print(test_indexes.count())
print()

                                                                                

107702688


                                                                                

35900896


[Stage 58:>                                                         (0 + 1) / 1]

35900896

CPU times: user 175 ms, sys: 237 ms, total: 413 ms
Wall time: 32.8 s


                                                                                

In [11]:
train = train_indexes.join(
    data, on="transaction_id", how="left"
)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
)
test = test_indexes.join(
    data, on="transaction_id", how="left"
)
train_validation = train.union(validation)

In [12]:
location_main_features = os.path.join("features", s.OUTPUT_POSTFIX.lstrip("-"))

In [13]:
# shutil.rmtree(location_main_features, ignore_errors=True)

In [14]:
location_train = f"{location_main_features}{os.sep}train{os.sep}"

try:
    os.makedirs(location_main_features)
except FileExistsError:
    pass

try:
    os.makedirs(location_train)
except FileExistsError:
    pass

In [15]:
# %%time

# data_input = data.select("*")
# nodes_source = set(get_pandas(data.select("source").distinct())["source"])
# nodes_target = set(get_pandas(data.select("target").distinct())["target"])
# nodes_passthrough = nodes_source.intersection(nodes_target)

# %run communities_global.ipynb

# comm_as_source_features.to_parquet(f"{location_main_features}/comm_as_source_features.parquet")
# comm_as_target_features.to_parquet(f"{location_main_features}/comm_as_target_features.parquet")
# comm_as_passthrough_features.to_parquet(f"{location_main_features}/comm_as_passthrough_features.parquet")
# comm_as_passthrough_features_reverse.to_parquet(f"{location_main_features}/comm_as_passthrough_features_reverse.parquet")

In [16]:
comm_as_source_features = pd.read_parquet(f"{location_main_features}/comm_as_source_features.parquet")
comm_as_target_features = pd.read_parquet(f"{location_main_features}/comm_as_target_features.parquet")
comm_as_passthrough_features = pd.read_parquet(f"{location_main_features}/comm_as_passthrough_features.parquet")
comm_as_passthrough_features_reverse = pd.read_parquet(f"{location_main_features}/comm_as_passthrough_features_reverse.parquet")

In [17]:
%%time

ts_min = data_graph.select(sf.min("timestamp").alias("x")).collect()[0]["x"] - timedelta(minutes=1)
data_graph_agg = data_graph.groupby(["source", "target", "source_bank", "target_bank", "source_currency"]).agg(
    sf.count("source").alias("num_transactions"),
    sf.sum("amount").alias("amount"),
    sf.sum("source_amount").alias("source_amount"),
    sf.to_json(
        sf.collect_list(sf.array((sf.col("timestamp") - ts_min).cast("long"), sf.col("amount")))
    ).alias("timestamps_amounts"),
)
data_graph_agg = get_pandas(data_graph_agg)

                                                                                

CPU times: user 3.97 s, sys: 913 ms, total: 4.88 s
Wall time: 27.2 s


In [19]:
# %%time

# %run model_experiment_nested_new.ipynb

# all_features = all_features.join(
#     comm_as_source_features, how="left", rsuffix="_dispense"
# ).join(
#     comm_as_target_features, how="left", rsuffix="_sink"
# ).join(
#     comm_as_passthrough_features, how="left", rsuffix="_passthrough"
# ).join(
#     comm_as_passthrough_features_reverse, how="left", rsuffix="_passthrough_rev"
# )

# all_features.to_parquet(f"{location_main_features}/features_if.parquet")

Constructing features
CPU times: user 14 s, sys: 8.82 s, total: 22.8 s
Wall time: 1h 16min 46s
CPU times: user 5min 41s, sys: 32.1 s, total: 6min 13s
Wall time: 2h 34min 47s


In [None]:
all_features = pd.read_parquet(f"{location_main_features}/features_if.parquet")

In [20]:
%%time

anomalies = all_features.loc[:, []]
anomalies.loc[:, "anomaly_score"] = IsolationForest().fit(
    all_features.fillna(0)
).decision_function(all_features.fillna(0))
anomalies.loc[:, "anomaly_score"] += abs(anomalies.loc[:, "anomaly_score"].min())

In [21]:
%%time

pca = PCA(n_components=10)
all_features_dim_reduced = pd.DataFrame(
    pca.fit_transform(normalize(all_features.fillna(0), norm="l1", axis=1)),
    index=all_features.index
)
print(sum(pca.explained_variance_ratio_))
all_features_dim_reduced.columns = [
    f"pca_{x + 1}" for x in all_features_dim_reduced.columns
]
# del all_features

0.998574501186977
CPU times: user 27.9 s, sys: 1.49 s, total: 29.4 s
Wall time: 4.6 s


In [None]:
%%time

edge_features = []
for index, (k, v) in enumerate(
    get_pandas(
        data.select("source", "target", "source_currency", "source_amount", "amount", "format")
    ).groupby(["source", "target"])
):
    if not (index % 100_000):
        print(index)
    edge_features.append(get_edge_features(k, v))
edge_features = pd.DataFrame(edge_features)
edge_features.to_parquet(f"{location_main_features}/features_edges.parquet")

                                                                                

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000


In [None]:
edge_features = pd.read_parquet(f"{location_main_features}/features_edges.parquet")

In [None]:
train_edges = get_pandas(train.select("source", "target").drop_duplicates()).set_index(
    ["source", "target"]
)
valid_edges = get_pandas(validation.select("source", "target").drop_duplicates()).set_index(
    ["source", "target"]
)
test_edges = get_pandas(test.select("source", "target").drop_duplicates()).set_index(
    ["source", "target"]
)

In [None]:
train_features = train_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
validation_features = valid_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
test_features = test_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()

In [None]:
train_features = train_features.set_index("target").join(
    anomalies, how="left"
).reset_index().set_index("source").join(
    anomalies, how="left", rsuffix="_source"
).reset_index().set_index("target").join(
    all_features_dim_reduced, how="left"
).reset_index().set_index("source").join(
    all_features_dim_reduced, how="left", rsuffix="_source"
).reset_index()
train_features.loc[:, "anom_scores_diff"] = train_features.loc[:, "anomaly_score"] - train_features.loc[:, "anomaly_score_source"]
train_features.loc[:, "anom_scores_min"] = np.array(
    [
        train_features.loc[:, "anomaly_score"].values, 
        train_features.loc[:, "anomaly_score_source"].values
    ],
).min(axis=0)
train_features.loc[:, "anom_scores_max"] = np.array(
    [
        train_features.loc[:, "anomaly_score"].values, 
        train_features.loc[:, "anomaly_score_source"].values
    ],
).max(axis=0)
train_features.loc[:, "anom_scores_mean"] = np.array(
    [
        train_features.loc[:, "anomaly_score"].values, 
        train_features.loc[:, "anomaly_score_source"].values
    ],
).mean(axis=0)
train_features.to_parquet(f"{location_train}/train.parquet")

In [None]:
validation_features = validation_features.set_index("target").join(
    anomalies, how="left"
).reset_index().set_index("source").join(
    anomalies, how="left", rsuffix="_source"
).reset_index().set_index("target").join(
    all_features_dim_reduced, how="left"
).reset_index().set_index("source").join(
    all_features_dim_reduced, how="left", rsuffix="_source"
).reset_index()
validation_features.loc[:, "anom_scores_diff"] = validation_features.loc[:, "anomaly_score"] - validation_features.loc[:, "anomaly_score_source"]
validation_features.loc[:, "anom_scores_min"] = np.array(
    [
        validation_features.loc[:, "anomaly_score"].values, 
        validation_features.loc[:, "anomaly_score_source"].values
    ],
).min(axis=0)
validation_features.loc[:, "anom_scores_max"] = np.array(
    [
        validation_features.loc[:, "anomaly_score"].values, 
        validation_features.loc[:, "anomaly_score_source"].values
    ],
).max(axis=0)
validation_features.loc[:, "anom_scores_mean"] = np.array(
    [
        validation_features.loc[:, "anomaly_score"].values, 
        validation_features.loc[:, "anomaly_score_source"].values
    ],
).mean(axis=0)
validation_features.to_parquet(f"{location_train}/validation.parquet")

In [None]:
test_features = test_features.set_index("target").join(
    anomalies, how="left"
).reset_index().set_index("source").join(
    anomalies, how="left", rsuffix="_source"
).reset_index().set_index("target").join(
    all_features_dim_reduced, how="left"
).reset_index().set_index("source").join(
    all_features_dim_reduced, how="left", rsuffix="_source"
).reset_index()
test_features.loc[:, "anom_scores_diff"] = test_features.loc[:, "anomaly_score"] - test_features.loc[:, "anomaly_score_source"]
test_features.loc[:, "anom_scores_min"] = np.array(
    [
        test_features.loc[:, "anomaly_score"].values, 
        test_features.loc[:, "anomaly_score_source"].values
    ],
).min(axis=0)
test_features.loc[:, "anom_scores_max"] = np.array(
    [
        test_features.loc[:, "anomaly_score"].values, 
        test_features.loc[:, "anomaly_score_source"].values
    ],
).max(axis=0)
test_features.loc[:, "anom_scores_mean"] = np.array(
    [
        test_features.loc[:, "anomaly_score"].values, 
        test_features.loc[:, "anomaly_score_source"].values
    ],
).mean(axis=0)
test_features.to_parquet(f"{location_train}/test.parquet")

In [None]:
columns = ["source", "target", "source_currency", "target_currency", "format", "amount", "is_laundering"]
columns_category = ["source_currency", "target_currency", "format"]
train_trx_features = get_pandas(train.select(*columns))
train_trx_features.loc[:, "inter_currency"] = train_trx_features["source_currency"] != train_trx_features["target_currency"]
valid_trx_features = get_pandas(validation.select(*columns))
valid_trx_features.loc[:, "inter_currency"] = valid_trx_features["source_currency"] != valid_trx_features["target_currency"]
test_trx_features = get_pandas(test.select(*columns))
test_trx_features.loc[:, "inter_currency"] = test_trx_features["source_currency"] != test_trx_features["target_currency"]

new_types = {column: "category" for column in columns_category}
train_trx_features = train_trx_features.astypes(new_types)
valid_trx_features = valid_trx_features.astypes(new_types)
test_trx_features = test_trx_features.astypes(new_types)

In [None]:
%%time

label_columns = ["source", "target", "is_laundering"]

train_features = pd.read_parquet(f"{location_train}/train.parquet")
del train_features["is_laundering"]
validation_features = pd.read_parquet(f"{location_train}/validation.parquet")
del validation_features["is_laundering"]
test_features = pd.read_parquet(f"{location_train}/test.parquet")
del test_features["is_laundering"]

train_features = train_trx_features.set_index(["source", "target"]).join(
    train_features.set_index(["source", "target"]), how="left"
).reset_index()
# TODO: Why is `format_Reinvestment` not in validation/test data
del train_features["format_Reinvestment"]

validation_features = valid_trx_features.set_index(["source", "target"]).join(
    validation_features.set_index(["source", "target"]), how="left"
).reset_index()

test_features = test_trx_features.set_index(["source", "target"]).join(
    test_features.set_index(["source", "target"]), how="left"
).reset_index()

train_features_labels = train_features.loc[:, label_columns].copy(deep=True)
del train_features["is_laundering"]
del train_features["source"]
del train_features["target"]

validation_features_labels = validation_features.loc[:, label_columns].copy(deep=True)
validation_features = validation_features.loc[:, train_features.columns]

test_features_labels = test_features.loc[:, label_columns].copy(deep=True)
test_features = test_features.loc[:, train_features.columns]
# test_labels_orig = test.select(["source", "target", "is_laundering"]).toPandas()

In [None]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))

In [None]:
# For HI
# def train_model(x, y, x_, y_):
#     model = xgb.XGBClassifier(
#         early_stopping_rounds=20, scale_pos_weight=10,
#         eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=20, max_depth=6,
#         colsample_bytree=0.5, subsample=0.5, enable_categorical=True,
#     )
#     model.fit(x, y, verbose=False, eval_set=[(x_, y_)])
#     print(f"Best iteration: {model.best_iteration}\n")
#     return model


# For LI
def train_model(x, y, x_, y_):
    # model = xgb.XGBClassifier(
    #     scale_pos_weight=10,
    #     eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=20, max_depth=6,
    #     colsample_bytree=0.5, subsample=0.5, n_estimators=1000,
    #     enable_categorical=True,
    # )
    model = xgb.XGBClassifier(
        early_stopping_rounds=20, scale_pos_weight=5,
        eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=5, max_depth=6,
        colsample_bytree=0.5, subsample=0.5, n_estimators=500,
        enable_categorical=True,
    )
    model.fit(x, y, verbose=False, eval_set=[(x_, y_)])
    print(f"Best iteration: {model.best_iteration}\n")
    return model

In [None]:
def get_orig_prediction_data(labels_data, labels_orig, prediction_values):
    labels_data = labels_data.copy(deep=True)
    labels_orig = labels_orig.copy(deep=True)
    labels_data.loc[:, "predicted"] = prediction_values
    predictions_agg = labels_data.groupby(["source", "target"]).agg(
        predicted=("predicted", "max")
    ).reset_index()
    final_predictions = labels_orig.set_index(["source", "target"]).join(
        predictions_agg.set_index(["source", "target"]), how="left"
    ).reset_index()
    return final_predictions

In [None]:
%%time

model = train_model(
    train_features, train_features_labels["is_laundering"].values, 
    validation_features, validation_features_labels["is_laundering"].values
)
y_test_predicted = model.predict(test_features)
print(
    "aggregated",
    round(f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2),
    round(recall_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2)
)
# predictions_data = get_orig_prediction_data(
#     test_features_labels, test_labels_orig, y_test_predicted
# )
# f1_final = round(f1_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
# print(
#     "final",
#     f1_final,
#     round(recall_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
# )
print()

In [None]:
%%time

CV_FOLD_PERC = 0.8
N_FOLDS = 5

f1_scores = []
for fold in range(N_FOLDS):
    print("Fold", fold + 1)
    x_train = train_features.sample(frac=CV_FOLD_PERC)
    x_train_labels = x_train.loc[:, []].join(train_features_labels, how="left")
    x_validation = validation_features.sample(frac=CV_FOLD_PERC)
    x_validation_labels = x_validation.loc[:, []].join(validation_features_labels, how="left")
    model = train_model(
        x_train, x_train_labels["is_laundering"].values, 
        x_validation, x_validation_labels["is_laundering"].values
    )
    y_test_predicted = model.predict(test_features)
    predictions_data = get_orig_prediction_data(
        test_features_labels, test_labels_orig, y_test_predicted
    )
    f1_cv = f1_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100
    print(
        round(f1_cv, 2),
        round(recall_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
    )
    f1_scores.append(f1_cv)

In [None]:
print(f"{f1_final} Â±{round(np.std(f1_scores), 2)}")

In [None]:
print((time.time() - start) // 60)