In [1]:
import os
import pickle
import shutil
import sys
import time
from datetime import timedelta, datetime

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.metrics import f1_score, recall_score
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

import settings as s
from common import create_workload_for_multi_proc, get_weights, MULTI_PROC_INPUT, MULTI_PROC_OUTPUT
from communities import get_communities_multi_proc
from features import get_features_multi_proc, get_edge_features, get_edge_features_multi_proc

%load_ext autoreload
%autoreload 2

In [2]:
shutil.rmtree("artifacts", ignore_errors=True)

config = [
    ("spark.driver.memory", "16g"),
    ("spark.worker.memory", "16g"),
    ("spark.driver.maxResultSize", "16g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
]
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(config))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/04 00:02:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
start = time.time()

In [4]:
WINDOW_SIZE = 7
TRAIN_PERC = 0.6
VALIDATION_PERC = 0.2
TEST_PERC = 0.2

NUM_PROCS = 10

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [5]:
location_main_features = os.path.join("features", s.OUTPUT_POSTFIX.lstrip("-"))
# shutil.rmtree(location_main_features, ignore_errors=True)

location_flow_dispense = f"{location_main_features}{os.sep}flow_dispense.parquet"
location_flow_passthrough = f"{location_main_features}{os.sep}flow_passthrough.parquet"
location_flow_sink = f"{location_main_features}{os.sep}flow_sink.parquet"

location_comm_as_source_features = f"{location_main_features}/comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main_features}/comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main_features}/comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main_features}/comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main_features}/features_node_level.parquet"
location_features_edges = f"{location_main_features}/features_edges.parquet"

try:
    os.makedirs(location_main_features)
except FileExistsError:
    pass

In [6]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data_count_original = data.count()

In [7]:
%%time

KEEP_TOP_N = 100

data_agg_weights = get_weights(
    data.groupby(["source", "target"])
    .agg(
        sf.sum("amount").alias("amount")
    ).toPandas()
)
data_agg_weights.sort_values("weight", ascending=False, inplace=True)

edges_to_keep = data_agg_weights.groupby("source").head(KEEP_TOP_N).reset_index(drop=True)
edges_to_keep.sort_values("weight", ascending=False, inplace=True)
edges_to_keep = edges_to_keep.groupby("target").head(KEEP_TOP_N).reset_index(drop=True)
edges_to_keep = edges_to_keep.loc[:, ["source", "target"]].drop_duplicates()
edges_to_keep = spark.createDataFrame(edges_to_keep)

data_graph = data.join(
    edges_to_keep.select(sf.col("source").alias("src"), sf.col("target").alias("dst")),
    (sf.col("source") == sf.col("src")) &
    (sf.col("target") == sf.col("dst"))
).drop("src", "dst").persist(StorageLevel.DISK_ONLY)
data_count_graph = data_graph.count()
reduction = round((data_count_graph / data_count_original) * 100, 2)
print(f"\nReduced to {reduction}%\n")




Reduced to 90.53%

CPU times: user 59.8 s, sys: 1.07 s, total: 1min
Wall time: 3min 28s


                                                                                

In [8]:
%%time

left = data_graph.select("source", "target", "timestamp", "amount")
select = []
for column in left.columns:
    select.append(sf.col(column).alias(f"left_{column}"))
left = left.select(*select)
right = data_graph.select("source", "target", "timestamp", "amount")

flows_temporal = left.join(
    right,
    (left["left_target"] == right["source"]) &
    (left["left_timestamp"] <= right["timestamp"]),
    how="inner"
).groupby(["left_source", "left_target", "source", "target"]).agg(
    sf.sum("left_amount").alias("left_amount"),
    sf.sum("amount").alias("amount"),
).drop("left_target").select(
    sf.col("left_source").alias("dispense"),
    sf.col("source").alias("passthrough"),
    sf.col("target").alias("sink"),
    sf.least("left_amount", "amount").alias("amount"),
).persist(StorageLevel.DISK_ONLY)
flows_temporal.count()
flows_temporal = flows_temporal.toPandas()


# TODO: This can be made much faster!
flow_dispense, flow_passthrough, flow_sink = [], [], []
for flow_data, flow_type in [
    (flow_dispense, "dispense"), (flow_passthrough, "passthrough"), (flow_sink, "sink")
]:
    print(flow_type)
    prefix = f"{s.G_FLOW_PREFIX}{flow_type}_"
    for key, group in flows_temporal.groupby(flow_type):
        cycle = group[(group["dispense"] == group["sink"]) & (group["dispense"] != group["passthrough"])]
        row = {
            "key": key,
            f"{prefix}amount_sum": group["amount"].sum(),
            f"{prefix}amount_mean": group["amount"].mean(),
            f"{prefix}amount_max": group["amount"].max(),
            f"{prefix}dispense_count": group["dispense"].nunique(),
            f"{prefix}passthrough_count": group["passthrough"].nunique(),
            f"{prefix}sink_count": group["sink"].nunique(),
            f"{prefix}cycle_sum": cycle["amount"].sum(),
            f"{prefix}cycle_mean": cycle["amount"].mean(),
            f"{prefix}cycle_max": cycle["amount"].max(),
            f"{prefix}cycle_passthrough_count": cycle["passthrough"].nunique(),
        }
        flow_data.append(row)

pd.DataFrame(flow_dispense).set_index("key").to_parquet(location_flow_dispense)
pd.DataFrame(flow_passthrough).set_index("key").to_parquet(location_flow_passthrough)
pd.DataFrame(flow_sink).set_index("key").to_parquet(location_flow_sink)

del flows_temporal
del flow_dispense
del flow_passthrough
del flow_sink

                                                                                

dispense
passthrough
sink
CPU times: user 23min 52s, sys: 21.4 s, total: 24min 14s
Wall time: 28min 26s


In [9]:
%%time

trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)
print(trx_count)

last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes = spark.createDataFrame(
    pd.DataFrame(train_indexes, columns=["transaction_id"])
).persist(StorageLevel.DISK_ONLY)
validation_indexes = spark.createDataFrame(
    pd.DataFrame(validation_indexes, columns=["transaction_id"])
).persist(StorageLevel.DISK_ONLY)
test_indexes = spark.createDataFrame(
    pd.DataFrame(test_indexes, columns=["transaction_id"])
).persist(StorageLevel.DISK_ONLY)

train = train_indexes.join(
    data, on="transaction_id", how="left"
)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
)
test = test_indexes.join(
    data, on="transaction_id", how="left"
)
train_validation = train.union(validation)

                                                                                

179504480
CPU times: user 1.78 s, sys: 947 ms, total: 2.73 s
Wall time: 16.1 s


In [12]:
%%time

data_input = data.select("*")
nodes_source = set(data.select("source").distinct().toPandas()["source"])
nodes_target = set(data.select("target").distinct().toPandas()["target"])
nodes_passthrough = nodes_source.intersection(nodes_target)

%run nested_communities_global.ipynb

comm_as_source_features.to_parquet(location_comm_as_source_features)
comm_as_target_features.to_parquet(location_comm_as_target_features)
comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

del comm_as_source_features
del comm_as_target_features
del comm_as_passthrough_features
del comm_as_passthrough_features_reverse

                                                                                


Processing comm_as_source

Processed hop #1 | 6,792,234 | 1,707,738
Processed hop #2 | 12,751,288 | 1,513,895
Processed hop #3 | 23,531,566 | 1,456,691
Processed hop #4 | 32,069,701 | 1,440,156

Processing comm_as_target

Processed hop #1 | 6,802,442 | 1,352,114
Processed hop #2 | 19,753,739 | 1,234,156
Processed hop #3 | 35,125,799 | 1,216,266
Processed hop #4 | 46,743,035 | 1,207,335

Processing comm_as_passthrough

Processed hop #1 | 6,279,937 | 1,329,490
Processed hop #2 | 11,029,056 | 1,154,274
Processed hop #3 | 20,118,081 | 1,109,985
Processed hop #4 | 25,603,374 | 1,096,681

Processing comm_as_passthrough_reverse

Processed hop #1 | 6,669,849 | 1,323,277
Processed hop #2 | 19,196,985 | 1,205,401
Processed hop #3 | 34,298,330 | 1,187,666
Processed hop #4 | 45,818,132 | 1,178,795
CPU times: user 2min 39s, sys: 5.06 s, total: 2min 44s
Wall time: 2min 45s
CPU times: user 2min 19s, sys: 3.49 s, total: 2min 22s
Wall time: 2min 22s
CPU times: user 2min 3s, sys: 2.24 s, total: 2min 6s

In [10]:
%%time

ts_min = data_graph.select(sf.min("timestamp").alias("x")).collect()[0]["x"] - timedelta(minutes=1)
data_graph_agg = data_graph.groupby(["source", "target", "source_bank", "target_bank", "source_currency"]).agg(
    sf.count("source").alias("num_transactions"),
    sf.sum("amount").alias("amount"),
    sf.sum("source_amount").alias("source_amount"),
    sf.collect_list(sf.array((sf.col("timestamp") - ts_min).cast("long"), sf.col("amount"))).alias("timestamps_amounts"),
)
data_graph_agg = data_graph_agg.toPandas()
data_graph_agg = data_graph_agg.convert_dtypes()

                                                                                

CPU times: user 15.8 s, sys: 4.88 s, total: 20.7 s
Wall time: 38.4 s


In [18]:
%%time

%run nested_features_generation.ipynb

all_features = all_features.join(
    pd.read_parquet(location_comm_as_source_features), how="left", rsuffix="_dispense"
).join(
    pd.read_parquet(location_comm_as_target_features), how="left", rsuffix="_sink"
).join(
    pd.read_parquet(location_comm_as_passthrough_features), how="left", rsuffix="_passthrough"
).join(
    pd.read_parquet(location_comm_as_passthrough_features_reverse), how="left", rsuffix="_passthrough_rev"
).join(
    pd.read_parquet(location_flow_dispense), how="left"
).join(
    pd.read_parquet(location_flow_passthrough), how="left"
).join(
    pd.read_parquet(location_flow_sink), how="left"
)

all_features.to_parquet(location_features_node_level)

In [None]:
all_features = pd.read_parquet(location_features_node_level)

In [20]:
%%time

anomalies = all_features.loc[:, []]
anomalies.loc[:, "anomaly_score"] = IsolationForest().fit(
    all_features.fillna(0)
).decision_function(all_features.fillna(0))
anomalies.loc[:, "anomaly_score"] += abs(anomalies.loc[:, "anomaly_score"].min())

In [29]:
%%time

pca = PCA(n_components=5)
all_features_dim_reduced = pd.DataFrame(
    pca.fit_transform(normalize(all_features.fillna(0), norm="l1", axis=1)),
    index=all_features.index
)
print(round(sum(pca.explained_variance_ratio_) * 100, 2))
all_features_dim_reduced.columns = [
    f"pca_{x + 1}" for x in all_features_dim_reduced.columns
]
del all_features

97.96
CPU times: user 26.2 s, sys: 2.6 s, total: 28.8 s
Wall time: 6.39 s


In [None]:
%%time

to_select = ["source", "target", "format", "source_currency", "source_amount", "amount"]

edges_features_input = data.select(to_select).groupby(
    ["source", "target", "format", "source_currency"]
).agg(
    sf.sum("source_amount").alias("source_amount"), sf.sum("amount").alias("amount")
).toPandas()
iterator_chunk_as_pickles, _ = create_workload_for_multi_proc(
    edges_features_input[["source", "target"]].drop_duplicates().shape[0],
    edges_features_input.groupby(["source", "target"]), 
    NUM_PROCS, 
    shuffle=False
)
edge_features = get_edge_features_multi_proc(iterator_chunk_as_pickles)
edge_features.to_parquet(location_features_edges)
del edge_features

                                                                                

In [23]:
edge_features = pd.read_parquet(location_features_edges)

In [31]:
%%time

train_edges = train.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
valid_edges = validation.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
test_edges = test.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)

                                                                                

In [32]:
train_features = train_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
validation_features = valid_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
test_features = test_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()

In [33]:
%%time

train_features = train_features.set_index("target").join(
    anomalies, how="left"
).reset_index().set_index("source").join(
    anomalies, how="left", rsuffix="_source"
).reset_index().set_index("target").join(
    all_features_dim_reduced, how="left"
).reset_index().set_index("source").join(
    all_features_dim_reduced, how="left", rsuffix="_source"
).reset_index()
train_features.loc[:, "anom_scores_diff"] = train_features.loc[:, "anomaly_score"] - train_features.loc[:, "anomaly_score_source"]
train_features.loc[:, "anom_scores_min"] = np.array(
    [
        train_features.loc[:, "anomaly_score"].values, 
        train_features.loc[:, "anomaly_score_source"].values
    ],
).min(axis=0)
train_features.loc[:, "anom_scores_max"] = np.array(
    [
        train_features.loc[:, "anomaly_score"].values, 
        train_features.loc[:, "anomaly_score_source"].values
    ],
).max(axis=0)
train_features.loc[:, "anom_scores_mean"] = np.array(
    [
        train_features.loc[:, "anomaly_score"].values, 
        train_features.loc[:, "anomaly_score_source"].values
    ],
).mean(axis=0)
train_features.to_parquet(f"{location_train}/train.parquet")

CPU times: user 21 s, sys: 3.63 s, total: 24.6 s
Wall time: 25.2 s


In [34]:
%%time

validation_features = validation_features.set_index("target").join(
    anomalies, how="left"
).reset_index().set_index("source").join(
    anomalies, how="left", rsuffix="_source"
).reset_index().set_index("target").join(
    all_features_dim_reduced, how="left"
).reset_index().set_index("source").join(
    all_features_dim_reduced, how="left", rsuffix="_source"
).reset_index()
validation_features.loc[:, "anom_scores_diff"] = validation_features.loc[:, "anomaly_score"] - validation_features.loc[:, "anomaly_score_source"]
validation_features.loc[:, "anom_scores_min"] = np.array(
    [
        validation_features.loc[:, "anomaly_score"].values, 
        validation_features.loc[:, "anomaly_score_source"].values
    ],
).min(axis=0)
validation_features.loc[:, "anom_scores_max"] = np.array(
    [
        validation_features.loc[:, "anomaly_score"].values, 
        validation_features.loc[:, "anomaly_score_source"].values
    ],
).max(axis=0)
validation_features.loc[:, "anom_scores_mean"] = np.array(
    [
        validation_features.loc[:, "anomaly_score"].values, 
        validation_features.loc[:, "anomaly_score_source"].values
    ],
).mean(axis=0)
validation_features.to_parquet(f"{location_train}/validation.parquet")

CPU times: user 14.1 s, sys: 2.29 s, total: 16.3 s
Wall time: 16.5 s


In [35]:
%%time

test_features = test_features.set_index("target").join(
    anomalies, how="left"
).reset_index().set_index("source").join(
    anomalies, how="left", rsuffix="_source"
).reset_index().set_index("target").join(
    all_features_dim_reduced, how="left"
).reset_index().set_index("source").join(
    all_features_dim_reduced, how="left", rsuffix="_source"
).reset_index()
test_features.loc[:, "anom_scores_diff"] = test_features.loc[:, "anomaly_score"] - test_features.loc[:, "anomaly_score_source"]
test_features.loc[:, "anom_scores_min"] = np.array(
    [
        test_features.loc[:, "anomaly_score"].values, 
        test_features.loc[:, "anomaly_score_source"].values
    ],
).min(axis=0)
test_features.loc[:, "anom_scores_max"] = np.array(
    [
        test_features.loc[:, "anomaly_score"].values, 
        test_features.loc[:, "anomaly_score_source"].values
    ],
).max(axis=0)
test_features.loc[:, "anom_scores_mean"] = np.array(
    [
        test_features.loc[:, "anomaly_score"].values, 
        test_features.loc[:, "anomaly_score_source"].values
    ],
).mean(axis=0)
test_features.to_parquet(f"{location_train}/test.parquet")

CPU times: user 15 s, sys: 2.29 s, total: 17.3 s
Wall time: 17.5 s


In [14]:
%%time

columns = ["source", "target", "source_currency", "target_currency", "format", "amount", "is_laundering"]
columns_category = ["source_currency", "target_currency", "format"]
train_trx_features = train.select(*columns).toPandas()
train_trx_features.loc[:, "inter_currency"] = train_trx_features["source_currency"] != train_trx_features["target_currency"]
valid_trx_features = validation.select(*columns).toPandas()
valid_trx_features.loc[:, "inter_currency"] = valid_trx_features["source_currency"] != valid_trx_features["target_currency"]
test_trx_features = test.select(*columns).toPandas()
test_trx_features.loc[:, "inter_currency"] = test_trx_features["source_currency"] != test_trx_features["target_currency"]

train_trx_features.to_parquet(f"{location_train}/train_trx_features")
valid_trx_features.to_parquet(f"{location_train}/valid_trx_features")
test_trx_features.to_parquet(f"{location_train}/test_trx_features")

del train_trx_features
del valid_trx_features
del test_trx_features

                                                                                

CPU times: user 56.8 s, sys: 26.8 s, total: 1min 23s
Wall time: 2min 47s


In [28]:
train_trx_features = spark.read.parquet(f"{location_train}/train_trx_features")
valid_trx_features = spark.read.parquet(f"{location_train}/valid_trx_features")
test_trx_features = spark.read.parquet(f"{location_train}/test_trx_features")

In [38]:
%%time

label_columns = ["source", "target", "is_laundering"]
new_types = {column: "category" for column in columns_category}
new_types.update({"is_laundering": bool})

# train_features = spark.read.parquet(f"{location_train}/train.parquet")
# train_trx_features = train_trx_features.withColumn(
#     "source_left", sf.col("source")
# ).withColumn(
#     "target_left", sf.col("target")
# ).drop("source", "target")
# train_features = train_trx_features.join(
#     train_features,
#     (train_trx_features["source_left"] == train_features["source"]) &
#     (train_trx_features["target_left"] == train_features["target"]),
#     how="left"
# ).drop("source_left", "target_left")
# train_features = train_features.toPandas()
# train_features = train_features.astype(new_types)

validation_features = spark.read.parquet(f"{location_train}/validation.parquet")
valid_trx_features = valid_trx_features.withColumn(
    "source_left", sf.col("source")
).withColumn(
    "target_left", sf.col("target")
).drop("source", "target")
validation_features = valid_trx_features.join(
    validation_features,
    (valid_trx_features["source_left"] == validation_features["source"]) &
    (valid_trx_features["target_left"] == validation_features["target"]),
    how="left"
).drop("source_left", "target_left")
validation_features = validation_features.toPandas()
validation_features = validation_features.astype(new_types)

test_features = spark.read.parquet(f"{location_train}/test.parquet")
test_trx_features = test_trx_features.withColumn(
    "source_left", sf.col("source")
).withColumn(
    "target_left", sf.col("target")
).drop("source", "target")
test_features = test_trx_features.join(
    test_features,
    (test_trx_features["source_left"] == test_features["source"]) &
    (test_trx_features["target_left"] == test_features["target"]),
    how="left"
).drop("source_left", "target_left")
test_features = test_features.toPandas()
test_features = test_features.astype(new_types)

                                                                                

CPU times: user 21.6 s, sys: 11.9 s, total: 33.6 s
Wall time: 1min 52s


In [39]:
%%time

label_columns = ["source", "target", "is_laundering"]

# TODO: Why is `format_Reinvestment` not in validation/test data
# del train_features["format_Reinvestment"]

train_features_labels = train_features.loc[:, label_columns].copy(deep=True)
del train_features["is_laundering"]
del train_features["source"]
del train_features["target"]

validation_features_labels = validation_features.loc[:, label_columns].copy(deep=True)
validation_features = validation_features.loc[:, train_features.columns]

test_features_labels = test_features.loc[:, label_columns].copy(deep=True)
test_features = test_features.loc[:, train_features.columns]

CPU times: user 10.4 s, sys: 8.29 s, total: 18.6 s
Wall time: 23.4 s


In [40]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))

In [41]:
# For HI
# def train_model(x, y, x_, y_):
#     model = xgb.XGBClassifier(
#         early_stopping_rounds=20, scale_pos_weight=10,
#         eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=20, max_depth=6,
#         colsample_bytree=0.5, subsample=0.5, enable_categorical=True,
#     )
#     model.fit(x, y, verbose=False, eval_set=[(x_, y_)])
#     print(f"Best iteration: {model.best_iteration}\n")
#     return model


# For LI
def train_model(x, y, x_, y_):
    # model = xgb.XGBClassifier(
    #     scale_pos_weight=10,
    #     eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=20, max_depth=6,
    #     colsample_bytree=0.5, subsample=0.5, n_estimators=1000,
    #     enable_categorical=True,
    # )
    model = xgb.XGBClassifier(
        early_stopping_rounds=20, scale_pos_weight=5,
        eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=1, max_depth=6,
        colsample_bytree=1, subsample=1, n_estimators=500,
        enable_categorical=True,
    )
    model.fit(x, y, verbose=True, eval_set=[(x_, y_)])
    print(f"Best iteration: {model.best_iteration}\n")
    return model

In [42]:
%%time

model = train_model(
    train_features, train_features_labels["is_laundering"].values, 
    validation_features, validation_features_labels["is_laundering"].values
)
y_test_predicted = model.predict(test_features)
print(
    "aggregated",
    round(f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2),
    round(recall_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2)
)
# predictions_data = get_orig_prediction_data(
#     test_features_labels, test_labels_orig, y_test_predicted
# )
# f1_final = round(f1_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
# print(
#     "final",
#     f1_final,
#     round(recall_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
# )
print()

[0]	validation_0-f1_eval:0.53664
[1]	validation_0-f1_eval:0.51116
[2]	validation_0-f1_eval:0.50245
[3]	validation_0-f1_eval:0.49714
[4]	validation_0-f1_eval:0.49438
[5]	validation_0-f1_eval:0.49283
[6]	validation_0-f1_eval:0.49157
[7]	validation_0-f1_eval:0.49153
[8]	validation_0-f1_eval:0.48984
[9]	validation_0-f1_eval:0.49019
[10]	validation_0-f1_eval:0.48905
[11]	validation_0-f1_eval:0.48755
[12]	validation_0-f1_eval:0.48613
[13]	validation_0-f1_eval:0.48505
[14]	validation_0-f1_eval:0.48358
[15]	validation_0-f1_eval:0.48341
[16]	validation_0-f1_eval:0.48268
[17]	validation_0-f1_eval:0.48437
[18]	validation_0-f1_eval:0.48031
[19]	validation_0-f1_eval:0.47996
[20]	validation_0-f1_eval:0.47923
[21]	validation_0-f1_eval:0.47921
[22]	validation_0-f1_eval:0.47892
[23]	validation_0-f1_eval:0.47857
[24]	validation_0-f1_eval:0.47760
[25]	validation_0-f1_eval:0.47734
[26]	validation_0-f1_eval:0.47728
[27]	validation_0-f1_eval:0.47691
[28]	validation_0-f1_eval:0.47637
[29]	validation_0-f1_eva

In [None]:
%%time

CV_FOLD_PERC = 0.8
N_FOLDS = 5

f1_scores = []
for fold in range(N_FOLDS):
    print("Fold", fold + 1)
    x_train = train_features.sample(frac=CV_FOLD_PERC)
    x_train_labels = x_train.loc[:, []].join(train_features_labels, how="left")
    x_validation = validation_features.sample(frac=CV_FOLD_PERC)
    x_validation_labels = x_validation.loc[:, []].join(validation_features_labels, how="left")
    model = train_model(
        x_train, x_train_labels["is_laundering"].values, 
        x_validation, x_validation_labels["is_laundering"].values
    )
    y_test_predicted = model.predict(test_features)
    predictions_data = get_orig_prediction_data(
        test_features_labels, test_labels_orig, y_test_predicted
    )
    f1_cv = f1_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100
    print(
        round(f1_cv, 2),
        round(recall_score(predictions_data["is_laundering"], predictions_data["predicted"]) * 100, 2)
    )
    f1_scores.append(f1_cv)

In [None]:
print(f"{f1_final} Â±{round(np.std(f1_scores), 2)}")

In [None]:
print((time.time() - start) // 60)