## In this notebook we will ingest computed node embeddings as feature groups and also prepare training datasets for anomaly detection model training   

## Query Model Repository for best node embeddings model

In [81]:
from hops import model
from hops.model import Metric
MODEL_NAME="NodeEmbeddings"
EVALUATION_METRIC="accuracy"

In [82]:
best_model = model.get_best_model(MODEL_NAME, EVALUATION_METRIC, Metric.MAX)

In [83]:
best_model['experimentId']

'application_1609605496842_0011_2'

## Define model and load wights 

In [84]:
import tensorflow as tf
from tensorflow import keras  

import pandas as pd
from stellargraph import StellarDiGraph
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
from stellargraph.data import UnsupervisedSampler, BiasedRandomWalk
from stellargraph.layer import Node2Vec
import pydoop.hdfs as pydoop
from pyspark.sql import functions as F
from pyspark.sql.functions import array, coalesce, concat,  col

import hsfs
from hops import hdfs


In [85]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [86]:
node_td = fs.get_training_dataset("node_td", 1)
edge_td = fs.get_training_dataset("edges_td", 1)

In [87]:
# Get fg as pandas
node_pdf = node_td.read().toPandas()
edge_pdf = edge_td.read().drop("tran_timestamp").toPandas()

In [88]:
node_data = pd.DataFrame(node_pdf[['type']], index=node_pdf['id'])

print('Defining StellarDiGraph')
G =StellarDiGraph(node_data,
                      edges=edge_pdf, 
                      edge_type_column="tx_type")


Defining StellarDiGraph

In [89]:
from hops import experiment
from hops import hdfs
import json
best_hyperparams_path = "Resources/embeddings_best_hp.json"
best_hyperparams = json.loads(hdfs.load(best_hyperparams_path))
args_dict = {}
for key in best_hyperparams.keys():
    args_dict[key] = [best_hyperparams[key]]

In [90]:
walk_number = 2
walk_length = 2
batch_size = 1
emb_size = args_dict['emb_size'][0]
# Extracting node embeddings
walker = BiasedRandomWalk(
        G,
        n=walk_number,
        length=walk_length,
        p=0.5,  # defines probability, 1/p, of returning to source node
        q=2.0,  # defines probability, 1/q, for moving to a node away from the source node
    )
unsupervised_samples = UnsupervisedSampler(G, nodes=list(G.nodes()), walker=walker)
generator = Node2VecLinkGenerator(G, batch_size)

node2vec = Node2Vec(emb_size, generator=generator)
x_inp, x_out = node2vec.in_out_tensors()

x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [91]:
latest = tf.train.latest_checkpoint("hdfs:///Projects/{}/Experiments/".format(hdfs.project_name()) + best_model['experimentId'])
embedding_model.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus object at 0x7fc3f101d150>

In [92]:
nodes = list(G.nodes())
node_gen = Node2VecNodeGenerator(G, batch_size).flow(nodes)

In [93]:
import numpy as np
import pandas as pd 

pdf = pd.DataFrame(embedding_model.predict(node_gen), index=G.nodes())
emb_feature_names = ["em_" + str(c)  for c in pdf.columns]
pdf.columns = emb_feature_names
pdf['id'] = pdf.index
node_embeddings_df = spark.createDataFrame(pdf)

In [94]:
node_embeddings_df.show(2)

+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+--------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+------------------+------------------+--------+
|               em_0|               em_1|               em_2|               em_3|                em_4|               em_5|                em_6|              em_7|              em_8|               em_9|              em_10|              em_11|              em_12|               em_13|             em_14|             em_15|      id|
+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+--------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+------------------+------------------+--------+
|0.2450331

## Create a connection to hsfs

In [95]:
import hsfs
from hops import hdfs
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

## Retrieve nodes training dataset from hsfs and determine whether node was part of the previously known money laundering scheme or not 

In [96]:
node_td = fs.get_training_dataset("node_td", 1)
node_td.show(5)

+--------+----+
|      id|type|
+--------+----+
|5441fc93|   0|
|4471c138|   0|
|355a7e27|   0|
|3a73aa6e|   0|
|58986dcf|   0|
+--------+----+
only showing top 5 rows

In [97]:
edges_td = fs.get_training_dataset("edges_td", 1)
edges_td.show(5)

+--------+--------+-------+--------+-------+------+
|  source|  target|tx_type|base_amt|tran_id|is_sar|
+--------+--------+-------+--------+-------+------+
|3aa9646b|1e46e726|      4|  858.77|    496|     0|
|49203bc3|a74d1101|      4|  386.86|   1342|     0|
|616d4505|99af2455|      4|  616.43|   1580|     0|
|39be1ea2|e7ec7bdb|      4|  146.44|   2866|     0|
|e2e0d938|afc399a9|      4|  439.09|   3997|     0|
+--------+--------+-------+--------+-------+------+
only showing top 5 rows

In [98]:
alert_edges = edges_td.read().where(F.col("is_sar")==1)
alert_sources = alert_edges.select(["source"]).toDF("id")
alert_targets = alert_edges.select(["target"]).toDF("id")
alert_nodes = alert_sources.union(alert_targets).dropDuplicates(subset=["id"])
alert_nodes = alert_nodes.withColumn("is_sar",F.lit(1))
alert_nodes.show()

+--------+------+
|      id|is_sar|
+--------+------+
|33a8ff5b|     1|
|43e028ef|     1|
|fcf3bbf3|     1|
|8b9017b8|     1|
|68c0230d|     1|
|9c187eed|     1|
|65636b63|     1|
|d73e5230|     1|
|550a25ff|     1|
|c0be245b|     1|
|cdbd2ed5|     1|
|963b978f|     1|
|84563a83|     1|
|da77c74b|     1|
|840701de|     1|
|dc37f73b|     1|
|b0f4351c|     1|
|dd2ebcf1|     1|
|c29d75dc|     1|
|d7c99aa5|     1|
+--------+------+
only showing top 20 rows

In [99]:
node_embeddings_df = node_embeddings_df.join(alert_nodes,['id'],"left")
node_embeddings_df = node_embeddings_df.withColumn("is_sar",F.when(F.col("is_sar") == 1, F.col("is_sar")).otherwise(0))
node_embeddings_df.select("id","is_sar").show()

+--------+------+
|      id|is_sar|
+--------+------+
|01fdc089|     0|
|1a14903a|     0|
|243b1e8b|     0|
|26c56102|     0|
|2906ef08|     0|
|33a8ff5b|     1|
|3406706a|     0|
|3406d993|     0|
|43e028ef|     1|
|4b46d80d|     0|
|5132aa4d|     0|
|5628bd6c|     0|
|5645140a|     0|
|5a99160f|     0|
|5c01ec6e|     0|
|62827917|     0|
|68b90958|     0|
|7138cbc6|     0|
|8b9017b8|     1|
|8c094b0d|     0|
+--------+------+
only showing top 20 rows

In [100]:
node_embeddings_df.count()

7347

In [101]:
emb_td = node_embeddings_df.drop("id").withColumn("embedding", array(emb_feature_names)).select("is_sar","embedding").withColumnRenamed("is_sar","target")

In [102]:
emb_td.show()

+------+--------------------+
|target|           embedding|
+------+--------------------+
|     0|[0.24980598688125...|
|     0|[-0.3896130621433...|
|     0|[-0.9297066330909...|
|     0|[-0.0277862437069...|
|     0|[0.20036964118480...|
|     1|[-0.6551665067672...|
|     0|[0.32614603638648...|
|     0|[-0.1033717691898...|
|     1|[-0.2198605686426...|
|     0|[-0.8199651241302...|
|     0|[0.32019841670989...|
|     0|[-1.0910882949829...|
|     0|[-0.4207568764686...|
|     0|[-0.2130762934684...|
|     0|[0.80234044790267...|
|     0|[0.79003244638442...|
|     0|[1.30539047718048...|
|     0|[-0.5524362921714...|
|     1|[-0.5746901631355...|
|     0|[0.02823481708765...|
+------+--------------------+
only showing top 20 rows

## Prepare training datasets for anomaly detection 
###### In the next notebook we are going to train [gan for anomaly detection](https://arxiv.org/pdf/1905.11034.pdf). Durring training step  we will provide only features of accounts that have never been reported for money laundering behaviour.  But we will disclose previously reported accounts to the model only in evaluation step.   

In [103]:
non_sar_df = emb_td.where(col("target")==0)

In [104]:
sar_df = emb_td.where(col("target")==1)

In [105]:
# Now that the data has been prepared, let's split the dataset into a training and test dataframe
[non_sar_train_df, non_sar_eval_df] = non_sar_df.randomSplit([0.8, 0.02],seed = 12345)

In [106]:
non_sar_td = fs.create_training_dataset(name="gan_non_sar_training_df",
                                       version=1,
                                       data_format="tfrecord",
                                       label=["target"], 
                                       statistics_config=False, 
                                       description="non sar dataset for gan training")
non_sar_td.save(non_sar_train_df)

<hsfs.training_dataset.TrainingDataset object at 0x7fc4176aa250>

In [107]:
eval_df = non_sar_eval_df.union(sar_df)

In [108]:
gan_eval_ds = fs.create_training_dataset(name="gan_eval_df",
                                       version=1,
                                       data_format="tfrecord",
                                       label=["target"], 
                                       statistics_config=False, 
                                       description="evaluation dataset for gan training")
gan_eval_ds.save(eval_df)

<hsfs.training_dataset.TrainingDataset object at 0x7fc4187c93d0>