# Query Model Repository for best node embeddings model

In [87]:
from hops import model
from hops.model import Metric
MODEL_NAME="NodeEmbeddings"
EVALUATION_METRIC="accuracy"

In [88]:
best_model = model.get_best_model(MODEL_NAME, EVALUATION_METRIC, Metric.MAX)

In [89]:
best_model['experimentId']

'application_1607714417077_0006_1'

# Define model and load wights 

In [90]:
import tensorflow as tf
from tensorflow import keras  

import pandas as pd
from stellargraph import StellarDiGraph
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
from stellargraph.data import UnsupervisedSampler, BiasedRandomWalk
from stellargraph.layer import Node2Vec
import pydoop.hdfs as pydoop
import hsfs
from hops import hdfs

In [91]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [92]:
node_td = fs.get_training_dataset("node_td", 1)
edge_td = fs.get_training_dataset("edges_td", 1)

In [93]:
# Get fg as pandas
node_pdf = node_td.read().toPandas()
edge_pdf = edge_td.read().drop("tran_timestamp").toPandas()

In [94]:
node_data = pd.DataFrame(node_pdf[['tx_behavior_id','prior_sar','initial_deposit']], index=node_pdf['id'])

print('Defining StellarDiGraph')
G =StellarDiGraph(node_data,
                      edges=edge_pdf, 
                      edge_type_column="tx_type")


Defining StellarDiGraph

In [95]:
walk_number = 2
walk_length = 2
batch_size = 1
emb_size = 16
# Extracting node embeddings
walker = BiasedRandomWalk(
        G,
        n=walk_number,
        length=walk_length,
        p=0.5,  # defines probability, 1/p, of returning to source node
        q=2.0,  # defines probability, 1/q, for moving to a node away from the source node
    )
unsupervised_samples = UnsupervisedSampler(G, nodes=list(G.nodes()), walker=walker)
generator = Node2VecLinkGenerator(G, batch_size)

node2vec = Node2Vec(emb_size, generator=generator)
x_inp, x_out = node2vec.in_out_tensors()

x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [96]:
latest = tf.train.latest_checkpoint("hdfs:///Projects/{}/Experiments/".format(hdfs.project_name()) + best_model['experimentId'])
embedding_model.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus object at 0x7fcbde5f90d0>

In [97]:
nodes = list(G.nodes())
node_gen = Node2VecNodeGenerator(G, batch_size).flow(nodes)

In [98]:
import numpy as np
import pandas as pd 

pdf = pd.DataFrame(embedding_model.predict(node_gen), index=G.nodes())
emb_feature_names
pdf.columns = ["em_" + str(c)  for c in pdf.columns]
pdf['id'] = pdf.index
node_embeddings_df = spark.createDataFrame(pdf)

In [99]:
node_embeddings_df.show(2)

+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+-------------------+----+
|                em_0|                em_1|               em_2|               em_3|               em_4|               em_5|               em_6|                em_7|               em_8|                em_9|               em_10|              em_11|              em_12|              em_13|               em_14|              em_15|  id|
+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+-------------------+----+
|

### Create a connection to hsfs

In [100]:
import hsfs
from hops import hdfs
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

### Retrieve nodes training dataset from hsfs to get label infamation, whether node was part of the previously known money laundering scheme or not 

In [101]:
from pyspark.sql.functions import array, coalesce, concat,  col

node_td = fs.get_training_dataset("node_td", 1)
node_labels_df = node_td.read().select("id","is_sar")
#node_labels_df = node_labels_df.select(*(col(c).cast("float").alias(c) for c in node_labels_df.columns))

In [102]:
node_labels_df.show(5)

+----+------+
|  id|is_sar|
+----+------+
|7605|     1|
|6075|     1|
|5814|     1|
|9360|     1|
|4626|     1|
+----+------+
only showing top 5 rows

In [103]:
node_embeddings_df = node_embeddings_df.join(node_labels_df,['id'])

In [104]:
node_embeddings_df.count()

9998

In [105]:
emb_td = node_embeddings_df.drop("id").withColumn("embedding", array(emb_feature_names)).select("is_sar","embedding").withColumnRenamed("is_sar","target")

In [106]:
emb_td.show()

+------+--------------------+
|target|           embedding|
+------+--------------------+
|     1|[-0.2483904957771...|
|     1|[-0.0583792515099...|
|     1|[-0.0767477303743...|
|     1|[0.25295042991638...|
|     1|[0.09719745814800...|
|     1|[-0.1674562841653...|
|     1|[-0.2701789140701...|
|     1|[-0.6531173586845...|
|     1|[0.40688830614089...|
|     0|[-0.0990444421768...|
|     0|[0.44653317332267...|
|     0|[0.82527494430541...|
|     0|[-0.4545851051807...|
|     0|[-0.4822266995906...|
|     0|[0.31034347414970...|
|     0|[-0.1862383484840...|
|     0|[1.19107234477996...|
|     0|[1.08200967311859...|
|     0|[-1.0821152925491...|
|     0|[-0.6171256303787...|
+------+--------------------+
only showing top 20 rows

## Prepare training datasets for anomaly detection 
###### In the next notebook we are going to train [gan for anomaly detection](https://arxiv.org/pdf/1905.11034.pdf). Durring training step  we will provide only features of accounts that have never been reported for money laundering behaviour.  But we will disclose previously reported accounts to the model only in evaluation step.   

In [107]:
non_sar_df = emb_td.where(col("target")==0)

In [108]:
sar_df = emb_td.where(col("target")==1)

In [109]:
# Now that the data has been prepared, let's split the dataset into a training and test dataframe
[non_sar_train_df, non_sar_eval_df] = non_sar_df.randomSplit([0.8, 0.02],seed = 12345)

In [110]:
non_sar_td = fs.create_training_dataset(name="gan_non_sar_training_df",
                                       version=1,
                                       data_format="tfrecord",
                                       label=["target"], 
                                       statistics_config=False, 
                                       description="non sar dataset for gan training")
non_sar_td.save(non_sar_train_df)

<hsfs.training_dataset.TrainingDataset object at 0x7fcbe1336a50>

In [111]:
eval_df = non_sar_eval_df.union(sar_df)

In [112]:
gan_eval_ds = fs.create_training_dataset(name="gan_eval_df",
                                       version=1,
                                       data_format="tfrecord",
                                       label=["target"], 
                                       statistics_config=False, 
                                       description="evaluation dataset for gan training")
gan_eval_ds.save(eval_df)

<hsfs.training_dataset.TrainingDataset object at 0x7fcbe133a5d0>