# Query Model Repository for best node embeddings model

In [1]:
from hops import model
from hops.model import Metric
MODEL_NAME="NodeEmbeddings"
EVALUATION_METRIC="accuracy"

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
58,application_1607211657348_0060,pyspark,idle,Link,Link


SparkSession available as 'spark'.


In [2]:
best_model = model.get_best_model(MODEL_NAME, EVALUATION_METRIC, Metric.MAX)

In [3]:
best_model['experimentId']

'application_1607211657348_0059_6'

# Send Prediction Requests to the Served Model using Hopsworks REST API

In [4]:
import tensorflow as tf
from tensorflow import keras  

import pandas as pd
from stellargraph import StellarDiGraph
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
from stellargraph.data import UnsupervisedSampler, BiasedRandomWalk
from stellargraph.layer import Node2Vec
import pydoop.hdfs as pydoop
import hsfs

In [5]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [6]:
node_td = fs.get_training_dataset("node_td", 1)
edge_td = fs.get_training_dataset("edges_td", 1)

In [7]:
# Get fg as pandas
node_pdf = node_td.read().toPandas()
edge_pdf = edge_td.read().drop("tran_timestamp").toPandas()

In [8]:
node_data = pd.DataFrame(node_pdf[['tx_behavior_id','prior_sar','initial_deposit']], index=node_pdf['id'])

print('Defining StellarDiGraph')
G =StellarDiGraph(node_data,
                      edges=edge_pdf, 
                      edge_type_column="tx_type")


Defining StellarDiGraph

In [9]:
walk_number = 2
walk_length = 2
batch_size = 1
emb_size = 16
# Extracting node embeddings
walker = BiasedRandomWalk(
        G,
        n=walk_number,
        length=walk_length,
        p=0.5,  # defines probability, 1/p, of returning to source node
        q=2.0,  # defines probability, 1/q, for moving to a node away from the source node
    )
unsupervised_samples = UnsupervisedSampler(G, nodes=list(G.nodes()), walker=walker)
generator = Node2VecLinkGenerator(G, batch_size)

node2vec = Node2Vec(emb_size, generator=generator)
x_inp, x_out = node2vec.in_out_tensors()

x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [10]:
latest = tf.train.latest_checkpoint("hdfs:///Projects/amlsim2/Experiments/" + best_model['experimentId'])
embedding_model.load_weights(latest)



An error was encountered:
'NoneType' object has no attribute 'endswith'
Traceback (most recent call last):
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 2172, in load_weights
    if _is_hdf5_filepath(filepath):
  File "/srv/hops/anaconda/envs/theenv/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 2785, in _is_hdf5_filepath
    return (filepath.endswith('.h5') or filepath.endswith('.keras') or
AttributeError: 'NoneType' object has no attribute 'endswith'



In [11]:
nodes = list(G.nodes())
node_gen = Node2VecNodeGenerator(G, batch_size).flow(nodes)
node_embeddings = embedding_model.predict(node_gen)

In [12]:
node_embeddings[0].tolist()

[0.8658797740936279, -0.694594144821167, 0.971454381942749, 0.7033360004425049, 0.27118611335754395, -0.3557753562927246, -0.5513269901275635, 0.6284477710723877, 0.7319214344024658, -0.7909672260284424, 0.290677547454834, 0.6811003684997559, 0.7183468341827393, 0.25343942642211914, -0.6975371837615967, 0.8923580646514893]

In [None]:
import numpy as np
from pyspark.mllib.linalg import Vectors, VectorUDT
from pyspark.mllib.regression import LabeledPoint

df = np.concatenate([np.random.randint(0,2, size=(1000)), np.random.randn(1000), 3*np.random.randn(1000)+2, 6*np.random.randn(1000)-2]).reshape(1000,-1)
df = map(lambda x: LabeledPoint(x[0], Vectors.dense(x[1:])), df)

mydf = spark.createDataFrame(df,["label", "features"])