In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
5,application_1613335206545_0003,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f4173022f90>

### Create a connection to hsfs

In [2]:
import hsfs
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

### Retrieve nodes and edges training datasets from hsfs

In [3]:
node_td = fs.get_training_dataset("node_td", 1)
edge_td = fs.get_training_dataset("edges_td", 1)

## Convert training dataset in to pandas daframe
We are going to use StellarGraph library to compute node embeddings. StellarGraph supports loading data via Pandas DataFrames, NumPy arrays, Neo4j and NetworkX graphs. 

---
**NOTE**:

Loading large scale dataset in to StellarGraph for training can not be handled with above mentioned fameworks. It will require loading data using frameworks such as `tf.data`. 

If your training datasets measure from couple of GB to 100s of GBs or even TBs contact us at Logical Clocks and we will help you to setup distributed training pipelines. 

---

In [4]:
# Get fg as pandas
node_pdf = node_td.read().toPandas()
edge_pdf = edge_td.read().drop("tran_timestamp").toPandas()

### Define hopsworks experiments wrapper function and put all the training logic there. 

In [5]:
def embeddings_computer(walk_number, walk_length, emb_size):
    
    import os
    import sys
    import uuid
    import random    
    
    import pandas as pd
    import numpy as np

    import pydoop.hdfs as pydoop

    import matplotlib.pyplot as plt

    from hops import hdfs
    from hops import pandas_helper as pandas
    from hops import model as hops_model
    from hops import tensorboard
    
    from sklearn.preprocessing import StandardScaler
    from sklearn import preprocessing, feature_extraction, model_selection
    from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.manifold import TSNE

    import stellargraph as sg
    from stellargraph import StellarGraph
    from stellargraph import StellarDiGraph
    from stellargraph.data import BiasedRandomWalk
    from stellargraph.data import UnsupervisedSampler
    from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
    from stellargraph.layer import Node2Vec, link_classification

    import tensorflow as tf
    from tensorflow import keras  
        
    ###########
    batch_size = 64
    epochs = 10
    num_samples = [20, 20]
    layer_sizes = [100, 100]
    learning_rate = 1e-2

    node_data = pd.DataFrame(node_pdf[['type']], index=node_pdf['id'])
    ###########
        
    print('Defining StellarDiGraph')
    G =StellarDiGraph(node_data,
                      edges=edge_pdf, 
                      edge_type_column="tx_type")


    nodes = list(G.nodes())

    walker = BiasedRandomWalk(
        G,
        n=walk_number,
        length=walk_length,
        p=0.5,  # defines probability, 1/p, of returning to source node
        q=2.0,  # defines probability, 1/q, for moving to a node away from the source node
    )
    unsupervised_samples = UnsupervisedSampler(G, nodes=list(G.nodes()), walker=walker)
    generator = Node2VecLinkGenerator(G, batch_size)
    node2vec = Node2Vec(emb_size, generator=generator)
    
    x_inp, x_out = node2vec.in_out_tensors()
    prediction = link_classification(
        output_dim=1, output_act="sigmoid", edge_embedding_method="dot"
    )(x_out)

    print('Defining the model')
    model = keras.Model(inputs=x_inp, outputs=prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )
    
    # Create a callback that saves the model's weights every 5 epochs
    log_dir = tensorboard.logdir()
    cp_callbacks = [
        tf.keras.callbacks.TensorBoard(log_dir=log_dir),
        keras.callbacks.ModelCheckpoint(
        filepath=tensorboard.logdir(), 
        verbose=1, 
        save_weights_only=True,
        save_freq=5*batch_size
    )]
    
    # Save the weights using the `checkpoint_path` format
    
    print('Training the model')
    history = model.fit(
        generator.flow(unsupervised_samples),
        epochs=epochs,
        verbose=1,
        use_multiprocessing=False,
        workers=4,
        shuffle=True,
        callbacks=cp_callbacks,
    )

    binary_accuracy = history.history['binary_accuracy'][-1]
    metrics={'accuracy': binary_accuracy} 
    
    # save to the model registry
    export_path = os.getcwd() + '/model-' + str(uuid.uuid4())
    print('Exporting trained model to: {}'.format(export_path))
    model.save(export_path)
    print('Done exporting!')
        
    hops_model.export(export_path, 'NodeEmbeddings', metrics=metrics)
    
    return metrics    

## Use above experiments wrapper function to conduct hops training experiments.

In [6]:
from hops import experiment
from hops import hdfs
import json

In [7]:
best_hyperparams_path = "Resources/embeddings_best_hp.json"
best_hyperparams = json.loads(hdfs.load(best_hyperparams_path))
args_dict = {}
for key in best_hyperparams.keys():
    args_dict[key] = [best_hyperparams[key]]

In [8]:
experiment.launch(embeddings_computer, args_dict, name='graph_embeddings_compute', metric_key='accuracy', local_logdir=False)

Finished Experiment 

('hdfs://rpc.namenode.service.consul:8020/Projects/amlsim/Experiments/application_1613335206545_0003_1', {'accuracy': 0.6966959238052368, 'log': 'Experiments/application_1613335206545_0003_1/walk_number=2&walk_length=3&emb_size=32/output.log'})