In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
37,application_1606998977104_0004,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f0a34d1d850>

In [2]:
import hsfs

In [3]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [4]:
node_fg = fs.get_feature_group("node_features", 1)
edge_fg = fs.get_feature_group("edge_features", 1)

In [5]:
source = edge_fg.select(["source","is_sar"]).read().withColumnRenamed("source","id")
target = edge_fg.select(["target","is_sar"]).read().withColumnRenamed("target","id")
nodes = source.union(target)
ano_nodes = nodes.where(nodes.is_sar == 1)
ben = nodes.where(nodes.is_sar == 0)
ben_nodes = ben.join(ano_nodes, ["id"], "leftanti")
labels = ano_nodes.union(ben_nodes).dropDuplicates(subset=["id"])
labels.coalesce(1)\
      .write\
      .option("header","true")\
      .option("sep",",")\
      .mode("overwrite")\
      .csv("hdfs:///Projects/amlsim/Resources/node_labels_for_plotting.csv")

In [6]:
# Get fg as pandas
node_pdf = node_fg.read().toPandas()
edge_pdf = edge_fg.read().toPandas()

In [7]:
def embeddings_computer(walk_number, walk_length, emb_size):
    
    import os
    import sys
    import uuid
    import random    
    
    import pandas as pd
    import numpy as np

    import pydoop.hdfs as pydoop

    import matplotlib.pyplot as plt

    from hops import hdfs
    from hops import pandas_helper as pandas
    from hops import model as hops_model
    from hops import tensorboard
    
    from sklearn.preprocessing import StandardScaler
    from sklearn import preprocessing, feature_extraction, model_selection
    from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.manifold import TSNE

    import stellargraph as sg
    from stellargraph import StellarGraph
    from stellargraph import StellarDiGraph
    from stellargraph.data import BiasedRandomWalk
    from stellargraph.data import UnsupervisedSampler
    from stellargraph.data import BiasedRandomWalk
    from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
    from stellargraph.layer import Node2Vec, link_classification

    import tensorflow as tf
    from tensorflow import keras  
    
    
    def _tsne_plot(node_embeddings, labels_file, walk_number, walk_length, emb_size):
        transform = TSNE  # PCA
        trans = transform(n_components=2, random_state=123)        
        #trans = transform(n_components=2, verbose=1, perplexity=40, n_iter=1500)
        
        node_embeddings_2d = trans.fit_transform(node_embeddings)

        labels = pd.read_csv(pydoop.path.abspath(hdfs.get_plain_path([path for path in hdfs.ls(labels_file) if path.endswith("csv")][0])))

        # draw the embedding points, coloring them by the target label (paper subject)
        alpha = 0.7
        label_map = pd.Series(labels.is_sar.values,index=labels.id).to_dict()

        node_colours = []
        for target in node_embeddings.index.values:
            if target in label_map:
                node_colours.append(label_map[target])
            else:
                node_colours.append(0)

        plt.figure(figsize=(7, 7))
        plt.axes().set(aspect="equal")
        plt.scatter(
            node_embeddings_2d[:, 0],
            node_embeddings_2d[:, 1],
            c=node_colours,
            cmap="jet",
            alpha=alpha,
        )
        plt.title("{} visualization of node embeddings".format(transform.__name__))
        tsne_file = 'embeddings_features_%d_%d_%d.tsne.pdf' % (walk_number, walk_length, emb_size)
        plt.savefig(tsne_file)
        plt.close()

        hdfs.copy_to_hdfs(tsne_file, "Resources", overwrite=True, project="amlsim")
    
    ###########
    batch_size = 32
    epochs = 10
    num_samples = [20, 20]
    layer_sizes = [100, 100]
    learning_rate = 1e-2

    node_data = pd.DataFrame(node_pdf[['tx_behavior_id','prior_sar','initial_deposit','gender','age']], index=node_pdf['acct_id'])
    ###########
        
    print('Defining StellarDiGraph')
    G =StellarDiGraph(node_data,
                      edges=edge_pdf, 
                      edge_type_column="tx_type")


    nodes = list(G.nodes())

    #walker = BiasedRandomWalk(
    #    G,
    #    n=walk_number,
    #    length=walk_length,
    #    p=0.5,  # defines probability, 1/p, of returning to source node
    #    q=2.0,  # defines probability, 1/q, for moving to a node away from the source node
    #)
    #unsupervised_samples = UnsupervisedSampler(G, nodes=list(G.nodes()), walker=walker)
    #generator = Node2VecLinkGenerator(G, batch_size)
    #node2vec = Node2Vec(emb_size, generator=generator)
    
    #x_inp, x_out = node2vec.in_out_tensors()
    #prediction = link_classification(
    #    output_dim=1, output_act="sigmoid", edge_embedding_method="dot"
    #)(x_out)
    
    #walk_number, walk_length, emb_size
    # parameter specification

    unsupervisedSamples = UnsupervisedSampler(
        Gs, nodes=G.nodes(), length=length, number_of_walks=walk_number
    )
    generator = GraphSAGELinkGenerator(Gs, batch_size, num_samples)
    train_gen = generator.flow(unsupervisedSamples)
    graphsage = GraphSAGE(
        layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.0, normalize="l2"
    )
    x_inp, x_out = graphsage.in_out_tensors()

    prediction = link_classification(
        output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
    )(x_out)


    print('Defining the model')
    model = keras.Model(inputs=x_inp, outputs=prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )
    
    
    # Create a callback that saves the model's weights every 5 epochs
    cp_callback = keras.callbacks.ModelCheckpoint(
        filepath=tensorboard.logdir(), 
        verbose=1, 
        save_weights_only=True,
        save_freq=5*batch_size)
    
    # Save the weights using the `checkpoint_path` format
    model.save_weights(tensorboard.logdir())
    
    print('Training the model')
    history = model.fit(
        generator.flow(unsupervised_samples),
        epochs=epochs,
        verbose=0,
        use_multiprocessing=False,
        workers=4,
        shuffle=True,
        callbacks=[cp_callback],
    )
    

    # Extracting node embeddings
    #x_inp_src = x_inp[0]
    #x_out_src = x_out[0]
    #embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)
    embedding_model = keras.Model(inputs=x_inp[::2], outputs=x_out[0])
    #embedding_model.load_weights(tensorboard.logdir())
    
    #node_gen = Node2VecNodeGenerator(G, batch_size).flow(nodes)
    node_gen = GraphSAGENodeGenerator(Gs, batch_size, num_samples).flow(nodes)    
    node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)
    
    embeddings_features = pd.DataFrame(node_embeddings, index=nodes)
    embb_file = 'embeddings_features_%d_%d_%d.csv' % (walk_number, walk_length, emb_size)
    ## bellow code works if run with experiments
    embeddings_features.to_csv(embb_file, index=True)
    hdfs.copy_to_hdfs(embb_file, "Resources", overwrite=True)
    labels_file = "hdfs:///Projects/amlsim/Resources/node_labels_for_plotting.csv"
    _tsne_plot(embeddings_features, labels_file, walk_number, walk_length, emb_size)
    
    binary_accuracy = history.history['binary_accuracy'][-1]
    metrics={'accuracy': binary_accuracy} 
    
    # save to the model registry
    export_path = os.getcwd() + '/model-' + str(uuid.uuid4())
    print('Exporting trained model to: {}'.format(export_path))
    tf.saved_model.save(model, export_path)
    print('Done exporting!')
        
    hops_model.export(export_path, 'graph_embeddings', metrics=metrics)
    
    return metrics    

In [None]:
from hops import hdfs
import json
#best_hyperparams_path = "Resources/embeddings_best_hp.json"
#best_hyperparams = json.loads(hdfs.load(best_hyperparams_path))
#args_dict = {}
#for key in best_hyperparams.keys():
#    args_dict[key] = [best_hyperparams[key]]
    
from hops import experiment
from hops import hdfs

args_dict = {'walk_number':[60,80],'walk_length':[3,4],'emb_size':[64,128]}


experiment.launch(embeddings_computer, args_dict, name='graph_embeddings_compute', metric_key='accuracy', local_logdir=True)