In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
1,application_1613331100983_0002,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f29acae2e10>

In [2]:
import hashlib
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
import hsfs

In [3]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

## Load transactions feature group from hsfs

In [None]:
transactions_fg = fs.get_feature_group("transactions_fg", 1)
transactions_df = transactions_fg.read()
transactions_df.show()

## Load alert transactions feature group from hsfs

In [5]:
alert_transactions_fg = fs.get_feature_group("alert_transactions_fg", 1)
alert_transactions_df = alert_transactions_fg.read()
alert_transactions_df.show()

+------+-------+--------------+--------+
|is_sar|tran_id|    alert_type|alert_id|
+------+-------+--------------+--------+
|  true|  11873|gather_scatter|      47|
|  true|  11874|gather_scatter|      47|
|  true|  11875|gather_scatter|      47|
|  true|  13151|gather_scatter|      47|
|  true|  23148|gather_scatter|      47|
|  true|  23779|scatter_gather|      17|
|  true|  23780|scatter_gather|      17|
|  true|  26441|scatter_gather|      17|
|  true|  26442|scatter_gather|      17|
|  true|  28329|gather_scatter|      47|
|  true|  31581|gather_scatter|      47|
|  true|  34310|gather_scatter|      47|
|  true|  34433|scatter_gather|      17|
|  true|  36131|gather_scatter|      58|
|  true|  36563|scatter_gather|      17|
|  true|  41430|scatter_gather|      17|
|  true|  42363|scatter_gather|      17|
|  true|  42511|gather_scatter|      58|
|  true|  44370|gather_scatter|      58|
|  true|  46176|gather_scatter|      58|
+------+-------+--------------+--------+
only showing top

## Load party feature group from hsfs

In [None]:
party_fg = fs.get_feature_group("party_fg", 1)
party_df = party_fg.read()
party_df.show()

## Create graph edge training dataset

In [7]:
alert_transactions_df.count()

915

In [None]:
transactions_df.count()

In [9]:
edges = transactions_df.join(alert_transactions_df,["tran_id"],"left")
edges = edges.withColumn("is_sar",F.when(F.col("is_sar") == "true", 1).otherwise(0))\
             .select("src","dst","tx_type","base_amt","tran_id","is_sar")\
             .toDF('source', 'target', 'tx_type', 'base_amt', 'tran_id', 'is_sar')

In [None]:
edges.show()

In [11]:
edges.count()

KeyboardInterrupt: 

In [None]:
edges.where(F.col("is_sar")==1).count()

In [None]:
edges_td_meta = fs.create_training_dataset(name="edges_td",
                                       version=1,
                                       data_format="csv",
                                       label = ["is_sar"],   
                                       description="edges training dataset")
edges_td_meta.save(edges)

## Create graph node training dataset

In [None]:
sources = edges.select(["source"]).toDF("id")
targets = edges.select(["target"]).toDF("id")
nodes = sources.union(targets).dropDuplicates(subset=["id"])
nodes.show()

In [None]:
nodes.count()

In [None]:
nodes_td = nodes.join(party_df, ["id"])
nodes_td.count()

In [None]:
nodes_td.show()

In [None]:
node_td_meta = fs.create_training_dataset(name="node_td",
                                       version=1,
                                       data_format="csv",   
                                       description="node training dataset")
node_td_meta.save(nodes_td)