In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
1,application_1612044880670_0002,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f3227379a90>

In [2]:
import hashlib
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
import hsfs

In [3]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

## Load transactions feature group from hsfs

In [None]:
transactions_fg = fs.get_feature_group("transactions_fg", 1)
transactions_df = transactions_fg.read()
transactions_df.show()

## Load alert transactions feature group from hsfs

In [5]:
alert_transactions_fg = fs.get_feature_group("alert_transactions_fg", 1)
alert_transactions_df = alert_transactions_fg.read()
alert_transactions_df.show()

+--------------+--------+------+-------+
|    alert_type|alert_id|is_sar|tran_id|
+--------------+--------+------+-------+
|gather_scatter|      52|  true| 447977|
|scatter_gather|      23|  true| 449282|
|gather_scatter|      52|  true| 454797|
|gather_scatter|      68|  true| 462363|
|gather_scatter|      68|  true| 468776|
|scatter_gather|      26|  true| 518050|
|scatter_gather|      10|  true| 518475|
|scatter_gather|      26|  true| 519362|
|scatter_gather|      26|  true| 521249|
|gather_scatter|      65|  true| 521357|
|scatter_gather|      25|  true| 557238|
|scatter_gather|      25|  true| 558782|
|gather_scatter|      61|  true| 559459|
|gather_scatter|      61|  true| 559460|
|gather_scatter|      69|  true| 559567|
|gather_scatter|      69|  true| 553958|
|scatter_gather|      25|  true| 554411|
|gather_scatter|      61|  true| 555697|
|gather_scatter|      61|  true| 556608|
|gather_scatter|      61|  true| 556609|
+--------------+--------+------+-------+
only showing top

## Load party feature group from hsfs

In [6]:
party_fg = fs.get_feature_group("party_fg", 1)
party_df = party_fg.read()
party_df.show()

+--------+----+
|      id|type|
+--------+----+
|5628bd6c|   0|
|a1fcba39|   0|
|f56c9501|   1|
|9969afdd|   0|
|b356eeae|   1|
|3406706a|   0|
|26c56102|   0|
|e386ebf7|   1|
|8c094b0d|   1|
|939235aa|   1|
|de6bf2a5|   0|
|33a8ff5b|   0|
|a32807a1|   1|
|2906ef08|   0|
|c2a01b8d|   1|
|5a99160f|   1|
|8b9017b8|   0|
|fcf3bbf3|   1|
|5132aa4d|   0|
|68b90958|   1|
+--------+----+
only showing top 20 rows

## Create graph edge training dataset

In [7]:
alert_transactions_df.count()

915

In [8]:
transactions_df.count()

An error was encountered:
An error occurred while calling o145.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 5.0 failed 4 times, most recent failure: Lost task 1.3 in stage 5.0 (TID 20, davitamlgpu-worker-1.internal.cloudapp.net, executor 3): org.apache.hadoop.hdfs.BlockMissingException: Could not obtain block: BP-900592089-10.0.0.6-1611940125785:blk_50033_1001 file=/apps/hive/warehouse/amlsim_featurestore.db/transactions_fg_1/part-00001-bbeb0685-3393-4019-aadc-130b1803cf82-c000
	at org.apache.hadoop.hdfs.DFSInputStream.chooseDataNode(DFSInputStream.java:955)
	at org.apache.hadoop.hdfs.DFSInputStream.fetchBlockByteRange(DFSInputStream.java:1054)
	at org.apache.hadoop.hdfs.DFSInputStream.pread(DFSInputStream.java:1376)
	at org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:1340)
	at org.apache.hadoop.fs.FSInputStream.readFully(FSInputStream.java:121)
	at org.apache.hadoop.fs.FSDataInputStream.readFully(FSDataInputStream.java:111)

In [9]:
edges = transactions_df.join(alert_transactions_df,["tran_id"],"left")
edges = edges.withColumn("is_sar",F.when(F.col("is_sar") == "true", 1).otherwise(0))\
             .select("src","dst","tx_type","base_amt","tran_id","is_sar")\
             .toDF('source', 'target', 'tx_type', 'base_amt', 'tran_id', 'is_sar')

In [None]:
edges.show()

In [None]:
edges.count()

In [None]:
edges.where(F.col("is_sar")==1).count()

In [None]:
edges_td_meta = fs.create_training_dataset(name="edges_td",
                                       version=1,
                                       data_format="csv",
                                       label = ["is_sar"],   
                                       description="edges training dataset")
edges_td_meta.save(edges)

## Create graph node training dataset

In [None]:
sources = edges.select(["source"]).toDF("id")
targets = edges.select(["target"]).toDF("id")
nodes = sources.union(targets).dropDuplicates(subset=["id"])
nodes.show()

In [None]:
nodes.count()

In [None]:
nodes_td = nodes.join(party_df, ["id"])
nodes_td.count()

In [None]:
nodes_td.show()

In [None]:
node_td_meta = fs.create_training_dataset(name="node_td",
                                       version=1,
                                       data_format="csv",   
                                       description="node training dataset")
node_td_meta.save(nodes_td)