In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
1,application_1627907706936_0002,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f73da5e1b90>

In [2]:
import hashlib
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
import hsfs

In [3]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

## Load transactions feature group from hsfs

In [4]:
transactions_fg = fs.get_feature_group("transactions_fg", 1)
transactions_fg.show(5)

+--------+-------+--------+--------+-------+--------------+
|  target|tran_id|base_amt|  source|tx_type|tran_timestamp|
+--------+-------+--------+--------+-------+--------------+
|ea20f3ea| 328889|  341.23|35d9e14d|      4|        Aug-18|
|42e760b1| 328890|  846.74|ecaba057|      4|        Aug-18|
|d85de509| 328891|  269.45|43e1ce2b|      4|        Aug-18|
|9dff007b| 328892|  180.32|2d76caaa|      4|        Aug-18|
|92cf8d33| 328893|  507.09|b2efe3c6|      4|        Aug-18|
+--------+-------+--------+--------+-------+--------------+
only showing top 5 rows

In [5]:
transactions_fg.read().count()

438386

## Load alert transactions feature group from hsfs

In [6]:
alert_transactions_fg = fs.get_feature_group("alert_transactions_fg", 1)
alert_transactions_fg.show(5)

+------+-------+--------+--------------+
|is_sar|tran_id|alert_id|    alert_type|
+------+-------+--------+--------------+
|     1|1000145|      32|scatter_gather|
|     1|1002861|      18|scatter_gather|
|     1| 100431|      34|scatter_gather|
|     1|1006056|      18|scatter_gather|
|     1|1006362|      32|scatter_gather|
+------+-------+--------+--------------+
only showing top 5 rows

In [7]:
alert_transactions_fg.read().count()

915

## Load party feature group from hsfs and create graph `nodes` Query object

In [8]:
party_fg = fs.get_feature_group("party_fg", 1)
nodes = party_fg.select_all()
nodes.read().show()

+--------+----+
|      id|type|
+--------+----+
|0016359b|   0|
|0019b8d0|   0|
|001dcc27|   1|
|00298665|   1|
|003cd8f3|   0|
|003e2533|   0|
|00403fbd|   1|
|00498ec2|   1|
|0049ee5b|   0|
|0054a022|   0|
|00575ac9|   0|
|005c0c19|   1|
|006ac170|   1|
|006cc052|   0|
|0075d230|   1|
|007749eb|   0|
|00794932|   1|
|007f2674|   0|
|007f76dc|   1|
|0081b086|   0|
+--------+----+
only showing top 20 rows

## Create graph edge training dataset

In [9]:
edges = transactions_fg.select(["source","target","tran_id","tx_type","base_amt"]).join(alert_transactions_fg.select(["is_sar"]),["tran_id"],"left")

In [10]:
edges.read().show()

+--------+--------+-------+-------+--------+------+
|  source|  target|tran_id|tx_type|base_amt|is_sar|
+--------+--------+-------+-------+--------+------+
|72555c71|6c344249|  36131|      4| 2775.22|     1|
|b0fe7e18|e6c76032| 811151|      4|  112.19|     1|
|5a89d195|2a348960| 864390|      4|  107.84|     1|
|78fd2e68|4f8b7770| 252225|      4| 2502.73|     1|
|a1b4f889|4f8b7770| 256932|      4| 2502.73|     1|
|c5d0e6ca|8fab72e6| 333603|      4| 2657.36|     1|
|ab638a8a|d429553b| 507116|      4| 2816.58|     1|
|6d4543d6|ea80e43e| 514090|      4| 2534.92|     1|
|a1b3bc5e|396e2618| 769337|      4| 2721.35|     1|
|a670af3d|313c12f6| 778521|      4| 2491.18|     1|
|67d2ab78|1a211334| 867611|      4| 2830.43|     1|
|4c52d76b|0c81ba35| 137352|      4|   94.95|     1|
|f48fcd16|36377e59| 601420|      4| 2516.74|     1|
|75178e11|fa1ca6b5| 832202|      4| 2867.47|     1|
|69d84dfc|053485ef| 852249|      4| 2580.73|     1|
|edfe718c|19857dba| 856332|      4|   103.5|     1|
|b98c31fe|3d

In [11]:
edges_td_meta = fs.create_training_dataset(name="edges_td",
                                           version=1,
                                           data_format="csv",
                                           label = ["is_sar"],   
                                           description="edges training dataset",
                                           coalesce=True,
#                                           splits={'train': 0.7, 'test': 0.2, 'validate': 0.1}
                                          )
edges_td_meta.save(edges)

## Create graph node training dataset

In [12]:
node_td_meta = fs.create_training_dataset(name="node_td",
                                          version=1,
                                          data_format="csv",   
                                          description="node training dataset",
                                          coalesce=True)
node_td_meta.save(nodes)

# create derived feature group `alert_nodes_fg`, nodes that were part of previously known money laundering scheme

In [13]:
alert_edges = edges.read().where(F.col("is_sar")==1)
alert_sources = alert_edges.select(["source"]).toDF("id")
alert_targets = alert_edges.select(["target"]).toDF("id")
alert_nodes = alert_sources.union(alert_targets).dropDuplicates(subset=["id"])
alert_nodes = alert_nodes.withColumn("is_sar",F.lit(1))
alert_nodes.cache()
alert_nodes.show()

+--------+------+
|      id|is_sar|
+--------+------+
|33a8ff5b|     1|
|43e028ef|     1|
|fcf3bbf3|     1|
|8b9017b8|     1|
|9c187eed|     1|
|65636b63|     1|
|68c0230d|     1|
|550a25ff|     1|
|d73e5230|     1|
|c0be245b|     1|
|cdbd2ed5|     1|
|963b978f|     1|
|84563a83|     1|
|da77c74b|     1|
|840701de|     1|
|dc37f73b|     1|
|b0f4351c|     1|
|dd2ebcf1|     1|
|c29d75dc|     1|
|d7c99aa5|     1|
+--------+------+
only showing top 20 rows

In [14]:
alert_nodes_df = nodes.read().join(alert_nodes,["id"], "left").withColumn("is_sar",F.when(F.col("is_sar") == 1, F.col("is_sar")).otherwise(0))
alert_nodes_df.cache()
alert_nodes_df.show()

+--------+----+------+
|      id|type|is_sar|
+--------+----+------+
|0016359b|   0|     0|
|0019b8d0|   0|     0|
|001dcc27|   1|     0|
|00298665|   1|     0|
|003cd8f3|   0|     0|
|003e2533|   0|     0|
|00403fbd|   1|     0|
|00498ec2|   1|     0|
|0049ee5b|   0|     0|
|0054a022|   0|     0|
|00575ac9|   0|     0|
|005c0c19|   1|     0|
|006ac170|   1|     0|
|006cc052|   0|     0|
|0075d230|   1|     0|
|007749eb|   0|     1|
|00794932|   1|     0|
|007f2674|   0|     1|
|007f76dc|   1|     0|
|0081b086|   0|     0|
+--------+----+------+
only showing top 20 rows

In [15]:
alert_nodes_df.where(F.col("is_sar") == 1).count()

816

In [16]:
alert_nodes_df.where(F.col("is_sar") == 0).count()

6531

In [17]:
extra_hudi_options = {
    "hoodie.bulkinsert.shuffle.parallelism":"1", 
    "hoodie.insert.shuffle.parallelism":"1", 
    "hoodie.upsert.shuffle.parallelism":"1",
    "hoodie.parquet.compression.ratio":"0.5"
}

alert_nodes_fg = fs.create_feature_group(name="alert_nodes_fg",
                                       version=1,
                                       primary_key=["id"],
                                       description="node embeddings from transactions, derived fg",
                                       time_travel_format="HUDI",     
                                       online_enabled=True,                                                
                                       statistics_config=False)

alert_nodes_fg.save(alert_nodes_df, extra_hudi_options)