In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
0,application_1613331100983_0001,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7fd4babfff10>

## Import modules

In [2]:
import hashlib
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, StringType
import hsfs
from hops import hdfs

## Define utility functions

In [3]:
def action_2_code(input_str):
    x = input_str.split("-")[0]
    if (x == "CASH_IN"):
        node_type = 0
    elif (x == "CASH_OUT"):
        node_type = 1
    elif (x == "DEBIT"):
        node_type = 2
    elif (x == "PAYMENT"):
        node_type = 3
    elif (x == "TRANSFER"):
        node_type = 4
    elif (x == "DEPOSIT"):
        node_type = 4        
    else:
        node_type = 99
    return node_type

def party_2_code(x):
    if (x == "Organization"):
        party_type = 0
    elif (x == "Individual"):
        party_type = 1
    else:    
        party_type = 99
    return party_type

def timestamp_2_time(x):
    dt_obj = datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    return dt_obj.strftime("%b-%d") 

action_2_code_udf = F.udf(action_2_code)
party_2_code_udf = F.udf(party_2_code)
timestamp_2_time_udf = F.udf(timestamp_2_time)

## Create a connection to Hopsworks feature store (hsfs)

In [4]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

## Load transactions datasets as spark dataframe and perform feature engineering

In [5]:
transactions_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/transactions.csv".format(hdfs.project_name()))

In [6]:
transactions_df.show()

+-------+----------------+--------+-------------------+--------+--------+
|tran_id|         tx_type|base_amt|     tran_timestamp|     src|     dst|
+-------+----------------+--------+-------------------+--------+--------+
|    496| TRANSFER-FanOut|  858.77|2020-01-01 00:00:00|3aa9646b|1e46e726|
|   1342| TRANSFER-Mutual|  386.86|2020-01-01 00:00:00|49203bc3|a74d1101|
|   1580| TRANSFER-FanOut|  616.43|2020-01-02 00:00:00|616d4505|99af2455|
|   2866| TRANSFER-FanOut|  146.44|2020-01-02 00:00:00|39be1ea2|e7ec7bdb|
|   3997| TRANSFER-Mutual|  439.09|2020-01-03 00:00:00|e2e0d938|afc399a9|
|   5518| TRANSFER-FanOut|   361.0|2020-01-04 00:00:00|75c9a805|d7a317f6|
|   7340| TRANSFER-Mutual|  768.98|2020-01-06 00:00:00|c14f4989|733a496b|
|   9376| TRANSFER-FanOut|   943.4|2020-01-07 00:00:00|576eb672|aa49b0eb|
|  10362|TRANSFER-Forward|   668.3|2020-01-08 00:00:00|847a9cf6|b070a6bb|
|  10817| TRANSFER-FanOut|  139.84|2020-01-08 00:00:00|12a388ff|586377aa|
|  11317| TRANSFER-Mutual|  499.47|202

In [7]:
transactions_df = transactions_df.withColumn('tx_type', action_2_code_udf(F.col('tx_type')))\
                                 .withColumn('tran_timestamp', timestamp_2_time_udf(F.col('tran_timestamp')).cast(StringType()))\
                                 .withColumnRenamed("orig_acct","source")\
                                 .withColumnRenamed("bene_acct","target")\
                                 .select("src","dst","tran_timestamp","tran_id","tx_type","base_amt")
transactions_df.show()

+--------+--------+--------------+-------+-------+--------+
|     src|     dst|tran_timestamp|tran_id|tx_type|base_amt|
+--------+--------+--------------+-------+-------+--------+
|3aa9646b|1e46e726|        Jan-01|    496|      4|  858.77|
|49203bc3|a74d1101|        Jan-01|   1342|      4|  386.86|
|616d4505|99af2455|        Jan-02|   1580|      4|  616.43|
|39be1ea2|e7ec7bdb|        Jan-02|   2866|      4|  146.44|
|e2e0d938|afc399a9|        Jan-03|   3997|      4|  439.09|
|75c9a805|d7a317f6|        Jan-04|   5518|      4|   361.0|
|c14f4989|733a496b|        Jan-06|   7340|      4|  768.98|
|576eb672|aa49b0eb|        Jan-07|   9376|      4|   943.4|
|847a9cf6|b070a6bb|        Jan-08|  10362|      4|   668.3|
|12a388ff|586377aa|        Jan-08|  10817|      4|  139.84|
|b36f9c84|1b467848|        Jan-08|  11317|      4|  499.47|
|362e42e0|385afb8b|        Jan-09|  11748|      4|  357.96|
|572014da|acd60eca|        Jan-10|  13285|      4|   630.9|
|5ff2d9a7|31976e38|        Jan-11|  1483

## Create transactions feature group metadata and save it in to hsfs

In [8]:
transactions_fg = fs.create_feature_group(name="transactions_fg",
                                       version=1,
                                       primary_key=["tran_id"],
#                                       partition_key=["tran_timestamp"],   
                                       description="transactions features",
                                       time_travel_format=None,                                        
                                       statistics_config=False)
transactions_fg.save(transactions_df)

<hsfs.feature_group.FeatureGroup object at 0x7fd46e62c6d0>

## Load alert transactions datasets as spark dataframe and perform feature engineering

In [9]:
alert_transactions = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/alert_transactions.csv".format(hdfs.project_name()))
alert_transactions.show()

+--------+--------------+------+-------+
|alert_id|    alert_type|is_sar|tran_id|
+--------+--------------+------+-------+
|      47|gather_scatter|  true|  11873|
|      47|gather_scatter|  true|  11874|
|      47|gather_scatter|  true|  11875|
|      47|gather_scatter|  true|  13151|
|      47|gather_scatter|  true|  23148|
|      17|scatter_gather|  true|  23779|
|      17|scatter_gather|  true|  23780|
|      17|scatter_gather|  true|  26441|
|      17|scatter_gather|  true|  26442|
|      47|gather_scatter|  true|  28329|
|      47|gather_scatter|  true|  31581|
|      47|gather_scatter|  true|  34310|
|      17|scatter_gather|  true|  34433|
|      58|gather_scatter|  true|  36131|
|      17|scatter_gather|  true|  36563|
|      17|scatter_gather|  true|  41430|
|      17|scatter_gather|  true|  42363|
|      58|gather_scatter|  true|  42511|
|      58|gather_scatter|  true|  44370|
|      58|gather_scatter|  true|  46176|
+--------+--------------+------+-------+
only showing top

In [10]:
alert_transactions = alert_transactions.select("alert_id","alert_type","is_sar","tran_id").orderBy("tran_id")
alert_transactions.show()

+--------+--------------+------+-------+
|alert_id|    alert_type|is_sar|tran_id|
+--------+--------------+------+-------+
|      47|gather_scatter|  true|  11873|
|      47|gather_scatter|  true|  11874|
|      47|gather_scatter|  true|  11875|
|      47|gather_scatter|  true|  13151|
|      47|gather_scatter|  true|  23148|
|      17|scatter_gather|  true|  23779|
|      17|scatter_gather|  true|  23780|
|      17|scatter_gather|  true|  26441|
|      17|scatter_gather|  true|  26442|
|      47|gather_scatter|  true|  28329|
|      47|gather_scatter|  true|  31581|
|      47|gather_scatter|  true|  34310|
|      17|scatter_gather|  true|  34433|
|      58|gather_scatter|  true|  36131|
|      17|scatter_gather|  true|  36563|
|      17|scatter_gather|  true|  41430|
|      17|scatter_gather|  true|  42363|
|      58|gather_scatter|  true|  42511|
|      58|gather_scatter|  true|  44370|
|      58|gather_scatter|  true|  46176|
+--------+--------------+------+-------+
only showing top

## Create alert transactions feature group metadata and perform feature engineering

In [11]:
alert_transactions_fg = fs.create_feature_group(name="alert_transactions_fg",
                                       version=1,
                                       primary_key=["tran_id"],
#                                       partition_key=["alert_type"],         
                                       description="alert transactions",
                                       time_travel_format=None,                                        
                                       statistics_config=False)
alert_transactions_fg.save(alert_transactions)

<hsfs.feature_group.FeatureGroup object at 0x7fd46e198190>

## Load party datasets as spark dataframe and ingest into hsfs

In [12]:
party = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/party.csv".format(hdfs.project_name()))
party.show()

+--------+------------+
| partyId|   partyType|
+--------+------------+
|5628bd6c|Organization|
|a1fcba39|Organization|
|f56c9501|  Individual|
|9969afdd|Organization|
|b356eeae|  Individual|
|3406706a|Organization|
|26c56102|Organization|
|e386ebf7|  Individual|
|8c094b0d|  Individual|
|939235aa|  Individual|
|de6bf2a5|Organization|
|33a8ff5b|Organization|
|a32807a1|  Individual|
|2906ef08|Organization|
|c2a01b8d|  Individual|
|5a99160f|  Individual|
|8b9017b8|Organization|
|fcf3bbf3|  Individual|
|5132aa4d|Organization|
|68b90958|  Individual|
+--------+------------+
only showing top 20 rows

In [13]:
party=party.withColumn('partyType', party_2_code_udf(F.col('partyType'))).toDF("id","type")
party.show()

+--------+----+
|      id|type|
+--------+----+
|5628bd6c|   0|
|a1fcba39|   0|
|f56c9501|   1|
|9969afdd|   0|
|b356eeae|   1|
|3406706a|   0|
|26c56102|   0|
|e386ebf7|   1|
|8c094b0d|   1|
|939235aa|   1|
|de6bf2a5|   0|
|33a8ff5b|   0|
|a32807a1|   1|
|2906ef08|   0|
|c2a01b8d|   1|
|5a99160f|   1|
|8b9017b8|   0|
|fcf3bbf3|   1|
|5132aa4d|   0|
|68b90958|   1|
+--------+----+
only showing top 20 rows

In [14]:
party_fg = fs.create_feature_group(name="party_fg",
                                       version=1,
                                       primary_key=["partyId"],
                                       description="party fg",
                                       time_travel_format=None,                                        
                                       statistics_config=False)
party_fg.save(party)

<hsfs.feature_group.FeatureGroup object at 0x7fd46e1c7290>