In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
79,application_1609265553881_0001,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7fcbfd834cd0>

## Import modules

In [2]:
import hashlib
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
import hsfs
from hops import hdfs

## Define utility functions

In [3]:
def action_2_code(x):
    if (x == "CASH_IN"):
        node_type = 0
    elif (x == "CASH_OUT"):
        node_type = 1
    elif (x == "DEBIT"):
        node_type = 2
    elif (x == "PAYMENT"):
        node_type = 3
    elif (x == "TRANSFER"):
        node_type = 4
    elif (x == "DEPOSIT"):
        node_type = 4        
    else:
        node_type = 99
    return node_type

def timestamp_2_time(x):
    dt_obj = datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    return dt_obj.timestamp()

action_2_code_udf = F.udf(action_2_code)
timestamp_2_time_udf = F.udf(timestamp_2_time)

## Create a connection to Hopsworks feature store (hsfs)

In [4]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

## Load accounts datasets as spark dataframe and perform feature engineering

In [5]:
accounts_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/accounts.csv".format(hdfs.project_name()))

accounts_df = accounts_df.drop('first_name')\
                        .drop('last_name')\
                        .drop('street_addr')\
                        .drop('city')\
                        .drop('state')\
                        .drop('zip')\
                        .drop('gender')\
                        .drop('birth_date')\
                        .drop('ssn')\
                        .drop('lon')\
                        .drop('lat')

In [6]:
accounts_df = accounts_df.withColumn('prior_sar',F.when(F.col('prior_sar_count') == 'true', 1).otherwise(0))\
                         .drop("prior_sar_count","acct_rptng_crncy","type","acct_stat","open_dt","bank_id","country","close_dt","dsply_nm","branch_id")

## Create accounts feature group metadata and save it in to hsfs 

In [7]:
accounts_fg = fs.create_feature_group(name="account_features",
                                      version=1,
                                      primary_key=["acct_id"],
                                      description="node features",
                                      time_travel_format=None,
                                      statistics_config=False)
accounts_fg.save(accounts_df)

<hsfs.feature_group.FeatureGroup object at 0x7fcc09116690>

## Load transactions datasets as spark dataframe and perform feature engineering

In [8]:
transactions_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/transactions.csv".format(hdfs.project_name()))


In [9]:
transactions_df = transactions_df.withColumn('is_sar',F.when(F.col('is_sar') == 'true', 1).otherwise(0))\
                                 .withColumn('tx_type', action_2_code_udf(F.col('tx_type')))\
                                 .withColumn('tran_timestamp', timestamp_2_time_udf(F.col('tran_timestamp')).cast(FloatType()))\
                                 .withColumnRenamed("orig_acct","source")\
                                 .withColumnRenamed("bene_acct","target")\
                                 .select("source","target","tran_timestamp","is_sar","alert_id","tran_id","tx_type","base_amt")
transactions_df.show()

+------+------+--------------+------+--------+-------+-------+--------+
|source|target|tran_timestamp|is_sar|alert_id|tran_id|tx_type|base_amt|
+------+------+--------------+------+--------+-------+-------+--------+
|   218|    78|   1.4832288E9|     0|      -1|      1|      4|  458.69|
|   213|    95|   1.4832288E9|     0|      -1|      2|      4|  537.69|
|   191|    74|   1.4832288E9|     0|      -1|      3|      4|  139.61|
|   166|   197|   1.4832288E9|     0|      -1|      4|      4|  717.61|
|    16|    46|   1.4832288E9|     0|      -1|      5|      4|  275.56|
|   100|   296|   1.4832288E9|     0|      -1|      6|      4|  870.63|
|   202|    76|   1.4832288E9|     1|       0|      7|      4|  157.52|
|    69|   229|   1.4832288E9|     0|      -1|      8|      4|  498.12|
|    97|   121|   1.4832288E9|     0|      -1|      9|      4|  451.32|
|     9|    62|   1.4832288E9|     0|      -1|     10|      4|  688.63|
|   118|    77|   1.4832288E9|     0|      -1|     11|      4|  

## Create transactions feature group metadata and save it in to hsfs 

In [10]:
transactions_fg = fs.create_feature_group(name="transactions_fg",
                                       version=1,
                                       primary_key=["tran_id"],
                                       description="edge features",
                                       time_travel_format=None,                                        
                                       statistics_config=False)
transactions_fg.save(transactions_df)

<hsfs.feature_group.FeatureGroup object at 0x7fcc08cb6dd0>