In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
81,application_1609265553881_0003,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f51d2776f90>

## Import modules

In [2]:
import hashlib
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
import hsfs
from hops import hdfs

## Define utility functions

In [3]:
def action_2_code(x):
    if (x == "CASH_IN"):
        node_type = 0
    elif (x == "CASH_OUT"):
        node_type = 1
    elif (x == "DEBIT"):
        node_type = 2
    elif (x == "PAYMENT"):
        node_type = 3
    elif (x == "TRANSFER"):
        node_type = 4
    elif (x == "DEPOSIT"):
        node_type = 4        
    else:
        node_type = 99
    return node_type

def timestamp_2_time(x):
    dt_obj = datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    return dt_obj.timestamp()

action_2_code_udf = F.udf(action_2_code)
timestamp_2_time_udf = F.udf(timestamp_2_time)

## Create a connection to Hopsworks feature store (hsfs)

In [4]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

## Load accounts datasets as spark dataframe and perform feature engineering

In [6]:
accounts_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/typologies/typologies/accounts.csv".format(hdfs.project_name()))

accounts_df = accounts_df.drop('first_name')\
                        .drop('last_name')\
                        .drop('street_addr')\
                        .drop('city')\
                        .drop('state')\
                        .drop('zip')\
                        .drop('gender')\
                        .drop('birth_date')\
                        .drop('ssn')\
                        .drop('lon')\
                        .drop('lat')

In [7]:
accounts_df = accounts_df.withColumn('prior_sar',F.when(F.col('prior_sar_count') == 'true', 1).otherwise(0))\
                         .drop("prior_sar_count","acct_rptng_crncy","type","acct_stat","open_dt","bank_id","country","close_dt","dsply_nm","branch_id")

## Load transactions datasets as spark dataframe and perform feature engineering

In [21]:
transactions_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/10K/10K/transactions.csv".format(hdfs.project_name()))
transactions_df.show()

+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|      1|     7720|     9769|   TRANSFER-Forward|   721.5|2020-01-01 00:00:00| false|      -1|
|      2|     8148|     2780|TRANSFER-Periodical|   719.6|2020-01-01 00:00:00| false|      -1|
|      3|     8148|     9182|TRANSFER-Periodical|  817.15|2020-01-01 00:00:00| false|      -1|
|      4|     8148|     8612|TRANSFER-Periodical|  931.53|2020-01-01 00:00:00| false|      -1|
|      5|      440|     2608|    TRANSFER-FanOut|  454.24|2020-01-01 00:00:00| false|      -1|
|      6|     5126|     3115|    TRANSFER-Mutual|  683.79|2020-01-01 00:00:00| false|      -1|
|      7|      894|     2759|    TRANSFER-FanOut|  682.61|2020-01-01 00:00:00| false|      -1|
|      8|     1316|     9766|    TRANSFER-FanOut| 

In [None]:
alert_accounts_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/10K/10K/alert_accounts.csv".format(hdfs.project_name()))

alert_accounts_df.show()
#alert_accounts_df.orderBy("alert_id").show()

In [None]:
alert_accounts_df.count()

In [None]:
alert_transactions_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/10K/10K/alert_transactions.csv".format(hdfs.project_name()))
alert_transactions_df.orderBy("alert_id").show()

In [None]:
alert_transactions_df.count()

In [None]:
alert_related_transactions_df = transactions_df.join(
    alert_accounts_df,
    [(alert_accounts_df.acct_id==transactions_df.orig_acct)|(alert_accounts_df.acct_id==transactions_df.bene_acct)],
    how="inner"
).dropDuplicates(subset=['tran_id'])
alert_related_transactions_df.show()

In [None]:
alert_related_transactions_df.count()

In [22]:
transactions_df.where((F.col("orig_acct")==165)|(F.col("bene_acct")==165)).show()

+-------+---------+---------+---------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|        tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+---------------+--------+-------------------+------+--------+
|    708|      165|     8297|TRANSFER-FanOut|  388.45|2020-01-01 00:00:00| false|      -1|
|  21817|      165|      645|TRANSFER-FanOut|  202.01|2020-01-08 00:00:00| false|      -1|
|  40184|      165|     8945|TRANSFER-FanOut|  647.11|2020-01-15 00:00:00| false|      -1|
|  58399|      165|     8297|TRANSFER-FanOut|  558.09|2020-01-22 00:00:00| false|      -1|
|  67968|      619|      165|TRANSFER-FanOut|  480.41|2020-01-26 00:00:00| false|      -1|
|  78332|      165|      645|TRANSFER-FanOut| 1029.48|2020-01-29 00:00:00| false|      -1|
+-------+---------+---------+---------------+--------+-------------------+------+--------+

In [26]:
transactions_df.where((F.col("orig_acct")==619)).show()

+-------+---------+---------+---------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|        tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+---------------+--------+-------------------+------+--------+
|  13594|      619|     9942|TRANSFER-FanOut|  955.18|2020-01-05 00:00:00| false|      -1|
|  32497|      619|     1353|TRANSFER-FanOut|  899.84|2020-01-12 00:00:00| false|      -1|
|  51560|      619|     9546|TRANSFER-FanOut|  123.98|2020-01-19 00:00:00| false|      -1|
|  67968|      619|      165|TRANSFER-FanOut|  480.41|2020-01-26 00:00:00| false|      -1|
+-------+---------+---------+---------------+--------+-------------------+------+--------+

In [27]:
transactions_df.where((F.col("bene_acct")==619)).show()

+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|  14758|     9262|      619|TRANSFER-Periodical|  790.72|2020-01-06 00:00:00| false|      -1|
|  35069|     9262|      619|TRANSFER-Periodical|   158.1|2020-01-13 00:00:00| false|      -1|
|  53272|     9262|      619|TRANSFER-Periodical|  700.75|2020-01-20 00:00:00| false|      -1|
|  73426|     9262|      619|TRANSFER-Periodical|  101.98|2020-01-27 00:00:00| false|      -1|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+

In [28]:
transactions_df.where((F.col("orig_acct")==9262)).show()

+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|   6173|     9262|     7108|TRANSFER-Periodical|  185.58|2020-01-03 00:00:00|  true|      21|
|  14756|     9262|     6850|TRANSFER-Periodical|  898.97|2020-01-06 00:00:00| false|      -1|
|  14757|     9262|     8569|TRANSFER-Periodical|   725.0|2020-01-06 00:00:00| false|      -1|
|  14758|     9262|      619|TRANSFER-Periodical|  790.72|2020-01-06 00:00:00| false|      -1|
|  14759|     9262|     8209|TRANSFER-Periodical|  189.24|2020-01-06 00:00:00| false|      -1|
|  14760|     9262|     3395|TRANSFER-Periodical|   208.1|2020-01-06 00:00:00| false|      -1|
|  35067|     9262|     6850|TRANSFER-Periodical|   513.2|2020-01-13 00:00:00| false|      -1|
|  35068|     9262|     8569|TRANSFER-Periodical| 

In [29]:
transactions_df.where((F.col("bene_acct")==9262)).show()

+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|   5068|     1765|     9262|    TRANSFER-FanOut|   448.4|2020-01-02 00:00:00| false|      -1|
|   6257|     7328|     9262|   TRANSFER-Forward|  146.71|2020-01-03 00:00:00| false|      -1|
|  14222|     9474|     9262|TRANSFER-Periodical|  731.25|2020-01-06 00:00:00| false|      -1|
|  24324|     6892|     9262|   TRANSFER-Forward|   408.3|2020-01-09 00:00:00| false|      -1|
|  26685|     7035|     9262|   TRANSFER-Forward|  400.55|2020-01-10 00:00:00| false|      -1|
|  33868|     9474|     9262|TRANSFER-Periodical|  153.61|2020-01-13 00:00:00| false|      -1|
|  38951|     6711|     9262|   TRANSFER-Forward|  415.45|2020-01-15 00:00:00| false|      -1|
|  53055|     9474|     9262|TRANSFER-Periodical| 

In [23]:
transactions_df.where(F.col("alert_id")==-1).select("tx_type").dropDuplicates().show()

+-------------------+
|            tx_type|
+-------------------+
|   TRANSFER-Forward|
|TRANSFER-Periodical|
|    TRANSFER-FanOut|
|    TRANSFER-Mutual|
|     TRANSFER-FanIn|
+-------------------+

In [24]:
transactions_df.where(F.col("alert_id")!=-1).select("tx_type").dropDuplicates().show()

+-------------------+
|            tx_type|
+-------------------+
|   TRANSFER-Forward|
|TRANSFER-Periodical|
|    TRANSFER-FanOut|
|    TRANSFER-Mutual|
|     TRANSFER-FanIn|
+-------------------+

In [32]:
transactions_df.where((F.col("orig_acct")==7950)).show()

+-------+---------+---------+----------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|         tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+----------------+--------+-------------------+------+--------+
|  18060|     7950|     1374|TRANSFER-Forward|  197.05|2020-01-07 00:00:00| false|      -1|
|  36357|     7950|     4631|TRANSFER-Forward|  594.77|2020-01-14 00:00:00| false|      -1|
|  56287|     7950|     9191|TRANSFER-Forward|  153.27|2020-01-21 00:00:00| false|      -1|
|  75210|     7950|     6405|TRANSFER-Forward|  173.72|2020-01-28 00:00:00| false|      -1|
+-------+---------+---------+----------------+--------+-------------------+------+--------+

In [33]:
transactions_df.where((F.col("bene_acct")==7950)).show()

+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|   1122|     8485|     7950|TRANSFER-Periodical|  194.53|2020-01-01 00:00:00|  true|       0|
|   3071|     3838|     7950|     TRANSFER-FanIn|  194.53|2020-01-02 00:00:00|  true|       0|
|   5015|     9624|     7950|TRANSFER-Periodical|  224.25|2020-01-02 00:00:00| false|      -1|
|   6753|     9726|     7950|TRANSFER-Periodical|  385.45|2020-01-03 00:00:00| false|      -1|
|   7847|     2145|     7950|     TRANSFER-FanIn|  194.53|2020-01-03 00:00:00|  true|       0|
|   9496|     8913|     7950|TRANSFER-Periodical|  194.53|2020-01-04 00:00:00|  true|       0|
|  12069|     9854|     7950|TRANSFER-Periodical|  221.99|2020-01-05 00:00:00| false|      -1|
|  12308|     6075|     7950|   TRANSFER-Forward| 

In [13]:
transactions_df = transactions_df.withColumn('is_sar',F.when(F.col('is_sar') == 'true', 1).otherwise(0))\
                                 .withColumn('tx_type', action_2_code_udf(F.col('tx_type')))\
                                 .withColumn('tran_timestamp', timestamp_2_time_udf(F.col('tran_timestamp')).cast(FloatType()))\
                                 .withColumnRenamed("orig_acct","source")\
                                 .withColumnRenamed("bene_acct","target")\
                                 .select("source","target","tran_timestamp","is_sar","alert_id","tran_id","tx_type","base_amt")
transactions_df.show()

+-------+------+------+-------+--------+--------------+------+--------+
|tran_id|source|target|tx_type|base_amt|tran_timestamp|is_sar|alert_id|
+-------+------+------+-------+--------+--------------+------+--------+
|      1|   165|    36|     99|  458.69|   1.5778368E9|     0|      -1|
|      2|   199|   194|     99|  537.69|   1.5778368E9|     0|      -1|
|      3|   116|    95|     99|  139.61|   1.5778368E9|     0|      -1|
|      4|    16|    46|     99|  717.61|   1.5778368E9|     0|      -1|
|      5|   176|    32|     99|  275.56|   1.5778368E9|     0|      -1|
|      6|   295|    78|     99|  870.63|   1.5778368E9|     0|      -1|
|      7|    80|   268|     99|  498.12|   1.5778368E9|     0|      -1|
|      8|   178|    59|     99|  451.32|   1.5778368E9|     0|      -1|
|      9|    69|   229|     99|  688.63|   1.5778368E9|     0|      -1|
|     10|   167|   251|     99|  395.89|   1.5778368E9|     0|      -1|
|     11|   281|   192|     99|  153.48|   1.5778368E9|     0|  

## Create accounts feature group metadata and save it in to hsfs 

In [None]:
accounts_fg = fs.create_feature_group(name="account_features",
                                      version=1,
                                      primary_key=["acct_id"],
                                      description="node features",
                                      time_travel_format=None,
                                      statistics_config=False)
accounts_fg.save(accounts_df)

## Create transactions feature group metadata and save it in to hsfs 

In [10]:
transactions_fg = fs.create_feature_group(name="transactions_fg",
                                       version=1,
                                       primary_key=["tran_id"],
                                       description="edge features",
                                       time_travel_format=None,                                        
                                       statistics_config=False)
transactions_fg.save(transactions_df)

<hsfs.feature_group.FeatureGroup object at 0x7fcc08cb6dd0>