## PaySim Dataset

##### <b>step</b> - integer - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

##### <b>type</b> - string/categorical - type of transaction: CASH_IN, CASH_OUT, DEBIT, PAYMENT and TRANSFER.

##### <b>amount</b> - float - amount of the transaction in local currency.

##### <b>nameOrig</b> - string - customer who initiated the transaction

##### <b>oldbalanceOrg</b> - float initial balance before the transaction

##### <b>newbalanceOrig</b> - float - new balance after the transaction

##### <b>nameDest</b> - string - customer who is the recipient of the transaction

##### <b>oldbalanceDest</b> - float - initial balance of recipient before the transaction.

##### <b>newbalanceDest</b> - float - new balance of recipient after the transaction.

##### <b>fraud</b> - boolean/binary - determines if transaction is fraudulent

In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
24,application_1606840862501_0004,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f748579ddd0>

In [2]:
import hashlib
from datetime import datetime
from graphframes import *
from pyspark.sql import functions as func
from pyspark.sql.types import FloatType
import hsfs

In [3]:
def hashnode(x):
    return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

def extract_node_type(x):
    if (x.startswith("C")):
        node_type = 0
    elif (x.startswith("B")):
        node_type = 1
    elif (x.startswith("M")):
        node_type = 2
    else:
        node_type = 99
    return node_type

def extract_fraudster(x):
    if (x.startswith("CF")):
        fraudster = 1
    else:
        fraudster = 0
    return fraudster

def action_2_code(x):
    if (x == "CASH_IN"):
        node_type = 0
    elif (x == "CASH_OUT"):
        node_type = 1
    elif (x == "DEBIT"):
        node_type = 2
    elif (x == "PAYMENT"):
        node_type = 3
    elif (x == "TRANSFER"):
        node_type = 4
    elif (x == "DEPOSIT"):
        node_type = 4        
    else:
        node_type = 99
    return node_type

def gender_2_code(x):
    if x == 'W':
        gender =  0
    elif x == 'M':
        gender = 1
    else: 
        gender = 99
    return gender

def timestamp_2_time(x):
    dt_obj = datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    return dt_obj.timestamp()

hashnode_udf = func.udf(hashnode)
extract_fraudster_udf = func.udf(extract_fraudster)
node_type_udf = func.udf(extract_node_type)
action_2_code_udf = func.udf(action_2_code)
gender_2_code_udf = func.udf(gender_2_code)
timestamp_2_time_udf = func.udf(timestamp_2_time)

In [4]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [5]:
transactions_fg = fs.get_feature_group("aml_10k_transactions_fg", 1)

In [6]:
transactions_fg.show(5)

+-------------------+------+--------+-------+---------+--------+---------+--------+
|     tran_timestamp|is_sar|alert_id|tran_id|bene_acct| tx_type|orig_acct|base_amt|
+-------------------+------+--------+-------+---------+--------+---------+--------+
|2017-01-01 00:00:00| false|      -1|      1|     3259|TRANSFER|     1767| 9405.71|
|2017-01-01 00:00:00| false|      -1|      2|     5141|TRANSFER|     7363| 6884.54|
|2017-01-01 00:00:00| false|      -1|      3|     9532|TRANSFER|     7585|  7968.4|
|2017-01-01 00:00:00| false|      -1|      4|     8792|TRANSFER|     1750| 9042.67|
|2017-01-01 00:00:00| false|      -1|      5|     4670|TRANSFER|     9060| 4692.79|
+-------------------+------+--------+-------+---------+--------+---------+--------+
only showing top 5 rows

In [7]:
transactions_df = transactions_fg.read()
transactions_df = transactions_df.withColumn('is_sar',func.when(func.col('is_sar') == 'true', 1).otherwise(0))\
                                 .withColumn('tx_type', action_2_code_udf(func.col('tx_type')))\
                                 .withColumn('tran_timestamp', timestamp_2_time_udf(func.col('tran_timestamp')).cast(FloatType()))\
                                 .withColumnRenamed("orig_acct","source")\
                                 .withColumnRenamed("bene_acct","target")\
                                 .select("source","target","tran_timestamp","is_sar","alert_id","tran_id","tx_type","base_amt")
transactions_df.show()
transactions_df.printSchema()

+------+------+--------------+------+--------+-------+-------+--------+
|source|target|tran_timestamp|is_sar|alert_id|tran_id|tx_type|base_amt|
+------+------+--------------+------+--------+-------+-------+--------+
|  1767|  3259|   1.4832288E9|     0|      -1|      1|      4| 9405.71|
|  7363|  5141|   1.4832288E9|     0|      -1|      2|      4| 6884.54|
|  7585|  9532|   1.4832288E9|     0|      -1|      3|      4|  7968.4|
|  1750|  8792|   1.4832288E9|     0|      -1|      4|      4| 9042.67|
|  9060|  4670|   1.4832288E9|     0|      -1|      5|      4| 4692.79|
|  8752|  3861|   1.4832288E9|     0|      -1|      6|      4| 4089.65|
|  9645|  3805|   1.4832288E9|     0|      -1|      7|      4| 3055.04|
|  7150|  9280|   1.4832288E9|     0|      -1|      8|      4| 6473.45|
|  1685|  6369|   1.4832288E9|     0|      -1|      9|      4| 2583.42|
|  7846|  8255|   1.4832288E9|     0|      -1|     10|      4| 6753.04|
|   878|  5957|   1.4832288E9|     0|      -1|     11|      4| 1

In [8]:
accounts_fg = fs.get_feature_group("aml_10k_accounts_fg", 1)
accounts_fg.show(5)

+---------------+---------------+-------------------+--------+----------------+---------+------+-------+----+---------+-------------------+--------------+-------+-------+---+
|prior_sar_count|initial_deposit|           close_dt|dsply_nm|acct_rptng_crncy|branch_id|gender|acct_id|type|acct_stat|            open_dt|tx_behavior_id|bank_id|country|age|
+---------------+---------------+-------------------+--------+----------------+---------+------+-------+----+---------+-------------------+--------------+-------+-------+---+
|          false|       84442.19|4754-11-29 00:00:00|     C_0|             USD|        1|     W|      0| SAV|        A|2017-01-01 00:00:00|             1|   bank|     US| 37|
|          false|       75795.44|4754-11-29 00:00:00|     C_1|             USD|        1|     M|      1| SAV|        A|2017-01-01 00:00:00|             1|   bank|     US| 76|
|          false|       42057.16|4754-11-29 00:00:00|     C_2|             USD|        1|     M|      2| SAV|        A|2017-0

In [9]:
accounts_df = accounts_fg.read()
accounts_df = accounts_df.withColumn('prior_sar',func.when(func.col('prior_sar_count') == 'true', 1).otherwise(0))\
                         .withColumn('gender', gender_2_code_udf(func.col('gender')))\
                         .drop("prior_sar_count","acct_rptng_crncy","type","acct_stat","open_dt","bank_id","country","close_dt","dsply_nm","branch_id")
        

accounts_df.show(5)
accounts_df.printSchema()

+---------------+------+-------+--------------+---+---------+
|initial_deposit|gender|acct_id|tx_behavior_id|age|prior_sar|
+---------------+------+-------+--------------+---+---------+
|       84442.19|     0|      0|             1| 37|        0|
|       75795.44|     1|      1|             1| 76|        0|
|       42057.16|     1|      2|             1| 61|        0|
|       25891.68|     0|      3|             1| 95|        0|
|       51127.47|     0|      4|             1| 64|        0|
+---------------+------+-------+--------------+---+---------+
only showing top 5 rows

root
 |-- initial_deposit: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- acct_id: integer (nullable = true)
 |-- tx_behavior_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- prior_sar: integer (nullable = false)

In [10]:
edge_fg_meta = fs.create_feature_group(name="edge_features",
                                       version=1,
                                       primary_key=["tran_id"],
                                       description="edge features",
                                       time_travel_format=None,                                        
                                       statistics_config=False)
edge_fg_meta.save(transactions_df)

<hsfs.feature_group.FeatureGroup object at 0x7f7490b74290>

In [11]:
node_fg_meta = fs.create_feature_group(name="node_features",
                                              version=1,
                                              primary_key=["acct_id"],
                                              description="node features",
                                              time_travel_format=None,                                        
                                              statistics_config=False)
node_fg_meta.save(accounts_df)

<hsfs.feature_group.FeatureGroup object at 0x7f7490b4b610>

In [14]:
edge_td_meta = fs.create_training_dataset(name="edge_td",
                                       version=1,
                                       data_format="csv",   
                                       description="edge td")
edge_td_meta.save(transactions_df)

<hsfs.training_dataset.TrainingDataset object at 0x7f7490b60550>

In [16]:
node_td_meta = fs.create_training_dataset(name="node_td",
                                       version=1,
                                       data_format="csv",   
                                       description="edge td")
node_td_meta.save(accounts_df)

<hsfs.training_dataset.TrainingDataset object at 0x7f7490b41290>