In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
83,application_1609326166272_0002,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f27436b7ed0>

## Import modules

In [2]:
import hashlib
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
from graphframes import *
import hsfs
from hops import hdfs

## Define utility functions

In [3]:
def action_2_code(x):
    if (x == "CASH_IN"):
        node_type = 0
    elif (x == "CASH_OUT"):
        node_type = 1
    elif (x == "DEBIT"):
        node_type = 2
    elif (x == "PAYMENT"):
        node_type = 3
    elif (x == "TRANSFER"):
        node_type = 4
    elif (x == "DEPOSIT"):
        node_type = 4        
    else:
        node_type = 99
    return node_type

def timestamp_2_time(x):
    dt_obj = datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    return dt_obj.timestamp()

action_2_code_udf = F.udf(action_2_code)
timestamp_2_time_udf = F.udf(timestamp_2_time)

## Load accounts datasets as spark dataframe and perform feature engineering

In [4]:
accounts_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/10K/10K/accounts.csv".format(hdfs.project_name()))

accounts_df = accounts_df.drop('first_name')\
                        .drop('last_name')\
                        .drop('street_addr')\
                        .drop('city')\
                        .drop('state')\
                        .drop('zip')\
                        .drop('gender')\
                        .drop('birth_date')\
                        .drop('ssn')\
                        .drop('lon')\
                        .drop('lat')

In [5]:
accounts_df = accounts_df.withColumn('prior_sar',F.when(F.col('prior_sar_count') == 'true', 1).otherwise(0))\
                         .drop("prior_sar_count","acct_rptng_crncy","type","acct_stat","open_dt","bank_id","country","close_dt","dsply_nm","branch_id")

In [6]:
accounts_df.show()

+-------+---------------+--------------+---------+
|acct_id|initial_deposit|tx_behavior_id|prior_sar|
+-------+---------------+--------------+---------+
|      0|       92221.09|             1|        0|
|      1|       87897.72|             1|        0|
|      2|       71028.58|             1|        0|
|      3|       62945.84|             1|        0|
|      4|       75563.74|             1|        0|
|      5|       70246.71|             1|        0|
|      6|       89189.93|             1|        0|
|      7|       65165.64|             1|        0|
|      8|       73829.85|             1|        0|
|      9|        79169.1|             1|        0|
|     10|       95405.64|             1|        0|
|     11|       75234.34|             1|        0|
|     12|       64091.89|             1|        0|
|     13|       87790.21|             1|        0|
|     14|       80918.45|             1|        0|
|     15|       62525.32|             1|        0|
|     16|       95487.31|      

In [7]:
accounts_df.select("tx_behavior_id").dropDuplicates().show()

+--------------+
|tx_behavior_id|
+--------------+
|             1|
|             3|
|             5|
|             4|
|             2|
+--------------+

In [8]:
accounts_df.where(F.col("acct_id")==7950).show()

+-------+---------------+--------------+---------+
|acct_id|initial_deposit|tx_behavior_id|prior_sar|
+-------+---------------+--------------+---------+
|   7950|       64926.12|             4|        1|
+-------+---------------+--------------+---------+

## Load transactions datasets as spark dataframe and perform feature engineering

In [9]:
transactions_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/10K/10K/transactions.csv".format(hdfs.project_name()))
transactions_df.show()

+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|      1|     7720|     9769|   TRANSFER-Forward|   721.5|2020-01-01 00:00:00| false|      -1|
|      2|     8148|     2780|TRANSFER-Periodical|   719.6|2020-01-01 00:00:00| false|      -1|
|      3|     8148|     9182|TRANSFER-Periodical|  817.15|2020-01-01 00:00:00| false|      -1|
|      4|     8148|     8612|TRANSFER-Periodical|  931.53|2020-01-01 00:00:00| false|      -1|
|      5|      440|     2608|    TRANSFER-FanOut|  454.24|2020-01-01 00:00:00| false|      -1|
|      6|     5126|     3115|    TRANSFER-Mutual|  683.79|2020-01-01 00:00:00| false|      -1|
|      7|      894|     2759|    TRANSFER-FanOut|  682.61|2020-01-01 00:00:00| false|      -1|
|      8|     1316|     9766|    TRANSFER-FanOut| 

In [10]:
alert_accounts_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/10K/10K/alert_accounts.csv".format(hdfs.project_name()))

alert_accounts_df.show()
#alert_accounts_df.orderBy("alert_id").show()

+--------+----------+-------+---------+------+--------+-----+-------+-----------+-------+
|alert_id|alert_type|acct_id|acct_name|is_sar|model_id|start|    end|schedule_id|bank_id|
+--------+----------+-------+---------+------+--------+-----+-------+-----------+-------+
|       0|    fan_in|   7950|     7950|  true|       2|    0|1000000|          2|   bank|
|       0|    fan_in|   8485|     8485|  true|       2|    0|1000000|          2|   bank|
|       0|    fan_in|   3838|     3838|  true|       2|    0|1000000|          2|   bank|
|       0|    fan_in|   2145|     2145|  true|       2|    0|1000000|          2|   bank|
|       0|    fan_in|   8913|     8913|  true|       2|    0|1000000|          2|   bank|
|       0|    fan_in|   6075|     6075|  true|       2|    0|1000000|          2|   bank|
|       0|    fan_in|   2322|     2322|  true|       2|    0|1000000|          2|   bank|
|       0|    fan_in|   3948|     3948|  true|       2|    0|1000000|          2|   bank|
|       0|

In [11]:
alert_accounts_df.count()

770

In [12]:
alert_transactions_df = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Resources/10K/10K/alert_transactions.csv".format(hdfs.project_name()))
alert_transactions_df.orderBy("alert_id").show()

+--------+----------+------+-------+---------+---------+-------------------+--------+-------------------+
|alert_id|alert_type|is_sar|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|
+--------+----------+------+-------+---------+---------+-------------------+--------+-------------------+
|       0|    fan_in|  true|   3071|     3838|     7950|     TRANSFER-FanIn|  194.53|2020-01-02 00:00:00|
|       0|    fan_in|  true|  12308|     6075|     7950|   TRANSFER-Forward|  194.53|2020-01-05 00:00:00|
|       0|    fan_in|  true|   9496|     8913|     7950|TRANSFER-Periodical|  194.53|2020-01-04 00:00:00|
|       0|    fan_in|  true|  14316|     2322|     7950|     TRANSFER-FanIn|  194.53|2020-01-06 00:00:00|
|       0|    fan_in|  true|  18059|     3948|     7950|     TRANSFER-FanIn|  194.53|2020-01-07 00:00:00|
|       0|    fan_in|  true|   7847|     2145|     7950|     TRANSFER-FanIn|  194.53|2020-01-03 00:00:00|
|       0|    fan_in|  true|  19638|     6419|

In [13]:
alert_transactions_df.count()

710

In [14]:
alert_acct_id_df = alert_accounts_df.select("acct_id")
alert_related_transactions_df = transactions_df.join(
    alert_acct_id_df,
    [(alert_acct_id_df.acct_id==transactions_df.orig_acct)|(alert_acct_id_df.acct_id==transactions_df.bene_acct)],
    how="inner"
).dropDuplicates(subset=['tran_id']).drop("acct_id")
alert_related_transactions_df.show()

+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|    148|     6944|     8315|   TRANSFER-Forward|  927.66|2020-01-01 00:00:00| false|      -1|
|   1580|     9915|     8387|TRANSFER-Periodical|  662.87|2020-01-01 00:00:00| false|      -1|
|   2142|     3367|     3484|     TRANSFER-FanIn|   452.9|2020-01-01 00:00:00| false|      -1|
|   5156|     9794|     8523|TRANSFER-Periodical|  337.62|2020-01-02 00:00:00| false|      -1|
|   5518|      254|     8605|    TRANSFER-FanOut|  188.55|2020-01-03 00:00:00| false|      -1|
|   6336|     9230|     8311|TRANSFER-Periodical|  253.46|2020-01-03 00:00:00| false|      -1|
|   7754|     9834|     8059|TRANSFER-Periodical|  787.57|2020-01-03 00:00:00| false|      -1|
|   8638|     9813|     8108|TRANSFER-Periodical| 

In [15]:
alert_related_transactions_df.count()

13271

In [16]:
alert_related_transactions_df.where(F.col("alert_id")==-1).show()

+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|    148|     6944|     8315|   TRANSFER-Forward|  927.66|2020-01-01 00:00:00| false|      -1|
|   1580|     9915|     8387|TRANSFER-Periodical|  662.87|2020-01-01 00:00:00| false|      -1|
|   2142|     3367|     3484|     TRANSFER-FanIn|   452.9|2020-01-01 00:00:00| false|      -1|
|   5156|     9794|     8523|TRANSFER-Periodical|  337.62|2020-01-02 00:00:00| false|      -1|
|   5518|      254|     8605|    TRANSFER-FanOut|  188.55|2020-01-03 00:00:00| false|      -1|
|   6336|     9230|     8311|TRANSFER-Periodical|  253.46|2020-01-03 00:00:00| false|      -1|
|   7754|     9834|     8059|TRANSFER-Periodical|  787.57|2020-01-03 00:00:00| false|      -1|
|   8638|     9813|     8108|TRANSFER-Periodical| 

In [17]:
alert_related_transactions_ids = alert_related_transactions_df.select("tran_id")
non_alert_related_transactions_df = transactions_df.join(alert_related_transactions_ids, ["tran_id"], how="leftanti")
non_alert_related_transactions_df.show()

+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|    463|     7252|     7708|   TRANSFER-Forward|  277.72|2020-01-01 00:00:00| false|      -1|
|    471|     7199|     4620|   TRANSFER-Forward|  664.95|2020-01-01 00:00:00| false|      -1|
|    496|     9009|     9648|TRANSFER-Periodical|  791.27|2020-01-01 00:00:00| false|      -1|
|    833|     3874|     2457|     TRANSFER-FanIn|  131.92|2020-01-01 00:00:00| false|      -1|
|   1088|     9218|      596|TRANSFER-Periodical|  422.95|2020-01-01 00:00:00| false|      -1|
|   1238|      346|     4803|    TRANSFER-FanOut|  933.44|2020-01-01 00:00:00| false|      -1|
|   1342|     8056|     5454|TRANSFER-Periodical|  634.39|2020-01-01 00:00:00| false|      -1|
|   1591|     2413|     3546|     TRANSFER-FanIn| 

In [18]:
non_alert_related_transactions_df.count()

68146

In [19]:
non_alert_related_transactions_df.show()

+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|tran_id|orig_acct|bene_acct|            tx_type|base_amt|     tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------------------+--------+-------------------+------+--------+
|    463|     7252|     7708|   TRANSFER-Forward|  277.72|2020-01-01 00:00:00| false|      -1|
|    471|     7199|     4620|   TRANSFER-Forward|  664.95|2020-01-01 00:00:00| false|      -1|
|    496|     9009|     9648|TRANSFER-Periodical|  791.27|2020-01-01 00:00:00| false|      -1|
|    833|     3874|     2457|     TRANSFER-FanIn|  131.92|2020-01-01 00:00:00| false|      -1|
|   1088|     9218|      596|TRANSFER-Periodical|  422.95|2020-01-01 00:00:00| false|      -1|
|   1238|      346|     4803|    TRANSFER-FanOut|  933.44|2020-01-01 00:00:00| false|      -1|
|   1342|     8056|     5454|TRANSFER-Periodical|  634.39|2020-01-01 00:00:00| false|      -1|
|   1591|     2413|     3546|     TRANSFER-FanIn| 

In [20]:
non_alert_related_transactions_df.where(F.col("alert_id")>-1).show()

+-------+---------+---------+-------+--------+--------------+------+--------+
|tran_id|orig_acct|bene_acct|tx_type|base_amt|tran_timestamp|is_sar|alert_id|
+-------+---------+---------+-------+--------+--------------+------+--------+
+-------+---------+---------+-------+--------+--------------+------+--------+

In [21]:
sar_sources = alert_related_transactions_df.select("orig_acct").toDF("id")
sar_targets = alert_related_transactions_df.select("bene_acct").toDF("id")
sar_nodes = sar_sources.union(sar_targets).dropDuplicates()
sar_edges = non_alert_related_transactions_df.select("orig_acct", "bene_acct").toDF("src", "dst")

In [22]:
# Now lets construct the graph
g_sar = GraphFrame(sar_nodes,sar_edges)

In [None]:
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_sar = g_sar.connectedComponents().cache()
#cc_sar = g_sar.stronglyConnectedComponents(20).cache()

In [None]:
cc_sar.groupBy('component').count().select('count').dropDuplicates().orderBy('count').show()

In [None]:
normal_sources = non_alert_related_transactions_df.select("orig_acct").toDF("id")
normal_targets = non_alert_related_transactions_df.select("bene_acct").toDF("id")
normal_nodes = normal_sources.union(normal_targets).dropDuplicates()
normal_edges = non_alert_related_transactions_df.select("orig_acct", "bene_acct").toDF("src", "dst")

In [None]:
normal_nodes.show()

In [None]:
normal_edges.show()

In [None]:
# Now lets construct the graph
g_normal = GraphFrame(normal_nodes,normal_edges)

In [None]:
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_normal = g_normal.connectedComponents().cache()
#cc_normal = g_normal.stronglyConnectedComponents(20).cache()

In [None]:
cc_normal.groupBy('component').count().select('count').dropDuplicates().orderBy('count').show()

## Create a connection to Hopsworks feature store (hsfs)

In [None]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

## Create accounts feature group metadata and save it in to hsfs 

In [None]:
accounts_fg = fs.create_feature_group(name="account_features",
                                      version=1,
                                      primary_key=["acct_id"],
                                      description="node features",
                                      time_travel_format=None,
                                      statistics_config=False)
accounts_fg.save(accounts_df)

## Create transactions feature group metadata and save it in to hsfs 

In [10]:
transactions_fg = fs.create_feature_group(name="transactions_fg",
                                       version=1,
                                       primary_key=["tran_id"],
                                       description="edge features",
                                       time_travel_format=None,                                        
                                       statistics_config=False)
transactions_fg.save(transactions_df)

<hsfs.feature_group.FeatureGroup object at 0x7fcc08cb6dd0>