In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
61,application_1607949680860_0063,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f9fb0c65f90>

In [2]:
import hashlib
from datetime import datetime
from graphframes import *
from pyspark.sql import functions as func
from pyspark.sql.types import FloatType
import hsfs
from hops import hdfs
import os
from pyspark.sql import SQLContext

In [3]:
def hashnode(x):
    return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

hashnode_udf = func.udf(hashnode)

In [4]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [5]:
edge_fg = fs.get_feature_group('transactions_fg', 1)
node_fg = fs.get_feature_group('account_features', 1)

In [6]:
node_fg.show(5)

+--------------+---------+-------+---------------+
|tx_behavior_id|prior_sar|acct_id|initial_deposit|
+--------------+---------+-------+---------------+
|             1|        0|      0|       84442.19|
|             1|        0|      1|       75795.44|
|             1|        0|      2|       42057.16|
|             1|        0|      3|       25891.68|
|             1|        0|      4|       51127.47|
+--------------+---------+-------+---------------+
only showing top 5 rows

In [7]:
edge_fg.show(5)

+-------+------+-------+--------------+--------+------+--------+------+
|tx_type|is_sar|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+-------+------+-------+--------------+--------+------+--------+------+
|      4|     0|      1|   1.4832288E9| 9405.71|  3259|      -1|  1767|
|      4|     0|      2|   1.4832288E9| 6884.54|  5141|      -1|  7363|
|      4|     0|      3|   1.4832288E9|  7968.4|  9532|      -1|  7585|
|      4|     0|      4|   1.4832288E9| 9042.67|  8792|      -1|  1750|
|      4|     0|      5|   1.4832288E9| 4692.79|  4670|      -1|  9060|
+-------+------+-------+--------------+--------+------+--------+------+
only showing top 5 rows

In [8]:
only_sar_edge_df = edge_fg.read().filter(func.col('alert_id') != -1)
only_normal_edge_df =  edge_fg.read().filter(func.col('alert_id') == -1)

In [9]:
only_normal_edge_df.show(5)

+-------+------+-------+--------------+--------+------+--------+------+
|tx_type|is_sar|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+-------+------+-------+--------------+--------+------+--------+------+
|      4|     0|      1|   1.4832288E9| 9405.71|  3259|      -1|  1767|
|      4|     0|      2|   1.4832288E9| 6884.54|  5141|      -1|  7363|
|      4|     0|      3|   1.4832288E9|  7968.4|  9532|      -1|  7585|
|      4|     0|      4|   1.4832288E9| 9042.67|  8792|      -1|  1750|
|      4|     0|      5|   1.4832288E9| 4692.79|  4670|      -1|  9060|
+-------+------+-------+--------------+--------+------+--------+------+
only showing top 5 rows

In [10]:
only_sar_edge_df.show()

+-------+------+-------+--------------+--------+------+--------+------+
|tx_type|is_sar|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+-------+------+-------+--------------+--------+------+--------+------+
|      4|     1|     98|   1.4832288E9|  108.62|  5688|      16|  2298|
|      4|     1|    108|   1.4832288E9|  183.25|  9601|      26|  8627|
|      4|     1|    135|   1.4832288E9|  142.71|  8359|      15|  2756|
|      4|     1|    137|   1.4832288E9|  132.47|  7702|       9|  7605|
|      4|     1|    218|   1.4832288E9|  119.51|  7377|      17|  5891|
|      4|     1|    335|   1.4832288E9|  136.02|  1661|      12|  6787|
|      4|     1|    439|   1.4832288E9|  194.53|  7950|       0|  8485|
|      4|     1|    477|   1.4832288E9|  184.32|  2177|       3|  5324|
|      4|     1|    514|   1.4832288E9|  130.63|  4616|      19|  4919|
|      4|     1|    564|   1.4832288E9|  183.27|  1589|       5|  4170|
|      4|     1|    580|   1.4832288E9|  135.69|  1590|      18|

In [11]:
only_sar_edge_df.count()

732

In [12]:
only_normal_edge_df.count()

1028964

In [13]:
only_sar_edge_df_grouped = only_sar_edge_df.groupBy('tran_id').agg(func.min("tran_timestamp"),func.max("tran_timestamp")).toDF("tran_id", "window_start", "window_end")
only_sar_edge_df_grouped.show(5)

+-------+------------+-----------+
|tran_id|window_start| window_end|
+-------+------------+-----------+
|  44437| 1.4859072E9|1.4859072E9|
| 100274| 1.4892768E9|1.4892768E9|
| 612597|  1.520208E9| 1.520208E9|
|  12471|   1.48392E9|  1.48392E9|
|  33855|  1.485216E9| 1.485216E9|
+-------+------------+-----------+
only showing top 5 rows

In [14]:
only_sar_edges_df_windows = only_sar_edge_df.join(only_sar_edge_df_grouped,["tran_id"])

In [15]:
only_sar_edges_df_windows.show()

+-------+-------+------+--------------+--------+------+--------+------+------------+-----------+
|tran_id|tx_type|is_sar|tran_timestamp|base_amt|target|alert_id|source|window_start| window_end|
+-------+-------+------+--------------+--------+------+--------+------+------------+-----------+
|     98|      4|     1|   1.4832288E9|  108.62|  5688|      16|  2298| 1.4832288E9|1.4832288E9|
|    108|      4|     1|   1.4832288E9|  183.25|  9601|      26|  8627| 1.4832288E9|1.4832288E9|
|    135|      4|     1|   1.4832288E9|  142.71|  8359|      15|  2756| 1.4832288E9|1.4832288E9|
|    137|      4|     1|   1.4832288E9|  132.47|  7702|       9|  7605| 1.4832288E9|1.4832288E9|
|    218|      4|     1|   1.4832288E9|  119.51|  7377|      17|  5891| 1.4832288E9|1.4832288E9|
|    335|      4|     1|   1.4832288E9|  136.02|  1661|      12|  6787| 1.4832288E9|1.4832288E9|
|    439|      4|     1|   1.4832288E9|  194.53|  7950|       0|  8485| 1.4832288E9|1.4832288E9|
|    477|      4|     1|   1.4

In [16]:
only_sar_edges_df_windows.count()

732

In [17]:
only_normal_edges_df_windows = only_sar_edge_df_grouped.select("window_start", "window_end").join(
    only_normal_edge_df,
    [(only_normal_edge_df.tran_timestamp>=only_sar_edge_df_grouped.window_start)&(only_normal_edge_df.tran_timestamp<=only_sar_edge_df_grouped.window_end)],
    how="left"
)

In [18]:
only_normal_edges_df_windows.show()

+------------+-----------+-------+------+-------+--------------+--------+------+--------+------+
|window_start| window_end|tx_type|is_sar|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+------------+-----------+-------+------+-------+--------------+--------+------+--------+------+
| 1.4859072E9|1.4859072E9|      4|     0|  44409|   1.4859072E9| 5887.65|  6489|      -1|  6074|
| 1.4859072E9|1.4859072E9|      4|     0|  44410|   1.4859072E9| 1753.42|  2474|      -1|  5360|
| 1.4859072E9|1.4859072E9|      4|     0|  44411|   1.4859072E9|  433.12|   678|      -1|  7416|
| 1.4859072E9|1.4859072E9|      4|     0|  44412|   1.4859072E9| 1322.19|  8767|      -1|  8663|
| 1.4859072E9|1.4859072E9|      4|     0|  44413|   1.4859072E9| 6036.48|  5370|      -1|  8258|
| 1.4859072E9|1.4859072E9|      4|     0|  44414|   1.4859072E9| 9083.92|  2420|      -1|  9893|
| 1.4859072E9|1.4859072E9|      4|     0|  44415|   1.4859072E9| 5309.47|  2418|      -1|  7987|
| 1.4859072E9|1.4859072E9|    

In [19]:
only_normal_edges_df_windows.count()

1048084

In [20]:
only_normal_edges_df_windows = only_normal_edges_df_windows.withColumnRenamed("source", "origId")\
                                                           .withColumnRenamed("target", "destId")  
only_normal_edges_df_windows = only_normal_edges_df_windows.withColumn('target',hashnode_udf(func.concat(func.col('destId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))\
                                                           .withColumn('source',hashnode_udf(func.concat(func.col('origId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))            
only_normal_edges_df_windows.show()

+------------+-----------+-------+------+-------+--------------+--------+------+--------+------+--------+--------+
|window_start| window_end|tx_type|is_sar|tran_id|tran_timestamp|base_amt|destId|alert_id|origId|  target|  source|
+------------+-----------+-------+------+-------+--------------+--------+------+--------+------+--------+--------+
| 1.4859072E9|1.4859072E9|      4|     0|  44409|   1.4859072E9| 5887.65|  6489|      -1|  6074|51f60611|b8797deb|
| 1.4859072E9|1.4859072E9|      4|     0|  44410|   1.4859072E9| 1753.42|  2474|      -1|  5360|ffea183c|2087ccb6|
| 1.4859072E9|1.4859072E9|      4|     0|  44411|   1.4859072E9|  433.12|   678|      -1|  7416|f9bcd32a|d16e7308|
| 1.4859072E9|1.4859072E9|      4|     0|  44412|   1.4859072E9| 1322.19|  8767|      -1|  8663|4e307995|6ac79954|
| 1.4859072E9|1.4859072E9|      4|     0|  44413|   1.4859072E9| 6036.48|  5370|      -1|  8258|57e45309|eab8a93e|
| 1.4859072E9|1.4859072E9|      4|     0|  44414|   1.4859072E9| 9083.92|  2420|

In [35]:
normal_sources = only_normal_edges_df_windows.select("source")
normal_targets = only_normal_edges_df_windows.select("target")
normal_nodes = normal_sources.union(normal_targets).toDF("id").dropDuplicates()
normal_nodes.count()

906318

In [36]:
normal_edges = only_normal_edges_df_windows.select("source", "target").toDF("src", "dst")
normal_edges.count()

1048084

In [23]:
only_sar_edges_df_windows = only_sar_edges_df_windows.withColumnRenamed("source", "origId")\
                                                           .withColumnRenamed("target", "destId")  
only_sar_edges_df_windows = only_sar_edges_df_windows.withColumn('target',hashnode_udf(func.concat(func.col('destId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))\
                                                           .withColumn('source',hashnode_udf(func.concat(func.col('origId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))            
only_sar_edges_df_windows.show()

+-------+-------+------+--------------+--------+------+--------+------+------------+-----------+--------+--------+
|tran_id|tx_type|is_sar|tran_timestamp|base_amt|destId|alert_id|origId|window_start| window_end|  target|  source|
+-------+-------+------+--------------+--------+------+--------+------+------------+-----------+--------+--------+
|     98|      4|     1|   1.4832288E9|  108.62|  5688|      16|  2298| 1.4832288E9|1.4832288E9|abadb2bd|2d6bcbfc|
|    108|      4|     1|   1.4832288E9|  183.25|  9601|      26|  8627| 1.4832288E9|1.4832288E9|5a2ef132|2ffeba7c|
|    135|      4|     1|   1.4832288E9|  142.71|  8359|      15|  2756| 1.4832288E9|1.4832288E9|20aba974|b6564133|
|    137|      4|     1|   1.4832288E9|  132.47|  7702|       9|  7605| 1.4832288E9|1.4832288E9|f5a54e21|6c465ac2|
|    218|      4|     1|   1.4832288E9|  119.51|  7377|      17|  5891| 1.4832288E9|1.4832288E9|53ae86b7|c59fa192|
|    335|      4|     1|   1.4832288E9|  136.02|  1661|      12|  6787| 1.483228

In [32]:
sar_sources = only_sar_edges_df_windows.select("source")
sar_targets = only_sar_edges_df_windows.select("target")
sar_nodes = sar_sources.union(sar_targets).toDF("id").dropDuplicates()
sar_nodes.count()

1425

In [33]:
sar_edges = only_sar_edges_df_windows.select("source", "target").toDF("src", "dst")
sar_edges.count()

732

In [38]:
# Now lets construct the graph
g_sar = GraphFrame(sar_nodes,sar_edges)
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_sar = g_sar.connectedComponents()

In [37]:
# Now lets construct the graph
g_normal = GraphFrame(normal_nodes,normal_edges)
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_normal = g_normal.connectedComponents()

In [39]:
cc_sar.cache().show()

+--------+-----------+
|      id|  component|
+--------+-----------+
|f8a266a3|          5|
|4256a087|          1|
|9b22f31e|          3|
|047d5447|          0|
|c8bc2847|          4|
|9a835811|          2|
|eaaf82d3| 8589934597|
|2076fd80| 8589934592|
|476d03c1| 8589934593|
|d9851929| 8589934601|
|de292af9| 8589934602|
|73986e68| 8589934595|
|b55d2376| 8589934598|
|f93cc381| 8589934604|
|adff9b02| 8589934597|
|75f46a58| 8589934596|
|d91ab5ea| 8589934600|
|c10e0ab4| 8589934599|
|59e84776|          5|
|c5f00bcb|17179869189|
+--------+-----------+
only showing top 20 rows

In [46]:
cc_sar.groupBy('component').count().where(func.col('count')==1).count()

0

In [43]:
cc_sar.groupBy('component').count().where(func.col('count')==2).count()

661

In [49]:
cc_sar.groupBy('component').count().where(func.col('count')>2).count()

32

In [40]:
cc_normal.cache().show()

+--------+---------+
|      id|component|
+--------+---------+
|00018e3c|        0|
|00081600|        1|
|0008f152|        2|
|004033e7|        3|
|0070da62|        4|
|008349ed|        5|
|0085127c|        6|
|00860f9e|        7|
|009b9d18|        8|
|009c6817|        9|
|00a43620|       10|
|00aaccc2|       11|
|00c0120c|       12|
|00c0648d|       13|
|00c79e18|       14|
|01112910|       15|
|0118f47a|       16|
|011db746|       17|
|0124b067|       18|
|012d1e14|       19|
+--------+---------+
only showing top 20 rows

In [47]:
cc_normal.groupBy('component').count().where(func.col('count')==1).count()

0

In [48]:
cc_normal.groupBy('component').count().where(func.col('count')==2).count()

322727

In [50]:
cc_normal.groupBy('component').count().where(func.col('count') > 1).count()

399525