In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
71,application_1607949680860_0073,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7facf5d0af90>

In [2]:
import hashlib
from datetime import datetime
from graphframes import *
from pyspark.sql import functions as func
from pyspark.sql.types import FloatType
import hsfs
from hops import hdfs
import os
from pyspark.sql import SQLContext

In [3]:
def hashnode(x):
    return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

hashnode_udf = func.udf(hashnode)

In [4]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [5]:
edge_fg = fs.get_feature_group('transactions_fg', 1)
node_fg = fs.get_feature_group('account_features', 1)

In [6]:
node_fg.show(5)

+--------------+---------+-------+---------------+
|tx_behavior_id|prior_sar|acct_id|initial_deposit|
+--------------+---------+-------+---------------+
|             1|        0|      0|       84442.19|
|             1|        0|      1|       75795.44|
|             1|        0|      2|       42057.16|
|             1|        0|      3|       25891.68|
|             1|        0|      4|       51127.47|
+--------------+---------+-------+---------------+
only showing top 5 rows

In [7]:
edge_fg.show(5)

+------+-------+--------------+-------+--------+------+--------+------+
|is_sar|tran_id|tran_timestamp|tx_type|base_amt|target|alert_id|source|
+------+-------+--------------+-------+--------+------+--------+------+
|     0|      1|   1.4832288E9|      4| 9405.71|  3259|      -1|  1767|
|     0|      2|   1.4832288E9|      4| 6884.54|  5141|      -1|  7363|
|     0|      3|   1.4832288E9|      4|  7968.4|  9532|      -1|  7585|
|     0|      4|   1.4832288E9|      4| 9042.67|  8792|      -1|  1750|
|     0|      5|   1.4832288E9|      4| 4692.79|  4670|      -1|  9060|
+------+-------+--------------+-------+--------+------+--------+------+
only showing top 5 rows

In [8]:
only_sar_edge_df = edge_fg.read().filter(func.col('alert_id') != -1)
only_normal_edge_df =  edge_fg.read().filter(func.col('alert_id') == -1)

In [9]:
only_normal_edge_df.show(5)

+------+-------+--------------+-------+--------+------+--------+------+
|is_sar|tran_id|tran_timestamp|tx_type|base_amt|target|alert_id|source|
+------+-------+--------------+-------+--------+------+--------+------+
|     0|      1|   1.4832288E9|      4| 9405.71|  3259|      -1|  1767|
|     0|      2|   1.4832288E9|      4| 6884.54|  5141|      -1|  7363|
|     0|      3|   1.4832288E9|      4|  7968.4|  9532|      -1|  7585|
|     0|      4|   1.4832288E9|      4| 9042.67|  8792|      -1|  1750|
|     0|      5|   1.4832288E9|      4| 4692.79|  4670|      -1|  9060|
+------+-------+--------------+-------+--------+------+--------+------+
only showing top 5 rows

In [10]:
only_sar_edge_df.show()

+------+-------+--------------+-------+--------+------+--------+------+
|is_sar|tran_id|tran_timestamp|tx_type|base_amt|target|alert_id|source|
+------+-------+--------------+-------+--------+------+--------+------+
|     1|     98|   1.4832288E9|      4|  108.62|  5688|      16|  2298|
|     1|    108|   1.4832288E9|      4|  183.25|  9601|      26|  8627|
|     1|    135|   1.4832288E9|      4|  142.71|  8359|      15|  2756|
|     1|    137|   1.4832288E9|      4|  132.47|  7702|       9|  7605|
|     1|    218|   1.4832288E9|      4|  119.51|  7377|      17|  5891|
|     1|    335|   1.4832288E9|      4|  136.02|  1661|      12|  6787|
|     1|    439|   1.4832288E9|      4|  194.53|  7950|       0|  8485|
|     1|    477|   1.4832288E9|      4|  184.32|  2177|       3|  5324|
|     1|    514|   1.4832288E9|      4|  130.63|  4616|      19|  4919|
|     1|    564|   1.4832288E9|      4|  183.27|  1589|       5|  4170|
|     1|    580|   1.4832288E9|      4|  135.69|  1590|      18|

In [11]:
only_sar_edge_df.count()

732

In [12]:
sar_sources = only_sar_edge_df.select("source")
sar_targets = only_sar_edge_df.select("target")
sar_nodes = sar_sources.union(sar_targets).toDF("id").dropDuplicates()
sar_nodes.count()

sar_edges = only_sar_edge_df.select("source", "target").toDF("src", "dst")
sar_edges.count()

732

In [28]:
# Now lets construct the graph
g_sar = GraphFrame(sar_nodes,sar_edges)
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_sar = g_sar.connectedComponents()

In [29]:
cc_sar.cache().show()

+----+---------+
|  id|component|
+----+---------+
|8086|     2599|
|7833|     3671|
|3997|     2773|
|5300|     2773|
| 463|      124|
|5614|      825|
|1522|     1054|
|1127|     1127|
| 540|      540|
|6393|     3396|
|7387|      643|
|3488|     2599|
|2393|      397|
|9162|     2889|
|4364|     1096|
|1265|     1009|
|4042|     1590|
|5223|     1013|
|3425|     1054|
|5157|       61|
+----+---------+
only showing top 20 rows

In [30]:
cc_sar.groupBy('component').count().select('count').dropDuplicates().orderBy('count').show()

+-----+
|count|
+-----+
|    5|
|    6|
|    7|
|    8|
|    9|
|   10|
+-----+

In [32]:
cc_sar_grouped = cc_sar.groupBy('component').count().where(func.col('count')>2).drop('count')
cc_sar = cc_sar.join(cc_sar_grouped,['component'])
cc_sar.show()

+---------+----+
|component|  id|
+---------+----+
|     2599|8086|
|     3671|7833|
|     2773|3997|
|     2773|5300|
|      124| 463|
|      825|5614|
|     1054|1522|
|     1127|1127|
|      540| 540|
|     3396|6393|
|      643|7387|
|     2599|3488|
|      397|2393|
|     2889|9162|
|     1096|4364|
|     1009|1265|
|     1590|4042|
|     1013|5223|
|     1054|3425|
|       61|5157|
+---------+----+
only showing top 20 rows

In [33]:
sar_cc_grouped = cc_sar.join(
    only_sar_edge_df,
    [(only_sar_edge_df.source==cc_sar.id)|(only_sar_edge_df.target==cc_sar.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])
sar_cc_grouped.show()

+---------+----+------+-------+--------------+-------+--------+------+--------+------+
|component|  id|is_sar|tran_id|tran_timestamp|tx_type|base_amt|target|alert_id|source|
+---------+----+------+-------+--------------+-------+--------+------+--------+------+
|     1823|2300|     1|  44437|   1.4859072E9|      4|  173.42|  2300|       8|  5294|
|     1009|1425|     1| 100274|   1.4892768E9|      4|  102.86|  1425|      80|  3661|
|      892|7268|     1| 612597|    1.520208E9|      4|  109.79|  8764|      10|  7268|
|      410|2491|     1|  12471|     1.48392E9|      4|  138.28|  2491|      58|  2739|
|     1794|5838|     1|  33855|    1.485216E9|      4|  101.76|  9403|      44|  5838|
|     1801|3984|     1| 284874|   1.5004224E9|      4|  103.47|  3984|      96|  9700|
|      554|8091|     1| 522545|   1.5147648E9|      4|  106.94|  8091|      91|   554|
|     1823|2368|     1| 266765|   1.4992992E9|      4|  173.42|  2300|       8|  2368|
|      825|4571|     1| 706588|    1.525824

In [38]:
sar_cc_grouped.count()

732

In [39]:
sar_cc_grouped.count()

732

In [40]:
only_sar_edge_df.count()

732

In [41]:
only_sar_edge_df = sar_cc_grouped

In [42]:
only_sar_edge_df.show(5)

+---------+----+------+-------+--------------+-------+--------+------+--------+------+
|component|  id|is_sar|tran_id|tran_timestamp|tx_type|base_amt|target|alert_id|source|
+---------+----+------+-------+--------------+-------+--------+------+--------+------+
|     1823|2300|     1|  44437|   1.4859072E9|      4|  173.42|  2300|       8|  5294|
|     1009|3661|     1| 100274|   1.4892768E9|      4|  102.86|  1425|      80|  3661|
|      892|7268|     1| 612597|    1.520208E9|      4|  109.79|  8764|      10|  7268|
|      410|2739|     1|  12471|     1.48392E9|      4|  138.28|  2491|      58|  2739|
|     1794|9403|     1|  33855|    1.485216E9|      4|  101.76|  9403|      44|  5838|
+---------+----+------+-------+--------------+-------+--------+------+--------+------+
only showing top 5 rows

In [43]:
only_normal_edge_df.count()

1028964

In [44]:
only_sar_edge_df_grouped = only_sar_edge_df.groupBy('component').agg(func.min("tran_timestamp"),func.max("tran_timestamp")).toDF("component", "window_start", "window_end")
only_sar_edge_df_grouped.show(5)

+---------+------------+-----------+
|component|window_start| window_end|
+---------+------------+-----------+
|     1127| 1.4833152E9|1.4850432E9|
|     1152|  1.516752E9|1.5170976E9|
|      635|  1.483488E9|1.4957568E9|
|      399| 1.5412032E9|1.5428448E9|
|     3396| 1.4832288E9|1.5060384E9|
+---------+------------+-----------+
only showing top 5 rows

In [45]:
only_sar_edges_df_windows = only_sar_edge_df.join(only_sar_edge_df_grouped,["component"])

In [46]:
only_sar_edges_df_windows.show()

+---------+----+------+-------+--------------+-------+--------+------+--------+------+------------+-----------+
|component|  id|is_sar|tran_id|tran_timestamp|tx_type|base_amt|target|alert_id|source|window_start| window_end|
+---------+----+------+-------+--------------+-------+--------+------+--------+------+------------+-----------+
|     1127|7188|     1|   9861|   1.4837472E9|      4|  170.32|  8279|      11|  7188| 1.4833152E9|1.4850432E9|
|     1127|3320|     1|   2135|   1.4833152E9|      4|  170.32|  8279|      11|  3320| 1.4833152E9|1.4850432E9|
|     1127|9173|     1|  30182|   1.4850432E9|      4|  170.32|  8279|      11|  9173| 1.4833152E9|1.4850432E9|
|     1127|4902|     1|  24020|   1.4846112E9|      4|  170.32|  8279|      11|  4902| 1.4833152E9|1.4850432E9|
|     1127|8279|     1|  16629|   1.4841792E9|      4|  170.32|  8279|      11|  1127| 1.4833152E9|1.4850432E9|
|     1152|4324|     1| 561547|   1.5170976E9|      4|   53.83|  2321|      65|  4324|  1.516752E9|1.517

In [47]:
only_sar_edges_df_windows.count()

732

In [48]:
only_normal_edges_df_windows = only_sar_edge_df_grouped.select("window_start", "window_end").join(
    only_normal_edge_df,
    [(only_normal_edge_df.tran_timestamp>=only_sar_edge_df_grouped.window_start)&(only_normal_edge_df.tran_timestamp<=only_sar_edge_df_grouped.window_end)],
    how="left"
)

In [49]:
only_normal_edges_df_windows.show()

+------------+-----------+------+-------+--------------+-------+--------+------+--------+------+
|window_start| window_end|is_sar|tran_id|tran_timestamp|tx_type|base_amt|target|alert_id|source|
+------------+-----------+------+-------+--------------+-------+--------+------+--------+------+
| 1.4833152E9|1.4850432E9|     0|   1498|   1.4833152E9|      4| 4300.78|  2341|      -1|  1712|
| 1.4833152E9|1.4850432E9|     0|   1499|   1.4833152E9|      4| 2702.02|  9721|      -1|  7976|
| 1.4833152E9|1.4850432E9|     0|   1500|   1.4833152E9|      4| 2987.06|  9759|      -1|  9567|
| 1.4833152E9|1.4850432E9|     0|   1501|   1.4833152E9|      4| 7066.07|  3219|      -1|  2664|
| 1.4833152E9|1.4850432E9|     0|   1502|   1.4833152E9|      4| 5341.72|  3459|      -1|  5908|
| 1.4833152E9|1.4850432E9|     0|   1503|   1.4833152E9|      4|  118.96|  2027|      -1|  5043|
| 1.4833152E9|1.4850432E9|     0|   1504|   1.4833152E9|      4| 4353.21|  6294|      -1|  6502|
| 1.4833152E9|1.4850432E9|    

In [50]:
only_normal_edges_df_windows.count()

26567736

In [51]:
only_normal_edges_df_windows = only_normal_edges_df_windows.withColumnRenamed("source", "origId")\
                                                           .withColumnRenamed("target", "destId")  
only_normal_edges_df_windows = only_normal_edges_df_windows.withColumn('target',hashnode_udf(func.concat(func.col('destId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))\
                                                           .withColumn('source',hashnode_udf(func.concat(func.col('origId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))            
only_normal_edges_df_windows.show()

+------------+-----------+------+-------+--------------+-------+--------+------+--------+------+--------+--------+
|window_start| window_end|is_sar|tran_id|tran_timestamp|tx_type|base_amt|destId|alert_id|origId|  target|  source|
+------------+-----------+------+-------+--------------+-------+--------+------+--------+------+--------+--------+
| 1.4833152E9|1.4850432E9|     0|   1498|   1.4833152E9|      4| 4300.78|  2341|      -1|  1712|e63c8751|465fa954|
| 1.4833152E9|1.4850432E9|     0|   1499|   1.4833152E9|      4| 2702.02|  9721|      -1|  7976|86bfcc1e|e7f1f1a1|
| 1.4833152E9|1.4850432E9|     0|   1500|   1.4833152E9|      4| 2987.06|  9759|      -1|  9567|4531cf75|d0ac2ea1|
| 1.4833152E9|1.4850432E9|     0|   1501|   1.4833152E9|      4| 7066.07|  3219|      -1|  2664|f4027061|2f4fa476|
| 1.4833152E9|1.4850432E9|     0|   1502|   1.4833152E9|      4| 5341.72|  3459|      -1|  5908|631e2e29|ec3cdd07|
| 1.4833152E9|1.4850432E9|     0|   1503|   1.4833152E9|      4|  118.96|  2027|

In [52]:
normal_sources = only_normal_edges_df_windows.select("source")
normal_targets = only_normal_edges_df_windows.select("target")
normal_nodes = normal_sources.union(normal_targets).toDF("id").dropDuplicates()
normal_edges = only_normal_edges_df_windows.select("source", "target").toDF("src", "dst")

In [53]:
# Now lets construct the graph
g_normal = GraphFrame(normal_nodes,normal_edges)

In [None]:
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_normal = g_normal.connectedComponents().cache()
#scc_normal = g_normal.stronglyConnectedComponents(20).cache()

In [None]:
cc_norm_comp_count = cc_normal.groupBy('component').count().where(func.col('count')>2)
cc_normal = cc_normal.join(cc_norm_comp_count,['component'])
cc_normal =  cc_normal.drop('count')
normal_cc_grouped = cc_normal.join(
    only_normal_edge_df,
    [(only_normal_edge_df.source==cc_normal.id)|(only_normal_edge_df.target==cc_normal.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])

In [None]:
normal_cc_grouped.show()

In [None]:
1-1