In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
67,application_1607949680860_0069,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f1c4ce76fd0>

In [2]:
import hashlib
from datetime import datetime
from graphframes import *
from pyspark.sql import functions as func
from pyspark.sql.types import FloatType
import hsfs
from hops import hdfs
import os
from pyspark.sql import SQLContext

In [3]:
def hashnode(x):
    return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

hashnode_udf = func.udf(hashnode)

In [4]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [5]:
edge_fg = fs.get_feature_group('transactions_fg', 1)
node_fg = fs.get_feature_group('account_features', 1)

In [6]:
node_fg.show(5)

+-------+--------------+---------+---------------+
|acct_id|tx_behavior_id|prior_sar|initial_deposit|
+-------+--------------+---------+---------------+
|      0|             1|        0|       84442.19|
|      1|             1|        0|       75795.44|
|      2|             1|        0|       42057.16|
|      3|             1|        0|       25891.68|
|      4|             1|        0|       51127.47|
+-------+--------------+---------+---------------+
only showing top 5 rows

In [7]:
edge_fg.show(5)

+------+-------+-------+--------------+--------+------+--------+------+
|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+------+-------+-------+--------------+--------+------+--------+------+
|     0|      4|      1|   1.4832288E9| 9405.71|  3259|      -1|  1767|
|     0|      4|      2|   1.4832288E9| 6884.54|  5141|      -1|  7363|
|     0|      4|      3|   1.4832288E9|  7968.4|  9532|      -1|  7585|
|     0|      4|      4|   1.4832288E9| 9042.67|  8792|      -1|  1750|
|     0|      4|      5|   1.4832288E9| 4692.79|  4670|      -1|  9060|
+------+-------+-------+--------------+--------+------+--------+------+
only showing top 5 rows

In [8]:
only_sar_edge_df = edge_fg.read().filter(func.col('alert_id') != -1)
only_normal_edge_df =  edge_fg.read().filter(func.col('alert_id') == -1)

In [9]:
only_normal_edge_df.show(5)

+------+-------+-------+--------------+--------+------+--------+------+
|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+------+-------+-------+--------------+--------+------+--------+------+
|     0|      4|      1|   1.4832288E9| 9405.71|  3259|      -1|  1767|
|     0|      4|      2|   1.4832288E9| 6884.54|  5141|      -1|  7363|
|     0|      4|      3|   1.4832288E9|  7968.4|  9532|      -1|  7585|
|     0|      4|      4|   1.4832288E9| 9042.67|  8792|      -1|  1750|
|     0|      4|      5|   1.4832288E9| 4692.79|  4670|      -1|  9060|
+------+-------+-------+--------------+--------+------+--------+------+
only showing top 5 rows

In [10]:
only_sar_edge_df.show()

+------+-------+-------+--------------+--------+------+--------+------+
|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+------+-------+-------+--------------+--------+------+--------+------+
|     1|      4|     98|   1.4832288E9|  108.62|  5688|      16|  2298|
|     1|      4|    108|   1.4832288E9|  183.25|  9601|      26|  8627|
|     1|      4|    135|   1.4832288E9|  142.71|  8359|      15|  2756|
|     1|      4|    137|   1.4832288E9|  132.47|  7702|       9|  7605|
|     1|      4|    218|   1.4832288E9|  119.51|  7377|      17|  5891|
|     1|      4|    335|   1.4832288E9|  136.02|  1661|      12|  6787|
|     1|      4|    439|   1.4832288E9|  194.53|  7950|       0|  8485|
|     1|      4|    477|   1.4832288E9|  184.32|  2177|       3|  5324|
|     1|      4|    514|   1.4832288E9|  130.63|  4616|      19|  4919|
|     1|      4|    564|   1.4832288E9|  183.27|  1589|       5|  4170|
|     1|      4|    580|   1.4832288E9|  135.69|  1590|      18|

In [11]:
only_sar_edge_df.count()

732

In [12]:
sar_sources = only_sar_edge_df.select("source")
sar_targets = only_sar_edge_df.select("target")
sar_nodes = sar_sources.union(sar_targets).toDF("id").dropDuplicates()
sar_nodes.count()

sar_edges = only_sar_edge_df.select("source", "target").toDF("src", "dst")
sar_edges.count()

732

In [13]:
# Now lets construct the graph
g_sar = GraphFrame(sar_nodes,sar_edges)
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_sar = g_sar.connectedComponents()

In [14]:
cc_sar.cache().show()

+----+---------+
|  id|component|
+----+---------+
|8086|     2599|
|7833|     3671|
|5300|     2773|
| 463|      124|
|3997|     2773|
|1127|     1127|
| 540|      540|
|6393|     3396|
|1522|     1054|
|5614|      825|
|3488|     2599|
|2393|      397|
|9162|     2889|
|7387|      643|
|4364|     1096|
|1265|     1009|
|4042|     1590|
|5223|     1013|
|3425|     1054|
|5157|       61|
+----+---------+
only showing top 20 rows

In [15]:
cc_sar.groupBy('component').count().select('count').dropDuplicates().orderBy('count').show()

+-----+
|count|
+-----+
|    5|
|    6|
|    7|
|    8|
|    9|
|   10|
+-----+

In [16]:
scc_sar = g_sar.stronglyConnectedComponents(20)

In [17]:
scc_sar.cache().show()

+----+---------+
|  id|component|
+----+---------+
|7800|     7800|
|8600|     8600|
|4000|     4000|
|5201|     1109|
|1801|     1801|
| 601|      601|
|1201|     1201|
|4601|     4601|
|9601|     9601|
|3402|     3402|
|4802|     4802|
|7802|     1945|
|6403|     1630|
|1003|     1003|
|2003|     1201|
|5403|      295|
|9403|     9403|
|7604|     1201|
|5204|     1257|
|8004|     8004|
+----+---------+
only showing top 20 rows

In [18]:
scc_sar.groupBy('component').count().select('count').dropDuplicates().orderBy('count').show()

+-----+
|count|
+-----+
|    1|
|    5|
|    6|
|    7|
|    8|
|    9|
|   10|
+-----+

In [19]:
scc_sar.groupBy('component').count().where(func.col('count')==1).count()

471

In [20]:
scc_sar.groupBy('component').count().where(func.col('count')>1).count()

40

In [21]:
scc_comp_count = scc_sar.groupBy('component').count().where(func.col('count')>1)

In [22]:
scc_sar = scc_sar.join(scc_comp_count,['component'])

In [23]:
scc_sar.show()

+---------+----+-----+
|component|  id|count|
+---------+----+-----+
|     1152|3595|   10|
|     1152|1152|   10|
|     1152|8535|   10|
|     1152|4324|   10|
|     1152|2321|   10|
|     1152|8117|   10|
|     1152|3304|   10|
|     1152|8654|   10|
|     1152|8049|   10|
|     1152|7824|   10|
|      399| 399|    6|
|      399|8965|    6|
|      399|9960|    6|
|      399|7146|    6|
|      399|2501|    6|
|      399|5621|    6|
|     3671|4969|    6|
|     3671|8546|    6|
|     3671|3671|    6|
|     3671|4867|    6|
+---------+----+-----+
only showing top 20 rows

In [24]:
scc_sar =  scc_sar.drop('count')

In [25]:
sar_cc_grouped = cc_sar.join(
    only_sar_edge_df,
    [(only_sar_edge_df.source==cc_sar.id)|(only_sar_edge_df.target==cc_sar.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])

In [26]:
sar_scc_grouped = scc_sar.join(
    only_sar_edge_df,
    [(only_sar_edge_df.source==scc_sar.id)|(only_sar_edge_df.target==scc_sar.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])

In [27]:
sar_cc_grouped.count()

732

In [28]:
sar_scc_grouped.count()

321

In [29]:
only_sar_edge_df.count()

732

In [30]:
sar_scc_grouped.show()

+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|component|  id|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|     1009|3661|     1|      4| 100274|   1.4892768E9|  102.86|  1425|      80|  3661|
|     1801|9700|     1|      4| 284874|   1.5004224E9|  103.47|  3984|      96|  9700|
|      554|8091|     1|      4| 522545|   1.5147648E9|  106.94|  8091|      91|   554|
|      825|7776|     1|      4| 706588|    1.525824E9|   63.66|  7776|      84|  4571|
|     1369|8581|     1|      4| 472208|   1.5117408E9|  114.67|  8581|      70|  1598|
|     1257|6837|     1|      4|  57885|   1.4866848E9|   73.41|  6837|      95|  5261|
|      907|3329|     1|      4| 509573|   1.5139872E9|   47.28|  3329|      90|  7851|
|      397|3252|     1|      4| 786028|   1.5306624E9|   79.42|  6415|      74|  3252|
|      295|6810|     1|      4| 202624|   1

In [31]:
only_sar_edge_df = sar_scc_grouped

In [32]:
only_sar_edge_df.show(5)

+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|component|  id|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|     1009|1425|     1|      4| 100274|   1.4892768E9|  102.86|  1425|      80|  3661|
|     1801|9700|     1|      4| 284874|   1.5004224E9|  103.47|  3984|      96|  9700|
|      554|8091|     1|      4| 522545|   1.5147648E9|  106.94|  8091|      91|   554|
|      825|4571|     1|      4| 706588|    1.525824E9|   63.66|  7776|      84|  4571|
|     1369|8581|     1|      4| 472208|   1.5117408E9|  114.67|  8581|      70|  1598|
+---------+----+------+-------+-------+--------------+--------+------+--------+------+
only showing top 5 rows

In [33]:
only_normal_edge_df.count()

1028964

In [34]:
only_sar_edge_df_grouped = only_sar_edge_df.groupBy('component').agg(func.min("tran_timestamp"),func.max("tran_timestamp")).toDF("component", "window_start", "window_end")
only_sar_edge_df_grouped.show(5)

+---------+------------+-----------+
|component|window_start| window_end|
+---------+------------+-----------+
|     1152|  1.516752E9|1.5170976E9|
|      399| 1.5412032E9|1.5428448E9|
|     3671| 1.5287616E9| 1.530144E9|
|     3751| 1.4901408E9|1.4911776E9|
|     1945| 1.5290208E9|1.5299712E9|
+---------+------------+-----------+
only showing top 5 rows

In [35]:
only_sar_edges_df_windows = only_sar_edge_df.join(only_sar_edge_df_grouped,["component"])

In [36]:
only_sar_edges_df_windows.show()

+---------+----+------+-------+-------+--------------+--------+------+--------+------+------------+-----------+
|component|  id|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|window_start| window_end|
+---------+----+------+-------+-------+--------------+--------+------+--------+------+------------+-----------+
|     1152|8535|     1|      4| 556095|    1.516752E9|  101.29|  8535|      65|  1152|  1.516752E9|1.5170976E9|
|     1152|3595|     1|      4| 556094|    1.516752E9|  112.55|  3595|      65|  8654|  1.516752E9|1.5170976E9|
|     1152|4324|     1|      4| 561547|   1.5170976E9|   53.83|  2321|      65|  4324|  1.516752E9|1.5170976E9|
|     1152|8049|     1|      4| 561548|   1.5170976E9|   48.44|  8049|      65|  2321|  1.516752E9|1.5170976E9|
|     1152|8535|     1|      4| 561549|   1.5170976E9|    43.6|  8654|      65|  8535|  1.516752E9|1.5170976E9|
|     1152|8049|     1|      4| 559541|   1.5170112E9|   73.84|  3304|      65|  8049|  1.516752E9|1.517

In [37]:
only_sar_edges_df_windows.count()

321

In [64]:
only_normal_edges_df_windows = only_sar_edge_df_grouped.select("window_start", "window_end").join(
    only_normal_edge_df,
    [(only_normal_edge_df.tran_timestamp>=only_sar_edge_df_grouped.window_start)&(only_normal_edge_df.tran_timestamp<=only_sar_edge_df_grouped.window_end)],
    how="left"
)

In [65]:
only_normal_edges_df_windows.show()

+------------+-----------+------+-------+-------+--------------+--------+------+--------+------+
|window_start| window_end|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+------------+-----------+------+-------+-------+--------------+--------+------+--------+------+
|  1.516752E9|1.5170976E9|     0|      4| 555045|    1.516752E9|  503.89|  9947|      -1|  6249|
|  1.516752E9|1.5170976E9|     0|      4| 555046|    1.516752E9|  564.26|  3095|      -1|  4557|
|  1.516752E9|1.5170976E9|     0|      4| 555047|    1.516752E9| 7773.05|  3003|      -1|  4967|
|  1.516752E9|1.5170976E9|     0|      4| 555048|    1.516752E9|  4490.0|  9866|      -1|  7090|
|  1.516752E9|1.5170976E9|     0|      4| 555049|    1.516752E9| 6136.03|  1343|      -1|  4471|
|  1.516752E9|1.5170976E9|     0|      4| 555050|    1.516752E9| 9344.63|  3076|      -1|  6975|
|  1.516752E9|1.5170976E9|     0|      4| 555051|    1.516752E9| 1527.03|  1969|      -1|  6923|
|  1.516752E9|1.5170976E9|    

In [66]:
only_normal_edges_df_windows.count()

677241

In [67]:
only_normal_edges_df_windows = only_normal_edges_df_windows.withColumnRenamed("source", "origId")\
                                                           .withColumnRenamed("target", "destId")  
only_normal_edges_df_windows = only_normal_edges_df_windows.withColumn('target',hashnode_udf(func.concat(func.col('destId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))\
                                                           .withColumn('source',hashnode_udf(func.concat(func.col('origId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))            
only_normal_edges_df_windows.show()

+------------+-----------+------+-------+-------+--------------+--------+------+--------+------+--------+--------+
|window_start| window_end|is_sar|tx_type|tran_id|tran_timestamp|base_amt|destId|alert_id|origId|  target|  source|
+------------+-----------+------+-------+-------+--------------+--------+------+--------+------+--------+--------+
|  1.516752E9|1.5170976E9|     0|      4| 555045|    1.516752E9|  503.89|  9947|      -1|  6249|b786ed64|bbc43466|
|  1.516752E9|1.5170976E9|     0|      4| 555046|    1.516752E9|  564.26|  3095|      -1|  4557|b0b7a393|af529315|
|  1.516752E9|1.5170976E9|     0|      4| 555047|    1.516752E9| 7773.05|  3003|      -1|  4967|4156c046|e033872d|
|  1.516752E9|1.5170976E9|     0|      4| 555048|    1.516752E9|  4490.0|  9866|      -1|  7090|05ac06b4|3eac3624|
|  1.516752E9|1.5170976E9|     0|      4| 555049|    1.516752E9| 6136.03|  1343|      -1|  4471|e1901da0|cb468f43|
|  1.516752E9|1.5170976E9|     0|      4| 555050|    1.516752E9| 9344.63|  3076|

In [42]:
normal_sources = only_normal_edges_df_windows.select("source")
normal_targets = only_normal_edges_df_windows.select("target")
normal_nodes = normal_sources.union(normal_targets).toDF("id").dropDuplicates()

normal_edges = only_normal_edges_df_windows.select("source", "target").toDF("src", "dst")

# Now lets construct the graph
g_normal = GraphFrame(normal_nodes,normal_edges)
inDeg = g_normal.inDegrees.filter("inDegree >= 2")
ourDeg = g_normal.outDegrees.filter("outDegree >= 2")
filtNode = g_normal.vertices

filtNode = filtNode.join(inDeg, ['id'])
filtNode = filtNode.join(ourDeg, ['id'])
filtNode.show()
filtEdges = g_normal.edges

filtEdges = filtEdges.join(
    filtNode,
    [(filtEdges.src==filtNode.id)|(filtEdges.dst==filtNode.id)],
    how="left"
).dropDuplicates(subset=['src','dst'])
filtEdges.cache()
filtEdges.show()

+--------+--------+---------+
|      id|inDegree|outDegree|
+--------+--------+---------+
|00fac541|       3|        2|
|0287e0b9|       3|        3|
|02f42efb|       4|        3|
|03ec4643|       2|        2|
|041a26ac|       2|        2|
|04258f79|       3|        4|
|062475fb|       3|        2|
|06326a74|       5|        3|
|065aca96|       3|        3|
|06984758|       2|        7|
|070a287f|       2|        2|
|075a33ad|      17|        8|
|07610d16|       2|        3|
|076132d4|       3|        3|
|07d36152|       2|        3|
|07d609b3|       3|        3|
|07f8a112|       2|        2|
|0879c806|       3|        2|
|0a1fa962|       2|        4|
|0aaa6194|       5|        2|
+--------+--------+---------+
only showing top 20 rows

+--------+--------+--------+--------+---------+
|     src|     dst|      id|inDegree|outDegree|
+--------+--------+--------+--------+---------+
|006437bc|fcdd5296|fcdd5296|       2|        2|
|006593aa|16c11e9c|006593aa|       4|        2|
|008b83a7|23be

In [57]:
filtEdges_2 = filtEdges.where((filtEdges.inDegree >= 2) & (filtEdges.outDegree >= 2)).drop("id","inDegree","outDegree")

In [60]:
filtEdges_2.count()

328641

In [68]:
only_normal_edges_df_windows = filtEdges_2.join(only_normal_edges_df_windows,[(filtEdges_2.src==only_normal_edges_df_windows.source)&(filtEdges_2.dst==only_normal_edges_df_windows.target)]).drop("target", "source")
only_normal_edges_df_windows.show()

+--------+--------+------------+-----------+------+-------+-------+--------------+--------+------+--------+------+
|     src|     dst|window_start| window_end|is_sar|tx_type|tran_id|tran_timestamp|base_amt|destId|alert_id|origId|
+--------+--------+------------+-----------+------+-------+-------+--------------+--------+------+--------+------+
|006437bc|fcdd5296| 1.5423264E9|1.5434496E9|     0|      4| 990756|   1.5430176E9|  876.22|  7381|      -1|  7898|
|006593aa|16c11e9c| 1.4887584E9| 1.489536E9|     0|      4|  99215|   1.4891904E9| 7912.75|  8957|      -1|  9505|
|00af2737|680d3223| 1.4888448E9|1.4898816E9|     0|      4| 102428|   1.4893632E9|  9809.5|  2022|      -1|  9807|
|00f958bc|2c52b404| 1.4874624E9|1.4890176E9|     0|      4|  76695|    1.487808E9| 2353.56|  7213|      -1|  7273|
|010262ed|b63c7960|  1.510272E9| 1.511568E9|     0|      4| 449609|   1.5103584E9| 1169.36|   779|      -1|  5405|
|010262ed|b63c7960|  1.510272E9| 1.511568E9|     0|      4| 460135|   1.5109632E

In [70]:
normal_edges = only_normal_edges_df_windows.select("src","dst").dropDuplicates(["src","dst"])
normal_edges.count()

328641

In [71]:
normal_sources = only_normal_edges_df_windows.select("src")
normal_targets = only_normal_edges_df_windows.select("dst")
normal_nodes = normal_sources.union(normal_targets).toDF("id").dropDuplicates()
normal_nodes.count()

245082

In [72]:
g_normal = GraphFrame(normal_nodes,normal_edges)

In [None]:
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
#cc_normal = g_normal.connectedComponents()
scc_normal = g_normal.stronglyConnectedComponents(20).cache

In [None]:
scc_norm_comp_count = scc_normal.groupBy('component').count().where(func.col('count')>1)
scc_normal = scc_normal.join(scc_norm_comp_count,['component'])
scc_normal =  scc_normal.drop('count')
normal_scc_grouped = normal_normal.join(
    only_normal_edge_df,
    [(only_normal_edge_df.source==scc_normal.id)|(only_normal_edge_df.target==scc_normal.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])

In [None]:
normal_scc_grouped.show()