In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
72,application_1607949680860_0074,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f8a87eb1e90>

In [2]:
import hashlib
from datetime import datetime
from graphframes import *
from pyspark.sql import functions as func
from pyspark.sql.types import FloatType
import hsfs
from hops import hdfs
import os
from pyspark.sql import SQLContext

In [3]:
def hashnode(x):
    return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

hashnode_udf = func.udf(hashnode)

In [4]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [5]:
edge_fg = fs.get_feature_group('transactions_fg', 1)
node_fg = fs.get_feature_group('account_features', 1)

In [6]:
node_fg.show(5)

+--------------+---------+-------+---------------+
|tx_behavior_id|prior_sar|acct_id|initial_deposit|
+--------------+---------+-------+---------------+
|             1|        0|      0|       84442.19|
|             1|        0|      1|       75795.44|
|             1|        0|      2|       42057.16|
|             1|        0|      3|       25891.68|
|             1|        0|      4|       51127.47|
+--------------+---------+-------+---------------+
only showing top 5 rows

In [7]:
edge_fg.show(5)

+------+-------+--------------+--------+------+--------+------+-------+
|is_sar|tran_id|tran_timestamp|base_amt|target|alert_id|source|tx_type|
+------+-------+--------------+--------+------+--------+------+-------+
|     0|      1|   1.4832288E9| 9405.71|  3259|      -1|  1767|      4|
|     0|      2|   1.4832288E9| 6884.54|  5141|      -1|  7363|      4|
|     0|      3|   1.4832288E9|  7968.4|  9532|      -1|  7585|      4|
|     0|      4|   1.4832288E9| 9042.67|  8792|      -1|  1750|      4|
|     0|      5|   1.4832288E9| 4692.79|  4670|      -1|  9060|      4|
+------+-------+--------------+--------+------+--------+------+-------+
only showing top 5 rows

In [8]:
only_sar_edge_df = edge_fg.read().filter(func.col('alert_id') != -1)
only_normal_edge_df =  edge_fg.read().filter(func.col('alert_id') == -1)

In [9]:
only_normal_edge_df.show(5)

+------+-------+--------------+--------+------+--------+------+-------+
|is_sar|tran_id|tran_timestamp|base_amt|target|alert_id|source|tx_type|
+------+-------+--------------+--------+------+--------+------+-------+
|     0|      1|   1.4832288E9| 9405.71|  3259|      -1|  1767|      4|
|     0|      2|   1.4832288E9| 6884.54|  5141|      -1|  7363|      4|
|     0|      3|   1.4832288E9|  7968.4|  9532|      -1|  7585|      4|
|     0|      4|   1.4832288E9| 9042.67|  8792|      -1|  1750|      4|
|     0|      5|   1.4832288E9| 4692.79|  4670|      -1|  9060|      4|
+------+-------+--------------+--------+------+--------+------+-------+
only showing top 5 rows

In [10]:
only_sar_edge_df.show()

+------+-------+--------------+--------+------+--------+------+-------+
|is_sar|tran_id|tran_timestamp|base_amt|target|alert_id|source|tx_type|
+------+-------+--------------+--------+------+--------+------+-------+
|     1|     98|   1.4832288E9|  108.62|  5688|      16|  2298|      4|
|     1|    108|   1.4832288E9|  183.25|  9601|      26|  8627|      4|
|     1|    135|   1.4832288E9|  142.71|  8359|      15|  2756|      4|
|     1|    137|   1.4832288E9|  132.47|  7702|       9|  7605|      4|
|     1|    218|   1.4832288E9|  119.51|  7377|      17|  5891|      4|
|     1|    335|   1.4832288E9|  136.02|  1661|      12|  6787|      4|
|     1|    439|   1.4832288E9|  194.53|  7950|       0|  8485|      4|
|     1|    477|   1.4832288E9|  184.32|  2177|       3|  5324|      4|
|     1|    514|   1.4832288E9|  130.63|  4616|      19|  4919|      4|
|     1|    564|   1.4832288E9|  183.27|  1589|       5|  4170|      4|
|     1|    580|   1.4832288E9|  135.69|  1590|      18|  6483| 

In [11]:
only_sar_edge_df.count()

732

In [12]:
sar_sources = only_sar_edge_df.select("source")
sar_targets = only_sar_edge_df.select("target")
sar_nodes = sar_sources.union(sar_targets).toDF("id").dropDuplicates()
sar_nodes.count()

sar_edges = only_sar_edge_df.select("source", "target").toDF("src", "dst")
sar_edges.count()

732

In [13]:
# Now lets construct the graph
g_sar = GraphFrame(sar_nodes,sar_edges)
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_sar = g_sar.connectedComponents()

In [14]:
cc_sar.cache().show()

+----+---------+
|  id|component|
+----+---------+
|8086|     2599|
|7833|     3671|
|5300|     2773|
| 463|      124|
|3997|     2773|
|1127|     1127|
| 540|      540|
|6393|     3396|
|1522|     1054|
|5614|      825|
|7387|      643|
|3488|     2599|
|2393|      397|
|9162|     2889|
|1265|     1009|
|4042|     1590|
|5223|     1013|
|4364|     1096|
|9403|     1794|
|4000|      960|
+----+---------+
only showing top 20 rows

In [15]:
cc_sar.groupBy('component').count().select('count').dropDuplicates().orderBy('count').show()

+-----+
|count|
+-----+
|    5|
|    6|
|    7|
|    8|
|    9|
|   10|
+-----+

In [16]:
cc_sar_grouped = cc_sar.groupBy('component').count().where(func.col('count')>2).drop('count')
cc_sar = cc_sar.join(cc_sar_grouped,['component'])
cc_sar.show()

+---------+----+
|component|  id|
+---------+----+
|     2599|8086|
|     3671|7833|
|     2773|5300|
|      124| 463|
|     2773|3997|
|     1127|1127|
|      540| 540|
|     3396|6393|
|     1054|1522|
|      825|5614|
|      643|7387|
|     2599|3488|
|      397|2393|
|     2889|9162|
|     1009|1265|
|     1590|4042|
|     1013|5223|
|     1096|4364|
|     1794|9403|
|      960|4000|
+---------+----+
only showing top 20 rows

In [17]:
sar_cc_grouped = cc_sar.join(
    only_sar_edge_df,
    [(only_sar_edge_df.source==cc_sar.id)|(only_sar_edge_df.target==cc_sar.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])
sar_cc_grouped.show()

+---------+----+------+-------+--------------+--------+------+--------+------+-------+
|component|  id|is_sar|tran_id|tran_timestamp|base_amt|target|alert_id|source|tx_type|
+---------+----+------+-------+--------------+--------+------+--------+------+-------+
|     1823|2300|     1|  44437|   1.4859072E9|  173.42|  2300|       8|  5294|      4|
|     1009|1425|     1| 100274|   1.4892768E9|  102.86|  1425|      80|  3661|      4|
|      892|7268|     1| 612597|    1.520208E9|  109.79|  8764|      10|  7268|      4|
|      410|2739|     1|  12471|     1.48392E9|  138.28|  2491|      58|  2739|      4|
|     1794|9403|     1|  33855|    1.485216E9|  101.76|  9403|      44|  5838|      4|
|     1801|9700|     1| 284874|   1.5004224E9|  103.47|  3984|      96|  9700|      4|
|      554| 554|     1| 522545|   1.5147648E9|  106.94|  8091|      91|   554|      4|
|     1823|2300|     1| 266765|   1.4992992E9|  173.42|  2300|       8|  2368|      4|
|      825|4571|     1| 706588|    1.525824

In [18]:
sar_cc_grouped.count()

732

In [19]:
sar_cc_grouped.count()

732

In [20]:
only_sar_edge_df.count()

732

In [21]:
only_sar_edge_df = sar_cc_grouped

In [22]:
only_sar_edge_df.show(5)

+---------+----+------+-------+--------------+--------+------+--------+------+-------+
|component|  id|is_sar|tran_id|tran_timestamp|base_amt|target|alert_id|source|tx_type|
+---------+----+------+-------+--------------+--------+------+--------+------+-------+
|     1823|5294|     1|  44437|   1.4859072E9|  173.42|  2300|       8|  5294|      4|
|     1009|1425|     1| 100274|   1.4892768E9|  102.86|  1425|      80|  3661|      4|
|      892|8764|     1| 612597|    1.520208E9|  109.79|  8764|      10|  7268|      4|
|      410|2491|     1|  12471|     1.48392E9|  138.28|  2491|      58|  2739|      4|
|     1794|9403|     1|  33855|    1.485216E9|  101.76|  9403|      44|  5838|      4|
+---------+----+------+-------+--------------+--------+------+--------+------+-------+
only showing top 5 rows

In [28]:
normal_sources = only_normal_edge_df.select("source")
normal_targets = only_normal_edge_df.select("target")
normal_nodes = normal_sources.union(normal_targets).toDF("id").dropDuplicates()
normal_edges = only_normal_edge_df.select("source", "target").toDF("src", "dst")

In [29]:
# Now lets construct the graph
g_normal = GraphFrame(normal_nodes,normal_edges)

In [30]:
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_normal = g_normal.connectedComponents().cache()
#scc_normal = g_normal.stronglyConnectedComponents(20).cache()

In [36]:
cc_norm_comp_count = cc_normal.groupBy('component').count().where(func.col('count')>=5)
cc_normal = cc_normal.join(cc_norm_comp_count,['component'])
cc_normal =  cc_normal.drop('count')
normal_cc_grouped = cc_normal.join(
    only_normal_edge_df,
    [(only_normal_edge_df.source==cc_normal.id)|(only_normal_edge_df.target==cc_normal.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])
normal_cc_grouped.cache()

DataFrame[component: bigint, id: int, is_sar: int, tran_id: int, tran_timestamp: float, base_amt: double, target: int, alert_id: int, source: int, tx_type: string]

In [37]:
normal_cc_grouped.show()

+---------+----+------+-------+--------------+--------+------+--------+------+-------+
|component|  id|is_sar|tran_id|tran_timestamp|base_amt|target|alert_id|source|tx_type|
+---------+----+------+-------+--------------+--------+------+--------+------+-------+
|        2|3951|     0|    148|   1.4832288E9| 8934.66|  3951|      -1|   854|      4|
|        2|4076|     0|    463|   1.4832288E9| 5695.63|  1630|      -1|  4076|      4|
|        2|1834|     0|    471|   1.4832288E9| 5781.43|  1834|      -1|  4193|      4|
|        2|3654|     0|    496|   1.4832288E9| 5231.16|  3654|      -1|  1334|      4|
|        2|1052|     0|    833|   1.4832288E9| 4552.67|  8906|      -1|  1052|      4|
|        2|9587|     0|   1088|   1.4832288E9| 6632.67|  9587|      -1|  9097|      4|
|        2|9109|     0|   1238|   1.4832288E9| 2777.34|  9378|      -1|  9109|      4|
|        2|6306|     0|   1342|   1.4832288E9|  9067.7|  6306|      -1|   176|      4|
|        2|9951|     0|   1580|   1.4833152

In [38]:
normal_cc_grouped.count()

1028964