In [11]:
spark

<pyspark.sql.session.SparkSession object at 0x7f103a41cfd0>

In [12]:
import hashlib
from datetime import datetime
from graphframes import *
from pyspark.sql import functions as func
from pyspark.sql.types import FloatType
import hsfs
from hops import hdfs
import os
from pyspark.sql import SQLContext

In [13]:
def hashnode(x):
    return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

hashnode_udf = func.udf(hashnode)

In [14]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [15]:
edge_fg = fs.get_feature_group('transactions_fg', 1)
node_fg = fs.get_feature_group('account_features', 1)

In [16]:
node_fg.show(5)

+-------+--------------+---------+---------------+
|acct_id|tx_behavior_id|prior_sar|initial_deposit|
+-------+--------------+---------+---------------+
|      0|             1|        0|       84442.19|
|      1|             1|        0|       75795.44|
|      2|             1|        0|       42057.16|
|      3|             1|        0|       25891.68|
|      4|             1|        0|       51127.47|
+-------+--------------+---------+---------------+
only showing top 5 rows

In [17]:
edge_fg.show(5)

+------+-------+-------+--------------+--------+------+--------+------+
|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+------+-------+-------+--------------+--------+------+--------+------+
|     0|      4|      1|   1.4832288E9| 9405.71|  3259|      -1|  1767|
|     0|      4|      2|   1.4832288E9| 6884.54|  5141|      -1|  7363|
|     0|      4|      3|   1.4832288E9|  7968.4|  9532|      -1|  7585|
|     0|      4|      4|   1.4832288E9| 9042.67|  8792|      -1|  1750|
|     0|      4|      5|   1.4832288E9| 4692.79|  4670|      -1|  9060|
+------+-------+-------+--------------+--------+------+--------+------+
only showing top 5 rows

In [18]:
only_sar_edge_df = edge_fg.read().filter(func.col('alert_id') != -1)
only_normal_edge_df =  edge_fg.read().filter(func.col('alert_id') == -1)

In [19]:
only_normal_edge_df.show(5)

+------+-------+-------+--------------+--------+------+--------+------+
|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+------+-------+-------+--------------+--------+------+--------+------+
|     0|      4|      1|   1.4832288E9| 9405.71|  3259|      -1|  1767|
|     0|      4|      2|   1.4832288E9| 6884.54|  5141|      -1|  7363|
|     0|      4|      3|   1.4832288E9|  7968.4|  9532|      -1|  7585|
|     0|      4|      4|   1.4832288E9| 9042.67|  8792|      -1|  1750|
|     0|      4|      5|   1.4832288E9| 4692.79|  4670|      -1|  9060|
+------+-------+-------+--------------+--------+------+--------+------+
only showing top 5 rows

In [20]:
only_sar_edge_df.show()

+------+-------+-------+--------------+--------+------+--------+------+
|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+------+-------+-------+--------------+--------+------+--------+------+
|     1|      4|     98|   1.4832288E9|  108.62|  5688|      16|  2298|
|     1|      4|    108|   1.4832288E9|  183.25|  9601|      26|  8627|
|     1|      4|    135|   1.4832288E9|  142.71|  8359|      15|  2756|
|     1|      4|    137|   1.4832288E9|  132.47|  7702|       9|  7605|
|     1|      4|    218|   1.4832288E9|  119.51|  7377|      17|  5891|
|     1|      4|    335|   1.4832288E9|  136.02|  1661|      12|  6787|
|     1|      4|    439|   1.4832288E9|  194.53|  7950|       0|  8485|
|     1|      4|    477|   1.4832288E9|  184.32|  2177|       3|  5324|
|     1|      4|    514|   1.4832288E9|  130.63|  4616|      19|  4919|
|     1|      4|    564|   1.4832288E9|  183.27|  1589|       5|  4170|
|     1|      4|    580|   1.4832288E9|  135.69|  1590|      18|

In [21]:
only_sar_edge_df.count()

732

In [22]:
sar_sources = only_sar_edge_df.select("source")
sar_targets = only_sar_edge_df.select("target")
sar_nodes = sar_sources.union(sar_targets).toDF("id").dropDuplicates()
sar_nodes.count()

sar_edges = only_sar_edge_df.select("source", "target").toDF("src", "dst")
sar_edges.count()

732

In [23]:
# Now lets construct the graph
g_sar = GraphFrame(sar_nodes,sar_edges)
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
#cc_sar = g_sar.connectedComponents()
cc_sar = g_sar.stronglyConnectedComponents(20).cache()

In [24]:
cc_sar.cache().show()

+----+---------+
|  id|component|
+----+---------+
|7800|     7800|
|8600|     8600|
|4000|     4000|
|5201|     1109|
|1801|     1801|
| 601|      601|
|1201|     1201|
|4601|     4601|
|9601|     9601|
|3402|     3402|
|4802|     4802|
|7802|     1945|
|6403|     1630|
|1003|     1003|
|2003|     1201|
|5403|      295|
|9403|     9403|
|7604|     1201|
|5204|     1257|
|8004|     8004|
+----+---------+
only showing top 20 rows

In [25]:
cc_sar.groupBy('component').count().select('count').dropDuplicates().orderBy('count').show()

+-----+
|count|
+-----+
|    1|
|    5|
|    6|
|    7|
|    8|
|    9|
|   10|
+-----+

In [26]:
cc_sar_grouped = cc_sar.groupBy('component').count().where(func.col('count')>2).drop('count')
cc_sar = cc_sar.join(cc_sar_grouped,['component'])
cc_sar.show()

+---------+----+
|component|  id|
+---------+----+
|     1109|5201|
|     1801|1801|
|      601| 601|
|     1201|1201|
|     1945|7802|
|     1630|6403|
|     1003|1003|
|     1201|2003|
|      295|5403|
|     1201|7604|
|     1257|5204|
|      255|6204|
|     1369|7006|
|      829|3607|
|     3669|7207|
|     3751|5608|
|     1109|4208|
|      515|7809|
|     1688|3609|
|      295|9009|
+---------+----+
only showing top 20 rows

In [27]:
sar_cc_grouped = cc_sar.join(
    only_sar_edge_df,
    [(only_sar_edge_df.source==cc_sar.id)|(only_sar_edge_df.target==cc_sar.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])
sar_cc_grouped.show()

+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|component|  id|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|     1009|1425|     1|      4| 100274|   1.4892768E9|  102.86|  1425|      80|  3661|
|     1801|9700|     1|      4| 284874|   1.5004224E9|  103.47|  3984|      96|  9700|
|      554|8091|     1|      4| 522545|   1.5147648E9|  106.94|  8091|      91|   554|
|      825|4571|     1|      4| 706588|    1.525824E9|   63.66|  7776|      84|  4571|
|     1369|8581|     1|      4| 472208|   1.5117408E9|  114.67|  8581|      70|  1598|
|     1257|6837|     1|      4|  57885|   1.4866848E9|   73.41|  6837|      95|  5261|
|      907|7851|     1|      4| 509573|   1.5139872E9|   47.28|  3329|      90|  7851|
|      397|6415|     1|      4| 786028|   1.5306624E9|   79.42|  6415|      74|  3252|
|      295|6810|     1|      4| 202624|   1

In [28]:
sar_cc_grouped.count()

321

In [29]:
sar_cc_grouped.count()

321

In [30]:
only_sar_edge_df.count()

732

In [31]:
only_sar_edge_df = sar_cc_grouped

In [32]:
only_sar_edge_df.show(5)

+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|component|  id|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|     1009|1425|     1|      4| 100274|   1.4892768E9|  102.86|  1425|      80|  3661|
|     1801|9700|     1|      4| 284874|   1.5004224E9|  103.47|  3984|      96|  9700|
|      554|8091|     1|      4| 522545|   1.5147648E9|  106.94|  8091|      91|   554|
|      825|4571|     1|      4| 706588|    1.525824E9|   63.66|  7776|      84|  4571|
|     1369|8581|     1|      4| 472208|   1.5117408E9|  114.67|  8581|      70|  1598|
+---------+----+------+-------+-------+--------------+--------+------+--------+------+
only showing top 5 rows

In [45]:
only_normal_edge_df = normal_cc_grouped.groupBy('tran_timestamp').count().show(200)

+--------------+-----+
|tran_timestamp|count|
+--------------+-----+
|    1.483488E9| 1056|
|      1.4904E9| 1010|
|   1.5161472E9| 1072|
|   1.5188256E9| 1005|
|   1.5254784E9|  983|
|   1.5032736E9| 1062|
|     1.50336E9| 1093|
|    1.519776E9| 1068|
|   1.5226272E9| 1037|
|   1.5437952E9| 1050|
|   1.5450048E9| 1032|
|   1.4878944E9| 1005|
|   1.4931648E9| 1100|
|   1.5218496E9|  999|
|   1.5369696E9| 1003|
|   1.4924736E9| 1048|
|   1.5074208E9| 1090|
|   1.5123456E9| 1016|
|   1.5198624E9| 1040|
|    1.518912E9| 1057|
|   1.5272928E9|  976|
|   1.5312672E9| 1085|
|   1.5374016E9| 1064|
|   1.4944608E9| 1063|
|   1.5306624E9| 1065|
|    1.535328E9| 1067|
|   1.5066432E9| 1051|
|   1.5304896E9| 1022|
|   1.4874624E9| 1032|
|   1.5101856E9| 1069|
|   1.5395616E9| 1076|
|   1.5381792E9|  948|
|   1.4996448E9| 1042|
|   1.5380064E9| 1006|
|   1.5384384E9| 1032|
|   1.4845248E9| 1074|
|   1.5253056E9| 1054|
|   1.4862528E9| 1055|
|   1.5281568E9| 1076|
|    1.540512E9| 1058|
|   1.49333

In [None]:
only_normal_edge_df.

In [33]:
normal_sources = only_normal_edge_df.select("source")
normal_targets = only_normal_edge_df.select("target")
normal_nodes = normal_sources.union(normal_targets).toDF("id").dropDuplicates()
normal_edges = only_normal_edge_df.select("source", "target").toDF("src", "dst")

In [34]:
# Now lets construct the graph
g_normal = GraphFrame(normal_nodes,normal_edges)

In [35]:
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
#cc_normal = g_normal.connectedComponents().cache()
cc_normal = g_normal.stronglyConnectedComponents(20).cache()

In [36]:
cc_norm_comp_count = cc_normal.groupBy('component').count().where(func.col('count')>=5)
cc_normal = cc_normal.join(cc_norm_comp_count,['component'])
cc_normal =  cc_normal.drop('count')
normal_cc_grouped = cc_normal.join(
    only_normal_edge_df,
    [(only_normal_edge_df.source==cc_normal.id)|(only_normal_edge_df.target==cc_normal.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])
normal_cc_grouped.cache()

DataFrame[component: bigint, id: int, is_sar: int, tx_type: string, tran_id: int, tran_timestamp: float, base_amt: double, target: int, alert_id: int, source: int]

In [37]:
normal_cc_grouped.show()

+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|component|  id|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|        4| 854|     0|      4|    148|   1.4832288E9| 8934.66|  3951|      -1|   854|
|        4|4076|     0|      4|    463|   1.4832288E9| 5695.63|  1630|      -1|  4076|
|        4|1334|     0|      4|    496|   1.4832288E9| 5231.16|  3654|      -1|  1334|
|        4|1052|     0|      4|    833|   1.4832288E9| 4552.67|  8906|      -1|  1052|
|        4|9097|     0|      4|   1088|   1.4832288E9| 6632.67|  9587|      -1|  9097|
|        4|6306|     0|      4|   1342|   1.4832288E9|  9067.7|  6306|      -1|   176|
|        4|9951|     0|      4|   1580|   1.4833152E9|  3811.5|  9951|      -1|  9478|
|        4|4819|     0|      4|   1645|   1.4833152E9| 8523.88|  1246|      -1|  4819|
|        4|5829|     0|      4|   1829|   1

In [38]:
normal_cc_grouped.count()

751796