In [88]:
spark

<pyspark.sql.session.SparkSession object at 0x7f3f73173f90>

In [89]:
import hashlib
from datetime import datetime
from graphframes import *
from pyspark.sql import functions as func
from pyspark.sql.types import FloatType
import hsfs
from hops import hdfs
import os
from pyspark.sql import SQLContext

In [90]:
def hashnode(x):
    return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

hashnode_udf = func.udf(hashnode)

In [91]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [92]:
edge_fg = fs.get_feature_group('transactions_fg', 1)
node_fg = fs.get_feature_group('account_features', 1)

In [93]:
node_fg.show(5)

+--------------+---------+-------+---------------+
|tx_behavior_id|prior_sar|acct_id|initial_deposit|
+--------------+---------+-------+---------------+
|             1|        0|      0|       84442.19|
|             1|        0|      1|       75795.44|
|             1|        0|      2|       42057.16|
|             1|        0|      3|       25891.68|
|             1|        0|      4|       51127.47|
+--------------+---------+-------+---------------+
only showing top 5 rows

In [94]:
edge_fg.show(5)

+-------+--------------+------+--------+------+--------+------+-------+
|tran_id|tran_timestamp|is_sar|base_amt|target|alert_id|source|tx_type|
+-------+--------------+------+--------+------+--------+------+-------+
|      1|   1.4832288E9|     0| 9405.71|  3259|      -1|  1767|      4|
|      2|   1.4832288E9|     0| 6884.54|  5141|      -1|  7363|      4|
|      3|   1.4832288E9|     0|  7968.4|  9532|      -1|  7585|      4|
|      4|   1.4832288E9|     0| 9042.67|  8792|      -1|  1750|      4|
|      5|   1.4832288E9|     0| 4692.79|  4670|      -1|  9060|      4|
+-------+--------------+------+--------+------+--------+------+-------+
only showing top 5 rows

In [95]:
only_sar_edge_df = edge_fg.read().filter(func.col('alert_id') != -1)
only_normal_edge_df =  edge_fg.read().filter(func.col('alert_id') == -1)

In [96]:
only_normal_edge_df.show(5)

+-------+--------------+------+--------+------+--------+------+-------+
|tran_id|tran_timestamp|is_sar|base_amt|target|alert_id|source|tx_type|
+-------+--------------+------+--------+------+--------+------+-------+
|      1|   1.4832288E9|     0| 9405.71|  3259|      -1|  1767|      4|
|      2|   1.4832288E9|     0| 6884.54|  5141|      -1|  7363|      4|
|      3|   1.4832288E9|     0|  7968.4|  9532|      -1|  7585|      4|
|      4|   1.4832288E9|     0| 9042.67|  8792|      -1|  1750|      4|
|      5|   1.4832288E9|     0| 4692.79|  4670|      -1|  9060|      4|
+-------+--------------+------+--------+------+--------+------+-------+
only showing top 5 rows

In [97]:
only_sar_edge_df.show()

+-------+--------------+------+--------+------+--------+------+-------+
|tran_id|tran_timestamp|is_sar|base_amt|target|alert_id|source|tx_type|
+-------+--------------+------+--------+------+--------+------+-------+
|     98|   1.4832288E9|     1|  108.62|  5688|      16|  2298|      4|
|    108|   1.4832288E9|     1|  183.25|  9601|      26|  8627|      4|
|    135|   1.4832288E9|     1|  142.71|  8359|      15|  2756|      4|
|    137|   1.4832288E9|     1|  132.47|  7702|       9|  7605|      4|
|    218|   1.4832288E9|     1|  119.51|  7377|      17|  5891|      4|
|    335|   1.4832288E9|     1|  136.02|  1661|      12|  6787|      4|
|    439|   1.4832288E9|     1|  194.53|  7950|       0|  8485|      4|
|    477|   1.4832288E9|     1|  184.32|  2177|       3|  5324|      4|
|    514|   1.4832288E9|     1|  130.63|  4616|      19|  4919|      4|
|    564|   1.4832288E9|     1|  183.27|  1589|       5|  4170|      4|
|    580|   1.4832288E9|     1|  135.69|  1590|      18|  6483| 

In [98]:
only_sar_edge_df.count()

732

In [99]:
only_normal_edge_df.count()

1028964

In [100]:
only_sar_edge_df_grouped = only_sar_edge_df.groupBy('tran_id').agg(func.min("tran_timestamp"),func.max("tran_timestamp")).toDF("tran_id", "window_start", "window_end")
only_sar_edge_df_grouped.show(5)

+-------+------------+-----------+
|tran_id|window_start| window_end|
+-------+------------+-----------+
|  44437| 1.4859072E9|1.4859072E9|
| 100274| 1.4892768E9|1.4892768E9|
| 612597|  1.520208E9| 1.520208E9|
|  12471|   1.48392E9|  1.48392E9|
|  33855|  1.485216E9| 1.485216E9|
+-------+------------+-----------+
only showing top 5 rows

In [101]:
only_sar_edges_df_windows = only_sar_edge_df.join(only_sar_edge_df_grouped,["tran_id"])

In [102]:
only_sar_edges_df_windows.show()

+-------+--------------+------+--------+------+--------+------+-------+------------+-----------+
|tran_id|tran_timestamp|is_sar|base_amt|target|alert_id|source|tx_type|window_start| window_end|
+-------+--------------+------+--------+------+--------+------+-------+------------+-----------+
|     98|   1.4832288E9|     1|  108.62|  5688|      16|  2298|      4| 1.4832288E9|1.4832288E9|
|    108|   1.4832288E9|     1|  183.25|  9601|      26|  8627|      4| 1.4832288E9|1.4832288E9|
|    135|   1.4832288E9|     1|  142.71|  8359|      15|  2756|      4| 1.4832288E9|1.4832288E9|
|    137|   1.4832288E9|     1|  132.47|  7702|       9|  7605|      4| 1.4832288E9|1.4832288E9|
|    218|   1.4832288E9|     1|  119.51|  7377|      17|  5891|      4| 1.4832288E9|1.4832288E9|
|    335|   1.4832288E9|     1|  136.02|  1661|      12|  6787|      4| 1.4832288E9|1.4832288E9|
|    439|   1.4832288E9|     1|  194.53|  7950|       0|  8485|      4| 1.4832288E9|1.4832288E9|
|    477|   1.4832288E9|     1

In [103]:
only_sar_edges_df_windows.count()

732

In [104]:
only_normal_edges_df_windows = only_sar_edge_df_grouped.select("window_start", "window_end").join(
    only_normal_edge_df,
    [(only_normal_edge_df.tran_timestamp>=only_sar_edge_df_grouped.window_start)&(only_normal_edge_df.tran_timestamp<=only_sar_edge_df_grouped.window_end)],
    how="left"
)

In [105]:
only_normal_edges_df_windows.show()

+------------+-----------+-------+--------------+------+--------+------+--------+------+-------+
|window_start| window_end|tran_id|tran_timestamp|is_sar|base_amt|target|alert_id|source|tx_type|
+------------+-----------+-------+--------------+------+--------+------+--------+------+-------+
| 1.4859072E9|1.4859072E9|  44409|   1.4859072E9|     0| 5887.65|  6489|      -1|  6074|      4|
| 1.4859072E9|1.4859072E9|  44410|   1.4859072E9|     0| 1753.42|  2474|      -1|  5360|      4|
| 1.4859072E9|1.4859072E9|  44411|   1.4859072E9|     0|  433.12|   678|      -1|  7416|      4|
| 1.4859072E9|1.4859072E9|  44412|   1.4859072E9|     0| 1322.19|  8767|      -1|  8663|      4|
| 1.4859072E9|1.4859072E9|  44413|   1.4859072E9|     0| 6036.48|  5370|      -1|  8258|      4|
| 1.4859072E9|1.4859072E9|  44414|   1.4859072E9|     0| 9083.92|  2420|      -1|  9893|      4|
| 1.4859072E9|1.4859072E9|  44415|   1.4859072E9|     0| 5309.47|  2418|      -1|  7987|      4|
| 1.4859072E9|1.4859072E9|  44

In [106]:
only_normal_edges_df_windows.count()

1048084

In [107]:
only_normal_edges_df_windows = only_normal_edges_df_windows.withColumnRenamed("source", "origId")\
                                                           .withColumnRenamed("target", "destId")  
only_normal_edges_df_windows = only_normal_edges_df_windows.withColumn('target',hashnode_udf(func.concat(func.col('destId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))\
                                                           .withColumn('source',hashnode_udf(func.concat(func.col('origId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))            
only_normal_edges_df_windows.show()

+------------+-----------+-------+--------------+------+--------+------+--------+------+-------+--------+--------+
|window_start| window_end|tran_id|tran_timestamp|is_sar|base_amt|destId|alert_id|origId|tx_type|  target|  source|
+------------+-----------+-------+--------------+------+--------+------+--------+------+-------+--------+--------+
| 1.4859072E9|1.4859072E9|  44409|   1.4859072E9|     0| 5887.65|  6489|      -1|  6074|      4|51f60611|b8797deb|
| 1.4859072E9|1.4859072E9|  44410|   1.4859072E9|     0| 1753.42|  2474|      -1|  5360|      4|ffea183c|2087ccb6|
| 1.4859072E9|1.4859072E9|  44411|   1.4859072E9|     0|  433.12|   678|      -1|  7416|      4|f9bcd32a|d16e7308|
| 1.4859072E9|1.4859072E9|  44412|   1.4859072E9|     0| 1322.19|  8767|      -1|  8663|      4|4e307995|6ac79954|
| 1.4859072E9|1.4859072E9|  44413|   1.4859072E9|     0| 6036.48|  5370|      -1|  8258|      4|57e45309|eab8a93e|
| 1.4859072E9|1.4859072E9|  44414|   1.4859072E9|     0| 9083.92|  2420|      -1

In [108]:
normal_sources = only_normal_edges_df_windows.select("source")
normal_targets = only_normal_edges_df_windows.select("target")
normal_nodes = normal_sources.union(normal_targets).dropDuplicates()
normal_nodes.count()

906318

In [109]:
normal_edges = only_normal_edges_df_windows.select("source", "target")
normal_edges.count()

1048084

In [110]:
only_sar_edges_df_windows = only_sar_edges_df_windows.withColumnRenamed("source", "origId")\
                                                           .withColumnRenamed("target", "destId")  
only_sar_edges_df_windows = only_sar_edges_df_windows.withColumn('target',hashnode_udf(func.concat(func.col('destId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))\
                                                           .withColumn('source',hashnode_udf(func.concat(func.col('origId'),func.lit('_'),func.col('window_start'),func.lit('_'),func.col('window_end'))))            
only_sar_edges_df_windows.show()

+-------+--------------+------+--------+------+--------+------+-------+------------+-----------+--------+--------+
|tran_id|tran_timestamp|is_sar|base_amt|destId|alert_id|origId|tx_type|window_start| window_end|  target|  source|
+-------+--------------+------+--------+------+--------+------+-------+------------+-----------+--------+--------+
|     98|   1.4832288E9|     1|  108.62|  5688|      16|  2298|      4| 1.4832288E9|1.4832288E9|abadb2bd|2d6bcbfc|
|    108|   1.4832288E9|     1|  183.25|  9601|      26|  8627|      4| 1.4832288E9|1.4832288E9|5a2ef132|2ffeba7c|
|    135|   1.4832288E9|     1|  142.71|  8359|      15|  2756|      4| 1.4832288E9|1.4832288E9|20aba974|b6564133|
|    137|   1.4832288E9|     1|  132.47|  7702|       9|  7605|      4| 1.4832288E9|1.4832288E9|f5a54e21|6c465ac2|
|    218|   1.4832288E9|     1|  119.51|  7377|      17|  5891|      4| 1.4832288E9|1.4832288E9|53ae86b7|c59fa192|
|    335|   1.4832288E9|     1|  136.02|  1661|      12|  6787|      4| 1.483228

In [111]:
sar_sources = only_sar_edges_df_windows.select("source")
sar_targets = only_sar_edges_df_windows.select("target")
sar_nodes = sar_sources.union(sar_targets).dropDuplicates()
sar_nodes.count()

1425

In [112]:
sar_edges = only_sar_edges_df_windows.select("source", "target")
sar_edges.count()

732

In [113]:
# Now lets construct the graph
g_normal = GraphFrame(normal_nodes,normal_edges)
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_normal = g_normal.connectedComponents()

An error was encountered:
An error occurred while calling o918.loadClass.
: java.lang.ClassNotFoundException: org.graphframes.GraphFramePythonAPI
	at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thr

In [114]:
# Now lets construct the graph
g_sar = GraphFrame(sar_nodes,sar_edges)
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_normal = g_sar.connectedComponents()

An error was encountered:
An error occurred while calling o936.loadClass.
: java.lang.ClassNotFoundException: org.graphframes.GraphFramePythonAPI
	at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thr