In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
80,application_1609265553881_0002,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f414f1e4f10>

In [2]:
import hashlib
from datetime import datetime
from graphframes import *
from pyspark.sql import functions as func
from pyspark.sql.types import FloatType
import hsfs
from hops import hdfs
import os
from pyspark.sql import SQLContext

In [3]:
def hashnode(x):
    return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

hashnode_udf = func.udf(hashnode)

In [4]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [5]:
edge_fg = fs.get_feature_group('transactions_fg', 1)
node_fg = fs.get_feature_group('account_features', 1)

In [6]:
node_fg.show(5)

+---------+---------------+--------------+-------+
|prior_sar|initial_deposit|tx_behavior_id|acct_id|
+---------+---------------+--------------+-------+
|        0|       92221.09|             1|      0|
|        0|       87897.72|             1|      1|
|        0|       71028.58|             1|      2|
|        0|       62945.84|             1|      3|
|        0|       75563.74|             1|      4|
+---------+---------------+--------------+-------+
only showing top 5 rows

In [7]:
edge_fg.show(5)

+------+------+-------+--------+--------------+-------+------+--------+
|source|is_sar|tran_id|alert_id|tran_timestamp|tx_type|target|base_amt|
+------+------+-------+--------+--------------+-------+------+--------+
|   218|     0|      1|      -1|   1.4832288E9|      4|    78|  458.69|
|   213|     0|      2|      -1|   1.4832288E9|      4|    95|  537.69|
|   191|     0|      3|      -1|   1.4832288E9|      4|    74|  139.61|
|   166|     0|      4|      -1|   1.4832288E9|      4|   197|  717.61|
|    16|     0|      5|      -1|   1.4832288E9|      4|    46|  275.56|
+------+------+-------+--------+--------------+-------+------+--------+
only showing top 5 rows

In [36]:
all_sources = edge_fg.read().select("source")
all_targets = edge_fg.read().select("target")
all_nodes = all_sources.union(all_targets).toDF("id").dropDuplicates()
all_edges = edge_fg.read().select("source", "target").toDF("src", "dst")
# Now lets construct the graph
g_all = GraphFrame(all_nodes,all_edges)
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
#cc_all = g_all.connectedComponents().cache()
cc_all = g_all.stronglyConnectedComponents(20).cache()
cc_all.groupBy('component').count().select('count').dropDuplicates().orderBy('count').show()

+-----+
|count|
+-----+
|  300|
+-----+

In [38]:
all_nodes.count()

300

In [39]:
node_fg.read().count()

300

In [40]:
# Run PageRank until convergence to tolerance "tol".
results = g_all.pageRank(resetProbability=0.15, tol=0.01)
# Display resulting pageranks and final edge weights
# Note that the displayed pagerank may be truncated, e.g., missing the E notation.
# In Spark 1.5+, you can use show(truncate=False) to avoid truncation.

In [44]:
results.vertices.select("id", "pagerank").orderBy("pagerank", ascending=False).show()

+---+------------------+
| id|          pagerank|
+---+------------------+
|199| 3.090645691444692|
|299|3.0616148540362587|
|298| 2.700349396074648|
|198| 2.621710988718279|
| 99| 2.603718505547572|
| 98|2.4392578463673744|
|197|2.4169416134926127|
|297| 2.413600360667969|
| 97| 2.403460226105327|
| 96| 2.196975880820476|
|292|2.1299957536737906|
|195|2.1079525426022006|
|296|2.0997526965332156|
| 95| 2.014461718748232|
|294|  2.01075730228747|
|194|1.9682507722506049|
|295|1.9621250186782841|
|196|1.9354852228725368|
| 94| 1.895490737648288|
| 91|1.8267309102516003|
+---+------------------+
only showing top 20 rows

In [47]:
all_edges.show()

+---+---+
|src|dst|
+---+---+
|218| 78|
|213| 95|
|191| 74|
|166|197|
| 16| 46|
|100|296|
|202| 76|
| 69|229|
| 97|121|
|  9| 62|
|118| 77|
| 98|162|
| 35|215|
|266|276|
|249|281|
|295| 78|
| 22|168|
|273|261|
|260|127|
|244|281|
+---+---+
only showing top 20 rows

In [51]:
import networkx as nx
pdf = all_edges.toPandas()
G=nx.from_pandas_edgelist(pdf,'src','dst')
graphs = list(nx.connected_components(G))

In [52]:
graphs

[{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221

In [8]:
only_sar_edge_df = edge_fg.read().filter(func.col('alert_id') != -1)
only_normal_edge_df =  edge_fg.read().filter(func.col('alert_id') == -1)

In [9]:
only_normal_edge_df.show(5)

+------+------+-------+--------+--------------+-------+------+--------+
|source|is_sar|tran_id|alert_id|tran_timestamp|tx_type|target|base_amt|
+------+------+-------+--------+--------------+-------+------+--------+
|   218|     0|      1|      -1|   1.4832288E9|      4|    78|  458.69|
|   213|     0|      2|      -1|   1.4832288E9|      4|    95|  537.69|
|   191|     0|      3|      -1|   1.4832288E9|      4|    74|  139.61|
|   166|     0|      4|      -1|   1.4832288E9|      4|   197|  717.61|
|    16|     0|      5|      -1|   1.4832288E9|      4|    46|  275.56|
+------+------+-------+--------+--------------+-------+------+--------+
only showing top 5 rows

In [10]:
only_sar_edge_df.show()

+------+------+-------+--------+--------------+-------+------+--------+
|source|is_sar|tran_id|alert_id|tran_timestamp|tx_type|target|base_amt|
+------+------+-------+--------+--------------+-------+------+--------+
|   202|     1|      7|       0|   1.4832288E9|      4|    76|  157.52|
|    25|     1|   1791|       4|   1.4867712E9|      4|    32|  149.66|
|    32|     1|   1833|       4|   1.4868576E9|      4|    57|  149.66|
|    57|     1|   1958|       4|   1.4871168E9|      4|   177|  149.66|
|   177|     1|   2033|       4|   1.4872896E9|      4|   259|  149.66|
|   259|     1|   2232|       4|   1.4877216E9|      4|   163|  149.66|
|   258|     1|   2668|       0|   1.4885856E9|      4|    76|  157.52|
|    83|     1|   3909|       1|   1.4910912E9|      4|   249|  131.22|
|   298|     1|   5340|       0|   1.4939424E9|      4|    76|  157.52|
|   240|     1|   8002|       0|   1.4992992E9|      4|    76|  157.52|
|    83|     1|   9999|       1|     1.50336E9|      4|   201|  

In [11]:
only_sar_edge_df.count()

38

In [12]:
sar_sources = only_sar_edge_df.select("source")
sar_targets = only_sar_edge_df.select("target")
sar_nodes = sar_sources.union(sar_targets).toDF("id").dropDuplicates()
sar_nodes.count()

sar_edges = only_sar_edge_df.select("source", "target").toDF("src", "dst")
sar_edges.count()

38

In [16]:
# Now lets construct the graph
g_sar = GraphFrame(sar_nodes,sar_edges)
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
cc_sar = g_sar.connectedComponents()
#cc_sar = g_sar.stronglyConnectedComponents(20).cache()

In [17]:
cc_sar.cache().show()

+---+---------+
| id|component|
+---+---------+
| 78|        5|
| 76|       76|
|285|       25|
|230|        7|
|157|        5|
|111|        5|
|177|       25|
|259|       25|
|178|        7|
| 20|        7|
| 57|       25|
|292|        5|
|191|        5|
|163|       25|
|  5|        5|
|258|       76|
|227|       83|
|202|       76|
|107|       25|
|176|        7|
+---+---------+
only showing top 20 rows

In [18]:
cc_sar.groupBy('component').count().select('count').dropDuplicates().orderBy('count').show()

+-----+
|count|
+-----+
|    5|
|   11|
|   12|
+-----+

In [19]:
cc_sar_grouped = cc_sar.groupBy('component').count().where(func.col('count')>2).drop('count')
cc_sar = cc_sar.join(cc_sar_grouped,['component'])
cc_sar.show()

+---------+---+
|component| id|
+---------+---+
|        5| 78|
|       76| 76|
|       25|285|
|        7|230|
|        5|157|
|        5|111|
|       25|177|
|       25|259|
|        7|178|
|        7| 20|
|       25| 57|
|        5|292|
|        5|191|
|       25|163|
|        5|  5|
|       76|258|
|       83|227|
|       76|202|
|       25|107|
|        7|176|
+---------+---+
only showing top 20 rows

In [20]:
sar_cc_grouped = cc_sar.join(
    only_sar_edge_df,
    [(only_sar_edge_df.source==cc_sar.id)|(only_sar_edge_df.target==cc_sar.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])
sar_cc_grouped.show()

+---------+---+------+------+-------+--------+--------------+-------+------+--------+
|component| id|source|is_sar|tran_id|alert_id|tran_timestamp|tx_type|target|base_amt|
+---------+---+------+------+-------+--------+--------------+-------+------+--------+
|        5|191|   191|     1|  26924|       7|   1.5374016E9|      4|   292|  102.97|
|        5|292|   116|     1|  26683|       7|   1.5368832E9|      4|   292|  102.97|
|       83|237|    83|     1|  18618|       1|     1.52064E9|      4|   237|  131.22|
|        7|230|     7|     1|  16131|       5|   1.5156288E9|      4|   230|  117.81|
|       25| 38|    38|     1|  25576|       6|   1.5346368E9|      4|   242|  101.38|
|       76|258|   258|     1|   2668|       0|   1.4885856E9|      4|    76|  157.52|
|        7|230|   230|     1|  15792|       5|    1.515024E9|      4|   176|  130.91|
|       25|215|   215|     1|  25655|       6|   1.5348096E9|      4|   242|  101.38|
|       25|285|   107|     1|  25277|       6|    1.53

In [21]:
sar_cc_grouped.count()

38

In [22]:
sar_cc_grouped.count()

38

In [23]:
only_sar_edge_df.count()

38

In [24]:
only_sar_edge_df = sar_cc_grouped

In [25]:
only_sar_edge_df.show(5)

+---------+---+------+------+-------+--------+--------------+-------+------+--------+
|component| id|source|is_sar|tran_id|alert_id|tran_timestamp|tx_type|target|base_amt|
+---------+---+------+------+-------+--------+--------------+-------+------+--------+
|        5|191|   191|     1|  26924|       7|   1.5374016E9|      4|   292|  102.97|
|        5|292|   116|     1|  26683|       7|   1.5368832E9|      4|   292|  102.97|
|       83| 83|    83|     1|  18618|       1|     1.52064E9|      4|   237|  131.22|
|        7|230|     7|     1|  16131|       5|   1.5156288E9|      4|   230|  117.81|
|       25|242|    38|     1|  25576|       6|   1.5346368E9|      4|   242|  101.38|
+---------+---+------+------+-------+--------+--------------+-------+------+--------+
only showing top 5 rows

In [27]:
normal_sources = only_normal_edge_df.select("source")
normal_targets = only_normal_edge_df.select("target")
normal_nodes = normal_sources.union(normal_targets).toDF("id").dropDuplicates()
normal_edges = only_normal_edge_df.select("source", "target").toDF("src", "dst")

In [29]:
# Now lets construct the graph
g_normal = GraphFrame(normal_nodes,normal_edges)

In [32]:
sc.setCheckpointDir("hdfs:///Projects/{}/Logs/sc".format(hdfs.project_name()))
#cc_normal = g_normal.connectedComponents().cache()
cc_normal = g_normal.stronglyConnectedComponents(20).cache()

In [33]:
cc_normal.groupBy('component').count().select('count').dropDuplicates().orderBy('count').show()

+-----+
|count|
+-----+
|  300|
+-----+

In [34]:
cc_normal.show()

+---+---------+
| id|component|
+---+---------+
|200|        0|
|  0|        0|
|201|        0|
|  1|        0|
|202|        0|
|  2|        0|
|  3|        0|
|203|        0|
|  4|        0|
|204|        0|
|205|        0|
|  5|        0|
|206|        0|
|  6|        0|
|207|        0|
|  7|        0|
|208|        0|
|  8|        0|
|  9|        0|
|209|        0|
+---+---------+
only showing top 20 rows

In [36]:
cc_norm_comp_count = cc_normal.groupBy('component').count().where(func.col('count')>=5)
cc_normal = cc_normal.join(cc_norm_comp_count,['component'])
cc_normal =  cc_normal.drop('count')
normal_cc_grouped = cc_normal.join(
    only_normal_edge_df,
    [(only_normal_edge_df.source==cc_normal.id)|(only_normal_edge_df.target==cc_normal.id)],
    how="left"
).dropDuplicates(subset=['tran_id'])
normal_cc_grouped.cache()

DataFrame[component: bigint, id: int, is_sar: int, tx_type: string, tran_id: int, tran_timestamp: float, base_amt: double, target: int, alert_id: int, source: int]

In [37]:
normal_cc_grouped.show()

+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|component|  id|is_sar|tx_type|tran_id|tran_timestamp|base_amt|target|alert_id|source|
+---------+----+------+-------+-------+--------------+--------+------+--------+------+
|        4| 854|     0|      4|    148|   1.4832288E9| 8934.66|  3951|      -1|   854|
|        4|4076|     0|      4|    463|   1.4832288E9| 5695.63|  1630|      -1|  4076|
|        4|1334|     0|      4|    496|   1.4832288E9| 5231.16|  3654|      -1|  1334|
|        4|1052|     0|      4|    833|   1.4832288E9| 4552.67|  8906|      -1|  1052|
|        4|9097|     0|      4|   1088|   1.4832288E9| 6632.67|  9587|      -1|  9097|
|        4|6306|     0|      4|   1342|   1.4832288E9|  9067.7|  6306|      -1|   176|
|        4|9951|     0|      4|   1580|   1.4833152E9|  3811.5|  9951|      -1|  9478|
|        4|4819|     0|      4|   1645|   1.4833152E9| 8523.88|  1246|      -1|  4819|
|        4|5829|     0|      4|   1829|   1

In [38]:
normal_cc_grouped.count()

751796