In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
47,application_1606908943181_0049,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f4b5cf45dd0>

In [2]:
import hashlib
from datetime import datetime
from graphframes import *
from pyspark.sql import functions as func
from pyspark.sql.types import FloatType
import hsfs

import os
from pyspark.sql import SQLContext

In [3]:
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [4]:
node_df = fs.get_feature_group('node_features')

edge_df = fs.get_feature_group('edge_features')

node_df = node_df.select(["acct_id", "initial_deposit", "tx_behavior_id", "prior_sar", "gender", "age"]).read().toDF("id", "initial_deposit", "tx_behavior_id", "prior_sar", "gender", "age")
node_df.show(5)

edge_df = edge_df.select(["source","target","is_sar", 'alert_id', 'tran_timestamp','tran_id']).read().toDF("src", "dst", "is_sar", 'alert_id', 'tran_timestamp','tran_id')
edge_df.show(5)

+---+---------------+--------------+---------+------+---+
| id|initial_deposit|tx_behavior_id|prior_sar|gender|age|
+---+---------------+--------------+---------+------+---+
|  0|       84442.19|             1|        0|     0| 65|
|  1|       75795.44|             1|        0|     0| 31|
|  2|       42057.16|             1|        0|     0| 92|
|  3|       25891.68|             1|        0|     0| 43|
|  4|       51127.47|             1|        0|     1| 67|
+---+---------------+--------------+---------+------+---+
only showing top 5 rows

+----+----+------+--------+--------------+-------+
| src| dst|is_sar|alert_id|tran_timestamp|tran_id|
+----+----+------+--------+--------------+-------+
|1767|3259|     0|      -1|   1.4832288E9|      1|
|7363|5141|     0|      -1|   1.4832288E9|      2|
|7585|9532|     0|      -1|   1.4832288E9|      3|
|1750|8792|     0|      -1|   1.4832288E9|      4|
|9060|4670|     0|      -1|   1.4832288E9|      5|
+----+----+------+--------+--------------+---

In [5]:
only_alert_edge_df = edge_df.filter(func.col('alert_id') != -1)

only_normal_edge_df =  edge_df.join(only_alert_edge_df, on=['tran_id'], how='left_anti')

only_alert_edge_df = only_alert_edge_df.select(["src","dst","is_sar","alert_id","tran_timestamp"])
only_normal_edge_df = only_normal_edge_df.select(["src","dst","is_sar","alert_id","tran_timestamp"])
only_alert_edge_df.show(5)
only_normal_edge_df.show(5)

+----+----+------+--------+--------------+
| src| dst|is_sar|alert_id|tran_timestamp|
+----+----+------+--------+--------------+
|2298|5688|     1|      16|   1.4832288E9|
|8627|9601|     1|      26|   1.4832288E9|
|2756|8359|     1|      15|   1.4832288E9|
|7605|7702|     1|       9|   1.4832288E9|
|5891|7377|     1|      17|   1.4832288E9|
+----+----+------+--------+--------------+
only showing top 5 rows

+----+----+------+--------+--------------+
| src| dst|is_sar|alert_id|tran_timestamp|
+----+----+------+--------+--------------+
|1767|3259|     0|      -1|   1.4832288E9|
|7363|5141|     0|      -1|   1.4832288E9|
|7585|9532|     0|      -1|   1.4832288E9|
|1750|8792|     0|      -1|   1.4832288E9|
|9060|4670|     0|      -1|   1.4832288E9|
+----+----+------+--------+--------------+
only showing top 5 rows

In [6]:
only_alert_edge_df = only_alert_edge_df.sort('alert_id')

only_alert_edge_df_grouped = only_alert_edge_df.groupBy('alert_id').agg(func.min("tran_timestamp"),func.max("tran_timestamp"))

only_alert_edge_df_grouped.show(5)


+--------+-------------------+-------------------+
|alert_id|min(tran_timestamp)|max(tran_timestamp)|
+--------+-------------------+-------------------+
|       0|        1.4832288E9|        1.4929056E9|
|       1|        1.4832288E9|         1.521072E9|
|       2|        1.4832288E9|        1.5092352E9|
|       3|        1.4832288E9|        1.5136416E9|
|       4|        1.4832288E9|        1.5157152E9|
+--------+-------------------+-------------------+
only showing top 5 rows

In [7]:
# iterates over the different alerts and collect the normal transactions in the time interval
for row in only_alert_edge_df_grouped.rdd.collect():
    min_ts = row['min(tran_timestamp)']
    max_ts = row['max(tran_timestamp)']
    alert_id = row['alert_id']
    
    tmp_df = only_normal_edge_df.filter((func.col('tran_timestamp') >= min_ts) & (func.col('tran_timestamp') <= max_ts))              
    g = GraphFrame(node_df, tmp_df)
    sc.setCheckpointDir("hdfs:///Projects/AML/Logs/sc")
    cc = g.connectedComponents()
    #cc.save.parquet("hdfs:///Projects/AML/Logs/parquet")
    cc_fg_meta = fs.create_feature_group(name="connected_components_alert_id_"+str(alert_id),
                                       version=1,
                                       primary_key=["id"],
                                       description="connected components of normal transactions within the "+str(alert_id) + " time interval.",
                                       time_travel_format=None,                                        
                                       statistics_config=False)
    cc_fg_meta.save(cc)

<hsfs.feature_group.FeatureGroup object at 0x7f4b6840cb50>
<hsfs.feature_group.FeatureGroup object at 0x7f4b68423090>
<hsfs.feature_group.FeatureGroup object at 0x7f4b6884de90>
<hsfs.feature_group.FeatureGroup object at 0x7f4b6840c850>
<hsfs.feature_group.FeatureGroup object at 0x7f4b683bf150>
<hsfs.feature_group.FeatureGroup object at 0x7f4b6840b050>
<hsfs.feature_group.FeatureGroup object at 0x7f4b683b8650>
<hsfs.feature_group.FeatureGroup object at 0x7f4b68423b50>
<hsfs.feature_group.FeatureGroup object at 0x7f4b683eefd0>
<hsfs.feature_group.FeatureGroup object at 0x7f4b68870a90>
<hsfs.feature_group.FeatureGroup object at 0x7f4b683eac10>
<hsfs.feature_group.FeatureGroup object at 0x7f4b688704d0>
<hsfs.feature_group.FeatureGroup object at 0x7f4b683eefd0>
<hsfs.feature_group.FeatureGroup object at 0x7f4b688702d0>
<hsfs.feature_group.FeatureGroup object at 0x7f4b683b6750>
<hsfs.feature_group.FeatureGroup object at 0x7f4b683b6550>
<hsfs.feature_group.FeatureGroup object at 0x7f4b683b871

In [8]:
result =  spark.read.parquet("hdfs:///Projects/AML/Logs/sc/6e297fee-a4d4-4e0d-96a9-b2bf1dd0ba03")
display(result)

An error was encountered:
'Unable to infer schema for Parquet. It must be specified manually.;'
Traceback (most recent call last):
  File "/srv/hops/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 316, in parquet
    return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
  File "/srv/hops/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/srv/hops/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 69, in deco
    raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: 'Unable to infer schema for Parquet. It must be specified manually.;'

