In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
40,application_1607211657348_0042,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7fcd0f581f10>

In [2]:
from pyspark.sql.functions import array, coalesce, concat,  col

### Create a connection to hsfs

In [3]:
import hsfs
from hops import hdfs
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

### Retrieve nodes training dataset from hsfs to get label infamation, whether node was part of the previously known money laundering scheme or not 

In [26]:
node_td = fs.get_training_dataset("node_td", 1)
node_labels_df = node_td.read().select("id","is_sar")
node_labels_df = node_labels_df.select(*(col(c).cast("float").alias(c) for c in node_labels_df.columns))

In [27]:
node_labels_df.show(5)

+------+------+
|    id|is_sar|
+------+------+
|7605.0|   1.0|
|6075.0|   1.0|
|5814.0|   1.0|
|9360.0|   1.0|
|4626.0|   1.0|
+------+------+
only showing top 5 rows

## Read computed node embeddings data and concatenate it as `array<float>` 

In [28]:
node_embeddings_df = spark.read.csv("hdfs:///Projects/{}/Resources/node_embeddings_features_2_2_16.csv".format(hdfs.project_name(),inferSchema =True,header=False))
node_embeddings_df = node_embeddings_df.select(*(col(c).cast("float").alias(c) for c in node_embeddings_df.columns))

In [29]:
node_embeddings_df.printSchema()

root
 |-- _c0: float (nullable = true)
 |-- _c1: float (nullable = true)
 |-- _c2: float (nullable = true)
 |-- _c3: float (nullable = true)
 |-- _c4: float (nullable = true)
 |-- _c5: float (nullable = true)
 |-- _c6: float (nullable = true)
 |-- _c7: float (nullable = true)
 |-- _c8: float (nullable = true)
 |-- _c9: float (nullable = true)
 |-- _c10: float (nullable = true)
 |-- _c11: float (nullable = true)
 |-- _c12: float (nullable = true)
 |-- _c13: float (nullable = true)
 |-- _c14: float (nullable = true)
 |-- _c15: float (nullable = true)
 |-- _c16: float (nullable = true)

In [30]:
feature_names = node_embeddings_df.columns
feature_names = ["_" + s + "c"  for s in feature_names]
feature_names[0]= 'id'
node_embeddings_node_embeddings_df = node_embeddings_df.toDF(*feature_names)

In [31]:
feature_names.remove('id')

In [32]:
emb_td = node_embeddings_node_embeddings_df.join(node_labels_df, ['id'])

In [33]:
emb_td = emb_td.drop("id").withColumn("embedding", array(feature_names)).select("is_sar","embedding").withColumnRenamed("is_sar","target")

In [34]:
emb_td.printSchema()

root
 |-- target: float (nullable = true)
 |-- embedding: array (nullable = false)
 |    |-- element: float (containsNull = true)

## Prepare training datasets for anomaly detection 
###### In the next notebook we are going to train [gan for anomaly detection](https://arxiv.org/pdf/1905.11034.pdf). Durring training step  we will provide only features of accounts that have never been reported for money laundering behaviour.  But we will disclose previously reported accounts to the model only in evaluation step.   

In [36]:
non_sar_df = emb_td.where(col("target")==0)

In [37]:
sar_df = emb_td.where(col("target")==1)

In [38]:
# Now that the data has been prepared, let's split the dataset into a training and test dataframe
[non_sar_train_df, non_sar_eval_df] = non_sar_df.randomSplit([0.8, 0.02],seed = 12345)

In [39]:
non_sar_td = fs.create_training_dataset(name="gan_non_sar_training_df",
                                       version=1,
                                       data_format="tfrecord",
                                       label=["target"], 
                                       statistics_config=False, 
                                       description="non sar dataset for gan training")
non_sar_td.save(non_sar_train_df)

<hsfs.training_dataset.TrainingDataset object at 0x7fcd1ef15b50>

##### Now lets prepare dataset for evaluation step.

In [40]:
eval_df = non_sar_eval_df.union(sar_df)

In [41]:
gan_eval_ds = fs.create_training_dataset(name="gan_eval_df",
                                       version=1,
                                       data_format="tfrecord",
                                       label=["target"], 
                                       statistics_config=False, 
                                       description="evaluation dataset for gan training")
gan_eval_ds.save(eval_df)

<hsfs.training_dataset.TrainingDataset object at 0x7fcd1ef59310>