In [1]:
import os
import uuid
from notebookutils import mssparkutils
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, MapType, ArrayType,BooleanType

f_uuid = F.udf(lambda: str(uuid.uuid4()), StringType())
f_bool = F.udf(lambda: True, BooleanType())


StatementMeta(sparkpool01, 6, 1, Finished, Available)

In [3]:
mssparkutils.fs.mount( 
    "abfss://bronze@xxxxxxxxxx.dfs.core.windows.net", 
    "/mydata", 
    {"linkedService":"xxxxxxx-synapse-01-WorkspaceDefaultStorage"} 
)
job_id = mssparkutils.env.getJobId()
bronze_mount_point = f"synfs:/{job_id}/mydata/archive.zip"

StatementMeta(sparkpool01, 6, 3, Finished, Available)

In [12]:
cosmosEndpoint = "https://ebcbin5oofjcs.documents.azure.com:443/"
cosmosMasterKey = "xxxxxxxxxxx"
cosmosDatabaseName = "database01"
cosmosContainerName = "graph01"

cfg = {
  "spark.cosmos.accountEndpoint" : cosmosEndpoint,
  "spark.cosmos.accountKey" : cosmosMasterKey,
  "spark.cosmos.database" : cosmosDatabaseName,
  "spark.cosmos.container" : cosmosContainerName,
}
# Configure Catalog Api to be used
spark.conf.set("spark.sql.catalog.cosmosCatalog", "com.azure.cosmos.spark.CosmosCatalog")
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountEndpoint", cosmosEndpoint)
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountKey", cosmosMasterKey)

def write_to_cosmos_graph(df: DataFrame, data_type: str, save: True):
    if (save):
        cosmos_vertices_df.write.format("delta").mode("overwrite").save(f"synfs:/{job_id}/mydata/{data_type}")
    df.write\
   .format("cosmos.oltp")\
   .options(**cfg)\
   .mode("APPEND")\
   .save()

StatementMeta(sparkpool01, 6, 12, Finished, Available)

In [4]:
raw_data = spark.read.format('delta').load(bronze_mount_point)

StatementMeta(sparkpool01, 6, 4, Finished, Available)

In [5]:
# Create vertices based on activity and num of transaction made from raw data
nameOrig = raw_data.select("nameOrig")
nameDest = raw_data.select("nameDest")
busy_accounts = nameOrig.union(nameDest).withColumnRenamed('nameOrig','id')
top_account_activity = busy_accounts.groupBy('id').count().filter(F.col('count') > 10 ).select('id')
top_account_amount = raw_data.filter(F.col('amount') > 500000)
nameOrigA = top_account_amount.select('nameOrig')
nameDestA = top_account_amount.select('nameDest')
top_account_transact = nameOrigA.union(nameDestA).withColumnRenamed('nameOrig','id').distinct().select('id')
vertices = top_account_activity.union(top_account_transact).distinct()
cosmos_vertices_df = vertices.withColumn('label',F.lit('account')).withColumn('accountId',vertices['id']).\
select("label","id","accountId")

StatementMeta(sparkpool01, 6, 5, Finished, Available)

In [8]:
# Get transactions from the above vertices( source or desination)
transactions_nameOrig = raw_data.join(vertices, raw_data['nameOrig'] == vertices['id'])
transactions_nameDest = raw_data.join(vertices, raw_data['nameDest'] == vertices['id'])
edges = transactions_nameOrig.union(transactions_nameDest).select("type","amount","nameOrig","nameDest","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest")
cosmos_edges_df = edges.withColumn("label",raw_data['type']).\
withColumn('accountId',raw_data['nameOrig']).\
withColumnRenamed('nameOrig','_vertexId').\
withColumn('_sinkPartition',raw_data['nameDest']).\
withColumnRenamed('nameDest',"_sink").\
withColumn('_sinkLabel',F.lit("account")).\
withColumn('_vertexLabel',F.lit("account")).\
withColumn('_isEdge',f_bool()).\
withColumn('id', f_uuid()).\
select("id","label","_sink","_sinkLabel","_sinkPartition","_vertexId","_vertexLabel","_isEdge","accountId","type","amount","oldbalanceOrg","oldbalanceDest","newbalanceDest")

StatementMeta(sparkpool01, 6, 8, Finished, Available)

In [9]:
# # Create test data set to verify vertices and edges are being created correctly
# cosmos_edges_df_test = cosmos_edges_df.limit(10)
# source = cosmos_edges_df_test.select('_vertexId')
# dest = cosmos_edges_df_test.select('_sink')
# tmp = source.union(dest).distinct().withColumnRenamed('_vertexId','id').withColumn('label',F.lit('account'))
# cosmos_vertices_df_test = tmp.withColumn('accountId',tmp['id']).select("label","id","accountId")
# print(f"Number of Accounts/Vertices: {cosmos_vertices_df_test.count()}, Number of transactions/Edges: {cosmos_edges_df_test.count()}")
# write_to_cosmos_graph(cosmos_vertices_df_test,"vertices",False)
# write_to_cosmos_graph(cosmos_edges_df."edges",False)

StatementMeta(sparkpool01, 6, 9, Finished, Available)

Number of Accounts/Vertices: 20, Number of transactions/Edges: 10

In [14]:
write_to_cosmos_graph(cosmos_vertices_df,"vertices",True)
write_to_cosmos_graph(cosmos_edges_df,"edges",True)

StatementMeta(sparkpool01, 6, 14, Finished, Available)