In [None]:
import os
import uuid
from notebookutils import mssparkutils
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from pyspark.sql.types import StringType,BooleanType
# from graphframes import *

f_uuid = F.udf(lambda: str(uuid.uuid4()), StringType())
f_bool = F.udf(lambda: True, BooleanType())


In [None]:
mssparkutils.fs.mount( 
    "abfss://bronze@ssattirajudemorgstg.dfs.core.windows.net", 
    "/mydata", 
    {"linkedService":"ssattiraju-synapse-01-WorkspaceDefaultStorage"} 
)
job_id = mssparkutils.env.getJobId()
bronze_mount_point = f"synfs:/{job_id}/mydata/archive.zip"

In [None]:
cosmosEndpoint = "https://xxxxxx.documents.azure.com:443/"
cosmosMasterKey = "xxxxxxx"
cosmosDatabaseName = "xxxxxxx"
cosmosContainerName = "xxxxxxx"

cfg = {
  "spark.cosmos.accountEndpoint" : cosmosEndpoint,
  "spark.cosmos.accountKey" : cosmosMasterKey,
  "spark.cosmos.database" : cosmosDatabaseName,
  "spark.cosmos.container" : cosmosContainerName,
}
# Configure Catalog Api to be used
spark.conf.set("spark.sql.catalog.cosmosCatalog", "com.azure.cosmos.spark.CosmosCatalog")
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountEndpoint", cosmosEndpoint)
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountKey", cosmosMasterKey)
spark.conf.set("spark.cosmos.throughputControl.enabled",True)
spark.conf.set("spark.cosmos.throughputControl.targetThroughput",20000)

def write_to_cosmos_graph(df: DataFrame, data_type: str, save: bool = False):
    if (save):
        df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"synfs:/{job_id}/mydata/{data_type}/")
        
    df.write\
   .format("cosmos.oltp")\
   .options(**cfg)\
   .mode("APPEND")\
   .save()

In [None]:
# Read from existing path
# edge_mount = f"synfs:/{job_id}/mydata/edges"
# vertices_mount = f"synfs:/{job_id}/mydata/vertices"
raw_data = spark.read.format('delta').load(bronze_mount_point).distinct()

In [5]:
raw_data.show()

StatementMeta(sparkpool01, 18, 5, Finished, Available)

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
| 156| CASH_IN| 256203.2|C1519792972|1.795038527E7| 1.820658846E7|C1793677055|     263485.24|       7282.05|      0|             0|
| 156| CASH_IN|239326.27| C945424670|2.222391698E7| 2.246324325E7|C1853417494|     549020.76|     309694.49|      0|             0|
| 156|CASH_OUT|255774.06|C1941980439|      61334.0|           0.0|C2082577597|           0.0|     255774.06|      0|             0|
| 156| CASH_IN|101050.92| C487010990|       9987.0|     111037.92| C729448360|      38486.82|           0.0|      0|             0|
| 156|TRANSFER|416825.34|C1123901508|          0.0|           0.0| C74154567

In [None]:
# PySpark function to create veritces and edges dataframes in a format accepted by Cosmos SQL api from raw dataframe. 
# TODO: Add vertex properties 
def prepare_vertices_edge_df(
    df: DataFrame,
    source_col_name: str,
    dest_col_name: str,
    parition_key_col_name: str,
    cosmos_parition_name: str,
    edge_properties_col_name: list,
    vertex_properties_col_name: list,
    vertex_label: str = "account",
    edge_label_col_name: str = "type",
    sample: bool = False,
):
    if (sample):
        df = df.limit(100)
    nameOrig = df.select(source_col_name).withColumnRenamed(source_col_name, "id")
    nameDest = df.select(dest_col_name).withColumnRenamed(dest_col_name, "id")
    all_vertices = nameOrig.union(nameDest).distinct()
    cosmos_vertices_df = (
        all_vertices.withColumn(cosmos_parition_name, all_vertices["id"])
        .withColumn("label", F.lit(vertex_label))
        .select("label", "id", cosmos_parition_name, *vertex_properties_col_name)
        .distinct()
    )
    # Create dataframe with required columns
    # _sink => target account => nameDest
    # _sinkLabel => target label => "account"
    # _vertexId => source account => nameOrig
    # _vertexLabel => source label => "account"
    # cosmos_parition_name => partition key defined in Cosmos => "accountId"
    cosmos_edges_df = (
        df.withColumn("id", f_uuid())
        .withColumn(cosmos_parition_name, df[parition_key_col_name])
        .withColumn("label", df[edge_label_col_name])
        .withColumn("_sinkPartition", df[dest_col_name])
        .withColumn("_vertexId", df[source_col_name])
        .withColumn("_sink", df[dest_col_name])
        .withColumn("_sinkLabel", F.lit(vertex_label))
        .withColumn("_vertexLabel", F.lit(vertex_label))
        .withColumn("_isEdge", f_bool())
        .select(
            "id",
            "label",
            "_sink",
            "_sinkLabel",
            "_sinkPartition",
            "_vertexId",
            "_vertexLabel",
            "_isEdge",
            cosmos_parition_name,
            *edge_properties_col_name
        )
    )
    return cosmos_vertices_df, cosmos_edges_df


In [None]:
v, e = prepare_vertices_edge_df(
    df=raw_data,
    source_col_name="nameOrig",
    dest_col_name="nameDest",
    parition_key_col_name="nameOrig",
    cosmos_parition_name="accountId",
    edge_properties_col_name=[
        "amount",
        "oldbalanceOrg",
        "oldbalanceDest",
        "newbalanceDest",
    ],
    vertex_properties_col_name=[],
    sample=True
)


In [8]:
v.count()
v.show()

StatementMeta(sparkpool01, 18, 8, Finished, Available)

+-------+-----------+-----------+
|  label|         id|  accountId|
+-------+-----------+-----------+
|account|C1541840649|C1541840649|
|account| C422430677| C422430677|
|account| C473429831| C473429831|
|account| C729448360| C729448360|
|account|C1109346842|C1109346842|
|account|C1368076471|C1368076471|
|account|C1655405641|C1655405641|
|account|C1646359151|C1646359151|
|account|C1822627571|C1822627571|
|account|M2018675632|M2018675632|
|account|C1629481649|C1629481649|
|account| M776947998| M776947998|
|account| C759012282| C759012282|
|account|C1039543044|C1039543044|
|account|C1685490824|C1685490824|
|account| C443055530| C443055530|
|account|M1936404595|M1936404595|
|account|C1458319665|C1458319665|
|account|C1406712091|C1406712091|
|account|C1648846292|C1648846292|
+-------+-----------+-----------+
only showing top 20 rows

In [9]:
e.count()
e.show()

StatementMeta(sparkpool01, 18, 9, Finished, Available)

+--------------------+--------+-----------+----------+--------------+-----------+------------+-------+-----------+---------+-------------+--------------+--------------+
|                  id|   label|      _sink|_sinkLabel|_sinkPartition|  _vertexId|_vertexLabel|_isEdge|  accountId|   amount|oldbalanceOrg|oldbalanceDest|newbalanceDest|
+--------------------+--------+-----------+----------+--------------+-----------+------------+-------+-----------+---------+-------------+--------------+--------------+
|d4aa189e-0eeb-47c...| CASH_IN|C1793677055|   account|   C1793677055|C1519792972|     account|   true|C1519792972| 256203.2|1.795038527E7|     263485.24|       7282.05|
|9cdbcbf1-c5d2-45d...| CASH_IN|C1853417494|   account|   C1853417494| C945424670|     account|   true| C945424670|239326.27|2.222391698E7|     549020.76|     309694.49|
|e38e599a-fef5-428...|CASH_OUT|C2082577597|   account|   C2082577597|C1941980439|     account|   true|C1941980439|255774.06|      61334.0|           0.0|  

In [None]:
write_to_cosmos_graph(v,"vertices",False)
write_to_cosmos_graph(e,"edges",False)