## Data Generator 
This notebook generates synthetic data in a Databricks notebook. This notebook mimicks work completed while serving in a position and replaces all PII and confidential data with dummy data. The data will not run unless connected to a real databricks account, storage account, neo4j instance, and corresponding code is updated to reflect so.

In [0]:
# Configuring access credentials and endpoints for interacting with Azure services and Neo4j, within Databricks. 

AUTHORITY = f"https://login.microsoftonline.us/dummyaccount.onmicrosoft.com"
NEO4J_APPLICATION_ID = "0000000-0000-0000-0000-0000000"
SP_CLIENT_ID = "0000000-0000-0000-0000-0000000"
SP_CLIENT_SECRET = dbutils.secrets.get(scope = "NeoSecretScope", key = "ABC-DEVTEST-ABC-DEF")

STORAGE_ACCOUNT_URL = "https://mystorageaccount.dfs.core.windows.net/my-container"
CONTAINER = "published"

In [0]:
# Import necessary libraries 

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import datetime
import numpy as np  
import os
import random
import string
from delta import configure_spark_with_delta_pip
from azure.storage.blob import BlobServiceClient

In [0]:
# Initializes a Spark session named SyntheticDataGenerator with Delta Lake support using Databricks' Delta configuration helper for reading/writing Delta tables and to ensure compatibility outside of Databricks.

builder = (
    SparkSession.builder.appName("SyntheticDataGenerator")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [0]:
BASE_PATH = f"abfss://{CONTAINER}@{STORAGE_ACCOUNT_URL}"

# Record Counts
N_ABVR = 15 
N_ORG_ID = 200 
N_TASK_ID = 1000 
N_COMPLETE_ID = 1000 

# ID ranges
ID_RANGES = {
   'ORG_COMPL_ID_START': 200, 
   'TASK_ID_START': 600,
}

ORG_COMPL_ID_START = 200 
TASK_ID_START = 600

# Value lists
FILTER_ABC = ['0', '1', '2']

# Defined IDs
def generate_abvr_id(n, seed=42):
    random.seed(seed)
    return ['ABC-005' + ''.join(random.choices(string.ascii_uppercase, k=1)) for _ in range(n)]

def generate_org_id(n, seed=42):
    random.seed(seed) 
    return [f"{i}" for i in range(ORG_COMPL_ID_START, ORG_COMPL_ID_START + n)]

def generate_task_ids(n, seed=42):
    random.seed(seed) 
    return [f"{i}" for i in range(TASK_ID_START, TASK_ID_START + n)]

def generate_task_dot_num(n, seed):
    random.seed(seed) 
    return ['.'.join(map(str, random.sample(range(1,18), random.randint(2, 5)))) for _ in range(n)]


In [0]:
# 1. Generate Organization Abbreviation data from flatfile
def generate_abvr_data(n_rows= N_ABVR):
    data = {
        'ABVR_ABVR': [''.join(random.choices(string.ascii_uppercase, k=3)) for i in range(N_ABVR)],
        'ORG_ABVR_ID': generate_abvr_id(N_ABVR)
}
    return pd.DataFrame(data)

# Generate and save
df = generate_abvr_data()
print(df)
spark_df = spark.createDataFrame(df)
abvr_path = f"{BASE_PATH}/abvr"
spark_df.write.mode("overwrite").format("delta").save(abvr_path)

# Verify
print("Org Abbreviation Data Generated:")
display(spark.read.format("delta").load(abvr_path))

In [0]:
# 2. Generate Division data from flatfile
def generate_org_data(n_rows=N_ORG_ID):
    data = {
        'ID_ORG': generate_org_id(n_rows),
        'ORG_ABVR_ID': generate_abvr_id(n_rows),
        'FILTER_ABC': [random.choice(FILTER_ABC) for _ in range(n_rows)]
    }
    return pd.DataFrame(data)

# Generate and save
df = generate_org_data()
print(df)
spark_df = spark.createDataFrame(df)
org_path = f"{BASE_PATH}/org"
spark_df.write.mode("overwrite").option("mergeSchema", "true").format("delta").save(org_path)

# Verify
print("Division Data Generated:")
display(spark.read.format("delta").load(org_path)) 

In [0]:
# 3. Generate Task Completion data mimicks data from SharePoint list
def generate_complete_data(n_rows=N_COMPLETE_ID):
    compl_ids = list(range(ORG_COMPL_ID_START, ORG_COMPL_ID_START + n_rows)) 
    org_ids = generate_org_id(n_rows, seed=42)
    org_names = [f"Org Name for Org {id}" for id in org_ids]
    task_ids = generate_task_ids(n_rows)
    
    data = {
        'Organizations': [', '.join(random.sample(org_names, random.randint(1, 5))) for _ in range(n_rows)],
        'task_title_choice': [', '.join(random.sample(task_ids, random.randint(1, 5))) for _ in range(n_rows)],
        'ID': compl_ids
    }
    return pd.DataFrame(data)

# Generate and save
df = generate_complete_data()
spark_df = spark.createDataFrame(df)
completion_path = f"{BASE_PATH}/completion"
spark_df.write.mode("overwrite").option("mergeSchema", "true").format("delta").save(completion_path)

# Verify
print("Task Completion Data Generated:")
display(spark.read.format("delta").load(completion_path))

In [0]:
# 4. Generate Task Information data mimicks data from SharePoint list
def generate_task_data(n_rows=N_TASK_ID):
    data = {
        'ID': generate_task_ids(n_rows, seed=41),
        'task_ID': generate_task_dot_num(n_rows, seed=40), 
        'task_title': [f"{task} Detailed DEF task Title for task IDs {task_id}" for task, task_id in zip(generate_task_dot_num(n_rows, seed=40), generate_task_ids(n_rows))]
    }
    return pd.DataFrame(data)

# Generate and save
df = generate_task_data()
spark_df = spark.createDataFrame(df)
task_path = f"{BASE_PATH}/task"
spark_df.write.mode("overwrite").format("delta").save(task_path)

# Verify
print("Task Information Data Generated:")
display(spark.read.format("delta").load(task_path))

In [0]:
# 5 Generate Crosswalk Task data from flatfile
def generate_crosswalk_task_data(n_rows=N_TASK_ID):
    task_dot_num = generate_task_dot_num(n_rows, seed= 40)
    task_np_dot_num = generate_task_dot_num(n_rows, seed= 43)

    data = {
        'task_p_hier_id': task_dot_num,
        'task_p_hier_nm':[f"Detailed DEF task Title for task IDs {task}" for task in task_dot_num],
        'task_hier_id': task_np_dot_num,
        'task_hier_nm':[f"Detailed OPTION task Title for task IDs {task}" for task in task_np_dot_num],
    }

    return pd.DataFrame(data)

df = generate_crosswalk_task_data()
spark_df = spark.createDataFrame(df)
crosswalk_task_path = f"{BASE_PATH}/crosswalk_task"
spark_df.write.mode("overwrite").option("mergeSchema", "true").format("delta").save(crosswalk_task_path)

# Verify
print("Crosswalk Task Data Generated:")
display(spark.read.format("delta").load(crosswalk_task_path))