# Intro

This notebook reads all bronze tables from Mythic via uploader folder, cleans tables and writes tables to silver.


## Change History

<style>
  table {margin-left: 0 !important;}
</style>

| Date    | Author | Description |
| :-------- | :------- | :------- | 
|2024-10-14 | Mclain R |  Created Date|
|2025-01-16 | Mclain R |  Changes to reflect new process|
|2025-01-23 | Mclain R |  Use email instead of contact_id|
|2025-02-26 | Mclain R |  Adjust code so dcrm file contains contact_role field|

# Code

## Imports

###### notebookutils
- **mssparkutils**: A utility module in Microsoft Fabric that provides functions for handling file operations, secrets, and other notebook-related tasks within the Spark environment.

###### pyspark.sql.functions
- **col**: A function used to reference a DataFrame column in PySpark expressions, typically for transformations or filtering.
- **F**: A common alias for importing PySpark SQL functions, allowing access to various built-in functions (e.g., F.lit(), F.when(), etc.) for DataFrame transformations.

###### python
- **re**: The regular expressions module used for pattern matching, text parsing, and string manipulation in Python.

In [None]:
from notebookutils import mssparkutils
from pyspark.sql.functions import col
import re
# import pyspark.sql.functions as F
from pyspark.sql import functions as F

## Define Parameters
- none

Note: the following is a parameter cell and will be interpreted by Pipelines as such.

## Reused Functions
- none

## Define Fields

- **workspace_name**: name of workspace


In [None]:
import sempy
import sempy.fabric as fabric

# Get the current workspace ID
workspace_id = fabric.get_workspace_id()
print(f"Workspace ID: {workspace_id}")

# Get the workspace name from the workspace ID
workspace_name = fabric.resolve_workspace_name(workspace_id)
print(f"Workspace Name: {workspace_name}")

## Process Data

In [None]:
# Base path containing the folders
base_path = f"abfss://{workspace_name}@onelake.dfs.fabric.microsoft.com/bronze_lakehouse.Lakehouse/Tables"

# List all items in the base path
items = mssparkutils.fs.ls(base_path)

# Get Table paths for additonal fields tables
table_paths = [item.path for item in items if item.isDir and (item.name.startswith('nucleus__icrm_contacts') or item.name.startswith('nucleus__dcrm_contacts'))]

# Process each table
for table_path in table_paths:
    # Read Delta Tables from the folder
    df = spark.read.format("delta").load(table_path)

    display(table_path.split('/')[-1])

    if table_path.split('/')[-1] == 'nucleus__dcrm_contacts':

        # Split the 'contact_role' into an array where roles are separated by ";"
        df = df.withColumn("roles_array", F.split(F.col("contact_role"), "; "))

        # Explode the 'roles_array' into multiple rows
        df = df.withColumn("contact_role_single", F.explode(F.col("roles_array")))

        # Define the job title based on the exploded single role
        df = df.withColumn(
            "job_title",
            F.when(
                F.col("contact_role_single").isin(["CEO", "CFO", "CFO/Controller", "Comptroller", "Controller", "Dealer Principal", "Owner", "President", "Owner/Principal"]), "Owner/Principal"
            ).when(
                F.col("contact_role_single").isin(["Ally Systems Authorizer", "Assist Service Manager", "Branch Manager", "Business Manager", "Fixed Operations Manager", "Garage Insurance Decision Maker", "General Manager", "General Sales Manager", "GM Warranty Manager", "Inventory Manager", "Manager", "Office Manager", "Operations Director", "Treasurer", "Vice President"]), "General Manager"
            ).when(
                F.col("contact_role_single").isin(["Credit Admin Contact", "DS-Decision Maker", "DS-Gate Keeper", "F&I Manager", "Finance Director", "GAP/Aftermarket Cancellation", "Used Car F&I Manager", "Warranty Administrator", "Finance Manager"]), "Finance Manager"
            ).when(
                F.col("contact_role_single").isin(["Fleet/Commercial Admin", "Fleet/Commercial Manager", "Fleet/Commercial Sales", "New Car Manager", "Sales Manager", "Sales Staff", "SmartAuction Buyer", "SmartAuction Seller", "SmartAuction User", "Used Car Manager"]), "Sales Manager"
            ).when(
                F.col("contact_role_single") == "Other", "Other"
            ).when(
                F.col("contact_role_single").isin(["No Longer Employed", "No Longer Employeed"]), "No Longer Employeed"
            ).when(
                F.col("contact_role_single").isin(["CIO", "Claims IT Manager", "Clearlane Leads", "Compliance Officer", "COVID-Communications", "Held Offering Contact", "Human Resources Manager", "Internet Manager", "Marketing Manager", "Office Staff", "Parts Manager", "Service Advisor", "Service Director", "Service Manager", "Shop Foreman", "Targeted Messaging Contact", "Title Clerk", "Never Targeted"]), "Never Targeted"
            ).otherwise("Unknown")
        )

        # Drop the 'roles_array' and 'contact_role_single' column after use
        df = df.drop("roles_array")
        df = df.drop("contact_role_single")
    
    else:

        # Split the 'contact_title' into an array where roles are separated by ";"
        df = df.withColumn("titles_array", F.split(F.col("contact_title"), "; "))

        # Explode the 'titles_array' into multiple rows
        df = df.withColumn("contact_title_single", F.explode(F.col("titles_array")))

        # Define the job title based on the exploded single title
        df = df.withColumn(
            "job_title",
            F.when(
                F.col("contact_title_single").isin(["CEO", "CFO", "CFO/Controller", "Comptroller", "Controller", "Dealer Principal", "Owner", "President", "Owner/Principal"]), "Owner/Principal"
            ).when(
                F.col("contact_title_single").isin(["Ally Systems Authorizer", "Assist Service Manager", "Branch Manager", "Business Manager", "Fixed Operations Manager", "Garage Insurance Decision Maker", "General Manager", "General Sales Manager", "GM Warranty Manager", "Inventory Manager", "Manager", "Office Manager", "Operations Director", "Treasurer", "Vice President"]), "General Manager"
            ).when(
                F.col("contact_title_single").isin(["Credit Admin Contact", "DS-Decision Maker", "DS-Gate Keeper", "F&I Manager", "Finance Director", "GAP/Aftermarket Cancellation", "Used Car F&I Manager", "Warranty Administrator", "Finance Manager"]), "Finance Manager"
            ).when(
                F.col("contact_title_single").isin(["Fleet/Commercial Admin", "Fleet/Commercial Manager", "Fleet/Commercial Sales", "New Car Manager", "Sales Manager", "Sales Staff", "SmartAuction Buyer", "SmartAuction Seller", "SmartAuction User", "Used Car Manager"]), "Sales Manager"
            ).when(
                F.col("contact_title_single") == "Other", "Other"
            ).when(
                F.col("contact_title_single").isin(["No Longer Employed", "No Longer Employeed"]), "No Longer Employeed"
            ).when(
                F.col("contact_title_single").isin(["CIO", "Claims IT Manager", "Clearlane Leads", "Compliance Officer", "COVID-Communications", "Held Offering Contact", "Human Resources Manager", "Internet Manager", "Marketing Manager", "Office Staff", "Parts Manager", "Service Advisor", "Service Director", "Service Manager", "Shop Foreman", "Targeted Messaging Contact", "Title Clerk", "Never Targeted"]), "Never Targeted"
            ).otherwise("Unknown")
        )

        # Drop the 'titles_array' and 'contact_title_single' column after use
        df = df.drop("titles_array")
        df = df.drop("contact_title_single")

    display(df.head(10))

    # Write to Delta table
    delta_table_path = f"abfss://{workspace_name}@onelake.dfs.fabric.microsoft.com/silver_lakehouse.Lakehouse/Tables/" + table_path.split('/')[-1]
    df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(delta_table_path)

In [None]:
# Combine both tables and write to one combined table
df_combined = spark.sql("""
SELECT DISTINCT
    job_title,
    account_id,
    uniqueaccountid,
    email
FROM silver_lakehouse.nucleus__dcrm_contacts
WHERE job_title is not null
    AND email like '%@%'

UNION

SELECT DISTINCT
    job_title,
    account_id,
    uniqueaccountid,
    email
FROM silver_lakehouse.nucleus__icrm_contacts
WHERE job_title is not null
    AND email like '%@%'
""")

# Write to Delta table
delta_table_path = f"abfss://{workspace_name}@onelake.dfs.fabric.microsoft.com/silver_lakehouse.Lakehouse/Tables/nucleus__combined_contacts"
df_combined.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(delta_table_path)