# Intro

This notebook reads all bronze tables from Mythic via uploader folder, cleans tables and writes tables to silver.


## Change History

<style>
  table {margin-left: 0 !important;}
</style>

| Date    | Author | Description |
| :-------- | :------- | :------- | 
|2025-07-01 | Mclain R |  Created Date|

# Code

## Imports

###### notebookutils
- **mssparkutils**: A utility module in Microsoft Fabric that provides functions for handling file operations, secrets, and other notebook-related tasks within the Spark environment.

###### pyspark.sql.functions
- **col**: A function used to reference a DataFrame column in PySpark expressions, typically for transformations or filtering.
- **F**: A common alias for importing PySpark SQL functions, allowing access to various built-in functions (e.g., F.lit(), F.when(), etc.) for DataFrame transformations.

###### python
- **re**: The regular expressions module used for pattern matching, text parsing, and string manipulation in Python.

In [None]:
from notebookutils import mssparkutils
from pyspark.sql.functions import col
import re
# import pyspark.sql.functions as F
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

## Define Parameters
- none

Note: the following is a parameter cell and will be interpreted by Pipelines as such.

## Reused Functions

In [None]:
# Function to format phone numbers for HubSpot
def format_phone_for_hubspot(phone_str):
    """
    Format phone number to HubSpot E.164 format (+1XXXXXXXXXX for US numbers)
    Handles the specific formats found in your data:
    - '+1 2538619993' -> '+12538619993' 
    - '(813) 927-2957' -> '+18139272957'
    - '+1 15127912247' -> '+15127912247' (fixes the extra 1)
    """
    if phone_str is None or phone_str == "":
        return None
    
    cleaned = str(phone_str).strip()
    
    # Handle the error case: +1 1XXXXXXXXXX (12 digits total)
    # This appears to be +1 + extra 1 + 10 digit number
    if re.match(r'^\+1 1\d{10}$', cleaned):
        digits = re.sub(r'\D', '', cleaned)
        return f"+{digits[1:]}"  # Remove the extra '1' after country code
    
    # Handle normal +1 XXXXXXXXXX (with space) -> remove space
    if re.match(r'^\+1 \d{10}$', cleaned):
        return cleaned.replace(' ', '')
    
    # Handle +1XXXXXXXXXX (already correct format)
    if re.match(r'^\+1\d{10}$', cleaned):
        return cleaned
    
    # Handle (XXX) XXX-XXXX format -> convert to +1XXXXXXXXXX
    if re.match(r'^\(\d{3}\) \d{3}-\d{4}$', cleaned):
        digits = re.sub(r'\D', '', cleaned)
        return f"+1{digits}"
    
    # Fallback: extract digits and format
    digits_only = re.sub(r'\D', '', cleaned)
    if len(digits_only) == 10:
        return f"+1{digits_only}"
    elif len(digits_only) == 11 and digits_only.startswith('1'):
        return f"+{digits_only}"
    elif len(digits_only) == 12 and digits_only.startswith('11'):
        # Handle the 12-digit error case in digit-only form
        return f"+{digits_only[1:]}"
    else:
        # Return None for any unhandleable formats
        return None

# Register UDF
format_phone_udf = F.udf(format_phone_for_hubspot, StringType())

In [None]:
#Function to format emails for HubSpot
def format_email_for_hubspot(email_str):
    """
    Replace @ally.com with @dealerbeyond.ally.com for HubSpot integration
    """
    if email_str is None or email_str == "":
        return None
    
    email_cleaned = str(email_str).strip()
    
    # Replace @ally.com with @dealerbeyond.ally.com
    if email_cleaned.endswith("@ally.com"):
        return email_cleaned.replace("@ally.com", "@dealerbeyond.ally.com")
    else:
        return email_cleaned

# Register UDF
format_email_udf = F.udf(format_email_for_hubspot, StringType())

## Define Fields

- **workspace_name**: name of workspace


In [None]:
import sempy
import sempy.fabric as fabric

# Get the current workspace ID
workspace_id = fabric.get_workspace_id()
print(f"Workspace ID: {workspace_id}")

# Get the workspace name from the workspace ID
workspace_name = fabric.resolve_workspace_name(workspace_id)
print(f"Workspace Name: {workspace_name}")

## Process Data

In [None]:
# Base path containing the folders
base_path = f"abfss://{workspace_name}@onelake.dfs.fabric.microsoft.com/bronze_lakehouse.Lakehouse/Tables"

# List all items in the base path
items = mssparkutils.fs.ls(base_path)

# Get Table paths for additional fields tables
table_paths = [item.path for item in items if item.isDir and item.name.startswith('icrm_names_for_leads')]

# Process each table
for table_path in table_paths:
    table_name = table_path.split('/')[-1]

    try:
        # Read Delta Table from the folder
        df = spark.read.format("delta").load(table_path)
        display(f"Processing: {table_name}")

        # Specify the exact phone number columns to format
        all_phone_columns = [
            'f_i_dir_of_sales_mobile',
            'f_i_sr_dir_of_sales_mobile', 
            'regional_growth_leader_1_mobile',
            'regional_growth_leader_2_mobile',
            'p_c_sr_director_of_sales_mobile',
            'p_c_director_of_sales_mobile',
            'p_c_account_manager_mobile'
        ]

        # Specify the email columns and their HubSpot duplicates
        email_columns_mapping = {
            'f_i_dir_of_sales_email': 'f_i_dir_of_sales_hubspot_email',
            'f_i_sr_dir_of_sales_email': 'f_i_sr_dir_of_sales_hubspot_email',
            'regional_growth_leader_1_email': 'regional_growth_leader_1_hubspot_email',
            'regional_growth_leader_2_email': 'regional_growth_leader_2_hubspot_email',
            'p_c_sr_director_of_sales_email': 'p_c_sr_director_of_sales_hubspot_email',
            'p_c_director_of_sales_email': 'p_c_director_of_sales_hubspot_email',
            'p_c_account_manager_email': 'p_c_account_manager_hubspot_email'
        }
        
        # Only process columns that actually exist in the dataframe
        phone_columns = [col for col in all_phone_columns if col in df.columns]
        email_columns = {orig: new for orig, new in email_columns_mapping.items() if orig in df.columns}

        # Apply phone formatting to identified columns
        for phone_col in phone_columns:
            df = df.withColumn(phone_col, format_phone_udf(F.col(phone_col)))

        # Create HubSpot email columns with @dealerbeyond.ally.com domain
        for original_email_col, hubspot_email_col in email_columns.items():
            df = df.withColumn(hubspot_email_col, format_email_udf(F.col(original_email_col)))
        
        display(df.head(10))

        # Write to Delta table
        delta_table_path = f"abfss://{workspace_name}@onelake.dfs.fabric.microsoft.com/silver_lakehouse.Lakehouse/Tables/{table_name}"
        df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(delta_table_path)

        display(f"Successfully processed {table_name} with phone formatting and HubSpot email columns")

    except Exception as e:
        display(f"Skipping {table_name}: {str(e)}")