In [1]:
from simple_salesforce import Salesforce
import pandas as pd
from dotenv import load_dotenv
from datetime import date
import os

### Load .env

In [2]:
from dotenv import load_dotenv  
load_dotenv()

True

In [3]:
SF_USERNAME = os.getenv("SF_USERNAME")
SF_PASSWORD_01 = os.getenv("SF_PASSWORD_01")
SF_SECURITY_TOKEN = os.getenv("SF_SECURITY_TOKEN")

In [4]:
# Connect to Salesforce
sf = Salesforce(username=SF_USERNAME, password=SF_PASSWORD_01, security_token=SF_SECURITY_TOKEN)

### Load and Cleanse Lead Table

In [5]:
leads_query = sf.query_all("""Select 
Id,
CreatedDate, 
LeadSource, 
Lead_status__c, 
ConvertedAccountId, 
ConvertedOpportunityId, 
ConvertedDate, 
CYCLE__c,
Intake_Year__c,
Online_Source__c,
WEB_SOURCE_GRP__c,
Market_Segment__c,
Level_1__c,
Programme_1__c,
LEVEL__c,
Taylor_s_Faculty__c,
Lead_Owner_Role__c,
Campus_Preference_1__c                     
from 
Lead""")

In [6]:
def flatten_sf_record(record, parent_key='', sep='_'):
    items = []
    for k, v in record.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if new_key in ['attributes_type', 'attributes_url']:
            continue  # skip these keys
        if isinstance(v, dict):
            items.extend(flatten_sf_record(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# Flatten all Salesforce records
leads_flattened_record = [flatten_sf_record(rec) for rec in leads_query['records']]

In [7]:
leads_table = pd.DataFrame(leads_flattened_record)

In [8]:
#Standardize date and time formats for leads table
leads_table['date'] = leads_table['CreatedDate'].str.split('T').str[0]  
leads_table['time'] = leads_table['CreatedDate'].str.split('T').str[1].str.split('+').str[0] 

In [9]:
leads_table['CreatedDate'] = pd.to_datetime(leads_table['CreatedDate'])

In [10]:
leads_table['date'] = pd.to_datetime(leads_table['date'], errors='coerce')
leads_table['date'] = leads_table['date'].dt.strftime('%d/%m/%Y')
leads_table['cdt_leads_table'] = pd.to_datetime(leads_table['date'] + ' ' + leads_table['time'], format='%d/%m/%Y %H:%M:%S.%f')

In [11]:
leads_table = leads_table.rename(columns={'Id': 'leads_id', 
                              'LeadSource': 'leads_source',
                              'CreatedDate' : 'cdt_leads_original',
                              'Lead_status__c': 'leads_status',
                              'ConvertedAccountId': 'account_id',
                              'ConvertedOpportunityId': 'opp_id',
                              'CYCLE__c': 'leads_cycle',
                              'Intake_Year__c': 'leads_intake_year',
                              'Online_Source__c':'leads_online_source',
                              'WEB_SOURCE_GRP__c': 'leads_web_source_grp',
                              'Market_Segment__c': 'leads_market_segment', 
                              'Level_1__c':'leads_level_1',
                              'Programme_1__c':'leads_programme_preference',
                              'LEVEL__c':'leads_level',
                              'Taylor_s_Faculty__c':'leads_taylor_faculty',
                              'Lead_Owner_Role__c':'leads_owner_role',
                              'Campus_Preference_1__c':'leads_campus_preference'})

In [12]:
leads_final=leads_table[['leads_id','leads_source','leads_status','account_id','opp_id','leads_cycle','leads_intake_year','cdt_leads_original',
                         'leads_online_source','leads_web_source_grp','leads_market_segment','leads_level_1','leads_programme_preference','leads_level','leads_taylor_faculty',
                         'leads_owner_role','leads_campus_preference']]

In [13]:
import pandas as pd
import re

def standardize_campus_simple(campus):
    if pd.isna(campus) or not str(campus).strip():
        return "Not specified"
    
    campus_str = str(campus).strip()
    
    # Check for TU patterns first
    tu_patterns = [
        r'taylor\'?s universit',  # Taylor's University variations
        r'\bTU\b',               # Standalone TU
        r'^TU[^-]',              # TU at start
        r'universit.*TU',        # University followed by TU
        r'TU.*universit'         # TU followed by University
    ]
    
    for pattern in tu_patterns:
        if re.search(pattern, campus_str, re.IGNORECASE):
            return "TU"
    
    # Check for TC patterns (including TCSH and TCSJ)
    tc_patterns = [
        r'taylor\'?s college',   # Taylor's College variations
        r'\bTC(S[HJ])?\b',       # TC, TCSH, or TCSJ
        r'^TC[^-]',              # TC at start
        r'college.*TC',          # College followed by TC
        r'TC.*college',          # TC followed by College
        r'sri hartamas',         # Sri Hartamas (TCSH)
        r'subang jaya',          # Subang Jaya (TCSJ)
        r'hartamas',             # Hartamas
        r'subang'                # Subang
    ]
    
    for pattern in tc_patterns:
        if re.search(pattern, campus_str, re.IGNORECASE):
            return "TC"
    
    # Handle special cases
    if re.search(r'not specified|unspecified|not set', campus_str, re.IGNORECASE):
        return "Not specified"
    
    if re.search(r'unknown|unknow', campus_str, re.IGNORECASE):
        return "Unknown Campus"
    
    # Return original if no patterns matched
    return campus_str

In [14]:
# Apply the regex standardization
leads_final['leads_campus_preference'] = leads_final['leads_campus_preference'].apply(standardize_campus_simple)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  leads_final['leads_campus_preference'] = leads_final['leads_campus_preference'].apply(standardize_campus_simple)


In [15]:
import pandas as pd
from datetime import datetime

current_year = datetime.now().year
min_year = 2000
max_year = current_year + 5  # 5 years in future

In [16]:
def clean_year(value):
    try:
        # Try to convert to numeric
        year = pd.to_numeric(value, errors='coerce')
        
        # Check if it's within our valid range
        if not pd.isna(year) and (min_year <= year <= max_year):
            return int(year)
        return None  # Will become NaN in pandas
    
    except:
        return None  # Will become NaN for any non-convertible values

In [17]:
# Apply cleaning and filtering
leads_final['clean_intake_year'] = leads_final['leads_intake_year'].apply(clean_year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  leads_final['clean_intake_year'] = leads_final['leads_intake_year'].apply(clean_year)


In [18]:
leads_final = leads_final.drop(['leads_intake_year'], axis=1)
leads_final = leads_final.rename(columns={'clean_intake_year': 'leads_intake_year'})
leads_final['leads_intake_year'] = pd.to_datetime(leads_final['leads_intake_year'], format='%Y')
leads_final['leads_intake_year'] = leads_final['leads_intake_year'].dt.year

### Load and Cleanse Lead History Table

In [19]:
leads_history = sf.query_all("""SELECT Id, LeadId, OldValue, NewValue ,CreatedDate
FROM LeadHistory
WHERE field = 'Status'""")

In [20]:
def flatten_sf_record(record, parent_key='', sep='_'):
    items = []
    for k, v in record.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if new_key in ['attributes_type', 'attributes_url']:
            continue  # skip these keys
        if isinstance(v, dict):
            items.extend(flatten_sf_record(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# Flatten all Salesforce records
leads_history_flattened_records = [flatten_sf_record(rec) for rec in leads_history['records']]

In [21]:
lead_history_table = pd.DataFrame(leads_history_flattened_records)

In [22]:
lead_history_table['date'] = lead_history_table['CreatedDate'].str.split('T').str[0]  
lead_history_table['time'] = lead_history_table['CreatedDate'].str.split('T').str[1].str.split('+').str[0]  
lead_history_table['CreatedDate'] = pd.to_datetime(lead_history_table['CreatedDate'])
lead_history_table['date'] = pd.to_datetime(lead_history_table['date'], errors='coerce')
lead_history_table['date'] = lead_history_table['date'].dt.strftime('%d/%m/%Y')
lead_history_table['cdt_leads_table'] = pd.to_datetime(lead_history_table['date'] + ' ' + lead_history_table['time'], format='%d/%m/%Y %H:%M:%S.%f')

In [23]:
lead_history_table = lead_history_table.rename(columns={
                                            'Id' : 'leads_history_id',
                                            'LeadId': 'leads_id', 
                                            'OldValue': 'old_value',
                                            'NewValue': 'new_value'})

In [24]:
lead_history_table = lead_history_table.rename(columns={'CreatedDate': 'created_date'})
lead_history_table = lead_history_table.rename(columns={'created_date': 'cdt_leads_original'})
lead_history_table = lead_history_table.drop(['date','time'], axis=1)

In [25]:
leads_history_table_v2 = lead_history_table[['leads_id','old_value','new_value','cdt_leads_original']]
leads_history_table_v2 = lead_history_table.rename(columns={'cdt_leads_original': 'cdt_leadshistory_original'})

In [26]:
leads_final_v2 = leads_final[['leads_id','leads_source','leads_status','account_id','opp_id','cdt_leads_original','leads_cycle','leads_intake_year',
                              'leads_online_source','leads_web_source_grp','leads_market_segment','leads_level_1','leads_programme_preference','leads_level',
                              'leads_taylor_faculty','leads_owner_role','leads_campus_preference']]

# Merge leads history with leads table on leads_id to find id that do not exist in leads history

In [27]:

lh_ld_join = leads_history_table_v2.merge(leads_final_v2, how='left', on='leads_id')

In [28]:
leads_in_final = set(leads_final_v2['leads_id'])
leads_in_history = set(leads_history_table_v2['leads_id'])
missing_in_history = list(leads_in_final - leads_in_history)

In [29]:
missing_records = leads_final_v2[leads_final_v2['leads_id'].isin(missing_in_history)]

In [30]:
missing_records = missing_records.rename(columns={'cdt_leads_original': 'validfrom'})

In [31]:
import pandas as pd

missing_records['validto'] = pd.NaT
missing_records['is_current'] = True

### SCD Records for leads table

In [33]:
scd_records = []

In [34]:
for leads_id, group in lh_ld_join.groupby('leads_id'):
    group = group.reset_index(drop=True)
    
    # Handle the first record for each lead
    first_row = group.iloc[0]
    
    # Create a record for the initial OldValue
    initial_old_value = {
        'leads_id': leads_id,
        'leads_status': first_row['old_value'],
        'validfrom': first_row['cdt_leads_original'], # The start date is the createddate in leads table
        'validto': first_row['cdt_leadshistory_original'],
        'leads_source' : first_row['leads_source'],
        'leads_cycle' : first_row['leads_cycle'],
        'leads_intake_year' : first_row['leads_intake_year'],
        'leads_online_source' : first_row['leads_online_source'],
        'leads_web_source_grp' : first_row['leads_web_source_grp'],
        'leads_market_segment' : first_row['leads_market_segment'],
        'leads_level_1' : first_row['leads_level_1'],
        'leads_programme_1' : first_row['leads_programme_preference'],
        'leads_level' : first_row['leads_level'],
        'leads_taylor_faculty' : first_row['leads_taylor_faculty'],
        'leads_owner_role' : first_row['leads_owner_role'],
        'leads_campus_preference' : first_row['leads_campus_preference'],
        'is_current': False
    }
    scd_records.append(initial_old_value)
    
    # Now, iterate through the records to capture the changes
    for i in range(len(group)):
        row = group.iloc[i]
        validfrom = row['cdt_leadshistory_original']
        
        # Determine the end date for the current status.
        # It's the start date of the next record, or None for the final record.
        if i + 1 < len(group):
            validto = group.iloc[i+1]['cdt_leadshistory_original']
        else:
            validto = None
        
        is_current = (i == len(group) - 1)
        
        # Create the record for the new status
        new_status_record = {
            'leads_id': leads_id,
            'leads_status': row['new_value'],
            'validfrom': validfrom,
            'validto': validto,
            'leads_source' : row['leads_source'],
            'leads_cycle' : row['leads_cycle'],
            'leads_intake_year' : row['leads_intake_year'],
            'leads_online_source' : row['leads_online_source'],
            'leads_web_source_grp' : row['leads_web_source_grp'],
            'leads_market_segment' : row['leads_market_segment'],
            'leads_level_1' : row['leads_level_1'],
            'leads_programme_1' : row['leads_programme_preference'],
            'leads_level' : row['leads_level'],
            'leads_taylor_faculty' : row['leads_taylor_faculty'],
            'leads_owner_role' : first_row['leads_owner_role'],
            'leads_campus_preference' : first_row['leads_campus_preference'],   
            'is_current': is_current
        }
        scd_records.append(new_status_record)

In [35]:
scd_df = pd.DataFrame(scd_records)

### Appending missing records to the scd table to combine the new data generated in leads table

In [36]:
scd_df_final = pd.concat([scd_df, missing_records], ignore_index=True)

In [37]:
scd_df_final = scd_df_final.drop_duplicates(keep=False)

In [38]:
scd_df_final['validto'] = scd_df_final['validto'].replace({pd.NaT: pd.Timestamp('9999-12-31 00:00:00+00:00')})

In [39]:
scd_df_final['validfromyear'] = pd.to_datetime(scd_df_final['validfrom']).dt.year
scd_df_final['validfrommonth'] = pd.to_datetime(scd_df_final['validfrom']).dt.month
scd_df_final['validfromday'] = pd.to_datetime(scd_df_final['validfrom']).dt.day

In [42]:
import os
from urllib.parse import quote
from sqlalchemy import create_engine
from dotenv import load_dotenv
 
def marcommdb_connection():
    # Load environment variables
    load_dotenv(override=True) 
 
    # Get credentials from environment variables
    username = os.getenv("PG_USERNAME")
    password = os.getenv("PG_PASSWORD")
    host = os.getenv("PG_HOST")
    port = os.getenv("PG_PORT")
    database = os.getenv("PG_DATABASE_EXPORT")
 
    # Ensure all credentials are available
    if not all([username, password, host, port, database]):
        raise ValueError("Missing one or more PostgreSQL environment variables!")
 
    # Encode password to handle special characters
    encoded_password = quote(password, safe="") if password else ""
 
    # Construct PostgreSQL connection string
    DATABASE_URL = f"postgresql+psycopg2://{username}:{encoded_password}@{host}:{port}/{database}"
 
    # Create and return SQLAlchemy engine
    return create_engine(DATABASE_URL)

In [43]:
engine= marcommdb_connection()

### Export to Marcomm DB

In [44]:
scd_df_final.to_sql(
    'leads_history_staging',
    engine,
    schema='staging',
    if_exists='replace',
    index=False
)

346