In [None]:
from simple_salesforce import Salesforce
import pandas as pd
from dotenv import load_dotenv
from datetime import date
import os
from dotenv import load_dotenv  
load_dotenv()

### Load .env

In [3]:
SF_USERNAME = os.getenv("SF_USERNAME")
SF_PASSWORD_01 = os.getenv("SF_PASSWORD_01")
SF_SECURITY_TOKEN = os.getenv("SF_SECURITY_TOKEN")

In [4]:
# Connect to Salesforce
sf = Salesforce(username=SF_USERNAME, password=SF_PASSWORD_01, security_token=SF_SECURITY_TOKEN)

### Extract Lead Table

In [24]:
leads_query = sf.query_all("""Select 
Id,
CreatedDate, 
LeadSource, 
Lead_status__c, 
ConvertedAccountId, 
ConvertedOpportunityId, 
ConvertedDate, 
CYCLE__c,
Intake_Year__c,
Online_Source__c,
WEB_SOURCE_GRP__c,
Market_Segment__c,
Level_1__c,
Programme_1__c,
LEVEL__c,
Taylor_s_Faculty__c,
Lead_Owner_Role__c,
Campus_Preference_1__c                     
from 
Lead""")

In [25]:
def flatten_sf_record(record, parent_key='', sep='_'):
    items = []
    for k, v in record.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if new_key in ['attributes_type', 'attributes_url']:
            continue  # skip these keys
        if isinstance(v, dict):
            items.extend(flatten_sf_record(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# Flatten all Salesforce records
leads_flattened_record = [flatten_sf_record(rec) for rec in leads_query['records']]

In [26]:
leads_table = pd.DataFrame(leads_flattened_record)

In [27]:
#Standardize date and time formats for leads table
leads_table['date'] = leads_table['CreatedDate'].str.split('T').str[0]  
leads_table['time'] = leads_table['CreatedDate'].str.split('T').str[1].str.split('+').str[0] 

In [28]:
leads_table['CreatedDate'] = pd.to_datetime(leads_table['CreatedDate'])

In [29]:
leads_table['date'] = pd.to_datetime(leads_table['date'], errors='coerce')
leads_table['date'] = leads_table['date'].dt.strftime('%d/%m/%Y')
leads_table['cdt_leads_table'] = pd.to_datetime(leads_table['date'] + ' ' + leads_table['time'], format='%d/%m/%Y %H:%M:%S.%f')

In [30]:
leads_table = leads_table.rename(columns={'Id': 'leads_id', 
                              'LeadSource': 'leads_source',
                              'CreatedDate' : 'cdt_leads_original',
                              'Lead_status__c': 'leads_status',
                              'ConvertedAccountId': 'account_id',
                              'ConvertedOpportunityId': 'opp_id',
                              'CYCLE__c': 'leads_cycle',
                              'Intake_Year__c': 'leads_intake_year',
                              'Online_Source__c':'leads_online_source',
                              'WEB_SOURCE_GRP__c': 'leads_web_source_grp',
                              'Market_Segment__c': 'leads_market_segment', 
                              'Level_1__c':'leads_level_1',
                              'Programme_1__c':'leads_programme_preference',
                              'LEVEL__c':'leads_level',
                              'Taylor_s_Faculty__c':'leads_taylor_faculty',
                              'Lead_Owner_Role__c':'leads_owner_role',
                              'Campus_Preference_1__c':'leads_campus_preference'})

In [33]:
leads_final=leads_table[['leads_id','leads_source','leads_status','account_id','opp_id','leads_cycle','leads_intake_year','cdt_leads_original',
                         'leads_online_source','leads_web_source_grp','leads_market_segment','leads_level_1','leads_programme_preference','leads_level','leads_taylor_faculty',
                         'leads_owner_role','leads_campus_preference']]

### Data Cleansing for Campus

In [35]:
import pandas as pd
import re

def standardize_campus_simple(campus):
    if pd.isna(campus) or not str(campus).strip():
        return "Not specified"
    
    campus_str = str(campus).strip()
    
    # Check for TU patterns first
    tu_patterns = [
        r'taylor\'?s universit',  # Taylor's University variations
        r'\bTU\b',               # Standalone TU
        r'^TU[^-]',              # TU at start
        r'universit.*TU',        # University followed by TU
        r'TU.*universit'         # TU followed by University
    ]
    
    for pattern in tu_patterns:
        if re.search(pattern, campus_str, re.IGNORECASE):
            return "TU"
    
    # Check for TC patterns (including TCSH and TCSJ)
    tc_patterns = [
        r'taylor\'?s college',   # Taylor's College variations
        r'\bTC(S[HJ])?\b',       # TC, TCSH, or TCSJ
        r'^TC[^-]',              # TC at start
        r'college.*TC',          # College followed by TC
        r'TC.*college',          # TC followed by College
        r'sri hartamas',         # Sri Hartamas (TCSH)
        r'subang jaya',          # Subang Jaya (TCSJ)
        r'hartamas',             # Hartamas
        r'subang'                # Subang
    ]
    
    for pattern in tc_patterns:
        if re.search(pattern, campus_str, re.IGNORECASE):
            return "TC"
    
    # Handle special cases
    if re.search(r'not specified|unspecified|not set', campus_str, re.IGNORECASE):
        return "Not specified"
    
    if re.search(r'unknown|unknow', campus_str, re.IGNORECASE):
        return "Unknown Campus"
    
    # Return original if no patterns matched
    return campus_str

In [36]:
# Apply the regex standardization
leads_final['Standardized_Campus'] = leads_final['leads_campus_preference'].apply(standardize_campus_simple)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  leads_final['Standardized_Campus'] = leads_final['leads_campus_preference'].apply(standardize_campus_simple)


In [37]:
leads_final = leads_final.drop('leads_campus_preference', axis=1)

In [38]:
leads_final.rename(columns={'Standardized_Campus': 'leads_campus_preference'})

Unnamed: 0,leads_id,leads_source,leads_status,account_id,opp_id,leads_cycle,leads_intake_year,cdt_leads_original,leads_online_source,leads_web_source_grp,leads_market_segment,leads_level_1,leads_programme_preference,leads_level,leads_taylor_faculty,leads_owner_role,leads_campus_preference
0,00Q0I00000lMRXcUAO,Online,Qualified Converted,0010I00001bMmO3QAK,0060I00000Pdo9yQAB,C2,2018,2017-10-26 18:34:04+00:00,Application Form,ORGANIC,International,,,Other,FBL,ISR Region 5 Counselor,TU
1,00Q0I00000lMSV1UAO,Called-In,Qualified Converted,0010I00001bMRlyQAG,0060I00000PdgauQAB,C1,2018,2017-10-27 02:08:59+00:00,,Other,Transfer,Postgraduate Master - TBS - TU,Master of Management,PG-Taught,FBL,TU Sales Manager (PG),TU
2,00Q0I00000lMSW4UAO,Email-In,Invalid,,,C2,2018,2017-10-27 02:13:15+00:00,,Other,International,Foundation - SLAS - TC,Foundation in Arts,Foundation,TC,ISR Region 4 Counselor,TC
3,00Q0I00000lMSWOUA4,Email-In,Invalid,,,C2,2018,2017-10-27 02:14:48+00:00,,Other,International,Foundation - SLAS - TC,Foundation in Arts,Foundation,TC,ISR Region 4 Counselor,TC
4,00Q0I00000lMSXMUA4,Online,Qualified Converted,0010I00001bMuCmQAK,0060I00000PdrmOQAR,C1,2018,2017-10-27 02:21:24+00:00,Online Live Chat,ORGANIC,International,Postgraduate Master - SBS - TU,Master of Science,PG-Taught,FHMS,ISR Region 5 Counselor,TU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
761659,00QMg00000EHoDFMA1,Online,New,,,C3,2026,2025-09-03 07:17:31+00:00,Conversation - Social Media,ORGANIC,International,,Bachelor of Computer Science(Artifical Intelle...,Degree,TC,ISR Support Team,Not specified
761660,00QMg00000EHp9JMAT,Online,New,,,C3,2025,2025-09-03 07:23:49+00:00,Digital Prospectus,ORGANIC,New Local,Degree - SOE - TU,Bachelor of Chemical Engineering with Honours,Degree,FIT,,TU
761661,00QMg00000EHpXVMA1,Online,New,,,C3,2025,2025-09-03 07:25:59+00:00,Digital Prospectus,ORGANIC,International,Postgraduate Master - TBS - TU,Master of Management,PG-Taught,FBL,ISR Support Team,TU
761662,00QMg00000EHpajMAD,Online,New,,,C3,2026,2025-09-03 07:26:22+00:00,Enquiry Form,SEM,International,Degree - TBS - TU,Bachelor of Business (Honours) in Internationa...,Degree,FBL,ISR Support Team,TU


In [40]:
import os
from urllib.parse import quote
from sqlalchemy import create_engine
from dotenv import load_dotenv
 
def marcommdb_connection():
    # Load environment variables
    load_dotenv(override=True) 
 
    # Get credentials from environment variables
    username = os.getenv("PG_USERNAME")
    password = os.getenv("PG_PASSWORD")
    host = os.getenv("PG_HOST")
    port = os.getenv("PG_PORT")
    database = os.getenv("PG_DATABASE_EXPORT")
 
    # Ensure all credentials are available
    if not all([username, password, host, port, database]):
        raise ValueError("Missing one or more PostgreSQL environment variables!")
 
    # Encode password to handle special characters
    encoded_password = quote(password, safe="") if password else ""
 
    # Construct PostgreSQL connection string
    DATABASE_URL = f"postgresql+psycopg2://{username}:{encoded_password}@{host}:{port}/{database}"
 
    # Create and return SQLAlchemy engine
    return create_engine(DATABASE_URL)
 

In [41]:
engine = marcommdb_connection()

### Export to Marcomm DB

In [None]:
leads_final.to_sql(
    'leads_staging',
    engine,
    schema='staging',
    if_exists='replace',
    index=False
)

664