In [43]:
import snowflake.connector
import pandas as pd
import numpy as np

In [1]:
account = 'XXXXXXXXXXXXXXXX'
user = 'XXXXXXXXXXXXXXXX'
password = 'XXXXXXXXXXXXXXXX'
database = 'XXXXXXXXXXXXXXXX'
schema = 'XXXXXXXXXXXXXXXX'
warehouse = 'XXXXXXXXXXXXXXXX'

In [45]:
ctx = snowflake.connector.connect(
    user=user,
    password=password,
    account=account,
    database=database,
    schema=schema,
    warehouse=warehouse
)

account_query = "select * from salesforce_db.raw_data.account"
contact_query = "select * from salesforce_db.raw_data.contact"
opportunity_query = "select * from salesforce_db.raw_data.opportunity"
opportunitystage_query = "select * from salesforce_db.raw_data.opportunitystage"
task_query = "select * from salesforce_db.raw_data.task"

In [46]:
def get_table(ctx, query):
    cs = ctx.cursor()
    try:
        cs.execute(query)
        df = cs.fetch_pandas_all()
    finally:
        cs.close()
    return df

def print_missing_data_percentage(df, threshold=50):
    # Calculate the percentage of missing data for each column
    missing_data_percentage = df.isnull().mean() * 100

    # Filter columns with missing data greater than or equal to the threshold
    filtered_columns = missing_data_percentage[missing_data_percentage <= threshold]

    # Print the percentage of missing data for each filtered column
    print(f"Columns with Missing Data Less Than or Equal to {threshold}%:")
    columns = []
    for column, percentage in filtered_columns.sort_values(ascending=False).items():
        print(f"{column}: {percentage:.2f}%")
        columns.append(column)
    return columns

In [47]:
account = get_table(ctx, account_query)
opportunity = get_table(ctx, opportunity_query)
contact = get_table(ctx, contact_query)
opportunitystage = get_table(ctx, opportunitystage_query)
task = get_table(ctx, task_query)

In [48]:
ctx.close()

In [49]:
account.to_csv("raw_data/account.csv", index=False)
opportunity.to_csv("raw_data/opportunity.csv", index=False)
opportunitystage.to_csv("raw_data/opportunitystage.csv", index=False)
task.to_csv("raw_data/task.csv", index=False)
contact.to_csv("raw_data/contact.csv", index=False)

In [50]:
# Handling Missing Values
print("Missing Values:")
account_columns = print_missing_data_percentage(account, threshold=20)

Missing Values:
Columns with Missing Data Less Than or Equal to 20%:
HQ_LOCATION__C: 1.64%
GROWTH_RATE__C: 1.64%
TOTAL_FUNDING_TO_DATE__C: 1.64%
YEARSTARTED: 1.64%
ACCOUNTSOURCE: 1.64%
BILLINGLATITUDE: 1.64%
BILLINGLONGITUDE: 1.64%
SITE: 1.64%
BILLINGPOSTALCODE: 1.48%
BILLINGCOUNTRY: 1.48%
ANNUALREVENUE: 0.66%
DESCRIPTION: 0.66%
RATING: 0.49%
TICKERSYMBOL: 0.49%
BILLINGCITY: 0.33%
NUMBEROFEMPLOYEES: 0.16%
OWNERSHIP: 0.16%
INDUSTRY: 0.16%
WEBSITE: 0.16%
FAX: 0.16%
PHONE: 0.16%
TYPE: 0.16%
BILLINGSTREET: 0.16%
BILLINGSTATE: 0.16%
SYSTEMMODSTAMP: 0.00%
NAME: 0.00%
OWNER_INTENT_TO_SELL__C: 0.00%
CLEANSTATUS: 0.00%
LASTMODIFIEDDATE: 0.00%
LASTMODIFIEDBYID: 0.00%
ISDELETED: 0.00%
CREATEDBYID: 0.00%
CREATEDDATE: 0.00%
OWNERID: 0.00%
PHOTOURL: 0.00%
ID: 0.00%


In [51]:
print(f"# columns: {len(account_columns)}")
account_0 = account[account_columns].copy()
account_0.dropna(subset=['HQ_LOCATION__C', 'GROWTH_RATE__C', 'TOTAL_FUNDING_TO_DATE__C', 'OWNER_INTENT_TO_SELL__C'], inplace=True)
columns_to_drop = ['BILLINGLATITUDE', 'BILLINGLONGITUDE', 'BILLINGPOSTALCODE', 'BILLINGCOUNTRY', 'DESCRIPTION', 'TICKERSYMBOL',
                   'BILLINGCITY', 'SITE', 'WEBSITE', 'FAX', 'PHONE', 'BILLINGSTREET', 'SYSTEMMODSTAMP', 'CLEANSTATUS', 'BILLINGSTATE',
                   'LASTMODIFIEDDATE', 'LASTMODIFIEDBYID', 'ISDELETED', 'CREATEDBYID', 'CREATEDDATE', 'OWNERID', 'PHOTOURL']
account_0.drop(columns=columns_to_drop, axis=0, inplace=True)

# columns: 36


In [52]:
# Check for duplicates
print(f"Duplicates: {account_0.duplicated().sum()}")

Duplicates: 0


In [53]:
opportunity.drop_duplicates(subset=['ID', 'ACCOUNTID'], keep='first', inplace=True)
# Handling Missing Values
print("Missing Values:")
opportunity_columns = print_missing_data_percentage(opportunity, threshold=20)

Missing Values:
Columns with Missing Data Less Than or Equal to 20%:
ID: 0.00%
ISDELETED: 0.00%
HASOPENACTIVITY: 0.00%
FISCAL: 0.00%
FISCALYEAR: 0.00%
FISCALQUARTER: 0.00%
SYSTEMMODSTAMP: 0.00%
LASTMODIFIEDBYID: 0.00%
LASTMODIFIEDDATE: 0.00%
CREATEDBYID: 0.00%
CREATEDDATE: 0.00%
OWNERID: 0.00%
HASOPPORTUNITYLINEITEM: 0.00%
FORECASTCATEGORYNAME: 0.00%
FORECASTCATEGORY: 0.00%
ISWON: 0.00%
ISCLOSED: 0.00%
LEADSOURCE: 0.00%
NEXTSTEP: 0.00%
TYPE: 0.00%
CLOSEDATE: 0.00%
EXPECTEDREVENUE: 0.00%
PROBABILITY: 0.00%
AMOUNT: 0.00%
STAGENAME: 0.00%
DESCRIPTION: 0.00%
NAME: 0.00%
ISPRIVATE: 0.00%
ACCOUNTID: 0.00%
HASOVERDUETASK: 0.00%


In [54]:
print(f"# columns: {len(opportunity_columns)}")
opportunity_0 = opportunity[opportunity_columns].copy()
columns_to_drop = ['ISDELETED', 'FISCAL', 'FISCALQUARTER', 'SYSTEMMODSTAMP', 'LASTMODIFIEDBYID', 'LASTMODIFIEDDATE',
                   'CREATEDBYID', 'CREATEDDATE', 'OWNERID', 'HASOPPORTUNITYLINEITEM', 'HASOPENACTIVITY', 'NEXTSTEP',
                   'CLOSEDATE', 'DESCRIPTION', 'ISPRIVATE']
opportunity_0.drop(columns=columns_to_drop, axis=0, inplace=True)

# columns: 30


In [55]:
# Check for duplicates
print(f"Duplicates: {opportunity_0.duplicated().sum()}")

Duplicates: 0


In [56]:
# Handling Missing Values
print("Missing Values:")
contact_columns = print_missing_data_percentage(contact, threshold=20)

Missing Values:
Columns with Missing Data Less Than or Equal to 20%:
HOMEPHONE: 3.23%
DESCRIPTION: 3.23%
MAILINGCITY: 3.01%
MAILINGSTATE: 3.01%
MAILINGCOUNTRY: 3.01%
DEPARTMENT: 0.22%
MOBILEPHONE: 0.22%
BIRTHDATE: 0.22%
PHOTOURL: 0.00%
ISEMAILBOUNCED: 0.00%
SYSTEMMODSTAMP: 0.00%
LASTMODIFIEDBYID: 0.00%
LASTMODIFIEDDATE: 0.00%
CREATEDBYID: 0.00%
CREATEDDATE: 0.00%
OWNERID: 0.00%
ID: 0.00%
TITLE: 0.00%
EMAIL: 0.00%
ISDELETED: 0.00%
FAX: 0.00%
PHONE: 0.00%
MAILINGSTREET: 0.00%
NAME: 0.00%
SALUTATION: 0.00%
FIRSTNAME: 0.00%
LASTNAME: 0.00%
ACCOUNTID: 0.00%
CLEANSTATUS: 0.00%


In [57]:
print(f"# columns: {len(contact_columns)}")
contact_0 = contact[contact_columns].copy()
columns_to_drop = ['HOMEPHONE', 'DESCRIPTION', 'MAILINGCITY', 'MAILINGCOUNTRY', 'MOBILEPHONE', 'BIRTHDATE', 'PHOTOURL', 'ISEMAILBOUNCED',
                   'SYSTEMMODSTAMP', 'LASTMODIFIEDBYID', 'LASTMODIFIEDDATE', 'CREATEDBYID', 'CREATEDDATE', 'OWNERID', 'EMAIL', 'ISDELETED',
                   'FAX', 'PHONE', 'MAILINGSTREET', 'SALUTATION', 'FIRSTNAME', 'LASTNAME', 'CLEANSTATUS']
contact_0.drop(columns=columns_to_drop, axis=0, inplace=True)
values_to_drop = ['Production', 'Technology', 'Procurement', 'Facilities', 'Operations', 'Warehouse Mgmt', 'Administration', 'Executive Team']
contact_0 = contact_0[~contact_0['DEPARTMENT'].isin(values_to_drop)]
values_to_drop = ['VP, Facilities', 'VP, Production', 'VP, Technology', 'SVP, Procurement', 'VP, Finance', 'Dean of Administration', 'Director,Warehouse Mgmt',
                  'SVP, Operations', 'SVP, Technology', 'SVP, Production', 'SVP, Administration and Finance', 'Regional General Manager']
contact_0 = contact_0[~contact_0['TITLE'].isin(values_to_drop)]

# columns: 29


In [58]:
task.drop_duplicates(subset=['ID'], keep='first', inplace=True)
# Handling Missing Values
print("Missing Values:")
task_columns = print_missing_data_percentage(task, threshold=20)

Missing Values:
Columns with Missing Data Less Than or Equal to 20%:
ID: 0.00%
WHOID: 0.00%
ISRECURRENCE: 0.00%
ISREMINDERSET: 0.00%
CALLOBJECT: 0.00%
CALLDISPOSITION: 0.00%
CALLTYPE: 0.00%
CALLDURATIONINSECONDS: 0.00%
ISARCHIVED: 0.00%
SYSTEMMODSTAMP: 0.00%
LASTMODIFIEDBYID: 0.00%
LASTMODIFIEDDATE: 0.00%
CREATEDBYID: 0.00%
CREATEDDATE: 0.00%
ISCLOSED: 0.00%
ACCOUNTID: 0.00%
ISDELETED: 0.00%
DESCRIPTION: 0.00%
OWNERID: 0.00%
ISHIGHPRIORITY: 0.00%
PRIORITY: 0.00%
STATUS: 0.00%
ACTIVITYDATE: 0.00%
SUBJECT: 0.00%
WHATID: 0.00%
TASKSUBTYPE: 0.00%


In [59]:
print(f"# columns: {len(task_columns)}")
task_0 = task[task_columns].copy()
columns_to_drop = ['ISRECURRENCE', 'ISREMINDERSET', 'CALLOBJECT', 'ISARCHIVED', 'SYSTEMMODSTAMP', 'LASTMODIFIEDBYID', 'SUBJECT',
                   'LASTMODIFIEDDATE', 'CREATEDBYID', 'CREATEDDATE', 'ISDELETED', 'DESCRIPTION', 'OWNERID', 'ACTIVITYDATE']
task_0.drop(columns=columns_to_drop, axis=0, inplace=True)

# columns: 26


In [60]:
account_0.to_csv('data/account.csv', index=False)
opportunity_0.to_csv('data/opportunity.csv', index=False)
contact_0.to_csv('data/contact.csv', index=False)
task_0.to_csv('data/task.csv', index=False)