# CUSTOMER DIMENSION

In [1]:
import pandas as pd
from connection_script import connect_databases

In [2]:
db_op, db_etl = connect_databases()

## EXTRACT 

In [3]:
# Load the required tables into pandas DataFrames
individual_customer = pd.read_sql_query('SELECT * FROM [Sales].[vIndividualCustomer]', db_op)
person = pd.read_sql_query('SELECT * FROM [Person].[Person]', db_op)
person_phone = pd.read_sql_query('SELECT * FROM [Person].[PersonPhone]', db_op)
person_demographics = pd.read_sql_query('SELECT * FROM [Sales].[vPersonDemographics]', db_op)
dim_geography = pd.read_sql_query('SELECT * FROM public."DimGeography"', db_etl)

# Merge the DataFrames
merged_df = individual_customer.merge(person, left_on='BusinessEntityID', right_on='BusinessEntityID')
merged_df = merged_df.merge(person_phone, left_on='BusinessEntityID', right_on='BusinessEntityID')
merged_df = merged_df.merge(person_demographics, left_on='BusinessEntityID', right_on='BusinessEntityID')
merged_df = merged_df.merge(dim_geography, left_on='City', right_on='City')

# Select the desired columns
result = merged_df[['BusinessEntityID','GeographyKey', 'Title_x','FirstName_x', 'MiddleName_x', 'LastName_x', 'NameStyle', 'BirthDate', 'MaritalStatus', 'Suffix_x', 'Gender',
                    'EmailAddress', 'YearlyIncome', 'TotalChildren', 'NumberChildrenAtHome', 'Education',
                    'Occupation', 'HomeOwnerFlag', 'NumberCarsOwned', 'AddressLine1', 'AddressLine2',
                    'PhoneNumber_x', 'DateFirstPurchase']]

result

Unnamed: 0,BusinessEntityID,GeographyKey,Title_x,FirstName_x,MiddleName_x,LastName_x,NameStyle,BirthDate,MaritalStatus,Suffix_x,...,TotalChildren,NumberChildrenAtHome,Education,Occupation,HomeOwnerFlag,NumberCarsOwned,AddressLine1,AddressLine2,PhoneNumber_x,DateFirstPurchase
0,1699,210,Mr.,David,R.,Robinett,False,1961-02-23,M,,...,4.0,0.0,Graduate Degree,Clerical,True,0.0,Pappelallee 6667,,238-555-0100,2003-09-01
1,1700,37,Ms.,Rebecca,A.,Robinson,False,1965-06-11,M,,...,3.0,3.0,Bachelors,Professional,True,1.0,1861 Chinquapin Ct,,648-555-0100,2004-06-05
2,1701,34,Ms.,Dorothy,B.,Robinson,False,1954-09-23,S,,...,2.0,0.0,Partial College,Skilled Manual,False,2.0,4693 Mills Dr.,,423-555-0100,2002-04-07
3,1702,244,Ms.,Carol Ann,F.,Rockne,False,1943-07-15,M,,...,1.0,0.0,Bachelors,Clerical,True,0.0,1312 Skycrest Drive,,439-555-0100,2001-10-27
4,1703,22,Mr.,Scott,M.,Rodgers,False,1968-05-15,M,,...,2.0,2.0,Bachelors,Professional,True,1.0,9860 Brookview Drive,,989-555-0100,2002-04-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34674,20773,272,,Crystal,,Guo,False,1974-11-23,S,,...,0.0,0.0,Partial High School,Manual,True,2.0,988 Mt. Everest Court,,1 (11) 500 555-0171,2004-04-19
34675,20774,380,,Isabella,F,Richardson,False,1961-06-12,M,,...,1.0,0.0,Partial College,Skilled Manual,True,1.0,7413 Alpine Drive,,910-555-0166,2003-08-30
34676,20775,633,,Crystal,S,He,False,1940-04-05,M,,...,3.0,0.0,Bachelors,Management,True,2.0,4764 East Avenue,,813-555-0148,2004-04-12
34677,20776,145,,Crystal,,Zheng,False,1975-07-25,S,,...,0.0,0.0,Partial College,Clerical,True,1.0,"34334, rue Jean Mermoz",,1 (11) 500 555-0171,2004-02-15


In [4]:
len(result)

34679

## TRANSFORM

In [5]:
result = result.drop_duplicates(subset=['BusinessEntityID'])

In [6]:
# Rename columns
result = result.rename(columns={'Education':'EnglishEducation', 'Occupation':'EnglishOccupation', 'HomeOwnerFlag':'HouseOwnerFlag'})

In [7]:
result = result.rename(columns={
    'Title_x': 'Title',
    'FirstName_x': 'FirstName',
    'MiddleName_x': 'MiddleName',
    'LastName_x': 'LastName',
    'Suffix_x': 'Suffix',
    'PhoneNumber_x': 'Phone',
    'Education':'EnglishEducation',
    'Occupation':'EnglishOccupation',
    'HomeOwnerFlag':'HouseOwnerFlag'
})

In [8]:
result = result.reset_index()
result.index += 11000

In [9]:
def generateAlternateKey(ind):
    l = len(str(ind))
    zeros = 8 - l
    alternateKey = 'AW' + '0'*zeros + str(ind)
    return alternateKey


In [10]:
#result['CustomerAlternateKey'] = list(result.index)
#result['CustomerAlternateKey'] = result['CustomerAlternateKey'].apply(lambda x: generateAlternateKey(x))
result = result.assign(CustomerAlternateKey=list(result.index))
result['CustomerAlternateKey'] = result['CustomerAlternateKey'].apply(lambda x: generateAlternateKey(x))

In [11]:
result = result[['GeographyKey', 'CustomerAlternateKey','Title', 'FirstName', 'MiddleName', 'LastName', 'NameStyle', 'BirthDate', 
                       'MaritalStatus', 'Suffix', 'Gender', 'EmailAddress', 'YearlyIncome', 
                       'TotalChildren', 'NumberChildrenAtHome', 'EnglishEducation', 'EnglishOccupation', 
                       'HouseOwnerFlag', 'NumberCarsOwned', 'AddressLine1', 'AddressLine2', 
                       'Phone', 'DateFirstPurchase']]

In [12]:
result

Unnamed: 0,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus,Suffix,...,TotalChildren,NumberChildrenAtHome,EnglishEducation,EnglishOccupation,HouseOwnerFlag,NumberCarsOwned,AddressLine1,AddressLine2,Phone,DateFirstPurchase
11000,210,AW00011000,Mr.,David,R.,Robinett,False,1961-02-23,M,,...,4.0,0.0,Graduate Degree,Clerical,True,0.0,Pappelallee 6667,,238-555-0100,2003-09-01
11001,37,AW00011001,Ms.,Rebecca,A.,Robinson,False,1965-06-11,M,,...,3.0,3.0,Bachelors,Professional,True,1.0,1861 Chinquapin Ct,,648-555-0100,2004-06-05
11002,34,AW00011002,Ms.,Dorothy,B.,Robinson,False,1954-09-23,S,,...,2.0,0.0,Partial College,Skilled Manual,False,2.0,4693 Mills Dr.,,423-555-0100,2002-04-07
11003,244,AW00011003,Ms.,Carol Ann,F.,Rockne,False,1943-07-15,M,,...,1.0,0.0,Bachelors,Clerical,True,0.0,1312 Skycrest Drive,,439-555-0100,2001-10-27
11004,22,AW00011004,Mr.,Scott,M.,Rodgers,False,1968-05-15,M,,...,2.0,2.0,Bachelors,Professional,True,1.0,9860 Brookview Drive,,989-555-0100,2002-04-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29479,272,AW00029479,,Crystal,,Guo,False,1974-11-23,S,,...,0.0,0.0,Partial High School,Manual,True,2.0,988 Mt. Everest Court,,1 (11) 500 555-0171,2004-04-19
29480,380,AW00029480,,Isabella,F,Richardson,False,1961-06-12,M,,...,1.0,0.0,Partial College,Skilled Manual,True,1.0,7413 Alpine Drive,,910-555-0166,2003-08-30
29481,633,AW00029481,,Crystal,S,He,False,1940-04-05,M,,...,3.0,0.0,Bachelors,Management,True,2.0,4764 East Avenue,,813-555-0148,2004-04-12
29482,145,AW00029482,,Crystal,,Zheng,False,1975-07-25,S,,...,0.0,0.0,Partial College,Clerical,True,1.0,"34334, rue Jean Mermoz",,1 (11) 500 555-0171,2004-02-15


In [13]:
#geography_key_backup = list(result['GeographyKey'])
#result = result.drop(columns=['GeographyKey'])
#result

In [14]:
#result = result.assign(GeographyKey=geography_key_backup)
#result

## LOAD

In [15]:
# Commute Distance column is still missing!!
db_op, db_etl = connect_databases()
result.to_sql('DimCustomer', db_etl, if_exists='replace', index_label='CustomerKey')

484