# CUSTOMER DIMENSION

In [1]:
import pandas as pd
from connection_script import connect_databases

In [2]:
db_op, db_etl = connect_databases()

## EXTRACT 

In [3]:
# Load the required tables into pandas DataFrames
individual_customer = pd.read_sql_query('SELECT * FROM [Sales].[vIndividualCustomer]', db_op)
sales_customer = pd.read_sql_query('SELECT CustomerID,PersonID,AccountNumber  FROM [Sales].[Customer]', db_op)
person = pd.read_sql_query('SELECT * FROM [Person].[Person]', db_op)
person_phone = pd.read_sql_query('SELECT * FROM [Person].[PersonPhone]', db_op)
person_demographics = pd.read_sql_query('SELECT * FROM [Sales].[vPersonDemographics]', db_op)
dim_geography = pd.read_sql_query('SELECT * FROM "DimGeography"', db_etl)

# Merge the DataFrames
merged_df = individual_customer.merge(person, left_on='BusinessEntityID', right_on='BusinessEntityID')
merged_df = merged_df.merge(person_phone, left_on='BusinessEntityID', right_on='BusinessEntityID')
merged_df = merged_df.merge(person_demographics, left_on='BusinessEntityID', right_on='BusinessEntityID')
print(merged_df.columns)
merged_df = merged_df.merge(dim_geography, on=['City', 'PostalCode', 'StateProvinceName', 'StateProvinceName'])
merged_df = merged_df.merge(sales_customer, left_on='BusinessEntityID', right_on='PersonID')

# Select the desired columns
result = merged_df[['BusinessEntityID', 'CustomerID', 'AccountNumber','GeographyKey', 'Title_x','FirstName_x', 'MiddleName_x', 'LastName_x', 'NameStyle', 'BirthDate', 'MaritalStatus', 'Suffix_x', 'Gender',
                    'EmailAddress', 'YearlyIncome', 'TotalChildren', 'NumberChildrenAtHome', 'Education',
                    'Occupation', 'HomeOwnerFlag', 'NumberCarsOwned', 'AddressLine1', 'AddressLine2',
                    'PhoneNumber_x', 'DateFirstPurchase']]

result

Index(['BusinessEntityID', 'Title_x', 'FirstName_x', 'MiddleName_x',
       'LastName_x', 'Suffix_x', 'PhoneNumber_x', 'PhoneNumberType',
       'EmailAddress', 'EmailPromotion_x', 'AddressType', 'AddressLine1',
       'AddressLine2', 'City', 'StateProvinceName', 'PostalCode',
       'CountryRegionName', 'Demographics_x', 'PersonType', 'NameStyle',
       'Title_y', 'FirstName_y', 'MiddleName_y', 'LastName_y', 'Suffix_y',
       'EmailPromotion_y', 'AdditionalContactInfo', 'Demographics_y',
       'rowguid', 'ModifiedDate_x', 'PhoneNumber_y', 'PhoneNumberTypeID',
       'ModifiedDate_y', 'TotalPurchaseYTD', 'DateFirstPurchase', 'BirthDate',
       'MaritalStatus', 'YearlyIncome', 'Gender', 'TotalChildren',
       'NumberChildrenAtHome', 'Education', 'Occupation', 'HomeOwnerFlag',
       'NumberCarsOwned'],
      dtype='object')


Unnamed: 0,BusinessEntityID,CustomerID,AccountNumber,GeographyKey,Title_x,FirstName_x,MiddleName_x,LastName_x,NameStyle,BirthDate,...,TotalChildren,NumberChildrenAtHome,Education,Occupation,HomeOwnerFlag,NumberCarsOwned,AddressLine1,AddressLine2,PhoneNumber_x,DateFirstPurchase
0,1699,11377,AW00011377,210,Mr.,David,R.,Robinett,False,1961-02-23,...,4.0,0.0,Graduate Degree,Clerical,True,0.0,Pappelallee 6667,,238-555-0100,2003-09-01
1,1700,11913,AW00011913,37,Ms.,Rebecca,A.,Robinson,False,1965-06-11,...,3.0,3.0,Bachelors,Professional,True,1.0,1861 Chinquapin Ct,,648-555-0100,2004-06-05
2,1701,11952,AW00011952,34,Ms.,Dorothy,B.,Robinson,False,1954-09-23,...,2.0,0.0,Partial College,Skilled Manual,False,2.0,4693 Mills Dr.,,423-555-0100,2002-04-07
3,1702,20164,AW00020164,244,Ms.,Carol Ann,F.,Rockne,False,1943-07-15,...,1.0,0.0,Bachelors,Clerical,True,0.0,1312 Skycrest Drive,,439-555-0100,2001-10-27
4,1703,20211,AW00020211,22,Mr.,Scott,M.,Rodgers,False,1968-05-15,...,2.0,2.0,Bachelors,Professional,True,1.0,9860 Brookview Drive,,989-555-0100,2002-04-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,20773,19379,AW00019379,272,,Crystal,,Guo,False,1974-11-23,...,0.0,0.0,Partial High School,Manual,True,2.0,988 Mt. Everest Court,,1 (11) 500 555-0171,2004-04-19
18504,20774,13933,AW00013933,380,,Isabella,F,Richardson,False,1961-06-12,...,1.0,0.0,Partial College,Skilled Manual,True,1.0,7413 Alpine Drive,,910-555-0166,2003-08-30
18505,20775,24634,AW00024634,633,,Crystal,S,He,False,1940-04-05,...,3.0,0.0,Bachelors,Management,True,2.0,4764 East Avenue,,813-555-0148,2004-04-12
18506,20776,21127,AW00021127,145,,Crystal,,Zheng,False,1975-07-25,...,0.0,0.0,Partial College,Clerical,True,1.0,"34334, rue Jean Mermoz",,1 (11) 500 555-0171,2004-02-15


In [4]:
merged_df.columns

Index(['BusinessEntityID', 'Title_x', 'FirstName_x', 'MiddleName_x',
       'LastName_x', 'Suffix_x', 'PhoneNumber_x', 'PhoneNumberType',
       'EmailAddress', 'EmailPromotion_x', 'AddressType', 'AddressLine1',
       'AddressLine2', 'City', 'StateProvinceName', 'PostalCode',
       'CountryRegionName', 'Demographics_x', 'PersonType', 'NameStyle',
       'Title_y', 'FirstName_y', 'MiddleName_y', 'LastName_y', 'Suffix_y',
       'EmailPromotion_y', 'AdditionalContactInfo', 'Demographics_y',
       'rowguid', 'ModifiedDate_x', 'PhoneNumber_y', 'PhoneNumberTypeID',
       'ModifiedDate_y', 'TotalPurchaseYTD', 'DateFirstPurchase', 'BirthDate',
       'MaritalStatus', 'YearlyIncome', 'Gender', 'TotalChildren',
       'NumberChildrenAtHome', 'Education', 'Occupation', 'HomeOwnerFlag',
       'NumberCarsOwned', 'GeographyKey', 'StateProvinceCode',
       'CountryRegionCode', 'EnglishCountryRegionName', 'SalesTerritoryID',
       'CustomerID', 'PersonID', 'AccountNumber'],
      dtype='object

In [5]:
len(result)

18508

## TRANSFORM

In [6]:
result = result.drop_duplicates(subset=['BusinessEntityID'])

In [7]:
# Rename columns
result = result.rename(columns={'Education':'EnglishEducation', 'Occupation':'EnglishOccupation', 'HomeOwnerFlag':'HouseOwnerFlag'})

In [8]:
result = result.rename(columns={
    'CustomerID': 'CustomerKey',
    'AccountNumber': 'CustomerAlternateKey',
    'Title_x': 'Title',
    'FirstName_x': 'FirstName',
    'MiddleName_x': 'MiddleName',
    'LastName_x': 'LastName',
    'Suffix_x': 'Suffix',
    'PhoneNumber_x': 'Phone',
    'Education':'EnglishEducation',
    'Occupation':'EnglishOccupation',
    'HomeOwnerFlag':'HouseOwnerFlag'
})

In [9]:
result = result[['CustomerKey', 'GeographyKey', 'CustomerAlternateKey','Title', 'FirstName', 'MiddleName', 'LastName', 'NameStyle', 'BirthDate', 
                       'MaritalStatus', 'Suffix', 'Gender', 'EmailAddress', 'YearlyIncome', 
                       'TotalChildren', 'NumberChildrenAtHome', 'EnglishEducation', 'EnglishOccupation', 
                       'HouseOwnerFlag', 'NumberCarsOwned', 'AddressLine1', 'AddressLine2', 
                       'Phone', 'DateFirstPurchase']]

In [10]:
result.sort_values(by='CustomerKey', inplace=True)
result

Unnamed: 0,CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus,...,TotalChildren,NumberChildrenAtHome,EnglishEducation,EnglishOccupation,HouseOwnerFlag,NumberCarsOwned,AddressLine1,AddressLine2,Phone,DateFirstPurchase
11250,11000,26,AW00011000,,Jon,V,Yang,False,1966-04-08,M,...,2.0,0.0,Bachelors,Professional,True,0.0,3761 N. 14th St,,1 (11) 500 555-0162,2001-07-22
3164,11001,37,AW00011001,,Eugene,L,Huang,False,1965-05-14,S,...,3.0,3.0,Bachelors,Professional,False,1.0,2243 W St.,,1 (11) 500 555-0110,2001-07-18
8987,11002,31,AW00011002,,Ruben,,Torres,False,1965-08-12,M,...,3.0,3.0,Bachelors,Professional,True,1.0,5844 Linden Land,,1 (11) 500 555-0184,2001-07-10
9076,11003,11,AW00011003,,Christy,,Zhu,False,1968-02-15,S,...,0.0,0.0,Bachelors,Professional,False,1.0,1825 Village Pl.,,1 (11) 500 555-0162,2001-07-01
9620,11004,19,AW00011004,,Elizabeth,,Johnson,False,1968-08-08,S,...,5.0,5.0,Bachelors,Professional,True,4.0,7553 Harness Circle,,1 (11) 500 555-0131,2001-07-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1899,29479,145,AW00029479,,Tommy,L,Tang,False,1958-07-04,M,...,1.0,0.0,Graduate Degree,Clerical,True,0.0,"111, rue Maillard",,1 (11) 500 555-0136,2003-03-08
2181,29480,251,AW00029480,,Nina,W,Raji,False,1960-11-10,S,...,3.0,0.0,Graduate Degree,Clerical,True,0.0,9 Katherine Drive,,1 (11) 500 555-0146,2004-01-18
5882,29481,169,AW00029481,,Ivan,,Suri,False,1960-01-05,S,...,3.0,0.0,Graduate Degree,Clerical,False,0.0,Knaackstr 4,,1 (11) 500 555-0144,2002-02-13
10289,29482,114,AW00029482,,Clayton,,Zhang,False,1959-03-05,M,...,3.0,0.0,Bachelors,Clerical,True,0.0,"1080, quai de Grenelle",,1 (11) 500 555-0137,2003-03-22


## LOAD

In [11]:
# Commute Distance column is still missing!!
db_op, db_etl = connect_databases()
result.to_sql('DimCustomer', db_etl, if_exists='replace', index=False)

484

In [12]:
test = pd.read_sql_query('SELECT * FROM "DimCustomer"', db_etl)
test

Unnamed: 0,CustomerKey,GeographyKey,CustomerAlternateKey,Title,FirstName,MiddleName,LastName,NameStyle,BirthDate,MaritalStatus,...,TotalChildren,NumberChildrenAtHome,EnglishEducation,EnglishOccupation,HouseOwnerFlag,NumberCarsOwned,AddressLine1,AddressLine2,Phone,DateFirstPurchase
0,11000,26,AW00011000,,Jon,V,Yang,False,1966-04-08,M,...,2.0,0.0,Bachelors,Professional,True,0.0,3761 N. 14th St,,1 (11) 500 555-0162,2001-07-22
1,11001,37,AW00011001,,Eugene,L,Huang,False,1965-05-14,S,...,3.0,3.0,Bachelors,Professional,False,1.0,2243 W St.,,1 (11) 500 555-0110,2001-07-18
2,11002,31,AW00011002,,Ruben,,Torres,False,1965-08-12,M,...,3.0,3.0,Bachelors,Professional,True,1.0,5844 Linden Land,,1 (11) 500 555-0184,2001-07-10
3,11003,11,AW00011003,,Christy,,Zhu,False,1968-02-15,S,...,0.0,0.0,Bachelors,Professional,False,1.0,1825 Village Pl.,,1 (11) 500 555-0162,2001-07-01
4,11004,19,AW00011004,,Elizabeth,,Johnson,False,1968-08-08,S,...,5.0,5.0,Bachelors,Professional,True,4.0,7553 Harness Circle,,1 (11) 500 555-0131,2001-07-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18479,29479,145,AW00029479,,Tommy,L,Tang,False,1958-07-04,M,...,1.0,0.0,Graduate Degree,Clerical,True,0.0,"111, rue Maillard",,1 (11) 500 555-0136,2003-03-08
18480,29480,251,AW00029480,,Nina,W,Raji,False,1960-11-10,S,...,3.0,0.0,Graduate Degree,Clerical,True,0.0,9 Katherine Drive,,1 (11) 500 555-0146,2004-01-18
18481,29481,169,AW00029481,,Ivan,,Suri,False,1960-01-05,S,...,3.0,0.0,Graduate Degree,Clerical,False,0.0,Knaackstr 4,,1 (11) 500 555-0144,2002-02-13
18482,29482,114,AW00029482,,Clayton,,Zhang,False,1959-03-05,M,...,3.0,0.0,Bachelors,Clerical,True,0.0,"1080, quai de Grenelle",,1 (11) 500 555-0137,2003-03-22
