# EMPLOYEE DIMENSION

In [1]:
from connection_script import connect_databases
import pandas as pd
import locale

In [2]:
db_op, db_etl = connect_databases()

## EXTRACT

In [3]:
# Extraccion de datos en la base operacional
employee_df = pd.read_sql_query('SELECT [BusinessEntityID], [NationalIDNumber], [LoginID], [OrganizationNode].ToString() as OrganizationNode, [OrganizationLevel], [JobTitle], [BirthDate], [MaritalStatus], [Gender], [HireDate], [SalariedFlag], [VacationHours], [SickLeaveHours], [CurrentFlag], [rowguid], [ModifiedDate] FROM [HumanResources].[Employee]', db_op)
employee_pay_history = pd.read_sql_query('SELECT * FROM [HumanResources].[EmployeePayHistory]', db_op)
employee_department_history = pd.read_sql_query('SELECT * FROM [HumanResources].[EmployeeDepartmentHistory]', db_op)
employee_department_history = employee_department_history.drop(columns=['ModifiedDate'])
dim_sales_territory = pd.read_sql_query("SELECT * FROM DimSalesTerritory", db_etl)
person = pd.read_sql_query('SELECT * FROM [Person].[Person]', db_op)
person_phone = pd.read_sql_query('SELECT * FROM [Person].[PersonPhone]', db_op)
person_phone = person_phone.drop(columns=['ModifiedDate'])
person_demographics = pd.read_sql_query('SELECT * FROM [Sales].[vPersonDemographics]', db_op)
sales_person = pd.read_sql_query('SELECT * FROM [Sales].[SalesPerson]', db_op)
sales_person_territory = sales_person.merge(dim_sales_territory, left_on='TerritoryID', right_on='SalesTerritoryKey')

## TRANSFORM

In [4]:
# Merge de datos
merged_df = employee_df.merge(person, left_on='BusinessEntityID', right_on='BusinessEntityID')
merged_df = merged_df.merge(employee_pay_history, left_on='BusinessEntityID', right_on='BusinessEntityID')
merged_df = merged_df.merge(employee_department_history, left_on='BusinessEntityID', right_on='BusinessEntityID')
merged_df = merged_df.merge(person_phone, left_on='BusinessEntityID', right_on='BusinessEntityID')
merged_df.drop(columns=['rowguid_x', 'rowguid_y', 'ModifiedDate_x', 'ModifiedDate_y'], inplace=True)
merged_df = merged_df.merge(sales_person_territory, on='BusinessEntityID', how='left')

# Create a set of unique BusinessEntityID in sales_person
sales_person_ids = set(sales_person['BusinessEntityID'])

# Create a new column in merged_df to check if the employee is a sales person
merged_df['IsSalesPerson'] = merged_df['BusinessEntityID'].apply(lambda id: id in sales_person_ids)

# Convert the boolean values to int (True to 1 and False to 0)
merged_df['IsSalesPerson'] = merged_df['IsSalesPerson'].astype(int)

merged_df.columns

Index(['BusinessEntityID', 'NationalIDNumber', 'LoginID', 'OrganizationNode',
       'OrganizationLevel', 'JobTitle', 'BirthDate', 'MaritalStatus', 'Gender',
       'HireDate', 'SalariedFlag', 'VacationHours', 'SickLeaveHours',
       'CurrentFlag', 'PersonType', 'NameStyle', 'Title', 'FirstName',
       'MiddleName', 'LastName', 'Suffix', 'EmailPromotion',
       'AdditionalContactInfo', 'Demographics', 'RateChangeDate', 'Rate',
       'PayFrequency', 'ModifiedDate_x', 'DepartmentID', 'ShiftID',
       'StartDate', 'EndDate', 'PhoneNumber', 'PhoneNumberTypeID',
       'TerritoryID', 'SalesQuota', 'Bonus', 'CommissionPct', 'SalesYTD',
       'SalesLastYear', 'rowguid', 'ModifiedDate_y', 'SalesTerritoryKey',
       'SalesTerritoryAlternateKey', 'SalesTerritoryRegion',
       'SalesTerritoryCountry', 'SalesTerritoryGroup', 'IsSalesPerson'],
      dtype='object')

In [5]:
from sqlalchemy import text
def get_employee_manager(row):
    # Define the SQL query
    query = text("EXEC dbo.uspGetEmployeeManagers :id")
    
    # Execute the query and fetch the result
    result = db_op.execute(query, {"id": row['BusinessEntityID']}).fetchone()
    
    # Return the manager's BusinessEntityID
    return result[1] if result else None

# Apply the function to each row of the DataFrame
merged_df['ParentEmployeeKey'] = merged_df.apply(get_employee_manager, axis=1)

# Create a mapping from BusinessEntityID to NationalIDNumber
id_map = employee_df.set_index('BusinessEntityID')['NationalIDNumber']

# Use the map to add the ParentEmployeeNationalIDAlternateKey column
merged_df['ParentEmployeeNationalIDAlternateKey'] = merged_df['ParentEmployeeKey'].map(id_map)

merged_df[['BusinessEntityID', 'ParentEmployeeKey', 'ParentEmployeeNationalIDAlternateKey']]

Unnamed: 0,BusinessEntityID,ParentEmployeeKey,ParentEmployeeNationalIDAlternateKey
0,1,,
1,2,,
2,3,3.0,509647174
3,4,4.0,112457891
4,4,4.0,112457891
...,...,...,...
329,286,286.0,758596752
330,287,287.0,982310417
331,288,288.0,954276278
332,289,289.0,668991357


In [6]:
# Seleccion de datos
result = merged_df[['BusinessEntityID', 'ParentEmployeeKey','NationalIDNumber', 'ParentEmployeeNationalIDAlternateKey', 'SalesTerritoryKey', 'FirstName', 'LastName', 'MiddleName', 
                    'Title', 'HireDate', 'BirthDate', 'LoginID', 'EmailPromotion', 'PhoneNumber', 'MaritalStatus', 'SalariedFlag', 'Gender', 'PayFrequency', 'Rate', 'VacationHours', 
                    'SickLeaveHours', 'IsSalesPerson', 'StartDate', 'EndDate']]

print(result.columns)
result

Index(['BusinessEntityID', 'ParentEmployeeKey', 'NationalIDNumber',
       'ParentEmployeeNationalIDAlternateKey', 'SalesTerritoryKey',
       'FirstName', 'LastName', 'MiddleName', 'Title', 'HireDate', 'BirthDate',
       'LoginID', 'EmailPromotion', 'PhoneNumber', 'MaritalStatus',
       'SalariedFlag', 'Gender', 'PayFrequency', 'Rate', 'VacationHours',
       'SickLeaveHours', 'IsSalesPerson', 'StartDate', 'EndDate'],
      dtype='object')


Unnamed: 0,BusinessEntityID,ParentEmployeeKey,NationalIDNumber,ParentEmployeeNationalIDAlternateKey,SalesTerritoryKey,FirstName,LastName,MiddleName,Title,HireDate,...,MaritalStatus,SalariedFlag,Gender,PayFrequency,Rate,VacationHours,SickLeaveHours,IsSalesPerson,StartDate,EndDate
0,1,,295847284,,,Ken,Sánchez,J,,2009-01-14,...,S,True,M,2,125.5000,99,69,0,2009-01-14,
1,2,,245797967,,,Terri,Duffy,Lee,,2008-01-31,...,S,True,F,2,63.4615,1,20,0,2008-01-31,
2,3,3.0,509647174,509647174,,Roberto,Tamburello,,,2007-11-11,...,M,True,M,2,43.2692,2,21,0,2007-11-11,
3,4,4.0,112457891,112457891,,Rob,Walters,,,2007-12-05,...,S,False,M,2,8.6200,48,80,0,2007-12-05,2010-05-30
4,4,4.0,112457891,112457891,,Rob,Walters,,,2007-12-05,...,S,False,M,2,8.6200,48,80,0,2010-05-31,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,286,286.0,758596752,758596752,9.0,Lynn,Tsoflias,N,,2013-05-30,...,S,True,F,2,23.0769,36,38,1,2013-05-30,
330,287,287.0,982310417,982310417,,Amy,Alberts,E,,2012-04-16,...,M,True,F,2,48.1010,21,30,1,2012-04-16,
331,288,288.0,954276278,954276278,8.0,Rachel,Valdez,B,,2013-05-30,...,S,True,F,2,23.0769,35,37,1,2013-05-30,
332,289,289.0,668991357,668991357,10.0,Jae,Pak,B,,2012-05-30,...,M,True,F,2,23.0769,37,38,1,2012-05-30,


In [7]:
# Create a copy of the DataFrame to avoid SettingWithCopyWarning
result = result.copy()

# Drop the column
#result.drop(columns=['BusinessEntityID'], inplace=True)

# Fill NaN values
result.loc[:, 'SalesTerritoryKey'] = result['SalesTerritoryKey'].fillna(11)

# Rename columns
result.rename(columns={'BusinessEntityID': 'EmployeeKey',
                        'NationalIDNumber': 'EmployeeNationalIDAlternateKey',
                        'EmailPromotion': 'EmailAddress',
                        'PhoneNumber': 'Phone'}, inplace=True)

In [8]:
result.set_index('EmployeeKey', inplace=True)
result

Unnamed: 0_level_0,ParentEmployeeKey,EmployeeNationalIDAlternateKey,ParentEmployeeNationalIDAlternateKey,SalesTerritoryKey,FirstName,LastName,MiddleName,Title,HireDate,BirthDate,...,MaritalStatus,SalariedFlag,Gender,PayFrequency,Rate,VacationHours,SickLeaveHours,IsSalesPerson,StartDate,EndDate
EmployeeKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,295847284,,11.0,Ken,Sánchez,J,,2009-01-14,1969-01-29,...,S,True,M,2,125.5000,99,69,0,2009-01-14,
2,,245797967,,11.0,Terri,Duffy,Lee,,2008-01-31,1971-08-01,...,S,True,F,2,63.4615,1,20,0,2008-01-31,
3,3.0,509647174,509647174,11.0,Roberto,Tamburello,,,2007-11-11,1974-11-12,...,M,True,M,2,43.2692,2,21,0,2007-11-11,
4,4.0,112457891,112457891,11.0,Rob,Walters,,,2007-12-05,1974-12-23,...,S,False,M,2,8.6200,48,80,0,2007-12-05,2010-05-30
4,4.0,112457891,112457891,11.0,Rob,Walters,,,2007-12-05,1974-12-23,...,S,False,M,2,8.6200,48,80,0,2010-05-31,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286,286.0,758596752,758596752,9.0,Lynn,Tsoflias,N,,2013-05-30,1977-02-14,...,S,True,F,2,23.0769,36,38,1,2013-05-30,
287,287.0,982310417,982310417,11.0,Amy,Alberts,E,,2012-04-16,1957-09-20,...,M,True,F,2,48.1010,21,30,1,2012-04-16,
288,288.0,954276278,954276278,8.0,Rachel,Valdez,B,,2013-05-30,1975-07-09,...,S,True,F,2,23.0769,35,37,1,2013-05-30,
289,289.0,668991357,668991357,10.0,Jae,Pak,B,,2012-05-30,1968-03-17,...,M,True,F,2,23.0769,37,38,1,2012-05-30,


## Load

In [9]:
db_op, db_etl = connect_databases()
result.to_sql('DimEmployee', db_etl, if_exists='replace', index='EmployeeKey')

73

In [10]:
#Prueba de la tabla DimEmployee

dim_employee = pd.read_sql_query('SELECT * FROM DimEmployee', db_etl)
dim_employee

Unnamed: 0,EmployeeKey,ParentEmployeeKey,EmployeeNationalIDAlternateKey,ParentEmployeeNationalIDAlternateKey,SalesTerritoryKey,FirstName,LastName,MiddleName,Title,HireDate,...,MaritalStatus,SalariedFlag,Gender,PayFrequency,Rate,VacationHours,SickLeaveHours,IsSalesPerson,StartDate,EndDate
0,1,,295847284,,11.0,Ken,Sánchez,J,,2009-01-14,...,S,True,M,2,125.5000,99,69,0,2009-01-14,
1,2,,245797967,,11.0,Terri,Duffy,Lee,,2008-01-31,...,S,True,F,2,63.4615,1,20,0,2008-01-31,
2,3,3.0,509647174,509647174,11.0,Roberto,Tamburello,,,2007-11-11,...,M,True,M,2,43.2692,2,21,0,2007-11-11,
3,4,4.0,112457891,112457891,11.0,Rob,Walters,,,2007-12-05,...,S,False,M,2,8.6200,48,80,0,2007-12-05,2010-05-30
4,4,4.0,112457891,112457891,11.0,Rob,Walters,,,2007-12-05,...,S,False,M,2,8.6200,48,80,0,2010-05-31,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,286,286.0,758596752,758596752,9.0,Lynn,Tsoflias,N,,2013-05-30,...,S,True,F,2,23.0769,36,38,1,2013-05-30,
330,287,287.0,982310417,982310417,11.0,Amy,Alberts,E,,2012-04-16,...,M,True,F,2,48.1010,21,30,1,2012-04-16,
331,288,288.0,954276278,954276278,8.0,Rachel,Valdez,B,,2013-05-30,...,S,True,F,2,23.0769,35,37,1,2013-05-30,
332,289,289.0,668991357,668991357,10.0,Jae,Pak,B,,2012-05-30,...,M,True,F,2,23.0769,37,38,1,2012-05-30,
