In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, monotonically_increasing_id)
from notebookUtils import readCsvAsDataframe

In [3]:
spark = SparkSession.builder.appName("factsales2").getOrCreate()

In [4]:
fact_sales = readCsvAsDataframe(spark,"fact_sales_data_v2.csv")

In [16]:
display(fact_sales)

DataFrame[ProductCategory: string, ProductName: string, Brand: string, StoreRegion: string, StoreName: string, StoreType: string, SalesRep: string, Department: string, EmployeeRole: string, UnitsSold: double, UnitPrice: double, Discount: double, SaleDate: date]

### Create Employee Dimension

In [10]:
df_employee = fact_sales.select([
    col('SalesRep'),
    col('Department'),
    col('EmployeeRole')
]).distinct()

In [11]:
df_employee2 = df_employee.withColumn("EmployeeKey",monotonically_increasing_id())

In [12]:
df_employee_na = spark.createDataFrame([
    ("N/A","N/A","N/A",-1)
],["SalesRep", "Department","EmployeeRole","EmployeeKey"])

In [14]:
dim_employee = df_employee2.unionAll(df_employee_na)

In [15]:
#df_category2.write.format("delta").mode("overwrite").save("dim_category")
dim_employee.write.format("parquet").mode("overwrite").saveAsTable(name="dim_employee")

In [16]:
df_dim_employee = spark.read.parquet("spark-warehouse/dim_employee")

In [17]:
df_dim_employee.show(30)

[Row(SalesRep='Kyle Lin', Department='Electronics', EmployeeRole='Sales Associate', EmployeeKey=0),
 Row(SalesRep='Charles Fields', Department='Apparel', EmployeeRole='Manager', EmployeeKey=1),
 Row(SalesRep='Wendy Castillo', Department='Home', EmployeeRole='Manager', EmployeeKey=2),
 Row(SalesRep='Wendy Castillo', Department='Electronics', EmployeeRole='Manager', EmployeeKey=3),
 Row(SalesRep='Charles Fields', Department='Home', EmployeeRole='Cashier', EmployeeKey=4),
 Row(SalesRep='Kyle Lin', Department='Home', EmployeeRole='Cashier', EmployeeKey=5),
 Row(SalesRep='John Harris', Department='Electronics', EmployeeRole='Manager', EmployeeKey=6),
 Row(SalesRep='John Harris', Department='Apparel', EmployeeRole='Manager', EmployeeKey=7),
 Row(SalesRep='Billy Perez', Department='Electronics', EmployeeRole='Cashier', EmployeeKey=8),
 Row(SalesRep='John Harris', Department='Home', EmployeeRole='Cashier', EmployeeKey=9),
 Row(SalesRep='Emily Vazquez', Department='Electronics', EmployeeRole='S