In [1]:
# Import necessary libraries
from pyspark.sql.functions import *
from pyspark.sql.types import StringType

# Create database
spark.sql(f'CREATE SCHEMA IF NOT EXISTS adventureworks')

# Load data
customers = spark.read.table("Sales_Development.bronze.adventureworks.customer")

# Drop columns that are not needed
customers = customers.drop("PasswordHash", "PasswordSalt", "rowguid", "ModifiedDate")

# Function to determine gender  
def determine_gender_udf(title):  
    if title == 'Mr.':  
        return 'Male'  
    elif title == 'Ms.':  
        return 'Female'  
    else:
        return 'Unknown'  # Add a default value for other cases

determine_gender_udf = udf(determine_gender_udf, StringType())

# Adding gender to each dictionary in the list
customers = customers.withColumn("Gender", \
determine_gender_udf(trim(customers["Title"])))

# Define the strip_prefix function
def strip_prefix(value):
    return value.strip("adventure-works\\")

# Define the strip_prefix_udf function
strip_prefix_udf = udf(strip_prefix, StringType())

# Updating SalesPerson in each dictionary in the list  
customers = customers.withColumn("SalesPerson", \
strip_prefix_udf(customers["SalesPerson"]))

# Making all telephone numbers consistent
customers = customers.withColumn("Phone", \
regexp_replace(customers["Phone"], r"1 \(\d{2}\) ", ""))

# Write customers data
customers.write.mode("Overwrite").saveAsTable("adventureworks.clean_customer")

StatementMeta(, d555e1b0-de58-413f-bcbc-afed2ade0944, 3, Finished, Available, Finished)

In [11]:
# Load data
address = spark.read.table("bronze.adventureworks.address")

# Drop columns that are not needed
address = address.drop("rowguid")

# Write address data
address.write.mode("Overwrite").saveAsTable("adventureworks.clean_address")

StatementMeta(, 6efdc37f-9a23-4afd-8a67-52a5726f8452, 13, Finished, Available, Finished)

In [12]:
# Load data
customeraddress = spark.read.table("bronze.adventureworks.customeraddress")

# Drop columns that are not needed
customeraddress = customeraddress.drop("rowguid")

# Write customeraddress data
customeraddress.write.mode("Overwrite").saveAsTable("adventureworks.clean_customeraddress")

StatementMeta(, 6efdc37f-9a23-4afd-8a67-52a5726f8452, 14, Finished, Available, Finished)

In [4]:
# Load data
product = spark.read.table("bronze.adventureworks.product")

# Drop columns that are not needed
product = product.drop("rowguid")

# Write product data
product.write.mode("Overwrite").saveAsTable("adventureworks.clean_product")

StatementMeta(, 6efdc37f-9a23-4afd-8a67-52a5726f8452, 6, Finished, Available, Finished)

In [5]:
# Load data
productcategory = spark.read.table("bronze.adventureworks.productcategory")

# Drop columns that are not needed
productcategory = productcategory.drop("rowguid")

# Write productcategory data
productcategory.write.mode("Overwrite").saveAsTable("adventureworks.clean_productcategory")

StatementMeta(, 6efdc37f-9a23-4afd-8a67-52a5726f8452, 7, Finished, Available, Finished)

In [6]:
# Load data
productdescription = spark.read.table("bronze.adventureworks.productdescription")

# Drop columns that are not needed
productdescription = productdescription.drop("rowguid")

# Write productdescription data
productdescription.write.mode("Overwrite").saveAsTable("adventureworks.clean_productdescription")

StatementMeta(, 6efdc37f-9a23-4afd-8a67-52a5726f8452, 8, Finished, Available, Finished)

In [7]:
# Load data
productmodel = spark.read.table("bronze.adventureworks.productmodel")

# Drop columns that are not needed
productmodel = productmodel.drop("rowguid")

# Write productmodel data
productmodel.write.mode("Overwrite").saveAsTable("adventureworks.clean_productmodel")

StatementMeta(, 6efdc37f-9a23-4afd-8a67-52a5726f8452, 9, Finished, Available, Finished)

In [8]:
# Load data
productmodelproductdescription = spark.read.table("bronze.adventureworks.productmodelproductdescription")

# Drop columns that are not needed
productmodelproductdescription = productmodelproductdescription.drop("rowguid")

# Write productmodelproductdescription data
productmodelproductdescription.write.mode("Overwrite").saveAsTable("adventureworks.clean_productmodelproductdescription")

StatementMeta(, 6efdc37f-9a23-4afd-8a67-52a5726f8452, 10, Finished, Available, Finished)

In [9]:
# Load data
salesorderdetail = spark.read.table("bronze.adventureworks.salesorderdetail")

# Drop columns that are not needed
salesorderdetail = salesorderdetail.drop("rowguid")

# Write salesorderdetail data
salesorderdetail.write.mode("Overwrite").saveAsTable("adventureworks.clean_salesorderdetail")

StatementMeta(, 6efdc37f-9a23-4afd-8a67-52a5726f8452, 11, Finished, Available, Finished)

In [10]:
# Load data
salesorderheader = spark.read.table("bronze.adventureworks.salesorderheader")

# Drop columns that are not needed
salesorderheader = salesorderheader.drop("rowguid")

# Write salesorderheader data
salesorderheader.write.mode("Overwrite").saveAsTable("adventureworks.clean_salesorderheader")

StatementMeta(, 6efdc37f-9a23-4afd-8a67-52a5726f8452, 12, Finished, Available, Finished)