In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, dayofmonth, month, quarter, year, dayofweek, date_format, sum as spark_sum

# Set environment vars to load jars
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--jars jars/hadoop-azure-3.3.6.jar," # Hadoop connector (wasb + wasbs)
    "jars/mysql-connector-j-9.3.0.jar,"
    "jars/azure-storage-8.6.6.jar," # Azure SDK for Java (allow communication between Hadoop, Spark with Blobs Storage)
    "jars/jetty-client-9.4.43.v20210629.jar," # I don't know...
    "jars/jetty-http-9.4.43.v20210629.jar," ###############
    "jars/jetty-io-9.4.43.v20210629.jar," #################
    "jars/jetty-util-9.4.43.v20210629.jar," ################
    "jars/jetty-util-ajax-9.4.43.v20210629.jar " ############
    "pyspark-shell"
)


In [2]:
# Init spark session
spark = SparkSession.builder \
    .appName("DW data load") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/07/07 14:05:43 WARN Utils: Your hostname, lenovo-slim, resolves to a loopback address: 127.0.1.1; using 192.168.199.13 instead (on interface wlp2s0)
25/07/07 14:05:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/07/07 14:05:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/07 14:05:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
mysql_url = "jdbc:mysql://localhost:3306/store_dw"
mysql_props = {
    "user": "bnguyen",
    "password": ".Tldccmcbtldck2",
    "driver": "com.mysql.cj.jdbc.Driver"
}

In [4]:
# Silver access key
spark.conf.set(
    "fs.azure.account.key.mysilver.blob.core.windows.net",
    "bAthp0pVBfqEtyCvJElSX7MeI7ejSLa6cjuPoMz0Gg/69uzEW01y4URMDXsdFCrkpc9M54cDHnXs+AStj1gExQ=="
)

In [None]:
# 1. DimProduct
products = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Products")
dim_product = products.select(
    col("ProductID"),
    col("Name").alias("ProductName"),
    col("CategoryID"),
    col("SellerID")
)
dim_product.write.jdbc(mysql_url, "DimProduct", mode="append", properties=mysql_props)

                                                                                

In [12]:
# 2. DimCategory
categories = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/ProductCategories")
dim_category = categories.select(
    col("CategoryID"),
    col("CategoryName")
)
dim_category.write.jdbc(mysql_url, "DimCategory", mode="append", properties=mysql_props)

                                                                                

In [9]:
# 3. DimSeller
sellers = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Sellers")
dim_seller = sellers.select(
    col("SellerID"),
    col("Name").alias("SellerName")
)
dim_seller.write.jdbc(mysql_url, "DimSeller", mode="append", properties=mysql_props)

                                                                                

In [10]:
# 4. DimCustomer
customers = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Customers")
dim_customer = customers.select(
    col("CustomerID"),
    col("Name").alias("CustomerName")
)
dim_customer.write.jdbc(mysql_url, "DimCustomer", mode="append", properties=mysql_props)

                                                                                

In [11]:
# 5. DimOrderStatus
order_status = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/OrderStatus")
dim_order_status = order_status.select(
    col("StatusID"),
    col("StatusName")
)
dim_order_status.write.jdbc(mysql_url, "DimOrderStatus", mode="append", properties=mysql_props)

                                                                                

In [13]:
# 6. DimDate (from Orders)
orders = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Orders")
dim_date = orders.select(
    date_format(col("CreatedAt"), "yyyyMMdd").cast("int").alias("DateKey"),
    col("CreatedAt").cast("date").alias("Date"),
    dayofmonth(col("CreatedAt")).alias("Day"),
    month(col("CreatedAt")).alias("Month"),
    quarter(col("CreatedAt")).alias("Quarter"),
    year(col("CreatedAt")).alias("Year"),
    dayofweek(col("CreatedAt")).alias("DayOfWeek")
).distinct()
dim_date.write.jdbc(mysql_url, "DimDate", mode="append", properties=mysql_props)

                                                                                

In [7]:
# 7. DimReason
reasons = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Reasons")
dim_reason = reasons.select(
    col("ReasonID"),
    col("ReasonType"),
    col("ReasonDescription")
)
dim_reason.write.jdbc(mysql_url, "DimReason", mode="append", properties=mysql_props)

                                                                                

In [None]:
# Fact sales
order_items = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/OrderItems")
orders = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Orders")
payments = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Payments")
products = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Products")

# Only consider orders that have a payment
paid_orders = payments.select("OrderID").distinct()

# Join OrderItems with Orders and filter for paid orders
fact_sales = (
    order_items
    .join(orders, order_items.OrderID == orders.OrderID)
    .join(paid_orders, order_items.OrderID == paid_orders.OrderID, "inner")
    .join(products, order_items.ProductID == products.ProductID)
    .select(
        order_items.OrderItemID,
        order_items.OrderID,
        order_items.ProductID,
        products.SellerID,
        orders.CustomerID,
        products.CategoryID,
        date_format(orders.CreatedAt, "yyyyMMdd").cast("int").alias("OrderDateKey"),
        orders.StatusID,
        col("Quantity").cast("int").alias("Quantity"),
        col("CurrentPrice").cast("double").alias("CurrentPrice"),
        # Revenue only for paid orders, cast both to numeric
        (col("Quantity").cast("int") * col("CurrentPrice").cast("double")).alias("Revenue")
    )
)
fact_sales.write.jdbc(mysql_url, "FactSales", mode="append", properties=mysql_props)

25/07/07 14:06:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-azure-file-system.properties,hadoop-metrics2.properties
                                                                                

In [None]:
# Fact reasons
reasons = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Reasons")
orders = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Orders")

# Join Reasons with Orders to get OrderDateKey and StatusID
fact_order_reason = (
    reasons
    .join(orders, reasons.OrderID == orders.OrderID, "inner")
    .select(
        reasons.ReasonID,
        reasons.OrderID,
        date_format(orders.CreatedAt, "yyyyMMdd").cast("int").alias("OrderDateKey"),
        orders.StatusID
    )
)
fact_order_reason.write.jdbc(mysql_url, "FactOrderReason", mode="append", properties=mysql_props)

                                                                                

In [None]:
# Fact customer order
orders = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Orders")
order_items = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/OrderItems")
payments = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Payments")

# Only include paid orders
paid_orders = payments.select("OrderID").distinct()

# Calculate revenue per order (sum of paid order items)
order_items_paid = (
    order_items
    .join(paid_orders, "OrderID", "inner")
    .withColumn("Revenue", col("Quantity").cast("double") * col("CurrentPrice").cast("double"))
)

order_revenue = (
    order_items_paid
    .groupBy("OrderID")
    .agg(spark_sum("Revenue").alias("Revenue"))
)

# Join with Orders to get CustomerID and OrderDateKey
fact_customer_order = (
    orders
    .join(order_revenue, "OrderID", "inner")
    .select(
        col("CustomerID"),
        col("OrderID"),
        date_format(col("CreatedAt"), "yyyyMMdd").cast("int").alias("OrderDateKey"),
        col("Revenue").cast("double")
    )
)
fact_customer_order.write.jdbc(mysql_url, "FactCustomerOrder", mode="append", properties=mysql_props)