In [7]:
# pip install -r requirements.txt
# python -m ipykernel install --user --name=python3

import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, dayofmonth, month, quarter, year, dayofweek, date_format, sum as spark_sum

jar_dir = "/home/bnguyen/Desktop/DE_project/scripts/jars"
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    f"--jars {jar_dir}/hadoop-azure-3.3.6.jar,"
    f"{jar_dir}/azure-storage-8.6.6.jar,"
    f"{jar_dir}/hadoop-common-3.3.6.jar,"
    f"{jar_dir}/jetty-client-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-http-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-io-9.4.43.v20210629.jar,"
    f"{jar_dir}/mysql-connector-j-9.3.0.jar,"
    f"{jar_dir}/jetty-util-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-util-ajax-9.4.43.v20210629.jar "
    "pyspark-shell"
)


In [8]:
# Init spark session
spark = SparkSession.builder \
    .appName("DW data load") \
    .getOrCreate()

In [9]:
mysql_url = "jdbc:mysql://localhost:3306/store_dw"
mysql_props = {
    "user": "bnguyen",
    "password": ".Tldccmcbtldck2",
    "driver": "com.mysql.cj.jdbc.Driver"
}

In [10]:
# Silver access key
spark.conf.set(
    "fs.azure.account.key.mysilver.blob.core.windows.net",
    "bAthp0pVBfqEtyCvJElSX7MeI7ejSLa6cjuPoMz0Gg/69uzEW01y4URMDXsdFCrkpc9M54cDHnXs+AStj1gExQ=="
)

# Gold
spark.conf.set(
    "fs.azure.account.key.mygold.dfs.core.windows.net",
    "wRPXTwWCVxWwUpavEh62A5wzLdUvRTGeB3tZKP3eRbig7ca8ZN51l0kWS32kcbH/ddQ/jNXBzqDC+AStOzXlyw=="
)


In [11]:
# 1. DimProduct
products = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Products")
dim_product = products.select(
    col("ProductID"),
    col("Name").alias("ProductName"),
    col("CategoryID"),
    col("SellerID")
)
# dim_product.write.jdbc(mysql_url, "DimProduct", mode="append", properties=mysql_props)
dim_product.write.mode("overwrite").parquet("abfss://gold-test@mygold.dfs.core.windows.net/DimProduct")
# dim_product.coalesce(1).write.format("csv").options(header="True", delimiter = ',').mode("overwrite").option("path", "abfss://gold-csv@mygold.dfs.core.windows.net/DimProduct.csv").save()

                                                                                

In [12]:
# 2. DimCategory
categories = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/ProductCategories")
dim_category = categories.select(
    col("CategoryID"),
    col("CategoryName")
)
# dim_category.write.jdbc(mysql_url, "DimCategory", mode="append", properties=mysql_props)
dim_category.write.mode("overwrite").parquet("abfss://gold-test@mygold.dfs.core.windows.net/DimCategory")
# dim_category.coalesce(1).write.format("csv").options(header="True", delimiter = ',').mode("overwrite").option("path", "abfss://gold-csv@mygold.dfs.core.windows.net/DimCategory.csv").save()

                                                                                

In [13]:
# 3. DimSeller
sellers = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Sellers")
dim_seller = sellers.select(
    col("SellerID"),
    col("Name").alias("SellerName")
)
# dim_seller.write.jdbc(mysql_url, "DimSeller", mode="append", properties=mysql_props)
dim_seller.write.mode("overwrite").parquet("abfss://gold-test@mygold.dfs.core.windows.net/DimSeller")
# dim_seller.coalesce(1).write.format("csv").options(header="True", delimiter = ',').mode("overwrite").option("path", "abfss://gold-csv@mygold.dfs.core.windows.net/DimSeller.csv").save()

                                                                                

In [14]:
# 4. DimCustomer
customers = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Customers")
dim_customer = customers.select(
    col("CustomerID"),
    col("Name").alias("CustomerName")
)
# dim_customer.write.jdbc(mysql_url, "DimCustomer", mode="append", properties=mysql_props)
dim_customer.write.mode("overwrite").parquet("abfss://gold-test@mygold.dfs.core.windows.net/DimCustomer")
# dim_customer.coalesce(1).write.format("csv").options(header="True", delimiter = ',').mode("overwrite").option("path", "abfss://gold-csv@mygold.dfs.core.windows.net/DimCustomer.csv").save()

                                                                                

In [15]:
# 5. DimOrderStatus
order_status = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/OrderStatus")
dim_order_status = order_status.select(
    col("StatusID"),
    col("StatusName")
)
# dim_order_status.write.jdbc(mysql_url, "DimOrderStatus", mode="append", properties=mysql_props)
dim_order_status.write.mode("overwrite").parquet("abfss://gold-test@mygold.dfs.core.windows.net/DimOrderStatus")
# dim_order_status.coalesce(1).write.format("csv").options(header="True", delimiter = ',').mode("overwrite").option("path", "abfss://gold-csv@mygold.dfs.core.windows.net/DimOrderStatus.csv").save()

                                                                                

In [16]:
# 6. DimDate (from Orders)
orders = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Orders")
dim_date = orders.select(
    date_format(col("CreatedAt"), "yyyyMMdd").cast("int").alias("DateKey"),
    col("CreatedAt").cast("date").alias("Date"),
    dayofmonth(col("CreatedAt")).alias("Day"),
    month(col("CreatedAt")).alias("Month"),
    quarter(col("CreatedAt")).alias("Quarter"),
    year(col("CreatedAt")).alias("Year"),
    dayofweek(col("CreatedAt")).alias("DayOfWeek")
).distinct()
# dim_date.write.jdbc(mysql_url, "DimDate", mode="append", properties=mysql_props)
dim_date.write.mode("overwrite").parquet("abfss://gold-test@mygold.dfs.core.windows.net/DimDate")
# dim_date.coalesce(1).write.format("csv").options(header="True", delimiter = ',').mode("overwrite").option("path", "abfss://gold-csv@mygold.dfs.core.windows.net/DimDate.csv").save()

                                                                                

In [17]:
# 7. DimReason
reasons = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Reasons")
dim_reason = reasons.select(
    col("ReasonID"),
    col("ReasonType"),
    col("ReasonDescription")
)
# dim_reason.write.jdbc(mysql_url, "DimReason", mode="append", properties=mysql_props)
dim_reason.write.mode("overwrite").parquet("abfss://gold-test@mygold.dfs.core.windows.net/DimReason")
# dim_reason.coalesce(1).write.format("csv").options(header="True", delimiter = ',').mode("overwrite").option("path", "abfss://gold-csv@mygold.dfs.core.windows.net/DimReason.csv").save()

                                                                                

In [18]:
# Fact sales
order_items = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/OrderItems").alias("oi")
orders = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Orders").alias("o")
payments = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Payments").alias("p")
products = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Products").alias("pr")

# Only consider orders that have a payment
paid_orders = payments.select("OrderID").distinct().alias("po")

# Join OrderItems with Orders and filter for paid orders
fact_sales = (
    order_items
    .join(orders, col("oi.OrderID") == col("o.OrderID"))
    .join(paid_orders, col("oi.OrderID") == col("po.OrderID"), "inner")
    .join(products, col("oi.ProductID") == col("pr.ProductID"))
    .join(payments, col("oi.OrderID") == col("p.OrderID"), "inner")
    .select(
        col("oi.OrderItemID"),
        col("oi.OrderID"),
        col("oi.ProductID"),
        col("pr.SellerID"),
        col("o.CustomerID"),
        col("pr.CategoryID"),
        date_format(col("o.CreatedAt"), "yyyyMMdd").cast("int").alias("OrderDateKey"),
        col("o.StatusID"),
        col("oi.Quantity").cast("int").alias("Quantity"),
        col("pr.Price").cast("double").alias("CurrentPrice"),  # Use Price from Product
        col("pr.Cost").cast("double").alias("Cost"),           # Use Cost from Product
        (col("oi.Quantity").cast("int") * col("pr.Price").cast("double")).alias("Revenue"),
        (
            (col("oi.Quantity").cast("int") * col("pr.Price").cast("double")) -
            (col("oi.Quantity").cast("int") * col("pr.Cost").cast("double"))
        ).alias("Profit"),
        col("p.CreatedAt").alias("CreatedAt")
    )
)
# fact_sales.write.jdbc(mysql_url, "FactSales", mode="overwrite", properties=mysql_props)
fact_sales.write.mode("overwrite").parquet("abfss://gold-test@mygold.dfs.core.windows.net/FactSales")
# fact_sales.coalesce(1).write.format("csv").options(header="True", delimiter = ',').mode("overwrite").option("path", "abfss://gold-csv@mygold.dfs.core.windows.net/FactSales.csv").save()

                                                                                

In [20]:
# FactOrderReason (updated for new schema)
reasons = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Reasons")
order_items = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/OrderItems")
products = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Products")
orders = spark.read.parquet("wasbs://silver@mysilver.blob.core.windows.net/Orders")

# Join Reasons with OrderItems to get OrderItemID and SellerID
fact_order_reason = (
    reasons
    .join(order_items, reasons.OrderID == order_items.OrderID, "inner")
    .join(products, order_items.ProductID == products.ProductID, "inner")
    .join(orders, reasons.OrderID == orders.OrderID, "inner")
    .select(
        order_items.OrderItemID,
        reasons.ReasonID,
        reasons.OrderID,
        products.SellerID,
        date_format(orders.CreatedAt, "yyyyMMdd").cast("int").alias("OrderDateKey"),
        orders.StatusID
    )
)
fact_order_reason.write.jdbc(mysql_url, "FactOrderReason", mode="overwrite", properties=mysql_props)
fact_order_reason.write.mode("overwrite").parquet("abfss://gold-test@mygold.dfs.core.windows.net/FactOrderReason")
# fact_order_reason.coalesce(1).write.format("csv").options(header="True", delimiter = ',').mode("overwrite").option("path", "abfss://gold-csv@mygold.dfs.core.windows.net/FactOrderReason.csv").save()

                                                                                