## Lakehouse 3 - Create Star Schema
This run-once notebook will setup the schema for building the fact and dimension tables.
Configure the sourceTableName variable in the first cell (if needed) to match the hourly aggregation table. The begin/end dates are for the date dimension table. This notebook will recreate all tables, rebuilding the schema: existing fact and dimension tables will be overwritten.

In [None]:
# configure the source table name (if needed) and begin/end dates for the date dimension

from delta.tables import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, col, lit, when, substring, monotonically_increasing_id 
from datetime import datetime

# name of source table -- default is the hourly aggregation table
sourceTableName = 'stocks_hour_agg'

# begin/end dates for dim_date table
beginYear = 2023
endYear = datetime.today().year + 2

if not spark.catalog.tableExists(sourceTableName):
    msg = f'Warning! Source table not found: {sourceTableName}'
    print(msg)

In [None]:
# main fact table for stock data

def create_fact_Stocks_Daily_Prices():
    spark.sql(f"""
        CREATE OR REPLACE TABLE fact_Stocks_Daily_Prices (
            Symbol_SK LONG NOT NULL
            ,PriceDateKey DATE 
            ,MinPrice DOUBLE 
            ,MaxPrice DOUBLE 
            ,ClosePrice DOUBLE)
        USING DELTA
        """)
    
create_fact_Stocks_Daily_Prices()

In [None]:
# symbol dimension holds details about each company

def create_dim_symbol():
    spark.sql(f"""
        CREATE OR REPLACE TABLE dim_symbol (
            Symbol_SK LONG NOT NULL
            ,Symbol VARCHAR(5) NOT NULL
            ,Name VARCHAR(25)
            ,Market VARCHAR(15) )
        USING DELTA
        """)

create_dim_symbol()

In [None]:
# load the symbol table by getting a list of distinct symbols from source table

def dim_symbol_initial_load(sourceTableName):

    if not spark.catalog.tableExists(sourceTableName):
        msg = f'Warning! Source table not found: {sourceTableName}. Will not load stock symbols.'
        print(msg)
        return

    # get unique stock symbols in source table
    df_stocks = spark.sql(f"SELECT distinct(Symbol), DENSE_RANK() OVER(ORDER BY Symbol asc) row FROM {sourceTableName}")

    df_symbols = df_stocks.select("Symbol", "row")
    df_symbols = df_symbols.withColumn("Symbol_SK", col("row"))
    df_symbols = df_symbols.withColumn("Name", when(df_symbols.Symbol == "BCUZ","Company Because") \
        .when(df_symbols.Symbol == "IDGD","Company IDontGiveADarn") \
        .when(df_symbols.Symbol == "IDK","Company IDontKnow") \
        .when(df_symbols.Symbol == "TDY","Company Today") \
        .when(df_symbols.Symbol == "TMRW","Company Tomorrow") \
        .when(df_symbols.Symbol == "WHAT","Company What") \
        .when(df_symbols.Symbol == "WHY","Company Why") \
        .when(df_symbols.Symbol == "WHO","Company Who") \
        .otherwise("Company Unknown"))
    df_symbols = df_symbols.withColumn("Market", when(substring(df_symbols.Symbol,1,1) == "B","NASDAQ") \
                            .when(substring(df_symbols.Symbol,1,1) == "W","NASDAQ") \
                            .when(substring(df_symbols.Symbol,1,1) == "I","NYSE") \
                            .when(substring(df_symbols.Symbol,1,1) == "T","NYSE") \
                            .otherwise("No Market"))

    # merge the symbols into the table
    dim_symbol_table = DeltaTable.forName(spark, "dim_symbol")

    dim_symbol_table.alias('dim_symbol') \
    .merge( \
        df_symbols.alias('updates'), \
        'dim_symbol.Symbol = updates.Symbol' \
    ) \
    .whenNotMatchedInsert(values = \
        { 
            "Symbol_SK": "updates.Symbol_SK"
            ,"Symbol": "updates.Symbol"
            ,"Name": "updates.Name"
            ,"Market": "updates.Market"
        } \
    ) \
    .execute()

    df_dimSymbol = spark.sql("SELECT * FROM dim_symbol")
    df_dimSymbol.show()

dim_symbol_initial_load(sourceTableName)

In [None]:
# create and populate the date dimension

def createAndPopulate_dim_date(beginYear=2022, endYear=2025):

    # Create a DataFrame with a range of dates
    dates = spark.range(
        (datetime(endYear, 12, 31) - datetime(beginYear, 1, 1)).days + 1
    ).select(
        (date_add(lit(f"{beginYear}-01-01"), col("id").cast("int"))).alias("date")
    )

    # Select the desired columns
    datesdf = dates.select(
        date_format("date","yyyy-MM-dd").cast('date').alias("DateKey"),
        dayofmonth("date").alias("DayNum"),
        dayofweek("date").alias("DayOfWeekNum"),
        date_format("date", "EEEE").alias("DayOfWeekName"),
        month("date").alias("MonthNum"),
        date_format("date", "MMMM").alias("MonthName"),
        quarter("date").alias("QuarterNum"),
        concat(lit("Q"), quarter("date")).alias("QuarterName"),
        year("date").alias("Year")
    )

    datesdf.write.mode("overwrite").format("delta").saveAsTable("dim_date")
    datesdf.show()

createAndPopulate_dim_date(beginYear, endYear)

In [None]:
# drop tables for testing

def dropTables():
    spark.sql("DROP TABLE fact_stocks_daily_prices")
    spark.sql("DROP TABLE dim_symbol")
    spark.sql("DROP TABLE dim_date")

# dropTables()

In [None]:
# optional optimization, recommended for small tables
# typically scheduled for regular maintenance

def optimizeTables(sourceTableName):

    dim_date_table = DeltaTable.forName(spark, "dim_date")
    dim_date_table.optimize().executeCompaction()

    dim_symbol_table = DeltaTable.forName(spark, "dim_symbol")
    dim_symbol_table.optimize().executeCompaction()

    fact_stock_prices_table = DeltaTable.forName(spark, "fact_stocks_daily_prices")
    fact_stock_prices_table.optimize().executeCompaction()

    if spark.catalog.tableExists(sourceTableName):
        StockData_table = DeltaTable.forName(spark, sourceTableName)
        StockData_table.optimize().executeCompaction()

optimizeTables(sourceTableName)