## Lakehouse 3: Historical Import to Lakehouse Dimensional Model
This notebook will download additional historic information in order to load the fact table with more data to make for more interesting reporting opportunities.

Configure the source data table name for the name of the source table.

The data that is downloaded is shared by the data science module. The data is downloaded as a zip file, and then extracted as CSV files into the lakehouse unstructured files folder. These files can be loaded identically to the previous notebook.

In [None]:
from delta.tables import *
from pyspark.sql.functions import *

sourceTableName = 'raw_stock_data'
highWaterMark = None
newHighWaterMark = None

In [None]:
# configuration for downloads and stock symbol to analyze

# url to source tar file
FULL_URL = "https://fabricrealtimelab.blob.core.windows.net/public/AbboCost_Stock_History/stockhistory-2023-2024.tgz?sp=r&st=2023-11-26T23:59:09Z&se=2027-11-27T07:59:09Z&spr=https&sv=2022-11-02&sr=b&sig=70w%2BT6ZVGpdTd6YJr%2FzPhKUFk9JYJ2ezu6%2BBBr9ahxc%3D"
# lakehouse location -- assumes default lakehouse
LAKEHOUSE_FOLDER = "/lakehouse/default"

# filename and data folders
TAR_FILE_NAME = "stockhistory-2023-2024.tgz"
DATA_FOLDER = "Files/stockhistory/raw"

TAR_FILE_PATH = f"/{LAKEHOUSE_FOLDER}/{DATA_FOLDER}/tar/"
CSV_FILE_PATH = f"/{LAKEHOUSE_FOLDER}/{DATA_FOLDER}/csv/"


In [None]:
import os

if not os.path.exists(LAKEHOUSE_FOLDER):
    # add a lakehouse if the notebook has no default lakehouse
    # a new notebook will not link to any lakehouse by default
    raise FileNotFoundError(
        "Lakehouse not found, please add a lakehouse for the notebook."
    )
else:
    # verify whether or not the required files are already in the lakehouse, and if not, download and unzip
    if not os.path.exists(f"{TAR_FILE_PATH}{TAR_FILE_NAME}"):
        os.makedirs(TAR_FILE_PATH, exist_ok=True)
        os.system(f"wget '{FULL_URL}' -O {TAR_FILE_PATH}{TAR_FILE_NAME}")

        #todo: better file checking
        os.makedirs(CSV_FILE_PATH, exist_ok=True)
        os.system(f"tar -zxvf {TAR_FILE_PATH}{TAR_FILE_NAME} -C {CSV_FILE_PATH}")

In [None]:
# read the CSV files, {year}/{month}/{day}.csv

df_stocks = (
    spark.read.format("csv")
    .option("header", "true")
    .load(f"{DATA_FOLDER}/csv/*/*/*.csv")
)

df_stocks.tail(8)

In [None]:
# find earliest date in fact table, or use today's date if none

df_fact_min_date = spark.sql("SELECT coalesce(min(PriceDateKey),current_date()) as minDate FROM fact_stocks_daily_prices")
minDate = df_fact_min_date.first()["minDate"]
print(f"Min date: {minDate}")

df_stocks = df_stocks.select("*").where( \
    'timestamp < "' + str(minDate) + '"').sort("timestamp")

df_stocks.tail(8)

In [None]:
# this function adds symbols to dim_symbol that may not exist in table
# this allows for new symbols to be added to feed over time

def dim_symbol_incremental_load(df_stocks, df_existing_symbols):

    # determine max id of current symbols table
    if df_existing_symbols.rdd.isEmpty():
        maxId = 0
    else:
        maxId = df_existing_symbols.select("Symbol_SK").rdd.max()[0]

    # for the new rows to be ingested, get a list of unique symbols
    df_symbols = df_stocks.select("Symbol").distinct().orderBy("Symbol")

    # get the symbols in the new dataset that do not exist in current symbols dimension
    df_symbols = df_symbols.join(df_existing_symbols, df_symbols.Symbol == df_existing_symbols.Symbol, "left_outer")\
                        .where(df_existing_symbols.Symbol.isNull()) \
                        .select(df_symbols.Symbol) \
                        .orderBy("Symbol")

    df_symbols = df_symbols.withColumn("Symbol_SK", monotonically_increasing_id() + maxId + 1)
    df_symbols = df_symbols.withColumn("Name", when(df_symbols.Symbol == "BCUZ","Company Because")
        .when(df_symbols.Symbol == "IDGD","Company IDontGiveADarn")
        .when(df_symbols.Symbol == "IDK","Company IDontKnow")
        .when(df_symbols.Symbol == "TDY","Company Today")
        .when(df_symbols.Symbol == "TMRW","Company Tomorrow")
        .when(df_symbols.Symbol == "WHAT","Company What")
        .when(df_symbols.Symbol == "WHY","Company Why")
        .when(df_symbols.Symbol == "WHO","Company Who")
        .otherwise("Company Unknown"))
    df_symbols = df_symbols.withColumn("Market", when(substring(df_symbols.Symbol,1,1) == "B","NASDAQ")
        .when(substring(df_symbols.Symbol,1,1) == "W","NASDAQ")
        .when(substring(df_symbols.Symbol,1,1) == "I","NYSE")
        .when(substring(df_symbols.Symbol,1,1) == "T","NYSE")
        .otherwise("No Market"))
    df_symbols = df_symbols.select(df_symbols.Symbol_SK, df_symbols.Symbol, df_symbols.Name, df_symbols.Market)

    # if the dataframe is empty, there are no missing symbols
    if df_symbols.rdd.isEmpty():
        print("No new symbols.") 
        return df_existing_symbols

    print("New Symbols:")
    df_symbols.show()

    dim_symbol_table = DeltaTable.forName(spark, "dim_symbol")

    dim_symbol_table.alias('dim_symbol') \
    .merge(
        df_symbols.alias('updates'),
        'dim_symbol.Symbol = updates.Symbol'
    ) \
    .whenNotMatchedInsert(values =
        {
            "Symbol_SK": "updates.Symbol_SK"
            ,"Symbol": "updates.Symbol"
            ,"Name": "updates.Name"
            ,"Market": "updates.Market"
        }
    ) \
    .execute()

    return spark.sql("SELECT * FROM dim_symbol ORDER BY Symbol ASC")

In [None]:
# note that this cell is frozen and will not run unless unfrozen.
# since we do not need to update the watermark for this one time 
# historical import, freezing cells can be a handy way to prevent 
# processing if you do not want to delete the cell

# get the high watermark which tracks which rows have already been ingested
# raise error if no record is found in our metadata table

df_temp = spark.sql(f"SELECT WaterMark FROM etl_ingestsourceinfo WHERE IsActiveFlag = 'Y' and ObjectName = '{sourceTableName}'")

if df_temp.rdd.isEmpty():
    msg = f"No valid ingestion source: {sourceTableName}"
    print(msg)
    raise SystemExit(msg)
else:
    highWaterMark = df_temp.first()["WaterMark"]
    print(f"High watermark: {highWaterMark}")

In [None]:
# load the date dimension for later joins

df_date = spark.sql("SELECT * FROM dim_date ORDER BY DateKey ASC")
df_date.show()

In [None]:
# load the symbols dimension 

df_symbol = spark.sql("SELECT * FROM dim_symbol ORDER BY Symbol ASC")
print("Current Symbols:")
df_symbol.show()

# load any new symbols into dimension
df_symbol = dim_symbol_incremental_load(df_stocks, df_symbol)

print("Symbols After Merge:")
df_symbol.show()

In [None]:
# this cell is frozen because the process is a one-time historical import

# calculate the new watermark

df_temp = df_stocks.agg(min(df_stocks["timestamp"]), max(df_stocks["timestamp"]))
newHighWaterMark = df_temp.first()["max(timestamp)"]
print(f"New watermark: {newHighWaterMark}")

In [None]:
# Code generated by Data Wrangler for PySpark DataFrame

from pyspark.sql import functions as F

def clean_data(df_stocks):
    df_stocks = df_stocks.withColumn('datestamp', to_date(df_stocks['timestamp']))
    df_stocks = df_stocks.groupBy('symbol', 'datestamp').agg(F.min('price').alias('newMinPrice'), 
        F.max('price').alias('newMaxPrice'), F.last('price').alias('newClosePrice'))
    df_stocks = df_stocks.dropna()
    df_stocks = df_stocks.sort(df_stocks['symbol'].asc(), df_stocks['datestamp'].asc())
    return df_stocks

df_stocks_agg = clean_data(df_stocks)
display(df_stocks_agg)

In [None]:
# join the aggregated data to the date dimension

df_join = df_stocks_agg.join(df_date, df_stocks_agg.datestamp == df_date.DateKey)
display(df_join)

In [None]:
# join the data from above with the symbols dimension

df_join = df_join.join(df_symbol, df_join.symbol == df_symbol.Symbol)
display(df_join)

In [None]:
# create a final view with cleaned names for processing ease

df_final_view = df_join.select(col("datekey").alias("newPriceDateKey"), col("dim_symbol.Symbol").alias("newSymbol"),
    col("dim_symbol.Symbol_SK").alias("newSymbol_SK"),"newMinPrice","newMaxPrice","newClosePrice")

df_final_view.show()

In [None]:
# to insert the new data, we'll merge the dataframe with the fact table
# for existing records, update the high/low/close price of the stock
# for new records, insert a new row with the current high/low/close

from delta.tables import *

fact_stock_prices_table = DeltaTable.forName(spark, "fact_stocks_daily_prices")

fact_stock_prices_table.alias('fact') \
  .merge(
    df_final_view.alias('updates'),
    'fact.PriceDateKey = updates.newPriceDateKey and fact.Symbol_SK = updates.newSymbol_SK'
  ) \
  .whenMatchedUpdate(set =
    {
        "MinPrice": "CASE WHEN fact.MinPrice < updates.newMinPrice THEN fact.MinPrice ELSE updates.newMinPrice END"
        ,"MaxPrice": "CASE WHEN fact.MaxPrice > updates.newMaxPrice THEN fact.MaxPrice ELSE updates.newMaxPrice END"
        ,"ClosePrice": "updates.newClosePrice"
    }
  ) \
  .whenNotMatchedInsert(values =
    {
        "Symbol_SK": "updates.newSymbol_SK"
        ,"PriceDateKey": "updates.newPriceDateKey"
        ,"MinPrice": "updates.newMinPrice"
        ,"MaxPrice": "updates.newMaxPrice"
        ,"ClosePrice": "updates.newClosePrice"
    }
  ) \
  .execute()


In [None]:
# note that this cell is frozen and will not run unless unfrozen.
# since we do not need to update the watermark for this one time 
# historical import, freezing cells can be a handy way to prevent 
# processing if you do not want to delete the cell

# update the watermark for next run

spark.sql(f"UPDATE etl_ingestsourceinfo SET WaterMark = '{newHighWaterMark}' WHERE IsActiveFlag = 'Y' and ObjectName = '{sourceTableName}'")
spark.sql("SELECT * FROM etl_ingestsourceinfo LIMIT 1000").show()


In [None]:
def get_latest_fact():
    return spark.sql("SELECT dim.Symbol, fact.Symbol_SK, PriceDateKey, MinPrice, MaxPrice, ClosePrice \
        FROM fact_stocks_daily_prices fact \
        INNER JOIN dim_symbol dim on fact.Symbol_SK = dim.Symbol_SK \
        ORDER BY PriceDateKey ASC, fact.Symbol_SK ASC")

In [None]:
# run results
display(get_latest_fact())