## Introduction

This notebook is a follow up to DS 1 and DS 2. In this notebook, the process of building models and generating predictions is simplified into a single process that can be easily scheduled as needed.

In [None]:
!pip install prophet
import pyspark.sql.functions as F
import pandas as pd
import datetime
from datetime import timedelta
from pyspark.sql.functions import concat, col, lit, when, substring 
from pyspark.sql.types import *

In [None]:
# specify date cutoff -- this is typically the current date unless testing

generated_date = datetime.datetime.utcnow()
cutoff_date = datetime.datetime.utcnow()

# manually specify a cutoff date
# cutoff_date = '2023-12-25 01:23:45'
# cutoff_date = datetime.datetime.strptime(cutoff_date, '%Y-%m-%d %H:%M:%S')

# normalize times to nearest minute
cutoff_date = cutoff_date.replace(second=0, microsecond=0)
print(f'Cutoff date: {cutoff_date}')

In [None]:
class HistoryData:
    def __init__(self, file_uri, filename, year) -> None:
        self.file_uri = file_uri
        self.filename = filename
        self.year = year

def getDownloadInfo(year):
    if year==2023:
        return HistoryData(
            'https://fabricrealtimelab.blob.core.windows.net/public/AbboCost_Stock_History_v2/stockhistory-2023.tgz?sp=r&st=2024-01-01T17:00:00Z&se=2032-01-01T17:00:00Z&spr=https&sv=2022-11-02&sr=b&sig=ledWmONUdRKvcpDumZHpLPqkrTLWu%2B9GrF0gMh5QK2c%3D',
            'stockhistory-2023.tgz',
            year)
    elif year==2024:
        return HistoryData(
            'https://fabricrealtimelab.blob.core.windows.net/public/AbboCost_Stock_History_v2/stockhistory-2024.tgz?sp=r&st=2024-01-01T17:00:00Z&se=2032-01-01T17:00:00Z&spr=https&sv=2022-11-02&sr=b&sig=TIFg2tvEww3rdTVNOKo5ef1xTx%2Bs0XAbdEARKGhOiX8%3D',
            'stockhistory-2024.tgz',
            year)
    elif year==2025:
        return HistoryData(
            'https://fabricrealtimelab.blob.core.windows.net/public/AbboCost_Stock_History_v2/stockhistory-2025.tgz?sp=r&st=2024-01-01T17:00:00Z&se=2032-01-01T17:00:00Z&spr=https&sv=2022-11-02&sr=b&sig=UB4QhOmsfwhPC0rE14wRJQxeiXXutHxm%2BOVnFA3xDFQ%3D',
            'stockhistory-2025.tgz',
            year)
    elif year==2026:
        return HistoryData(
            'https://fabricrealtimelab.blob.core.windows.net/public/AbboCost_Stock_History_v2/stockhistory-2026.tgz?sp=r&st=2024-01-01T17:00:00Z&se=2032-01-01T17:00:00Z&spr=https&sv=2022-11-02&sr=b&sig=l4tonO4SZfuCbHrheomO0WNkuYfyTTdfdNrcfu%2Fc7dU%3D',
            'stockhistory-2026.tgz',
            year)
    else:
        return None



In [None]:
import os
import datetime
from datetime import timedelta

LAKEHOUSE_FOLDER = "/lakehouse/default"
DATA_FOLDER = "Files/stockhistory/raw"

TAR_FILE_PATH = f"/{LAKEHOUSE_FOLDER}/{DATA_FOLDER}/tar/"
CSV_FILE_PATH = f"/{LAKEHOUSE_FOLDER}/{DATA_FOLDER}/csv/"

def downloadHistoryIfNotExists():

    currYear = datetime.datetime.utcnow().year

    if not os.path.exists(LAKEHOUSE_FOLDER):
        # add a lakehouse if the notebook has no default lakehouse
        # a new notebook will not link to any lakehouse by default
        raise FileNotFoundError(
            "Lakehouse not found, please add a lakehouse for the notebook."
        )
    else:
        for year in range(currYear, currYear-2, -1):
            fileInfo = getDownloadInfo(year)

            if (fileInfo is None):
                print(f'No file exists for {year}')
                continue

            # verify if files are already in the lakehouse, and if not, download and unzip
            if not os.path.exists(f"{TAR_FILE_PATH}{fileInfo.filename}"):
                print(f'Downloading {fileInfo.filename}')
                os.makedirs(TAR_FILE_PATH, exist_ok=True)
                os.system(f"wget '{fileInfo.file_uri}' -O {TAR_FILE_PATH}{fileInfo.filename}")

                #todo: better file checking
                os.makedirs(CSV_FILE_PATH, exist_ok=True)
                print(f'Extracting {fileInfo.filename}')
                os.system(f"tar -zxvf {TAR_FILE_PATH}{fileInfo.filename} -C {CSV_FILE_PATH}")
            else:
                print(f'File already exists: {fileInfo.filename}')

downloadHistoryIfNotExists()

In [None]:
# verify csv files are available

import time

path_to_check = f'{DATA_FOLDER}/csv'
files_found = False
check_count = 0

while (files_found == False):
    try:
        check_count += 1
        files = mssparkutils.fs.ls(path_to_check)
        if (len(files) > 0):
            files_found = True
        print(f'Found {len(files)} CSV folders.')
    except Exception as e:
        if (check_count > 10):
            print('Unable to verify CSV files. Please restart session and verify files are downloading and extracting.')
            raise e
        print('Checking for files...')
        time.sleep(1)


In [None]:
def readStockHistory():
    df = (
        spark.read.format("csv")
        .option("header", "true")
        .load(f"{DATA_FOLDER}/csv/*/*/*.csv")
    )

    return df

# read all data into a dataframe
df_stocks = readStockHistory()
df_stocks.tail(8)

In [None]:
def filterStocksByDate(date):

    df_stocks_history = df_stocks.select("*").where(
        f'timestamp < "{str(date)}"')

    return df_stocks_history

df_stocks_filtered = filterStocksByDate(cutoff_date)
df_stocks_filtered.tail(4)

In [None]:
# gets all symbols to process.
# symbols can be specified explicitly or by code

def get_symbols(df):

    # get the symbols from the dim_symbol table if lakehouse module is completed
    # symbol_df = spark.sql("SELECT Symbol FROM StocksLakehouse.dim_symbol")
    
    # # create symbols manually if dim_symbol (from lakehouse module) does not exist
    # symbol_df = spark.createDataFrame( \
    #     [['BCUZ'], ['IDGD'], ['IDK'], ['TDY'], ['TMRW'], ['WHAT'], ['WHO'], ['WHY']],['Symbol'])

    # can specify a single symbol like so:
    symbol_df = spark.createDataFrame( \
        [['WHO']],['Symbol'])

    # by default, get all symbols from the current dataframe
    if not df.rdd.isEmpty():
        symbol_df = df.select('symbol').distinct().sort('symbol')
        symbol_df = symbol_df.withColumnRenamed('symbol','Symbol')

    symbols = symbol_df.collect()
    return symbols

symbols_list = get_symbols(df_stocks_filtered)
print(symbols_list)

In [None]:
def create_prediction_table():
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS stocks_prediction (
            Predict_time TIMESTAMP
            ,Symbol VARCHAR(5)
            ,yhat DOUBLE
            ,yhat_lower DOUBLE
            ,yhat_upper DOUBLE
            ,Generated TIMESTAMP)
        USING DELTA
        """)
    
create_prediction_table()

## Functions for filtering, building, and merging data

In [None]:
# remove all but specified stock symbol
# individual models can be built for each stock

def filterStocksBySymbol(df_stocks, symbol):
    df_stocks_filtered = df_stocks.select("*").where(
        f'symbol == "{symbol}"'
    )
    df_stocks_filtered = df_stocks_filtered.sort("timestamp")

    return df_stocks_filtered


In [None]:
# merge the predictions with the table in the lakehouse

from delta.tables import *

def write_predictions(predictions_pd, symbol, generated):

    predictions_df = spark.createDataFrame(predictions_pd) 
    predictions_df = predictions_df.withColumn("symbol", lit(symbol))
    predictions_df = predictions_df.withColumn("generated", lit(generated))
   
    stock_predictions_table = DeltaTable.forName(spark, "stocks_prediction")

    stock_predictions_table.alias('table') \
    .merge(
        predictions_df.alias('predictions'),
        f'table.Predict_time = predictions.ds and table.Symbol = "{symbol}"'
    ) \
    .whenMatchedUpdate(set =
        {
            "yhat": "predictions.yhat"
            ,"yhat_lower": "predictions.yhat_lower"
            ,"yhat_upper": "predictions.yhat_upper"
            ,"Generated": f"'{str(generated)}'"
        }
    ) \
    .whenNotMatchedInsert(values =
        {
            "Predict_time": "predictions.ds"
            ,"Symbol": f"'{symbol}'"
            ,"yhat": "predictions.yhat"
            ,"yhat_lower": "predictions.yhat_lower"
            ,"yhat_upper": "predictions.yhat_upper"
            ,"Generated": f"'{str(generated)}'"
        }
    ) \
    .execute()

In [None]:
# establish begin/end dates for prediction
# returns an empty dataframe

def make_prediction_dataframe(fromdate = datetime.datetime.utcnow()):

    enddate = fromdate + datetime.timedelta(days=7)

    print(f'Beginning of forecast: {fromdate}')
    print(f'End of forecast: {enddate}')

    future = pd.DataFrame({'ds': pd.date_range(start=fromdate, end=enddate, freq='T')})
    return future

In [None]:
from prophet import Prophet
from prophet.plot import add_changepoints_to_plot

def build_and_predict(dfStocks, prediction_begin_date):

    # set the seconds/microseconds to zero to normalize the time across runs
    prediction_begin_date = prediction_begin_date.replace(second=0, microsecond=0)

    # predict_df = loaded_model.make_future_dataframe(periods=60*24*7, freq='min', include_history = False)
    predict_df = make_prediction_dataframe(prediction_begin_date)

    dfstocks_pd = dfStocks.toPandas()

    # rename the columns as expected by Prophet (ds and y)
    dfstocks_pd = dfstocks_pd.rename(columns={'timestamp': 'ds'})
    dfstocks_pd = dfstocks_pd.rename(columns={'price': 'y'})
    print('Min data date: ', dfstocks_pd['ds'].min())
    print('Max data date: ', dfstocks_pd['ds'].max())

    # model parameters could be loaded dynamically, if needed
    # perhaps by looking up existing models in MLflow
    changepoint_prior_scale = 0.05
    changepoint_range = 0.95
    seasonality_prior_scale = 10
    weekly_seasonality = 5

    m = Prophet(changepoint_prior_scale = changepoint_prior_scale, 
        changepoint_range = changepoint_range, 
        seasonality_prior_scale = seasonality_prior_scale,
        weekly_seasonality=weekly_seasonality)
    m.fit(dfstocks_pd)

    forecast = m.predict(predict_df)
    return forecast

## Main loop to build predicitions for each symbol

In [None]:
# loop through all the symbols,
# filter the data by symbol, generate predictions
# write predictions to table

for row in symbols_list:
    print(f'Starting: {row.Symbol} at {datetime.datetime.utcnow()}')
    df_stocks_filtered_symbol = filterStocksBySymbol(df_stocks_filtered, row.Symbol)
    forecast = build_and_predict(df_stocks_filtered_symbol, cutoff_date)
    write_predictions(forecast, row.Symbol, generated_date)
    print(f'Completed: {row.Symbol} at {datetime.datetime.utcnow()}')
    print('-' * 50)

In [None]:
# spark.sql("DELETE FROM stocks_prediction")

In [None]:
df = spark.sql("SELECT * FROM stocks_prediction ORDER BY Predict_time DESC LIMIT 1000")
display(df)