## Introduction

This notebook is a follow up to DS 1 and DS 2. In this notebook, the process of building models and generating predictions is simplified into a single process that can be easily scheduled as needed.

In [None]:
!pip install prophet
import pyspark.sql.functions as F
import pandas as pd
import datetime
from datetime import timedelta
from pyspark.sql.functions import concat, col, lit, when, substring 
from pyspark.sql.types import *

In [None]:
def create_prediction_table():
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS stock_predictions (
            predict_time TIMESTAMP
            ,symbol VARCHAR(5)
            ,yhat DOUBLE
            ,yhat_lower DOUBLE
            ,yhat_upper DOUBLE
            ,generated TIMESTAMP)
        USING DELTA
        """)

In [None]:
import os

# configuration for downloads and stock symbol to analyze

DATA_FOLDER = "Files/stockhistory/raw"

def downloadHistoryIfNotExists():

    # url to source tar file
    FULL_URL = "https://fabricrealtimelab.blob.core.windows.net/public/AbboCost_Stock_History/stockhistory-2023-2024.tgz?sp=r&st=2023-11-26T23:59:09Z&se=2027-11-27T07:59:09Z&spr=https&sv=2022-11-02&sr=b&sig=70w%2BT6ZVGpdTd6YJr%2FzPhKUFk9JYJ2ezu6%2BBBr9ahxc%3D"
    # lakehouse location -- assumes default lakehouse
    LAKEHOUSE_FOLDER = "/lakehouse/default"

    # filename and data folders
    TAR_FILE_NAME = "stockhistory-2023-2024.tgz"
    TAR_FILE_PATH = f"/{LAKEHOUSE_FOLDER}/{DATA_FOLDER}/tar/"
    CSV_FILE_PATH = f"/{LAKEHOUSE_FOLDER}/{DATA_FOLDER}/csv/"

    if not os.path.exists(LAKEHOUSE_FOLDER):
        # add a lakehouse if the notebook has no default lakehouse
        # a new notebook will not link to any lakehouse by default
        raise FileNotFoundError(
            "Lakehouse not found, please add a lakehouse for the notebook."
        )
    else:
        # verify whether or not the required files are already in the lakehouse, and if not, download and unzip
        if not os.path.exists(f"{TAR_FILE_PATH}{TAR_FILE_NAME}"):
            os.makedirs(TAR_FILE_PATH, exist_ok=True)
            os.system(f"wget '{FULL_URL}' -O {TAR_FILE_PATH}{TAR_FILE_NAME}")

            #todo: better file checking
            os.makedirs(CSV_FILE_PATH, exist_ok=True)
            os.system(f"tar -zxvf {TAR_FILE_PATH}{TAR_FILE_NAME} -C {CSV_FILE_PATH}")

In [None]:
def readStockHistory():
    df_stocks = (
        spark.read.format("csv")
        .option("header", "true")
        .load(f"{DATA_FOLDER}/csv/*/*/*.csv")
    )

    df_stocks.tail(8)
    return df_stocks

In [None]:
# remove all but specified stock symbol
# individual models can be built for each stock

def filterStocksBySymbol(df_stocks, symbol):
    df_stocks_filtered = df_stocks.select("*").where(
        'symbol == "' + symbol + '"'
    )
    df_stocks_filtered = df_stocks_filtered.sort("timestamp")

    df_stocks_filtered.tail(4)
    return df_stocks_filtered

In [None]:
def filterStocksByDate(date):

    df_stocks_history = df_stocks.select("*").where(
        'timestamp < "' + str(date) + '"')

    df_stocks_history.tail(4)
    return df_stocks_history

In [None]:
# merge the predicitions with the table in the lakehouse

from delta.tables import *

def write_predicitions(predicitions_pd, symbol, generated):

    predicitions_df = spark.createDataFrame(predicitions_pd) 
    predicitions_df = predicitions_df.withColumn("symbol", lit(symbol))
    predicitions_df = predicitions_df.withColumn("generated", lit(generated))
   
    stock_predictions_table = DeltaTable.forName(spark, "stock_predictions")

    stock_predictions_table.alias('table') \
    .merge(
        predicitions_df.alias('predicitions'),
        'table.predict_time = predicitions.ds and table.symbol = "' + symbol + '"'
    ) \
    .whenMatchedUpdate(set =
        {
            "yhat": "predicitions.yhat"
            ,"yhat_lower": "predicitions.yhat_lower"
            ,"yhat_upper": "predicitions.yhat_upper"
            ,"generated": f"'{str(generated)}'"
        }
    ) \
    .whenNotMatchedInsert(values =
        {
            "predict_time": "predicitions.ds"
            ,"symbol": f"'{symbol}'"
            ,"yhat": "predicitions.yhat"
            ,"yhat_lower": "predicitions.yhat_lower"
            ,"yhat_upper": "predicitions.yhat_upper"
            ,"generated": f"'{str(generated)}'"
        }
    ) \
    .execute()

In [None]:
# gets all symbols to process.
# symbols can be specified explicitly or by code

def get_symbols(df):

    # get the symbols from the dim_symbol table if lakehouse module is completed
    #symbol_df = spark.sql("SELECT Symbol FROM StocksLakehouse.dim_symbol")

    # create symbols manually if dim_symbol (from lakehouse module) does not exist
    symbol_df = spark.createDataFrame( \
        [['BCUZ'], ['IDGD'], ['IDK'], ['TDY'], ['TMRW'], ['WHAT'], ['WHO'], ['WHY']],['Symbol'])

    # can specify a single symbol like so:
    symbol_df = spark.createDataFrame( \
        [['WHO']],['Symbol'])

    # by default, get all symbols from the current dataframe
    if not df.rdd.isEmpty():
        symbol_df = df.select('symbol').distinct().sort('symbol')

    symbols = symbol_df.collect()
    return symbols

In [None]:
# establish begin/end dates for prediction
# returns an empty dataframe

def make_prediction_dataframe(fromdate = datetime.datetime.utcnow()):

    enddate = fromdate + datetime.timedelta(days=7)

    print(f'Beginning of forecast: {fromdate}')
    print(f'End of forecast: {enddate}')

    future = pd.DataFrame({'ds': pd.date_range(start=fromdate, end=enddate, freq='T')})
    return future

In [None]:
from prophet import Prophet
from prophet.plot import add_changepoints_to_plot

def build_and_predict(dfStocks, predicition_begin_date):

    # set the seconds/microseconds to zero to normalize the time across runs
    predicition_begin_date = predicition_begin_date.replace(second=0, microsecond=0)

    # predict_df = loaded_model.make_future_dataframe(periods=60*24*7, freq='min', include_history = False)
    predict_df = make_prediction_dataframe(predicition_begin_date)

    dfstocks_pd = dfStocks.toPandas()

    # rename the columns as expected by Prophet (ds and y)
    dfstocks_pd = dfstocks_pd.rename(columns={'timestamp': 'ds'})
    dfstocks_pd = dfstocks_pd.rename(columns={'price': 'y'})
    print('Min data date: ', dfstocks_pd['ds'].min())
    print('Max data date: ', dfstocks_pd['ds'].max())

    # model parameters could be loaded dynamically, if needed
    # perhaps by looking up existing models in MLflow
    changepoint_prior_scale = 0.05
    changepoint_range = 0.95
    seasonality_prior_scale = 10
    weekly_seasonality = 5

    m = Prophet(changepoint_prior_scale = changepoint_prior_scale, 
        changepoint_range = changepoint_range, 
        seasonality_prior_scale = seasonality_prior_scale,
        weekly_seasonality=weekly_seasonality)
    m.fit(dfstocks_pd)

    forecast = m.predict(predict_df)
    return forecast

In [None]:
generated_date = datetime.datetime.utcnow()
cutoff_date = datetime.datetime.utcnow()

# manually specify a cutoff date
# cutoff_date = '2023-12-21 01:23:45'
# cutoff_date = datetime.datetime.strptime(cutoff_date, '%Y-%m-%d %H:%M:%S')

# normalize times to nearest minute
cutoff_date = cutoff_date.replace(second=0, microsecond=0)
print(f'Cutoff date: {cutoff_date}')

downloadHistoryIfNotExists()

# read all data into a dataframe
df_stocks = readStockHistory()

# keep only data up until current date (no future looking data)
df_stocks_filtered = filterStocksByDate(cutoff_date)

# create the stocks prediction table if needed
create_prediction_table()

# get a list of all stock symbols in data
symbols_list = get_symbols(df_stocks_filtered)

In [None]:
# loop through all the symbols,
# filter the data by symbol, generate predicitions
# write predicitions to table

for row in symbols_list:
    print(f'Starting: {row.symbol} at {datetime.datetime.utcnow()}')
    df_stocks_filtered_symbol = filterStocksBySymbol(df_stocks_filtered, row.symbol)
    forecast = build_and_predict(df_stocks_filtered_symbol, cutoff_date)
    write_predicitions(forecast, row.symbol, generated_date)
    print(f'Completed: {row.symbol} at {datetime.datetime.utcnow()}')

In [None]:
print(symbols_list)

In [None]:
spark.sql("DELETE FROM stock_predictions")

In [None]:
df = spark.sql("SELECT * FROM StocksLakehouse.stock_predictions ORDER BY predict_time ASC LIMIT 1000")
display(df)