## Introduction

This notebook is a follow up to DS 1 and DS 2. In this notebook, the process of building models and generating predictions is simplified into a single process that can be easily scheduled as needed.

In this notebook, live data from the stock_minute_agg table will be used to build predictions instead of the downloaded CSV files. 

In [None]:
!pip install prophet
import pyspark.sql.functions as F
import pandas as pd

from pyspark.sql.functions import concat, col, lit, when, substring 
from pyspark.sql.types import *

import datetime
import time
from datetime import datetime
from datetime import timedelta

In [None]:
def create_prediction_table():
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS stocks_prediction (
            Predict_time TIMESTAMP
            ,Symbol VARCHAR(5)
            ,yhat DOUBLE
            ,yhat_lower DOUBLE
            ,yhat_upper DOUBLE
            ,Generated TIMESTAMP)
        USING DELTA
        """)

In [None]:
def readStockHistoryLive():
    
    df = spark.sql("SELECT * FROM stocks_minute_agg")

    # create a timestamp column, derived from the datestamp + hour + minute columns
    df = df.withColumn('timestamp', F.expr("to_timestamp(datestamp) + make_interval(0, 0, 0, 0, hour, minute, 0)"))

    # drop the datestamp, hour, minute, minprice, maxprice columns
    df = df.drop('Datestamp', 'Hour', 'Minute', 'MinPrice', 'MaxPrice')

    # Rename column 'Symbol' to 'symbol'
    df = df.withColumnRenamed('Symbol', 'symbol')

    # Rename column 'LastPrice' to 'lastprice'
    df = df.withColumnRenamed('LastPrice', 'lastprice')

    df = df.sort("timestamp")
    return df

In [None]:
# remove all but specified stock symbol
# individual models can be built for each stock

def filterStocksBySymbol(df_stocks, symbol):
    
    df_stocks_filtered = df_stocks.select("*").where(
        'symbol == "' + symbol + '"'
    )
    df_stocks_filtered = df_stocks_filtered.sort("timestamp")

    df_stocks_filtered.tail(4)
    return df_stocks_filtered

In [None]:
def filterStocksByDate(df, date):

    df_stocks_history = df.select("*").where(
        'timestamp < "' + str(date) + '"')

    df_stocks_history.tail(4)
    return df_stocks_history

In [None]:
# merge the predictions with the table in the lakehouse

from delta.tables import *

def write_predictions(predictions_pd, symbol, generated):

    predictions_df = spark.createDataFrame(predictions_pd) 
    predictions_df = predictions_df.withColumn("symbol", lit(symbol))
    predictions_df = predictions_df.withColumn("generated", lit(generated))
   
    stock_predictions_table = DeltaTable.forName(spark, "stocks_prediction")

    stock_predictions_table.alias('table') \
    .merge(
        predictions_df.alias('predictions'),
        'table.Predict_time = predictions.ds and table.Symbol = "' + symbol + '"'
    ) \
    .whenMatchedUpdate(set =
        {
            "yhat": "predictions.yhat"
            ,"yhat_lower": "predictions.yhat_lower"
            ,"yhat_upper": "predictions.yhat_upper"
            ,"Generated": f"'{str(generated)}'"
        }
    ) \
    .whenNotMatchedInsert(values =
        {
            "Predict_time": "predictions.ds"
            ,"Symbol": f"'{symbol}'"
            ,"yhat": "predictions.yhat"
            ,"yhat_lower": "predictions.yhat_lower"
            ,"yhat_upper": "predictions.yhat_upper"
            ,"Generated": f"'{str(generated)}'"
        }
    ) \
    .execute()

In [None]:
# gets all symbols to process.
# symbols can be specified explicitly or by filtering from dataframe

def get_symbols(df):

    # # get the symbols from the dim_symbol table if lakehouse module is completed
    # symbol_df = spark.sql("SELECT Symbol FROM StocksLakehouse.dim_symbol")

    # create symbols manually if dim_symbol (from lakehouse module) does not exist
    symbol_df = spark.createDataFrame( \
        [['BCUZ'], ['IDGD'], ['IDK'], ['TDY'], ['TMRW'], ['WHAT'], ['WHO'], ['WHY']],['Symbol'])

    # # can specify a single symbol like so:
    # symbol_df = spark.createDataFrame( \
    #     [['WHO']],['Symbol'])

    # by default, get all symbols from the current dataframe
    if not df.rdd.isEmpty():
        symbol_df = df.select('symbol').distinct().sort('symbol')
        symbol_df = symbol_df.withColumnRenamed('symbol','Symbol')

    symbols = symbol_df.collect()
    return symbols

In [None]:
# establish begin/end dates for prediction
# returns an empty dataframe

def make_prediction_dataframe(fromdate = datetime.utcnow()):

    # enddate = fromdate + datetime.timedelta(days=7)
    enddate = fromdate + timedelta(days=7)

    print(f'Beginning of forecast: {fromdate}')
    print(f'End of forecast: {enddate}')

    future = pd.DataFrame({'ds': pd.date_range(start=fromdate, end=enddate, freq='T')})
    return future

In [None]:
from prophet import Prophet
from prophet.plot import add_changepoints_to_plot

def build_and_predict(dfStocks, prediction_begin_date):

    # set the seconds/microseconds to zero to normalize the time across runs
    prediction_begin_date = prediction_begin_date.replace(second=0, microsecond=0)

    # predict_df = loaded_model.make_future_dataframe(periods=60*24*7, freq='min', include_history = False)
    predict_df = make_prediction_dataframe(prediction_begin_date)

    dfstocks_pd = dfStocks.toPandas()

    # rename the columns as expected by Prophet (ds and y)
    dfstocks_pd = dfstocks_pd.rename(columns={'timestamp': 'ds'})
    dfstocks_pd = dfstocks_pd.rename(columns={'lastprice': 'y'})
    print('Min data date: ', dfstocks_pd['ds'].min())
    print('Max data date: ', dfstocks_pd['ds'].max())

    # display(dfstocks_pd)

    # model parameters could be loaded dynamically, if needed
    # perhaps by looking up existing models in MLflow
    changepoint_prior_scale = 0.05
    changepoint_range = 0.95
    seasonality_prior_scale = 10
    weekly_seasonality = 5

    m = Prophet(changepoint_prior_scale = changepoint_prior_scale, 
        changepoint_range = changepoint_range, 
        seasonality_prior_scale = seasonality_prior_scale,
        weekly_seasonality=weekly_seasonality)
    m.fit(dfstocks_pd)

    forecast = m.predict(predict_df)
    return forecast

In [None]:
generated_date = datetime.utcnow()
cutoff_date = datetime.utcnow()

# manually specify a cutoff date
# cutoff_date = '2023-12-28 01:23:45'
# cutoff_date = datetime.strptime(cutoff_date, '%Y-%m-%d %H:%M:%S')

# normalize times to nearest minute
cutoff_date = cutoff_date.replace(second=0, microsecond=0)
print(f'Cutoff date: {cutoff_date}')

# read all data into a dataframe
df_stocks = readStockHistoryLive()

# keep only data up until current date (no future looking data)
# df_stocks = filterStocksByDate(df_stocks, cutoff_date)

# create the stocks prediction table if needed
create_prediction_table()

# get a list of all stock symbols in data
symbols_list = get_symbols(df_stocks)
print(symbols_list)

In [None]:
df_stocks.tail(8)

In [None]:
# loop through all the symbols,
# filter the data by symbol, generate predictions
# write predictions to table

for row in symbols_list:
    start_time = datetime.utcnow()
    print(f'Starting: {row.Symbol} at {start_time}')
    df_stocks_filtered_symbol = filterStocksBySymbol(df_stocks, row.Symbol)
    forecast = build_and_predict(df_stocks_filtered_symbol, cutoff_date)
    forecast_finish_time = datetime.utcnow()
    write_predictions(forecast, row.Symbol, generated_date)
    write_finish_time = datetime.utcnow()
    forecast_elap = forecast_finish_time - start_time
    write_elap = write_finish_time - forecast_finish_time
    print(f'Completed: {row.Symbol} at {datetime.utcnow()}. ' \
        f'Model: {forecast_elap.total_seconds()} ' \
        f'Write: {write_elap.total_seconds()}')

In [None]:
# spark.sql("DELETE FROM stocks_prediction")

In [None]:
df = spark.sql("SELECT * FROM stocks_prediction ORDER BY Predict_time DESC LIMIT 1000")
display(df)