## Introduction

This notebook is a follow up to DS 1 - Stock Analysis. In this notebook, models stored in MLflow will be downloaded and future predictions will be stored in the lakehouse, which can then be leveraged in Power BI.

In [None]:
!pip install prophet
import pyspark.sql.functions as F
import mlflow
import pandas as pd
import datetime
from datetime import timedelta
from pyspark.sql.functions import concat, col, lit, when, substring 
from pyspark.sql.types import *
from mlflow import MlflowClient
from mlflow.entities import ViewType

In [None]:
def create_prediction_table():
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS stock_predictions (
            predict_time TIMESTAMP
            ,symbol VARCHAR(5)
            ,yhat DOUBLE
            ,yhat_lower DOUBLE
            ,yhat_upper DOUBLE
            ,generated TIMESTAMP)
        USING DELTA
        """)

In [None]:
class StockPrediction:
    def __init__(self, model_uri, symbol) -> None:
        self.model_uri = model_uri
        self.symbol = symbol
        self.generated = datetime.datetime.utcnow()

In [None]:
# merge the predicitions with the table in the lakehouse

from delta.tables import *

def write_predicitions(predicitions_pd, symbol, generated):

    predicitions_df = spark.createDataFrame(predicitions_pd) 
    predicitions_df = predicitions_df.withColumn("symbol", lit(symbol))
    predicitions_df = predicitions_df.withColumn("generated", lit(generated))
   
    stock_predictions_table = DeltaTable.forName(spark, "stock_predictions")

    stock_predictions_table.alias('table') \
    .merge(
        predicitions_df.alias('predicitions'),
        'table.predict_time = predicitions.ds and table.symbol = "' + symbol + '"'
    ) \
    .whenMatchedUpdate(set =
        {
            "yhat": "predicitions.yhat"
            ,"yhat_lower": "predicitions.yhat_lower"
            ,"yhat_upper": "predicitions.yhat_upper"
            ,"generated": f"'{str(generated)}'"
        }
    ) \
    .whenNotMatchedInsert(values =
        {
            "predict_time": "predicitions.ds"
            ,"symbol": f"'{symbol}'"
            ,"yhat": "predicitions.yhat"
            ,"yhat_lower": "predicitions.yhat_lower"
            ,"yhat_upper": "predicitions.yhat_upper"
            ,"generated": f"'{str(generated)}'"
        }
    ) \
    .execute()

In [None]:
def get_symbols():

    # get the symbols from the dim_symbol table if lakehouse module is completed
    #symbol_df = spark.sql("SELECT Symbol FROM StocksLakehouse.dim_symbol")

    # create symbols manually if dim_symbol (from lakehouse module) does not exist
    symbol_df = spark.createDataFrame( \
        [['BCUZ'], ['IDGD'], ['IDK'], ['TDY'], ['TMRW'], ['WHAT'], ['WHO'], ['WHY']],['Symbol'])

    symbols = symbol_df.collect()
    return symbols

In [None]:
# queries mlflow for matching models for each stock, using the most recent model
def create_predicition_list(symbols):

    prediction_list = []

    for row in symbols:
        symbol = row['Symbol']

        runs_df = mlflow.search_runs(experiment_names=[f"{symbol}-stock-prediction"],
            run_view_type=ViewType.ACTIVE_ONLY,
            filter_string="attributes.status = 'Finished'",
            order_by=["attributes.start_time DESC"])

        if not runs_df.empty:
            run_id = runs_df.iloc[0].run_id
            model_uri = f"runs:/{run_id}/{symbol}-stock-prediction-model"
            print(model_uri)
            prediction_list.append(StockPrediction(model_uri, symbol))
            
    return prediction_list

In [None]:
# establish begin/end dates for prediction
# returns an empty dataframe

def make_prediction_dataframe(fromdate = datetime.datetime.utcnow()):

    enddate = fromdate + datetime.timedelta(days=7)

    print(f'Beginning of forecast: {fromdate}')
    print(f'End of forecast: {enddate}')

    future = pd.DataFrame({'ds': pd.date_range(start=fromdate, end=enddate, freq='T')})
    return future

In [None]:
import mlflow

def load_and_predict(prediction):

    loaded_model = mlflow.prophet.load_model(prediction.model_uri)

    prediction_start = datetime.datetime.utcnow()
    prediction_start = prediction_start.replace(second=0, microsecond=0)

    # predict_df = loaded_model.make_future_dataframe(periods=60*24*7, freq='min', include_history = False)
    predict_df = make_prediction_dataframe(prediction_start)

    forecast = loaded_model.predict(predict_df)
    return forecast

In [None]:
# create the stocks prediction table if needed
create_prediction_table()

# get a list of all stock symbols
symbol_df = get_symbols()

# search for available models in mlflow
prediction_list = create_predicition_list(symbol_df)

# build the predictions for each model, store in lakehouse
for prediction in prediction_list:
    print(f"{prediction.symbol} {prediction.model_uri}")
    forecast = load_and_predict(prediction)
    write_predicitions(forecast, prediction.symbol, prediction.generated)

In [None]:
spark.sql("DELETE FROM stock_predictions")

In [None]:
df = spark.sql("SELECT * FROM StocksLakehouse.stock_predictions ORDER BY predict_time ASC LIMIT 1000")
display(df)