In [1]:
from fastapi import FastAPI
from pydantic import BaseModel
import numpy as np
from tensorflow import keras
import uvicorn

# find and init the spark instance to ensure it is pip installed
import findspark
findspark.init()

print("detected a spark instance!")

# set some HTML display setting 
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

print("set display settings!")
# import all the pyspark dependencies 
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType, DateType
from pyspark.sql.functions import *
import pyspark.sql.functions as F

print("installed all keras and tensorflow dependencies for the LSTM model!")

# declare a spark object that we will run our spark SQL dataframes on 
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

# init a spark session 
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .getOrCreate()

print("intialized a spark context!")


# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")

print("all dependencies installed!")

detected a spark instance!


set display settings!
installed all keras and tensorflow dependencies for the LSTM model!
intialized a spark context!
all dependencies installed!


In [None]:
from flask import Flask, render_template, request
import numpy as np
import pandas as pd 
from pyspark.ml.feature import MinMaxScaler, VectorAssembler, StandardScaler, RobustScaler
import tensorflow as tf
from pyspark.sql.window import Window
from tensorflow.keras.models import load_model
import io
import base64
import json 
    
app = Flask(__name__)
model = load_model('LSTM_model_build_v0.h5')

def build_pipeline(df, features):
    df_modelSet = df.select(
        col("Instrument").cast(StringType()).alias("Instrument"),
        col("Year").cast(IntegerType()).alias("Year"),
        col("Month").cast(IntegerType()).alias("Month"),
        col("Avg_Price_Open").cast(DoubleType()).alias("Avg_Price_Open"),
        col("Avg_Price_Close").cast(DoubleType()).alias("Avg_Price_Close"),
        col("Avg_Price_High").cast(DoubleType()).alias("Avg_Price_High"),
        col("Avg_Daily_Return").cast(DoubleType()).alias("Avg_Daily_Return")
        )

    df_modelSet = df_modelSet.filter(
        ((col("Year") == 2022) & (col("Month") >= 9)) |
        ((col("Year") == 2023) & (col("Month") <= 9))
    ).sort('Year','Month')

    vector_assembler = VectorAssembler(inputCols=features, outputCol="feature_vectors")
    df_assembled = vector_assembler.transform(df_modelSet)
    
    r_scaler = RobustScaler(inputCol="feature_vectors", outputCol="scaled_features", withScaling=True, withCentering=True)
    
    df_scaled = r_scaler.fit(df_assembled).transform(df_assembled)

    w1 = Window.partitionBy("Instrument").orderBy("Year", "Month")
    sequence_length_val = 12

    # # Use Window function to create sequences
    # #window_spec = Window.partitionBy("Instrument_Encoded").orderBy("Year", "Month")
    for i in range(1, sequence_length_val + 1):
        lag_col = "lag_{}".format(i)
        df_scaled = df_scaled.withColumn(lag_col, lag(df_scaled["feature_vectors"], i).over(w1))
           
    return df_scaled


@app.route('/predict', methods=['GET','POST'])
def predict():
    if request.method == 'POST':
        _dict = {}
        ric = request.form['text_input']
        
        df = spark.read.csv('kaggle_stock_data_transformed.csv', header=True)
        df = df.drop('Instrument9').withColumnRenamed('Instrument0','Instrument')
        df_pipeline = build_pipeline(df,['Avg_Price_Open','Avg_Price_Close'])
        df_ric = df_pipeline.filter(col('Instrument')==ric)
        df_last = df_ric.filter((col('Year')==2023) & (col('Month')==9))

        sector_data = df.filter(col('Instrument')==ric).first()  # Get the first (and only) row
        _dict["Company_Common_Name"] = sector_data['Company_Common_Name'] 
        _dict["Economic_Sector_Name"] = sector_data['TRBC_Economic_Sector_Name']
        _dict["Business_Sector_Name"] = sector_data['TRBC_Business_Sector_Name']
        _dict["Industry_Group_Name"] = sector_data['TRBC_Industry_Group_Name']
        _dict["Industry_Name"] = sector_data['TRBC_Industry_Name']
        _dict["Activity_Name"] = sector_data['TRBC_Activity_Name']

        print(_dict)
        
        sequence_length = 12
        X_val = np.array(df_last.select(["lag_{}".format(i) for i in range(1, sequence_length+1)]).collect())
        next_6_months_predictions = model.predict(X_val).tolist()[0]
       
        
        return render_template('index.html', ric=ric, prediction=next_6_months_predictions, sector_info=_dict)
        #return jsonify(prediction=next_6_months_predictions)
    
    else:
        return render_template('index.html')
        print("else")

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.1.62:5000
Press CTRL+C to quit
127.0.0.1 - - [13/Oct/2023 08:45:58] "GET /predict HTTP/1.1" 200 -


{'Company_Common_Name': 'Covestro AG', 'Economic_Sector_Name': 'Basic Materials', 'Business_Sector_Name': 'Chemicals', 'Industry_Group_Name': 'Chemicals', 'Industry_Name': 'Commodity Chemicals', 'Activity_Name': 'Plastics'}


127.0.0.1 - - [13/Oct/2023 08:46:03] "POST /predict HTTP/1.1" 200 -


{'Company_Common_Name': '1&1 AG', 'Economic_Sector_Name': 'Technology', 'Business_Sector_Name': 'Telecommunications Services', 'Industry_Group_Name': 'Telecommunications Services', 'Industry_Name': 'Wireless Telecommunications Services', 'Activity_Name': 'Wireless Telecoms Service Providers'}


127.0.0.1 - - [13/Oct/2023 08:46:27] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [13/Oct/2023 08:46:55] "GET /predict HTTP/1.1" 200 -
