# IMPORTING LIBRARIES

In [371]:
import pandas as pd
import numpy as np
from pyESN import ESN
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import warnings
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import datetime
import pickle
import math

# data preprocessing

In [372]:
#ETL-> extract, Transform & Load Pipeline 
# This function will read the input dataframe and perform some necessary functions:
# 1. Dropping rows with null values
# 2. Converting dates/times column to date_time format under column name "Date"
# 3. Remove the percentage symbols from the values of column(like Change% etc.) to make if float[for calculations]

def etl(df):
    for i in df.columns: # iterating over columns of input dataframe
        if df[i].isna().sum() == df.shape[0]: #if all the values of certain column in null, drop that
            df.drop(i,axis=1,inplace=True)
            continue
        if i=="Time": # if column with name "Time" is found, convert its values to date_time format
            df["Date"] = pd.to_datetime(df[i])
            df.drop("Time",axis=1,inplace=True)
        elif i=="Date": # if column with name "Date" is found, convert its values to date_time format
            df[i] = pd.to_datetime(df[i])
        elif df[i].dtype == "object": # if some column is found with datatype other than float/int etc.
            #convert its values to str
            df[i] = df[i].astype(str)
            df[i] = df[i].apply(lambda x:re.sub('[%]',"",x))
            df[i] = df[i].astype(float)
    return df

In [373]:
data = etl(pd.read_csv("SPY.csv"))

In [374]:
data.head(2)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1993-01-29,43.96875,43.96875,43.75,43.9375,25.218218,1003200
1,1993-02-01,43.96875,44.25,43.96875,44.25,25.397591,480500


In [375]:
data.sort_values(by="Date",inplace=True)
data["Target"] = data["Close"].shift(-1)
data.dropna(inplace=True)
# df = data
# data = data.drop(["Date"],axis=1)

data.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Target
0,1993-01-29,43.96875,43.96875,43.75,43.9375,25.218218,1003200,44.25
1,1993-02-01,43.96875,44.25,43.96875,44.25,25.397591,480500,44.34375
2,1993-02-02,44.21875,44.375,44.125,44.34375,25.451399,201300,44.8125


In [376]:
FEATURES = ['Open', 'High', 'Close', 'Low']
TARGET = ["Target"]

# PRE-PROCESSING FUNTION

In [377]:
def get_data_for_model(data):
    FEATURES = ['Open', 'High', 'Close', 'Low']
    TARGET = ["Target"]
    return data[FEATURES].values,data[TARGET].values

In [378]:
def data_split(
    X : np.ndarray,
    y : np.ndarray,
):
    """Trai-test aplit

    Args:
        X (np.ndarray): feature array
        y (np.ndarray): target array
    """
    return train_test_split(X,y,train_size=0.8,random_state=101)

In [379]:
def scale_data(
    X_train : np.ndarray,
    X_test : np.ndarray,
    y_train : np.ndarray,
    y_test : np.ndarray,
):
    """Data standarizing

    Args:
        X (np.ndarray): feature array
        y (np.ndarray): target array
    """
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    pred_scaler = StandardScaler()
    y_train = pred_scaler.fit_transform(y_train)
    y_test = pred_scaler.transform(y_test)
    
    return scaler,pred_scaler,X_train,X_test,y_train,y_test

# PRE-PROCESSING

In [380]:
X,y = get_data_for_model(data)

In [381]:
X_train,X_test,y_train,y_test = data_split(X,y)

In [382]:
scaler,pred_scaler,X_train,X_test,y_train,y_test = scale_data(X_train,X_test,y_train,y_test)

# ECHO STATE NETWORK

### MODEL HELPER FUNCTIONS

In [414]:
def ESN_Model(
    n_inputs : int,
    n_outputs : int,
    n_reservoir :int = 5,
    sparsity : float = 0.2,
    rand_seed : int = 23,
    spectral_radius : float = 2.9,
    noise : float = .0003
) -> ESN :
    """
RNN updated model gives better time-series prediction by use or reservior neurons
Args:
    n_reservoir (_type_, optional): neurons in reservior. Defaults to 5:int.
    sparsity (_type_, optional): sparsity of reservior . Defaults to 0.2:float.
    rand_seed (_type_, optional): seed. Defaults to 23:int.
    spectral_radius (_type_, optional): r. Defaults to 2.9:float.
    noise (_type_, optional): regularization effect. Defaults to .0003:float.

Returns:
    pyESN.ESN: pyESN model
"""
    return ESN(n_inputs=n_inputs, n_outputs=n_outputs, n_reservoir=n_reservoir, sparsity=sparsity, spectral_radius=spectral_radius, random_state=42)

In [419]:
def train_and_evaluate(
    model : ESN,
    X_train : np.ndarray,
    X_test : np.ndarray,
    y_train : np.ndarray,
    y_test : np.ndarray,
):
    """ESN training and evaluation

    Args:
        model (ESN): Echo state network model object
        X_train (np.ndarray): feature array
        y_train (np.ndarray): target array
        X_test (np.ndarray): feature array
        y_test (np.ndarray): target array
    """
    print("Fitting ESN on train data")
    train_preds = model.fit( X_train , y_train )
    print("Making ESN predict on test data")
    y_hat = model.predict(X_test)
    
    print("ERRORS:")
    print(f"Mean Absolute Error[MAE] = {mean_absolute_error(y_test,y_hat) : .5f}")
    print(f"Mean Squared Error[MSE] = {mean_squared_error(y_test,y_hat) : .5f}")

In [433]:
def get_comparison_plot(
    data : pd.DataFrame,
    features : list,
    target : list,
    model : ESN,
    scaler : StandardScaler,
    pred_scaler : StandardScaler
    ):
    
    INPUT_FEATURES_VALUES = data[features].values #feature value from actual data frame
    ACTUAL_TARGET_VALUES = data[target].values #target column from actual data
    scaled_input = scaler.transform(INPUT_FEATURES_VALUES) #scaling features to feed model to get results
    
    # GETTING MODEL'S PREDICTION OVER FEATURES FROM DATA
    PREDICTED_MODEL_OUTPUT = model.predict( scaled_input)
    
    PREDICTED_MODEL_OUTPUT = pred_scaler.inverse_transform(PREDICTED_MODEL_OUTPUT.reshape(-1,1))

    resultant_dataframe = pd.DataFrame({'ACTUAL_TARGET_VALUES':ACTUAL_TARGET_VALUES.ravel(), 
                                        'PREDICTED_MODEL_OUTPUT':PREDICTED_MODEL_OUTPUT.ravel()})

    resultant_dataframe.insert(1, "residuals",
                               resultant_dataframe["PREDICTED_MODEL_OUTPUT"] - resultant_dataframe["ACTUAL_TARGET_VALUES"],
                               True)
    df_sub = ["#2BC97A" if x > 0 else "#C92B2B" for x in resultant_dataframe["residuals"].dropna()]

    resultant_dataframe.index = data.Date

    fig = go.Figure()
    fig.add_trace(go.Scatter(y = resultant_dataframe["ACTUAL_TARGET_VALUES"] , x = resultant_dataframe.index,
                          name = "ACTUAL" , mode="lines"))
    fig.add_trace(go.Scatter(y = resultant_dataframe["PREDICTED_MODEL_OUTPUT"] , x = resultant_dataframe.index,
                         name = "PREDICTED",mode = "lines"))
    fig.add_trace(
        go.Bar(y = resultant_dataframe["residuals"].dropna() ,  x = resultant_dataframe.index,
               marker = {'color' : df_sub}, name = "Difference")
    )
    fig.update_layout(
            xaxis = dict(rangeslider=dict(
                visible=True
            ),
            type="date"),
            hovermode="x unified" , width = 1000,
            legend_title= "TRACES",
            yaxis_title= "VALUE OF TARGET LAST" ,xaxis_title="TIME",
            hoverlabel = dict(
                bgcolor="white",
                font_size=16,
                font_family="Rockwell"
            )
        )
    return fig,resultant_dataframe

# MODEL

## ESN MODEL 

In [421]:
esn = ESN_Model(n_inputs=4 , n_outputs=1)

In [422]:
train_and_evaluate(esn,X_train,X_test,y_train,y_test)

Fitting ESN on train data
Making ESN predict on test data
ERRORS:
Mean Absolute Error[MAE] =  0.01289
Mean Squared Error[MSE] =  0.00052


In [434]:
comparison_fig , comparison_df = get_comparison_plot(
    data,
    FEATURES,
    TARGET,
    model = esn,
    scaler = scaler,
    pred_scaler = pred_scaler
    )

In [435]:
comparison_df

Unnamed: 0_level_0,ACTUAL_TARGET_VALUES,residuals,PREDICTED_MODEL_OUTPUT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1993-01-29,44.250000,-0.462470,43.787530
1993-02-01,44.343750,-0.033331,44.310419
1993-02-02,44.812500,-0.419840,44.392660
1993-02-03,45.000000,-0.123142,44.876858
1993-02-04,44.968750,-0.027580,44.941170
...,...,...,...
2023-02-10,412.829987,-4.875175,407.954812
2023-02-13,412.640015,0.102191,412.742206
2023-02-14,413.980011,-1.252461,412.727550
2023-02-15,408.279999,5.464722,413.744721


In [436]:
comparison_fig

In [465]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Target
0,1993-01-29,43.968750,43.968750,43.750000,43.937500,25.218218,1003200,44.250000
1,1993-02-01,43.968750,44.250000,43.968750,44.250000,25.397591,480500,44.343750
2,1993-02-02,44.218750,44.375000,44.125000,44.343750,25.451399,201300,44.812500
3,1993-02-03,44.406250,44.843750,44.375000,44.812500,25.720453,529400,45.000000
4,1993-02-04,44.968750,45.093750,44.468750,45.000000,25.828056,531500,44.968750
...,...,...,...,...,...,...,...,...
7563,2023-02-10,405.859985,408.440002,405.010010,408.040009,408.040009,70738000,412.829987
7564,2023-02-13,408.720001,412.970001,408.239990,412.829987,412.829987,64913500,412.640015
7565,2023-02-14,411.239990,415.049988,408.510010,412.640015,412.640015,88389300,413.980011
7566,2023-02-15,410.350006,414.059998,409.470001,413.980011,413.980011,61685300,408.279999


# PREDICTIONS ON INPUT DATA

In [469]:
def predict_new_data(
    file_name : str,
    model : ESN,
    scaler : StandardScaler,
    pred_scaler : StandardScaler
):
    new_data = etl(pd.read_csv(file_name))
    try:
        in_data = new_data[FEATURES]
    except Exception:
        in_data = new_data[["SPY Open","SPY High","SPY Last" , "SPY Low"]]
    in_data = scaler.transform(in_data)
    out_data = model.predict(in_data)
    out_data = pred_scaler.inverse_transform(out_data)
    out_data = out_data.ravel()[0]
    
    preds = pd.DataFrame({"SPY Last" : [new_data["SPY Last"].iloc[0] ,out_data]})
    new_data.Date = pd.to_datetime(new_data.Date)
    today = new_data.Date[0].date()
    tom = today + datetime.timedelta(days= 7-today.weekday() if today.weekday()>3 else 1)
    preds.index = [today,tom]
    
    return preds

In [470]:
predict_new_data("data.csv",esn,scaler,pred_scaler)


X has feature names, but StandardScaler was fitted without feature names



Unnamed: 0,SPY Last
2023-02-15,413.98
2023-02-16,413.77497


# ANALYZING DATE WHERE DIFFERENCE IS GREATER

In [463]:
comparison_df[abs(comparison_df.residuals)>5]

Unnamed: 0_level_0,ACTUAL_TARGET_VALUES,residuals,PREDICTED_MODEL_OUTPUT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1997-10-24,87.187500,6.918606,94.106106
1998-08-26,103.750000,5.234479,108.984479
1998-08-28,96.000000,7.410604,103.410604
1998-09-04,103.000000,-5.538158,97.461842
2000-01-03,139.750000,5.287541,145.037541
...,...,...,...
2023-01-25,404.750000,-5.038072,399.711928
2023-01-27,400.589996,6.014886,406.604882
2023-02-01,416.779999,-5.811126,410.968873
2023-02-06,415.190002,-5.282893,409.907109
