In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import yfinance as yf
import numpy as np
from datetime import datetime, timedelta


In [63]:
weight_dict = {'2008-09-24' :0.273, '2014-11-26' : 0.727, '2020-03-06' : np.nan}

In [65]:
# fix random seed for reproducibility
tf.random.set_seed(7)

In [66]:
# plt.plot(donor5)
# plt.show()

In [67]:
COP = yf.Ticker("COP")

In [68]:
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

In [133]:
residual_dict = {}

for date in weight_dict.keys():

    date_to_use = datetime.strptime(date, '%Y-%m-%d')
    print('We will now use ', date_to_use)
    
    date_to_start = date_to_use - timedelta(days=200)
    print('Training period begins ', date_to_start)

    date_start_test = date_to_use + timedelta(days=1)
    print('Period to predict begins ', date_start_test)
    
    date_end_test = date_to_use + timedelta(days=8)
    print('Prediction period ends ', date_end_test)

    COP_dat = COP.history(period="max")

    COP_dat.index = COP_dat.index.tz_convert(None)

    COP_dat.reset_index(inplace = True)

    
    COP_dat['log_close'] = np.log(COP_dat['Close'])
    COP_dat['log_diff'] = COP_dat['log_close'].diff()
    COP_dat['Date'] =  pd.to_datetime(COP_dat['Date'], format='%Y-%m-%d')

    training_set = COP_dat.loc[ (COP_dat['Date'] >= date_to_start) & (COP_dat['Date'] <= date_to_use)  ].Close

    test = COP_dat.loc[ (COP_dat['Date'] >= date_start_test) & (COP_dat['Date'] <= date_end_test)  ].Close

    training_set = training_set.to_numpy()
    training_set = training_set.reshape((training_set.shape[0], 1))
    test = test.to_numpy()
    test = test.reshape((test.shape[0], 1))

    print('Our training set has size ', training_set.shape)
    print('Our testing set has size ', test.shape)

    # normalize the datasets
    scaler = MinMaxScaler(feature_range=(0, 1))
    training_set = scaler.fit_transform(training_set)
    
    test = scaler.transform(test)

    # reshape into X=t and Y=t+1
    look_back = 1
    trainX, trainY = create_dataset(training_set, look_back)

    testX, testY = create_dataset(test, look_back)

    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(5, input_shape=(1, look_back)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, epochs=25, batch_size=4, verbose=2)

    # make predictions
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)

    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform([trainY])
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform([testY])

    print('We print the predictions', testPredict)
    print('We print the ground truth', testY)


    residuals = testY - testPredict

    print('The dimension of the residuals is ', residuals.shape)

    residual_dict.update({ date : residuals })

We will now use  2008-09-24 00:00:00
Training period begins  2008-03-08 00:00:00
Period to predict begins  2008-09-25 00:00:00
Prediction period ends  2008-10-02 00:00:00
Our training set has size  (138, 1)
Our testing set has size  (5, 1)
Epoch 1/25


  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  super().__init__(**kwargs)


34/34 - 0s - 9ms/step - loss: 0.2818
Epoch 2/25
34/34 - 0s - 404us/step - loss: 0.2178
Epoch 3/25
34/34 - 0s - 389us/step - loss: 0.1655
Epoch 4/25
34/34 - 0s - 354us/step - loss: 0.1238
Epoch 5/25
34/34 - 0s - 352us/step - loss: 0.0918
Epoch 6/25
34/34 - 0s - 370us/step - loss: 0.0685
Epoch 7/25
34/34 - 0s - 378us/step - loss: 0.0524
Epoch 8/25
34/34 - 0s - 377us/step - loss: 0.0420
Epoch 9/25
34/34 - 0s - 371us/step - loss: 0.0356
Epoch 10/25
34/34 - 0s - 389us/step - loss: 0.0318
Epoch 11/25
34/34 - 0s - 377us/step - loss: 0.0294
Epoch 12/25
34/34 - 0s - 361us/step - loss: 0.0278
Epoch 13/25
34/34 - 0s - 379us/step - loss: 0.0265
Epoch 14/25
34/34 - 0s - 373us/step - loss: 0.0254
Epoch 15/25
34/34 - 0s - 379us/step - loss: 0.0243
Epoch 16/25
34/34 - 0s - 360us/step - loss: 0.0232
Epoch 17/25
34/34 - 0s - 374us/step - loss: 0.0221
Epoch 18/25
34/34 - 0s - 373us/step - loss: 0.0210
Epoch 19/25
34/34 - 0s - 373us/step - loss: 0.0199
Epoch 20/25
34/34 - 0s - 388us/step - loss: 0.0188
Ep

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  super().__init__(**kwargs)


35/35 - 0s - 10ms/step - loss: 0.3147
Epoch 2/25
35/35 - 0s - 429us/step - loss: 0.2405
Epoch 3/25
35/35 - 0s - 401us/step - loss: 0.1749
Epoch 4/25
35/35 - 0s - 383us/step - loss: 0.1197
Epoch 5/25
35/35 - 0s - 404us/step - loss: 0.0776
Epoch 6/25
35/35 - 0s - 410us/step - loss: 0.0500
Epoch 7/25
35/35 - 0s - 413us/step - loss: 0.0348
Epoch 8/25
35/35 - 0s - 416us/step - loss: 0.0279
Epoch 9/25
35/35 - 0s - 437us/step - loss: 0.0250
Epoch 10/25
35/35 - 0s - 431us/step - loss: 0.0234
Epoch 11/25
35/35 - 0s - 421us/step - loss: 0.0222
Epoch 12/25
35/35 - 0s - 414us/step - loss: 0.0211
Epoch 13/25
35/35 - 0s - 430us/step - loss: 0.0200
Epoch 14/25
35/35 - 0s - 417us/step - loss: 0.0189
Epoch 15/25
35/35 - 0s - 423us/step - loss: 0.0178
Epoch 16/25
35/35 - 0s - 403us/step - loss: 0.0167
Epoch 17/25
35/35 - 0s - 398us/step - loss: 0.0157
Epoch 18/25
35/35 - 0s - 402us/step - loss: 0.0146
Epoch 19/25
35/35 - 0s - 407us/step - loss: 0.0136
Epoch 20/25
35/35 - 0s - 415us/step - loss: 0.0126
E

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  super().__init__(**kwargs)


34/34 - 0s - 8ms/step - loss: 0.4390
Epoch 2/25
34/34 - 0s - 439us/step - loss: 0.3562
Epoch 3/25
34/34 - 0s - 365us/step - loss: 0.2850
Epoch 4/25
34/34 - 0s - 346us/step - loss: 0.2202
Epoch 5/25
34/34 - 0s - 362us/step - loss: 0.1626
Epoch 6/25
34/34 - 0s - 372us/step - loss: 0.1148
Epoch 7/25
34/34 - 0s - 385us/step - loss: 0.0794
Epoch 8/25
34/34 - 0s - 387us/step - loss: 0.0567
Epoch 9/25
34/34 - 0s - 383us/step - loss: 0.0443
Epoch 10/25
34/34 - 0s - 386us/step - loss: 0.0386
Epoch 11/25
34/34 - 0s - 389us/step - loss: 0.0359
Epoch 12/25
34/34 - 0s - 376us/step - loss: 0.0344
Epoch 13/25
34/34 - 0s - 385us/step - loss: 0.0333
Epoch 14/25
34/34 - 0s - 392us/step - loss: 0.0322
Epoch 15/25
34/34 - 0s - 382us/step - loss: 0.0311
Epoch 16/25
34/34 - 0s - 363us/step - loss: 0.0300
Epoch 17/25
34/34 - 0s - 369us/step - loss: 0.0289
Epoch 18/25
34/34 - 0s - 369us/step - loss: 0.0278
Epoch 19/25
34/34 - 0s - 373us/step - loss: 0.0267
Epoch 20/25
34/34 - 0s - 371us/step - loss: 0.0256
Ep

In [None]:
# dot product

In [138]:
list(weight_dict.values())[0:1]

[0.273]

In [141]:
np.dot(list(weight_dict.values())[0:2], [(33.20644379 - 34.683964)/33.20644379, (49.82816696-51.57873)/49.82816696] )

-0.03768809306171332

In [142]:
np.dot(list(weight_dict.values())[0:2], [(33.20644379 - 34.683964), (49.82816696-51.57873)] )

-1.6760223474100027

In [None]:
#And now here is the thing we are correcting

In [143]:
42.685432 - 1.6760223474100027

41.009409652589994

In [129]:
list(residual_dict.keys())

['2008-09-24', '2014-11-26', '2020-03-06']

In [130]:
residual_dict['2008-09-24']

array([-1.12805557, -4.14643478, -2.43036079])

In [131]:
residual_dict.items()

dict_items([('2008-09-24', array([-1.12805557, -4.14643478, -2.43036079])), ('2014-11-26', array([-3.65476608, -2.59600449])), ('2020-03-06', array([ -7.93042374,  -9.89914513, -13.59899712]))])

In [95]:
len(residual_list)

3

In [96]:
residual_list[0].shape

(10734,)

In [49]:

residual_list

[array([ -0.2918092 ,  -0.33033115,  -0.34959191, ..., 108.59415233,
        109.88414562, 107.07414806]),
 array([ -0.2529037 ,  -0.29142547,  -0.31068671, ..., 108.63305807,
        109.92305136, 107.1130538 ]),
 array([ -0.2541182 ,  -0.29264051,  -0.31190109, ..., 108.63184333,
        109.92183661, 107.11183906])]

In [16]:
np.dot(residual_list, 

datetime.datetime(2014, 8, 20, 0, 0)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


In [1335]:
testPredict.shape

(7, 1)

In [1336]:
testPredict

array([[0.00098109],
       [0.00047128],
       [0.00071508],
       [0.00062283],
       [0.00214965],
       [0.00214986],
       [0.00206605]], dtype=float32)

In [1309]:
testY

array([[49.82816696, 50.88693237, 52.05597305, 50.96044922, 49.88698578,
        47.81357956, 47.76211166]])

In [1311]:
residuals

array([[ 1.20053864,  2.25930405,  3.42834473,  2.33282089,  1.25935745,
        -0.81404877, -0.86551666],
       [-0.10146332,  0.95730209,  2.12634277,  1.03081894, -0.0426445 ,
        -2.11605072, -2.16751862],
       [-1.20051956, -0.14175415,  1.02728653, -0.0682373 , -1.14170074,
        -3.21510696, -3.26657486],
       [-2.40596771, -1.3472023 , -0.17816162, -1.27368546, -2.3471489 ,
        -4.42055511, -4.47202301],
       [-1.27660751, -0.2178421 ,  0.95119858, -0.14432526, -1.2177887 ,
        -3.29119492, -3.34266281],
       [-0.16265106,  0.89611435,  2.06515503,  0.9696312 , -0.10383224,
        -2.17723846, -2.22870636],
       [ 1.99699402,  3.05575943,  4.22480011,  3.12927628,  2.05581284,
        -0.01759338, -0.06906128]])

In [1117]:
# Weights from Lin and Eck 2021 W∗ = (0.000, 0.000, 0.000, 0.273, 0.727) 

# ("2008-03-14" , "2008-09-05", "2008-09-12", "2008-09-25"  ,"2014-11-26")

# https://arxiv.org/pdf/2008.11756



In [1118]:
trainPredict

array([[ 2.93182151e-04],
       [ 1.97510628e-04],
       [-4.93526284e-04],
       [ 3.02714398e-05],
       [-1.12913665e-03],
       [-2.96893140e-05],
       [-4.93408064e-04],
       [-1.41926212e-04],
       [ 4.71628329e-04],
       [-4.53065644e-04],
       [ 4.53223533e-04],
       [-6.43673848e-05],
       [ 5.72077581e-04],
       [-1.89835016e-04],
       [ 2.54734798e-04],
       [-3.89885012e-04],
       [ 1.19183322e-04],
       [ 1.59838688e-04],
       [ 2.20494883e-04],
       [ 3.70553229e-04],
       [-2.39608763e-03],
       [ 4.55129048e-04],
       [-5.94123812e-05],
       [-4.16596828e-04],
       [-8.17058433e-04],
       [-1.47478067e-05],
       [-7.89818121e-04],
       [ 1.59602234e-04],
       [-8.55731836e-04],
       [-5.41970599e-04],
       [-7.76674307e-04],
       [-6.32440264e-04],
       [-1.80699502e-03],
       [ 7.26666709e-04],
       [-2.11399677e-03],
       [ 1.37803238e-03],
       [-2.36518937e-03],
       [-3.65868036e-04],
       [-7.6

In [1255]:
# LSTM for international airline passengers problem with memory
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)
    
# fix random seed for reproducibility
tf.random.set_seed(7)

# load the dataset
dataset = donor5
dataset = dataset.astype('float32')
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)

# split into train and test sets
train_size = int(len(dataset) * 0.67)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]

# reshape into X=t and Y=t+1
look_back = 3
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))

# create and fit the LSTM network
batch_size = 1
model = Sequential()
model.add(LSTM(4, batch_input_shape=(batch_size, look_back, 1), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
for i in range(100):
    model.fit(trainX, trainY, epochs=1, batch_size=batch_size, verbose=2, shuffle=False)
    model.reset_states()
    
# make predictions
trainPredict = model.predict(trainX, batch_size=batch_size)
model.reset_states()
testPredict = model.predict(testX, batch_size=batch_size)

# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])

# calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))

# shift train predictions for plotting
trainPredictPlot = np.empty_like(dataset)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict

# shift test predictions for plotting
testPredictPlot = np.empty_like(dataset)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict

# plot baseline and predictions
plt.plot(scaler.inverse_transform(dataset))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()

ValueError: Unrecognized keyword arguments passed to LSTM: {'batch_input_shape': (1, 3, 1)}