In [1]:
import json
import copy
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import keras_tuner
from dataclasses import dataclass
from typing import List
from keras import layers, ops
from keras.utils import timeseries_dataset_from_array
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, Normalizer
from sklearn.linear_model import LinearRegression
%matplotlib inline

from processing import split_indices, make_stationary, inverse_stationary, scale, unscale, sequence, Stationarizer, Normalizer
#from tuner import LSTMLayerHyperparams, ModelHyperparams
from plotting import visualize_loss, show_plot
from model_builder import build_model, get_tuner

2024-08-03 17:37:36.022342: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv("data/YF_AAPL.csv")["Adj Close"].to_numpy().reshape(-1, 1)

train_idx = int(0.7 * np.shape(data)[0])
val_idx = int(0.85 * np.shape(data)[0])

train_data = data[:train_idx]
val_data = data[train_idx:val_idx]
non_test_data = data[:val_idx]
test_data = data[val_idx:]

In [3]:
val_data[:20]

array([[31.91248322],
       [31.92647171],
       [32.57902908],
       [32.38559341],
       [32.5766983 ],
       [32.47415924],
       [32.51610565],
       [32.39491653],
       [32.32033157],
       [32.42753601],
       [32.44152069],
       [32.39258194],
       [32.7351799 ],
       [32.78879166],
       [32.6256485 ],
       [32.9682312 ],
       [32.5906868 ],
       [32.9589119 ],
       [32.84238052],
       [32.77712631]])

In [4]:
#train_stater = Stationarizer()
#val_stater = Stationarizer()
#normer = Normalizer()
normer = StandardScaler()
#normer = PowerTransformer(standardize=False)
#train_data = train_stater.fit_transform(train_data)
#val_data = val_stater.fit_transform(val_data)
normer.fit(non_test_data)
train_data = normer.transform(train_data)
val_data = normer.transform(val_data)

In [5]:
train_data

array([[-0.73677596],
       [-0.74013057],
       [-0.73959742],
       ...,
       [ 0.72812825],
       [ 0.72176291],
       [ 0.72318917]])

In [6]:
joblib.dump(normer, "AAPL_normer.gz")

['AAPL_normer.gz']

In [7]:
train_ds = timeseries_dataset_from_array(
    train_data[:-20],
    train_data[20:],
    sequence_length=20,
    batch_size=16,
    shuffle=True
)
val_ds = timeseries_dataset_from_array(
    val_data[:-20],
    val_data[20:],
    sequence_length=20,
    batch_size=16,
    shuffle=True
)

for batch in train_ds.take(1):
    inputs, targets = batch

print("Input shape:", inputs.numpy().shape)
print("Target shape:", targets.numpy().shape)

Input shape: (16, 20, 1)
Target shape: (16, 1)


2024-08-03 17:37:37.621126: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [8]:
train_ds.save("data/AAPL_ds/train")
val_ds.save("data/AAPL_ds/val")

In [None]:
tuner = get_tuner()
tuner.search(train_ds, epochs=15, validation_data=val_ds)

Trial 84 Complete [00h 01m 09s]
val_mse: 2.3316116333007812

Best val_mse So Far: 1.3993076086044312
Total elapsed time: 01h 31m 08s

Search: Running Trial #85

Value             |Best Value So Far |Hyperparameter
3                 |3                 |num_layers
112               |32                |units_0
tanh              |tanh              |activation_0
sigmoid           |tanh              |recurrent_activation_0
0                 |0.15              |dropout_0
0.6               |0.6               |recurrent_dropout_0
0.0061575         |0.00047207        |lr
112               |112               |units_1
tanh              |sigmoid           |activation_1
tanh              |tanh              |recurrent_activation_1
0.15              |0.45              |dropout_1
0.15              |0.45              |recurrent_dropout_1
80                |48                |units_2
tanh              |sigmoid           |activation_2
sigmoid           |tanh              |recurrent_activation_2
0.3       

In [None]:
trials = list()
for n in range(100):
    path = f"tuning/stock_predictor/trial_{n:03d}/trial.json"
    with open(path, mode="r") as f:
        trial = json.load(f)
        trials.append(trial)

#trials[0]

In [None]:
def sort(array):
    """Sort the array by using quicksort."""

    less = []
    equal = []
    greater = []

    if len(array) > 1:
        pivot = array[0]["score"]
        for x in array:
            if x["score"] < pivot:
                less.append(x)
            elif x["score"] == pivot:
                equal.append(x)
            elif x["score"] > pivot:
                greater.append(x)
        return sort(less)+equal+sort(greater)
    else:
        return array

In [None]:
sorted_trials = sort(trials)
tmp = copy.deepcopy(sorted_trials)
rmse_trials = list()
for trial in tmp:
    trial.update({"score" : np.sqrt(trial["score"])})
    rmse_trials.append(trial)


In [None]:
bins = np.linspace(min([x["score"] for x in rmse_trials]), max([x["score"] for x in rmse_trials]), 25)

In [None]:
counts = [[cutoff, 0] for cutoff in bins]
t = 0
c = 0
while t < len(rmse_trials) and c < len(counts):
    if rmse_trials[t]["score"] <= counts[c][0]:
        counts[c][1] += 1
        t += 1
    else:
        c += 1

np.sum([el[1] for el in counts])

In [None]:
scx = list()
heights = list()
for el in counts:
    scx.append(el[0])
    heights.append(el[1])

plt.bar(scx, heights, edgecolor="black", width=0.016)
plt.xlabel("RMSE")
plt.ylabel("number of occurrences")
plt.title("Score distribution")

In [None]:
num_layers = [trial["hyperparameters"]["values"]["num_layers"] for trial in rmse_trials]
rmse_scores = np.array([trial["score"] for trial in rmse_trials])
plt.scatter(num_layers, rmse_scores)
plt.xlabel("number of LSTM layers")
plt.ylabel("RMSE")

reg = LinearRegression()
reg.fit(np.array(num_layers).reshape((-1, 1)), np.array(rmse_scores).reshape((-1, 1)))
x = np.linspace(min(num_layers), max(num_layers), 100)
plt.plot(x, np.array(reg.coef_ * x + reg.intercept_).reshape((-1, 1)))

In [None]:
log_lrs = np.array([np.log10(trial["hyperparameters"]["values"]["lr"]) for trial in rmse_trials])
reg.fit(log_lrs.reshape((-1, 1)), rmse_scores.reshape((-1, 1)))
x = np.linspace(min(log_lrs), max(log_lrs), 100)
y = reg.coef_ * x + reg.intercept_
plt.scatter(log_lrs, rmse_scores.reshape((-1, 1)))
plt.plot(x, y.reshape((-1, 1)))
plt.xlabel("log(learning rate)")
plt.ylabel("RMSE")

In [None]:
df = pd.DataFrame([trial["hyperparameters"]["values"] for trial in rmse_trials])
df["trial id"] = [trial["trial_id"] for trial in rmse_trials]
df["rmse"] = [trial["score"] for trial in rmse_trials]
df

In [None]:
df.to_pickle("tuning/dataframe.pkl")