In [1]:
import json
import copy
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import keras_tuner
from dataclasses import dataclass
from typing import List
from keras import layers, ops
from keras.utils import timeseries_dataset_from_array
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.linear_model import LinearRegression
%matplotlib inline

from processing import Stationarizer, Normalizer
#from tuner import LSTMLayerHyperparams, ModelHyperparams
from plotting import visualize_loss, show_plot
from model_builder import build_model, build_spec_model, get_tuner

2024-08-03 21:19:38.920919: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv("data/YF_TSLA.csv")["Adj Close"].to_numpy().reshape(-1, 1)

In [3]:
normer = Normalizer()
normer.fit(data)

In [4]:
train_idx = int(0.7 * np.shape(data)[0])
val_idx = int(0.85 * np.shape(data)[0])

train_data = data[:train_idx]
val_data = data[train_idx:val_idx]
non_test_data = data[:val_idx]
test_data = data[val_idx:]

In [5]:
train_data = normer.transform(train_data)
val_data = normer.transform(val_data)

In [6]:
train_data

array([[-0.71611151],
       [-0.71615073],
       [-0.71737301],
       ...,
       [-0.25468943],
       [-0.24463663],
       [-0.23896314]])

In [7]:
joblib.dump(normer, "TSLA_normer.gz")

['TSLA_normer.gz']

In [8]:
train_ds = timeseries_dataset_from_array(
    train_data[:-20],
    train_data[20:],
    sequence_length=20,
    batch_size=64,
    shuffle=True
)
val_ds = timeseries_dataset_from_array(
    val_data[:-20],
    val_data[20:],
    sequence_length=20,
    batch_size=64,
    shuffle=True
)

for batch in train_ds.take(1):
    inputs, targets = batch
    print(inputs[0])
    print(targets[0])

print("Input shape:", inputs.numpy().shape)
print("Target shape:", targets.numpy().shape)

tf.Tensor(
[[-0.70981054]
 [-0.71021579]
 [-0.70998702]
 [-0.7095687 ]
 [-0.70943797]
 [-0.70925496]
 [-0.70916345]
 [-0.70872552]
 [-0.70819608]
 [-0.70754899]
 [-0.70755553]
 [-0.70686922]
 [-0.7069215 ]
 [-0.70720257]
 [-0.7072091 ]
 [-0.70669274]
 [-0.70705877]
 [-0.70680385]
 [-0.70612408]
 [-0.70592145]], shape=(20, 1), dtype=float64)
tf.Tensor([-0.70607832], shape=(1,), dtype=float64)
Input shape: (64, 20, 1)
Target shape: (64, 1)


2024-08-03 21:19:40.536909: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [9]:
train_ds.save("data/TSLA_ds/train")
val_ds.save("data/TSLA_ds/val")

In [10]:
tuner = get_tuner()
tuner.search(train_ds, epochs=50, validation_data=val_ds)

Trial 100 Complete [00h 00m 40s]
val_mse: 1.9687508344650269

Best val_mse So Far: 0.1469946950674057
Total elapsed time: 01h 14m 56s


In [None]:
model = build_spec_model(
    2, 
    [32, 32], 
    ["tanh", "tanh"], 
    ["sigmoid", "sigmoid"], 
    [0.2, 0.2], 
    [0.0, 0.0],
    0.001
)
history = model.fit(
    train_ds,
    batch_size=64,
    epochs=100,
    validation_data=val_ds
)

In [None]:
trials = list()
for n in range(100):
    path = f"tuning/stock_predictor/trial_{n:03d}/trial.json"
    with open(path, mode="r") as f:
        trial = json.load(f)
        trials.append(trial)

#trials[0]

In [None]:
def sort(array):
    """Sort the array by using quicksort."""

    less = []
    equal = []
    greater = []

    if len(array) > 1:
        pivot = array[0]["score"]
        for x in array:
            if x["score"] < pivot:
                less.append(x)
            elif x["score"] == pivot:
                equal.append(x)
            elif x["score"] > pivot:
                greater.append(x)
        return sort(less)+equal+sort(greater)
    else:
        return array

In [None]:
sorted_trials = sort(trials)
tmp = copy.deepcopy(sorted_trials)
rmse_trials = list()
for trial in tmp:
    trial.update({"score" : np.sqrt(trial["score"])})
    rmse_trials.append(trial)


In [None]:
bins = np.linspace(min([x["score"] for x in rmse_trials]), max([x["score"] for x in rmse_trials]), 25)

In [None]:
counts = [[cutoff, 0] for cutoff in bins]
t = 0
c = 0
while t < len(rmse_trials) and c < len(counts):
    if rmse_trials[t]["score"] <= counts[c][0]:
        counts[c][1] += 1
        t += 1
    else:
        c += 1

np.sum([el[1] for el in counts])

In [None]:
scx = list()
heights = list()
for el in counts:
    scx.append(el[0])
    heights.append(el[1])

plt.bar(scx, heights, edgecolor="black", width=0.016)
plt.xlabel("RMSE")
plt.ylabel("number of occurrences")
plt.title("Score distribution")

In [None]:
num_layers = [trial["hyperparameters"]["values"]["num_layers"] for trial in rmse_trials]
rmse_scores = np.array([trial["score"] for trial in rmse_trials])
plt.scatter(num_layers, rmse_scores)
plt.xlabel("number of LSTM layers")
plt.ylabel("RMSE")

reg = LinearRegression()
reg.fit(np.array(num_layers).reshape((-1, 1)), np.array(rmse_scores).reshape((-1, 1)))
x = np.linspace(min(num_layers), max(num_layers), 100)
plt.plot(x, np.array(reg.coef_ * x + reg.intercept_).reshape((-1, 1)))

In [None]:
log_lrs = np.array([np.log10(trial["hyperparameters"]["values"]["lr"]) for trial in rmse_trials])
reg.fit(log_lrs.reshape((-1, 1)), rmse_scores.reshape((-1, 1)))
x = np.linspace(min(log_lrs), max(log_lrs), 100)
y = reg.coef_ * x + reg.intercept_
plt.scatter(log_lrs, rmse_scores.reshape((-1, 1)))
plt.plot(x, y.reshape((-1, 1)))
plt.xlabel("log(learning rate)")
plt.ylabel("RMSE")

In [None]:
df = pd.DataFrame([trial["hyperparameters"]["values"] for trial in rmse_trials])
df["trial id"] = [trial["trial_id"] for trial in rmse_trials]
df["rmse"] = [trial["score"] for trial in rmse_trials]
df

In [None]:
df.to_pickle("tuning/dataframe.pkl")