In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.metrics import MAPE
from tensorflow.keras import callbacks
from google.cloud import storage
import joblib

def subsample_sequence(df, length):
    '''
    function that return a random slice of features and targets
    len(X) = lenght and len(y) = 3
    '''
    last_possible = df.shape[0] - length - 3
    random_start = np.random.randint(0, last_possible)
    X = df[random_start: random_start+length].values
    y = df.iloc[random_start+length:random_start+length+3][['price_usd']]
    return X, y

def get_X_y(df, length_of_observations):
    '''
    function that returns a list of random slices of features and targets
    len(X[0]) = lenght and len(y[0]) = 3
    '''
    X, y = [], []
    for length in length_of_observations:
        xi, yi = subsample_sequence(df, length)
        X.append(xi)
        y.append(yi)
    return X, y

def split_tr_te(df, horizon=3, ratio=0.8):
    '''
    function that returns a training and test set
    arguments are:
    the horizon of prediction
    the ratio of the train/test split
    '''
    # the gap to avoid data leakage
    gap = horizon - 1
    len_ = int(ratio*df.shape[0])
    data_train = df[:len_]
    data_test = df[len_+gap:]
    return data_train, data_test

def extract_xy_tr_te(train,
                     test,
                     train_splits = 300,
                     train_time_min = 79,
                     train_time_max = 81):
    '''
    function returns a serie of train and test data
    train splits is the number of selections of our dataset
    train_time_min is the minimum number of days that are randomly choosen by the get_X_y function
    train_time_max is the maximum number of days that are randomly choosen by the get_X_y function
    '''
    length_of_observations = np.random.randint(train_time_min, train_time_max, train_splits)
    X_train, y_train = get_X_y(train, length_of_observations)
    length_of_observations = np.random.randint(train_time_min, train_time_max, train_splits)
    X_test, y_test = get_X_y(test, length_of_observations)
    return X_train, y_train, X_test, y_test

def padding_seq(train):
    '''
    function that return the padded version of the train dataset
    to uniform the size of the model imput
    '''
    return pad_sequences(train, dtype='float32', value=-1)

def baseline_model(X_train_pad, y_train):
    '''
    function that return a trained baseline model and its fitting history
    and save locally the trained model file basemodel.joblib
    '''
    normalizer = Normalization()
    normalizer.adapt(X_train_pad)
    model = Sequential()
    model.add(normalizer)
    model.add(layers.Masking(mask_value=-1))
    model.add(layers.LSTM(10, activation='tanh'))
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='linear'))
    model.compile(loss='mse', optimizer=RMSprop(learning_rate=0.01), metrics=MAPE)
    es = callbacks.EarlyStopping(patience=60, restore_best_weights=True)
    history = model.fit(X_train_pad,
                np.array(y_train),
                epochs=250,
                batch_size=64,
                validation_split=0.3,
                callbacks=[es],
                verbose=1)
    joblib.dump(model, 'basemodel.joblib')
    return model, history

def plot_history(history, title='', axs=None, exp_name=""):
    '''
    return the loss and metric plots of train and test fit process
    '''
    if axs is not None:
        ax1, ax2 = axs
    else:
        f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

    if len(exp_name) > 0 and exp_name[0] != '_':
        exp_name = '_' + exp_name
    ax1.plot(history.history['loss'], label = 'train' + exp_name)
    ax1.plot(history.history['val_loss'], label = 'val' + exp_name)
    #ax1.set_ylim(0., 2.2)
    ax1.set_title('loss')
    ax1.legend()

    ax2.plot(history.history['mean_absolute_percentage_error'], label='train mape'  + exp_name)
    ax2.plot(history.history['val_mean_absolute_percentage_error'], label='val mape'  + exp_name)
    #ax2.set_ylim(0.25, 1.)
    ax2.set_title('mape')
    ax2.legend()
    return (ax1, ax2)

def pred_3d_price(model, test):
    '''
    return the prediction of three days after the test data
    '''
    return model.predict(test)

# PARAMETERS FOR GCP BASEMODEL UPLOAD

STORAGE_LOCATION = 'models/basemodel.joblib'
BUCKET_NAME='crypto913'

def upload_model_to_gcp():
    '''
    function that upload the trained model to gcp
    '''
    client = storage.Client()
    bucket = client.bucket(BUCKET_NAME)
    blob = bucket.blob(STORAGE_LOCATION)
    blob.upload_from_filename('basemodel.joblib')


In [99]:
data_dl = pd.read_csv('../raw_data/data_advanced.csv') # TO RUN LOCALLY

data_dl.shape
data_dl.set_index('datetime', inplace=True)
data_dl.drop(columns='Unnamed: 0', inplace=True)
data_train, data_test = split_tr_te(data_dl)
X_train, y_train, X_test, y_test = extract_xy_tr_te(train=data_train,
                                                     test=data_test)
# X_train = np.array(X_train)
# X_train = np.resize(X_train, (300, 80, 44))

# print(np.shape(X_train), np.shape(y_train))
X_train_pad = padding_seq(X_train)

model, history = baseline_model(X_train_pad, y_train)

2022-07-14 17:52:06.898795: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-07-14 17:52:06.898857: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-07-14 17:52:06.898873: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (LAPTOP4): /proc/driver/nvidia/version does not exist
2022-07-14 17:52:06.899133: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78



INFO:tensorflow:Assets written to: ram://72d02d30-27d1-43f9-90fb-720728c89fc1/assets


INFO:tensorflow:Assets written to: ram://72d02d30-27d1-43f9-90fb-720728c89fc1/assets


In [102]:
X_predict = data_dl.tail(80)
y_pred = pred_3d_price(model, X_predict)
y_pred



array([[1103.7607],
       [ 797.0423],
       [1516.581 ]], dtype=float32)

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(data_dl['price_usd'], label="Ethereum Price")
plt.plot(test['ds'], test['y'], label="Test")
plt.plot(forecast[-horizon:]['ds'], forecast[-horizon:]['yhat'], label="predictions")
plt.legend(loc="best")
plt.show()

In [7]:

#data_dl['datetime'] = pd.to_datetime(data_dl['datetime'])
#data_dl.set_index('datetime', inplace=True)
#data_train, data_test = split_tr_te(data_dl)


#X_train, y_train, X_test, y_test = extract_xy_tr_te(train=data_train,
#                                                      test=data_test)
# print(np.shape(X_train), np.shape(y_train), np.shape(X_test), np.shape(y_test))
# X_train_pad = padding_seq(X_train)
# print('after padding')
# print(np.shape(X_train), np.shape(y_train), np.shape(X_test), np.shape(y_test))
    # model, history = baseline_model(X_train_pad, y_train)
    # X_predict = data_dl.tail(80)
    # y_pred = pred_3d_price(model, X_predict)
    # upload_model_to_gcp()
    # print(y_pred)

KeyError: 'datetime'