In [86]:
## Imports libs
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ["SM_FRAMEWORK"] = "tf.keras"
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tensorflow as tf
print(tf.__version__) 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, GRU
from tensorflow.keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
import itertools
import random
import os
import math # Mathematical functions 
import time

ts = str(time.time())

2.9.2


### Data Pre-Processing

In [87]:
def load_time_series_data_and_preprocessing(filename,f_date,t_date):
    
    #Load data into dataframe
    data =  pd.read_csv(filename, header=0)
    
    #information about dataset
    print(data.info())
    
    #missing = data[col_name].isnull()
    #data[missing]
    
    #Duplicate 
    
    print("Missing Values:")
    print(data.isnull().sum())

    print("Duplicate Values:")
    duplicate = data[data.duplicated()]
    print(duplicate)
    
    
    #Convert Date from Object to Datetime type
    data['Date'] = pd.to_datetime(data['Date'])
    
    #Sorting value by Date
    data=data.sort_values(by="Date", ascending=True)
    
    #Setting index as Date
    data.set_index("Date",inplace=True)
    
    #Dataset Selection by date from dataset
    data=data[(data.index >= f_date) & (data.index <= t_date)]
    
    return data


### Feature Engineering and Scaling 

In [88]:
def feature_selection_scaling(data):
    data_prices = data.drop(['SMAVG_100d'], axis=1)
    # We add a prediction column and set dummy values to prepare the data for scaling
    data_prices_ext = data_prices.copy()
    data_prices_ext['Prediction'] = data_prices_ext['Close']  
    
    # Get the number of rows in the data
    nrows = data_prices.shape[0]
    
    # Convert the data to numpy values
    np_data_unscaled = np.array(data_prices)
    np_data = np.reshape(np_data_unscaled, (nrows, -1))
    print('np_data.shape:',np_data.shape)    
    
    # Transform the data by scaling each feature to a range between 0 and 1
    scaler = MinMaxScaler()
    np_data_scaled = scaler.fit_transform(np_data_unscaled)
    
    # Creating a separate scaler that works on a single column for scaling predictions
    scaler_pred = MinMaxScaler()
    df_Close = pd.DataFrame(data_prices_ext['Close'])
    np_Close_scaled = scaler_pred.fit_transform(df_Close)
    
    # Print the tail of the dataframe
    return data_prices, data_prices_ext,np_Close_scaled,np_data_scaled,scaler_pred

### Partition Dataset Using sliding window Technique

In [89]:
# The RNN needs data with the format of [samples, time steps, features]
# Here, we create N samples, sequence_length time steps per sample, and 6 features
def partition_dataset(sequence_length, data, index_Close):
    x, y = [], []
    data_len = data.shape[0]
    for i in range(sequence_length, data_len):
        x.append(data[i-sequence_length:i,:]) #contains sequence_length values 0-sequence_length * columsn
        y.append(data[i, index_Close]) #contains the prediction values for validation,  for single-step prediction
    
    # Convert the x and y to numpy arrays
    x = np.array(x)
    y = np.array(y)
    return x, y

### Transform data for Model Prediction

In [90]:
def transform_multivariate_data(data,np_data_scaled,sequence_length):
    # Set the sequence length - this is the timeframe used to make a single prediction
    #sequence_length = 6
    
    # Prediction Index
    index_Close = data.columns.get_loc("Close")
    
    # Split the training data into train and train data sets
    # As a first step, we get the number of rows to train the model on 80% of the data 
    train_data_len = math.ceil(np_data_scaled.shape[0] * 0.8)
    
    # Create the training and test data
    train_data = np_data_scaled[0:train_data_len, :]
    test_data = np_data_scaled[train_data_len - sequence_length:, :]
    
    # Generate training data and test data
    x_train, y_train = partition_dataset(sequence_length, train_data,index_Close)
    x_test, y_test = partition_dataset(sequence_length, test_data,index_Close)

    # Print the shapes: the result is: (rows, training_sequence, features) (prediction value, )
    print('x_train.shape:',x_train.shape,'y_train.shape:', y_train.shape)
    print('x_test.shape:',x_test.shape,'y_test.shape:', y_test.shape)
        
    # Validate that the prediction value and the input match up
    # The last close price of the second input sample should equal the first prediction value
    print(x_train[1][sequence_length-1][index_Close])
    print(y_train[0])
    return  x_train, y_train, x_test, y_test, train_data_len

### Evalute Model Performance 

In [91]:
def evalute_model_performance(history,output):
    ts = str(time.time())
    filename= output+ "evalute_model_performance" + ts + ".png"
    fig = plt.figure(figsize=(20,7))
    fig.add_subplot(121)
    
    # Accuracy
    plt.plot(history.epoch, history.history['root_mean_squared_error'], label = "rmse")
    plt.plot(history.epoch, history.history['val_root_mean_squared_error'], label = "val_rmse")
    
    plt.title("RMSE", fontsize=18)
    plt.xlabel("Epochs", fontsize=15)
    plt.ylabel("RMSE", fontsize=15)
    plt.grid(alpha=0.3)
    plt.legend()
    
    
    #Adding Subplot 1 (For Loss)
    fig.add_subplot(122)
    
    plt.plot(history.epoch, history.history['loss'], label="loss")
    plt.plot(history.epoch, history.history['val_loss'], label="val_loss")
    
    plt.title("Loss", fontsize=18)
    plt.xlabel("Epochs", fontsize=15)
    plt.ylabel("Loss", fontsize=15)
    plt.grid(alpha=0.3)
    plt.legend()    
    fig.savefig(filename)
    plt.show()  
    
      

### Train Model with initial parameter

In [92]:
def train_multivariate_prediction_model(x_train, y_train,filename):
    # ------------------LSTM-----------------------
    regressor = Sequential()
    regressor.add(LSTM(units=16, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
    regressor.add(Dropout(0.2))
    
    regressor.add(LSTM(units=16, return_sequences=False))
    regressor.add(Dropout(0.2))
    regressor.add(Dense(units=1, activation='linear'))
    regressor.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    print('LSTM Regression Summary :')
    print(regressor.summary())
    
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
    #mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
    
    # fit model
    history = regressor.fit(x_train, y_train, validation_split=0.3, epochs=40, batch_size=64, callbacks=[es])
    # plot Accuracy and Loss 
    evalute_model_performance(history,filename)
    
    results = regressor.evaluate(x_test, y_test)
    print("test loss, test acc:", np.round(results, 4))
    return history, results

### Train Model With Optimitzed Hyperparameter

In [93]:
def train_model_with_optimized_hyperparam(n_neurons, n_batch_size, dropout,output):
    regressor = Sequential()
    regressor.add(LSTM(units=n_neurons, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
    regressor.add(Dropout(dropout))
        
    regressor.add(LSTM(units=n_neurons, return_sequences=False))
    regressor.add(Dropout(dropout))
    regressor.add(Dense(units=1, activation='linear'))
    regressor.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
    
    #file_path = output + "/weights-{epoch:03d}-{val_loss:.4f}.hdf5"
    file_path = "/weights-{epoch:03d}-{val_loss:.4f}.hdf5"
    
    mc = ModelCheckpoint(file_path, monitor='val_loss', mode='min', verbose=1, save_best_only=True)
    
    history = regressor.fit(x_train, y_train, validation_split=0.3, epochs=40, batch_size=n_batch_size, callbacks=[es, mc], verbose=0)
    #print('root_mean_squared_error:',history.history['root_mean_squared_error'])
    #print('val_root_mean_squared_error:',history.history['val_root_mean_squared_error'])
    
    evalute_model_performance(history,filename)

    results=regressor.evaluate(x_test, y_test)
    print("test loss, test acc:", np.round(results, 4))
    return regressor,history

### Hyperparam Tuning

In [94]:
def tune_model_hyperparam(config, x_train, y_train, x_test, y_test,output):
    
    n_neurons, n_batch_size, dropout = config
    
    possible_combinations = list(itertools.product(n_neurons, n_batch_size, dropout))
    
    print(possible_combinations)
    print('\n')
    
    hist = []
    
    for i in range(0, len(possible_combinations)):
        
        print(f'{i+1}th combination: \n')
        print('--------------------------------------------------------------------')
        
        n_neurons, n_batch_size, dropout = possible_combinations[i]
        
        # instantiating the model in the strategy scope creates the model on the TPU
        #with tpu_strategy.scope():
        regressor = Sequential()
        regressor.add(LSTM(units=n_neurons, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
        regressor.add(Dropout(dropout))
        
        regressor.add(LSTM(units=n_neurons, return_sequences=False))
        regressor.add(Dropout(dropout))
        regressor.add(Dense(units=1, activation='linear'))
        regressor.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])

        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
        '''''
        From the mentioned article above --> If a validation dataset is specified to the fit() function via the validation_data or v
        alidation_split arguments,then the loss on the validation dataset will be made available via the name “val_loss.”
        '''''

        #file_path = output + "/weights-{epoch:03d}-{val_loss:.4f}.hdf5"
        file_path = "/weights-{epoch:03d}-{val_loss:.4f}.hdf5"

        mc = ModelCheckpoint(file_path, monitor='val_loss', mode='min', verbose=1, save_best_only=True)

        '''''
        cb = Callback(...)  # First, callbacks must be instantiated.
        cb_list = [cb, ...]  # Then, one or more callbacks that you intend to use must be added to a Python list.
        model.fit(..., callbacks=cb_list)  # Finally, the list of callbacks is provided to the callback argument when fitting the model.
        '''''

        regressor.fit(x_train, y_train, validation_split=0.3, epochs=40, batch_size=n_batch_size, callbacks=[es, mc], verbose=0)

        # load the best model
        # regressor = load_model('best_model.h5')

        train_accuracy = regressor.evaluate(x_train, y_train, verbose=0)
        test_accuracy = regressor.evaluate(x_test, y_test, verbose=0)

        hist.append(list((n_neurons, n_batch_size, dropout,
                          train_accuracy, test_accuracy)))

        print(f'{str(i)}-th combination = {possible_combinations[i]} \n train accuracy: {train_accuracy} and test accuracy: {test_accuracy}')
        
        print('--------------------------------------------------------------------')
        print('--------------------------------------------------------------------')
        print('--------------------------------------------------------------------')
        print('--------------------------------------------------------------------')
        

    return hist

### Predict Crypto coins Price

In [95]:
def predict_future_price(x_test,y_test,train_data_len,from_date,cryto_name,output,regressor):
    ts = str(time.time())
    filename1 = output + 'real_price_pred_price_'+ts+".png"
    filename2 = output + 'y_pred_vs_y_test_'+ts+".png"
    
    y_pred_scaled = regressor.predict(x_test)
    y_pred = scaler_pred.inverse_transform(y_pred_scaled)

    plt.figure(figsize=(16,8), dpi= 100, facecolor='w', edgecolor='k')
    
    plt.plot(y_test, color='red', label = 'Real Close Price')
    plt.plot(y_pred_scaled, color='green', label = 'Predicted Close Price')
    plt.legend(loc='best')
    plt.savefig(filename1)

        # Add the difference between the valid and predicted prices
    train = pd.DataFrame(data_prices_ext['Close'][:train_data_len + 1]).rename(columns={'Close': 'y_train'})
    valid = pd.DataFrame(data_prices_ext['Close'][train_data_len:]).rename(columns={'Close': 'y_test'})
    valid.insert(1, "y_pred", y_pred, True)
    valid.insert(1, "residuals", valid["y_pred"] - valid["y_test"], True)
    df_union = pd.concat([train, valid])
    # Zoom in to a closer timeframe
    df_union_zoom = df_union[df_union.index > from_date]
    print(df_union_zoom)    
    df_union_zoom.to_csv(output+"/predicted_price.csv")
    
    # Create the lineplot
    fig, ax1 = plt.subplots(figsize=(16, 8))
    plt.title("y_pred vs y_test")
    plt.ylabel(cryto_name, fontsize=18)
    sns.set_palette(["#090364", "#1960EF", "#EF5919"])
    sns.lineplot(data=df_union_zoom[['y_pred', 'y_train', 'y_test']], linewidth=1.0, dashes=False, ax=ax1)
    
    # Create the bar plot with the differences
    df_sub = ["#2BC97A" if x > 0 else "#C92B2B" for x in df_union_zoom["residuals"].dropna()]
    ax1.bar(height=df_union_zoom['residuals'].dropna(), x=df_union_zoom['residuals'].dropna().index, width=3, label='residuals', color=df_sub)
    plt.legend()    
    fig.savefig(filename2)
    plt.show()
    plt.draw()
    
    print('filename1:',filename1)
    print('filename2:',filename2)
    

### Dataset, output path and date configuration

In [96]:
#-------------------------------------------------------------------------------------
filename = "./dataset/bitcoin/bitcoin_60min_dataset.csv"
cryto_name = "bitcoin"
cryto_data = "bitcoin_60min"
from_date = "2022-04-01"
to_date = "2022-09-30"
output="./output/"+cryto_name+"/"+cryto_data+"/"
#os.mkdir(output)
#-------------------------------------------------------------------------------------

### Execution of Prediction Model

In [97]:
# Set the sequence length - this is the timeframe used to make a single prediction
sequence_length = 50
data=load_time_series_data_and_preprocessing(filename,from_date,to_date)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25953 entries, 0 to 25952
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        25953 non-null  object 
 1   Open        25953 non-null  float64
 2   High        25953 non-null  float64
 3   Low         25953 non-null  float64
 4   Close       25953 non-null  float64
 5   SMAVG_50d   25904 non-null  float64
 6   SMAVG_100d  25854 non-null  float64
 7   SMAVG_200d  25754 non-null  float64
dtypes: float64(7), object(1)
memory usage: 1.6+ MB
None
Missing Values:
Date            0
Open            0
High            0
Low             0
Close           0
SMAVG_50d      49
SMAVG_100d     99
SMAVG_200d    199
dtype: int64
Duplicate Values:
Empty DataFrame
Columns: [Date, Open, High, Low, Close, SMAVG_50d, SMAVG_100d, SMAVG_200d]
Index: []


In [98]:
data_prices, data_prices_ext, np_Close_scaled,np_data_scaled,scaler_pred = feature_selection_scaling(data)

np_data.shape: (4369, 6)


In [99]:
x_train, y_train, x_test, y_test,train_data_len = transform_multivariate_data(data,np_data_scaled,sequence_length)

x_train.shape: (3446, 50, 6) y_train.shape: (3446,)
x_test.shape: (873, 50, 6) y_test.shape: (873,)
0.9625339111150382
0.9625339111150382


In [None]:
history, results = train_multivariate_prediction_model(x_train, y_train,output)

LSTM Regression Summary :
Model: "sequential_34"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_68 (LSTM)              (None, 50, 16)            1472      
                                                                 
 dropout_68 (Dropout)        (None, 50, 16)            0         
                                                                 
 lstm_69 (LSTM)              (None, 16)                2112      
                                                                 
 dropout_69 (Dropout)        (None, 16)                0         
                                                                 
 dense_34 (Dense)            (None, 1)                 17        
                                                                 
Total params: 3,601
Trainable params: 3,601
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/40
Epoch 2/4

In [None]:
config = [[16, 32], [32, 64, 128], [0.1,0.2]]  

# list of lists --> [[first_additional_layer], [second_additional_layer], [third_additional_layer], [n_neurons], [n_batch_size], [dropout]]
hist = tune_model_hyperparam(config, x_train, y_train, x_test, y_test,output)  # change x_train shape
hist = pd.DataFrame(hist)


In [None]:
hist

In [None]:
hist = hist.sort_values(by=[4], ascending=True)
hist.head(12)

In [None]:


print(f'Best Combination: \n n_neurons = {hist.iloc[0, 0]}\n n_batch_size = {hist.iloc[0, 1]}\n dropout = {hist.iloc[0, 2]}')
print('**************************')
print(f'Results Before Tunning:\n Test Set RMSE: {np.round(results, 4)[1]}\n')
print(f'Results After Tunning:\n Test Set RMSE: {np.round(hist.iloc[0, -1], 4)[1]}\n')
print(f'{np.round((results[1] - hist.iloc[0, -1][1])*100/np.round(results, 4)[1])}% Improvement') 

In [None]:
n_neurons, n_batch_size, dropout = list(hist.iloc[0, :-2])
print(list(hist.iloc[0, :-2]))

In [None]:
regressor,history = train_model_with_optimized_hyperparam(n_neurons, n_batch_size, dropout,output)

In [None]:
evalute_model_performance(history,output)

In [None]:
predict_future_price(x_test,y_test,train_data_len,from_date,cryto_name,output,regressor)