In [5]:
import math
import matplotlib.pyplot as plt
import keras
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

In [6]:
def get_data():
    """method to get the training data (or a portion of it) from google cloud bucket"""
    # client = storage.Client()
    # data = pd.read_csv(f"gs://{BUCKET_NAME}/{BUCKET_TRAIN_DATA_PATH}")
    data = pd.read_csv('../../raw_data/bitstampUSD.csv')
    # data = pd.read_csv('gs://bitcoin-prediction-01/data/bitstampUSD.csv')
    data = data[2798176:4727776].copy()
    return data


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/bitstampUSD.csv')

In [7]:
data = get_data()

In [8]:
data

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
2798176,1493596800,1348.88,1354.80,1348.88,1354.80,3.087374,4173.722673,1351.868203
2798177,1493596860,1352.41,1352.41,1352.41,1352.41,0.261954,354.269412,1352.410000
2798178,1493596920,1349.49,1354.86,1349.49,1354.86,0.096311,130.384815,1353.783259
2798179,1493596980,1350.11,1351.25,1350.11,1351.25,0.260284,351.553973,1350.655803
2798180,1493597040,1351.25,1351.25,1349.52,1349.52,1.089116,1470.056120,1349.769693
...,...,...,...,...,...,...,...,...
4727771,1609372500,28809.07,28825.50,28800.01,28810.08,0.087391,2517.799582,28810.597267
4727772,1609372560,28801.47,28829.42,28785.64,28829.42,0.965221,27804.572129,28806.429798
4727773,1609372620,28829.42,28863.90,28829.42,28857.06,2.368831,68332.350629,28846.441863
4727774,1609372680,28850.49,28900.52,28850.49,28882.82,2.466590,71232.784464,28879.056266


In [9]:
def select_date(data, date_start, date_end):
    
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s', origin='unix')
    data = data[['Open', 'Timestamp']].set_index("Timestamp").fillna(method='ffill')
    
    if date_start != None:
        if date_end != None:
            data = data[date_start:date_end].copy()
    else:
        data = data.copy()
        
    return data

In [10]:
def preprocessing_data(data, features_size, h):
        
    data_pp = data.copy()
    data_pp['diff_Open'] = data_pp['Open'].diff(h)
    data_pp['diff_Open'] = data_pp['diff_Open'].dropna()
    data_pp[f"t+{h}"] = data_pp['diff_Open'].shift(-h)
    
    for i in range(0, features_size):
        data_pp[f't-{i}'] = data_pp['Open'].shift(i)
    data_shifted = data_pp.dropna()
    
    return data_shifted

In [11]:
def features_target(data_shifted, h):
    
    X = data_shifted.drop(columns=['Open', 'diff_Open', f"t+{h}"])
    y = data_shifted[f"t+{h}"].copy()
    y[y > 0] = 1
    y[y <= 0] = 0
    
    data_size = data_shifted.shape[0]
    
    return X, y, data_size

In [12]:
def input_data(X, y, sample_size, train_fraction, features_size, data_size, train_size, test_size, h=1, w=0):    
 

    sample_X = X.iloc[data_size-(test_size * w + sample_size) : data_size - (test_size * w)]
    sample_y = y.iloc[data_size-(test_size * w + sample_size) : data_size - (test_size * w)]
    
    X_train = sample_X.iloc[0:train_size]
    y_train = sample_y.iloc[0:train_size]
    X_test = sample_X.iloc[(train_size+h-1):(sample_size)]
    y_test = sample_y.iloc[(train_size+h-1):(sample_size)]
    
    return X_train, X_test, y_train, y_test

In [13]:
def deep_test(model_deep, sample_size=1000, train_fraction=0.7, features_size=60, h=1, RNN=True, date_start=None, date_end=None):
    
    data = get_data()
    data = select_date(data, date_start, date_end)
    data_shifted = preprocessing_data(data, features_size, h)
    X, y, data_size = features_target(data_shifted, h)
    train_size = int(train_fraction*sample_size)
    test_size = sample_size - train_size
    
    r = math.floor((data_size-train_size)/test_size)
    intervals = range(0, r)
    reversed_intervals = reversed(intervals)
    results = []
    
    for i in reversed_intervals:
        X_train, X_test, y_train, y_test = input_data(X, y, sample_size, train_fraction, features_size, data_size, train_size, test_size, h, w=i)
        
        if RNN == True:
            X_train, y_train = np.array(X_train), np.array(y_train)
            X_test, y_test = np.array(X_test), np.array(y_test)
            X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
            X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
            
        score = predict_score_deep(model_deep, X_train, X_test, y_train, y_test)
        results.append(score)
    
    return dict({'mean_score':round(stats.mean(results),2),
                 'std':round(stats.stdev(results),2),
                 'score_min':round(min(results),2),
                 'score_max':round(max(results),2),
                 'n_fold':r})
    

In [14]:
def predict_score_deep(model_deep, X_train, X_test, y_train, y_test):
    
    model = Sequential()
#     es = EarlyStopping(patience=20, restore_best_weights=True)
    
    if model_deep == LSTM:
        model.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
        model.add(Dropout(0.2))
#         model.add(LSTM(units = 40, return_sequences = True))
#         model.add(Dropout(0.2))
        model.add(LSTM(units = 30, return_sequences = True))
        model.add(Dropout(0.2))
        model.add(LSTM(units = 15))
        model.add(Dropout(0.2))
        # Adding the output layer
        model.add(Dense(units = 1))
        # Compiling the RNN
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        # Fitting the RNN to the Training set
        model.fit(X_train, y_train, validation_split=0.2, epochs = 100, batch_size = 32)

        score = model.evaluate(X_test, y_test, verbose=0)
    
    if model_deep == GRU:
        pass
        
    return score[1] #attention score[0] loss à return also

In [19]:
def Xtrain_1(data, sample_size=1000, train_fraction=0.7, features_size=60, h=2, w=0, date_start=None, date_end=None):    
    
#     data = get_data()
    data = data.copy()
    data = select_date(data, date_start, date_end)
    data_shifted = preprocessing_data(data, features_size, h)
    X, y, data_size = features_target(data_shifted, h)
    train_size = int(train_fraction*sample_size)
    test_size = sample_size - train_size

    return X, y, data_size, train_size, test_size

def Xtrain_2(X, y, data_size, train_size, test_size):

    sample_X = X.iloc[data_size-(test_size * w + sample_size) : data_size - (test_size * w)]
    sample_y = y.iloc[data_size-(test_size * w + sample_size) : data_size - (test_size * w)]
    
    X_train = sample_X.iloc[0:train_size]
    y_train = sample_y.iloc[0:train_size]
    X_test = sample_X.iloc[(train_size+h-1):(sample_size)]
    y_test = sample_y.iloc[(train_size+h-1):(sample_size)]
    
    return X_train, X_test, y_train, y_test
   

In [20]:
X, y, data_size, train_size, test_size = Xtrain_1(data, sample_size=43200, train_fraction=0.7, features_size=4320, h=2, w=0, date_start="2020", date_end="2020") 

MemoryError: Unable to allocate 5.49 GiB for an array with shape (1401, 525600) and data type float64

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model = Sequential()
model.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units = 40, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(units = 30, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(units = 10))
model.add(Dropout(0.2))
model.add(Dense(units = 20))
model.add(Dense(units = 1))

In [None]:
model.summary()

In [None]:
1440*30

In [None]:
data_shifted = init_X_train(sample_size=43200, train_fraction=0.7, features_size=4320, h=2, w=0, date_start="2020", date_end="2020")

In [25]:
data

Unnamed: 0_level_0,Open
Timestamp,Unnamed: 1_level_1
2020-01-01 00:00:00,7160.69
2020-01-01 00:01:00,7161.51
2020-01-01 00:02:00,7158.82
2020-01-01 00:03:00,7158.82
2020-01-01 00:04:00,7158.50
...,...
2020-12-30 23:55:00,28809.07
2020-12-30 23:56:00,28801.47
2020-12-30 23:57:00,28829.42
2020-12-30 23:58:00,28850.49


In [None]:
data_shifted

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape