# 1. Download Training Data

In [None]:
# import libraries
import numpy as np
import pandas as pd
import requests

In [None]:
# Connect to Binance api to download training data, no need for keys or authentication, 
# but only accessible from physical machines(does not work on Google Cloud)

root_url = 'https://api.binance.com/api/v3/'

check_url = root_url + 'ping'

if requests.get(check_url).ok != True:
    print('!= 200')


In [None]:
# set api parameters including trading pairs, intervals, and desired timeframes

symbol = 'BTCUSDT' # 'BTCUSDT', 'ETHUSDT', etc.

interval = '1m' #'1d' , '1h', '1d'

kline_url = root_url + 'klines'

params = {'interval':interval,
          'symbol':symbol,
          'endTime':1698796800000,
          #'limit':5 # Binance default: 500, max: 1000
         }

In [None]:
# get api data as json

if requests.get(url=kline_url, params=params).ok != True:
    print('Issue with Binance kline API connectivity, did not fetch data')

api_data = requests.get(url=kline_url, params=params).json()

In [None]:
# save json to dataframe

df = pd.DataFrame(api_data)
df.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume',
          'k_close_time', 'quote_asset_volume', 'num_trades',
          'taker_base_vol', 'taker_quote_vol', 'ignore']

In [None]:
# use for loop to overcome Binance api max limit of 1000 rows per call

for i in range(150):
    prev = df['Date'][0]
    # url_next = url + '&endTime=' + str(prev)
    params['endTime'] = str(prev)
    next_data = requests.get(url=kline_url, params=params).json()
    df_next = pd.DataFrame(next_data)
    df_next.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume',
          'k_close_time', 'quote_asset_volume', 'num_trades',
          'taker_base_vol', 'taker_quote_vol', 'ignore']
    df = pd.concat([df_next, df]).drop_duplicates()


In [None]:
for col in ['Open', 'High', 'Low', 'Close', 'Volume',
            'quote_asset_volume', 'num_trades',
            'taker_base_vol', 'taker_quote_vol', 'ignore']:
    df[col] = df[col].astype(float)
    
df['Date'] = pd.to_datetime(df['Date'],unit='ms')

In [None]:
df

In [None]:
# saving to csv

filename = f'{symbol}_{interval}.csv'
df.to_csv(f'../raw_data/{filename}')

# 2. Load Data & Packages

In [1]:
# Import libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, LSTM, Flatten
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error,  r2_score, accuracy_score
from sklearn.model_selection import train_test_split


2024-02-17 16:01:40.369662: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Read data from the CSV file into a DataFrame
data = pd.read_csv("../raw_data/BTCUSDT_1d.csv")

# Select specific columns from the DataFrame
data = data.loc[:,['Date','Open','High','Low','Close','Volume']]

In [3]:
# Set the 'Date' column as the index of the DataFrame
data = data.set_index('Date')

data

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-02-06,38289.32,40955.51,38215.94,39186.94,98757.311183
2021-02-07,39181.01,39700.00,37351.00,38795.69,84363.679763
2021-02-08,38795.69,46794.45,37988.89,46374.87,138597.536914
2021-02-09,46374.86,48142.19,44961.09,46420.42,115499.861712
2021-02-10,46420.42,47310.00,43727.00,44807.58,97154.182200
...,...,...,...,...,...
2023-10-28,33892.01,34493.33,33860.00,34081.00,16880.131440
2023-10-29,34081.01,34750.11,33930.00,34525.89,20685.521760
2023-10-30,34525.88,34856.00,34062.84,34474.73,33657.959760
2023-10-31,34474.74,34720.49,34025.00,34639.77,32737.898220


# 3. Preprocess Data

In [4]:
#Set the Target column
aim = 'Close'

In [5]:
# 90-10 test split
test_split_index = int(len(data) * 0.9)
test_data = data.iloc[test_split_index:]
available_data = data.iloc[:test_split_index]

# 80-20 train-val split
split_index = int(len(available_data) * 0.8)
train_data = available_data.iloc[:split_index]
val_data = available_data.iloc[split_index:]


def line_plot(line1, line2, line3, label1=None, label2=None, label3=None, title='', lw=2):
    """
    Create a line plot with two lines.
    Parameters:
    - line1 (array-like): Data for the first line.
    - line2 (array-like): Data for the second line.
    - label1 (str, optional): Label for the first line (default is None).
    - label2 (str, optional): Label for the second line (default is None).
    - title (str, optional): Title of the plot (default is an empty string).
    - lw (int, optional): Line width for both lines (default is 2).
    """
    # Create a subplot with specified size
    fig, ax = plt.subplots(1, figsize=(13, 7))

    # Plot the first line with its label
    ax.plot(line1, label=label1, linewidth=lw)

    # Plot the second line with its label
    ax.plot(line2, label=label2, linewidth=lw)
    
    # Plot the 3rd line with its label
    ax.plot(line3, label=label3, linewidth=lw)

    # Set y-axis label
    ax.set_ylabel('BTC/USDT', fontsize=14)

    # Set the title
    ax.set_title(title, fontsize=16)

    # Add a legend at the best location with the specified font size
    ax.legend(loc='best', fontsize=16)

    # Display the plot
    plt.show()


In [None]:
# Plotting the training and testing data for the 'Close' column
Dont plot if using a very large dataset

line_plot(train_data[aim], val_data[aim], test_data[aim], label1='Train', label2='Val', label3='Test', title='Data Split')


In [6]:
# Function to normalize a continuous variable to a zero-base scale
def normalise_zero_base(continuous):
    """
    Normalize a continuous variable to a zero-base scale.
    Parameters:
    - continuous (pandas.Series): The continuous variable to be normalized.
    Returns:
    - pandas.Series: The normalized continuous variable.
    """
    # Normalize by dividing each value by the first value and subtracting 1
    return continuous / continuous.iloc[0] - 1


# Function to normalize a continuous variable to a min-max scale
def normalise_min_max(continuous):
    """
    Normalize a continuous variable to a min-max scale.
    Parameters:
    - continuous (pandas.Series): The continuous variable to be normalized.
    Returns:
    - pandas.Series: The normalized continuous variable.
    """
    # Normalize using min-max scaling formula
    return (continuous - continuous.min()) / (continuous.max() - continuous.min())


In [7]:
# Function to extract windowed data from a continuous variable
def extract_window_data(continuous, window_len=5, zero_base=True):
    """
    Extract windowed data from a continuous variable.
    Parameters:
    - continuous (pandas.Series): The continuous variable to extract windows from.
    - window_len (int, optional): The length of each window (default is 5).
    - zero_base (bool, optional): Whether to normalize each window to a zero-base scale (default is True).
    Returns:
    - numpy.ndarray: Array of windowed data.
    Example:
    >>> windowed_data = extract_window_data(data['Close'], window_len=10, zero_base=True)
    """
    # Initialize an empty list to store windowed data
    window_data = []

    # Iterate over the continuous variable to extract windows
    for idx in range(len(continuous) - window_len):
        # Extract a window of data
        tmp = continuous[idx: (idx + window_len)].copy()

        # Normalize the window to a zero-base scale if specified
        if zero_base:
            tmp = normalise_zero_base(tmp)

        # Append the window data to the list
        window_data.append(tmp.values)

    # Convert the list of windowed data to a numpy array
    return np.array(window_data)


In [8]:
# Function to prepare data for time series analysis
def prepare_data(continuous, aim, window_len=10, zero_base=True, test_size=0.2):
    """
    Prepare data for time series analysis.
    Parameters:
    - continuous (pandas.Series): The continuous variable for time series analysis.
    - aim (str): The target variable to predict.
    - window_len (int, optional): The length of each window (default is 10).
    - zero_base (bool, optional): Whether to normalize each window to a zero-base scale (default is True).
    - test_size (float, optional): The proportion of data to be used as the test set (default is 0.2).
    Returns:
    - tuple: A tuple containing train_data, test_data, X_train, X_test, y_train, y_test.
    """
    # Extract windowed data for training and testing sets
    X_train = extract_window_data(train_data, window_len, zero_base)
    X_val = extract_window_data(val_data, window_len, zero_base)

    # Extract target variable for training and validation sets
    y_train = train_data[aim][window_len:].values
    y_val = val_data[aim][window_len:].values

    # Normalize the target variable to a zero-base scale if specified
    if zero_base:
        y_train = y_train / train_data[aim][:-window_len].values - 1
        y_val = y_val / val_data[aim][:-window_len].values - 1

    # Return the prepared data
    return train_data, val_data, X_train, X_val, y_train, y_val


In [9]:
# Function to prepare data for time series analysis
def prepare_test_data(continuous, aim, window_len=10):#, zero_base=True):
    """
    Prepare data for time series analysis.
    Parameters:
    - continuous (pandas.Series): The continuous variable for time series analysis.
    - aim (str): The target variable to predict.
    - window_len (int, optional): The length of each window (default is 10).
    - zero_base (bool, optional): Whether to normalize each window to a zero-base scale (default is True).
    - test_size (float, optional): The proportion of data to be used as the test set (default is 0.2).
    Returns:
    - tuple: A tuple containing train_data, test_data, X_train, X_test, y_train, y_test.
    """
    # Extract windowed data for validation set
    X_test = extract_window_data(test_data, window_len, zero_base)

    # Extract target variable for validation set
    y_test = test_data[aim][window_len:].values

    # Normalize the target variable to a zero-base scale if specified
    # if zero_base:
    #     y_train = y_train / train_data[aim][:-window_len].values - 1
    #     y_test = y_test / test_data[aim][:-window_len].values - 1

    # Return the prepared data
    return test_data, X_test, y_test


In [10]:
print(train_data.head(3))
print(val_data.head(3))
print(test_data.head(3))

                Open      High       Low     Close         Volume
Date                                                             
2021-02-06  38289.32  40955.51  38215.94  39186.94   98757.311183
2021-02-07  39181.01  39700.00  37351.00  38795.69   84363.679763
2021-02-08  38795.69  46794.45  37988.89  46374.87  138597.536914
                Open      High       Low     Close        Volume
Date                                                            
2023-01-26  23060.42  23282.47  22850.01  23009.65  288924.43581
2023-01-27  23009.65  23500.00  22534.88  23074.16  280833.86315
2023-01-28  23074.16  23189.00  22878.46  23022.60  148115.71085
                Open      High       Low     Close       Volume
Date                                                           
2023-07-25  29176.50  29376.00  29047.65  29228.91  21565.74780
2023-07-26  29228.91  29690.00  29096.94  29351.96  33931.63366
2023-07-27  29351.95  29567.49  29083.85  29222.78  22476.47626


In [38]:
def get_X(df: pd.DataFrame, fold_length: int, fold_stride: int) -> list[pd.DataFrame]:
    '''
    This function slides through the Time Series dataframe of shape (n_timesteps, n_features) to create folds
    - of equal `fold_length`
    - using `fold_stride` between each fold
    
    Returns a list of folds, each as a DataFrame
    '''
    folds = []
    for idx in range(0, len(df)-fold_length, fold_stride): # --> also possible to get rid of the break 
    #for idx in range(0, len(df), fold_stride):   # range(start, stop, step): for each idx in our rows at every 91 days
        # Exits the loop as soon as the last fold index would exceed the last index
        # if (idx + fold_length) > len(df):
        #     break
        fold = df.iloc[idx:idx + fold_length, :]  # select from row idx til last row of the fold, all the columns
        # set last position in iloc[] to fit the column that we want
        folds.append(fold)   # append the fold to folds
    # return np.array(folds)
    return folds

In [40]:
folds = get_X(df=train_data, fold_length=5, fold_stride=1)
test_x = folds[:3]

In [103]:
test_x

[                Open      High       Low     Close         Volume
 Date                                                             
 2021-02-06  38289.32  40955.51  38215.94  39186.94   98757.311183
 2021-02-07  39181.01  39700.00  37351.00  38795.69   84363.679763
 2021-02-08  38795.69  46794.45  37988.89  46374.87  138597.536914
 2021-02-09  46374.86  48142.19  44961.09  46420.42  115499.861712
 2021-02-10  46420.42  47310.00  43727.00  44807.58   97154.182200,
                 Open      High       Low     Close         Volume
 Date                                                             
 2021-02-07  39181.01  39700.00  37351.00  38795.69   84363.679763
 2021-02-08  38795.69  46794.45  37988.89  46374.87  138597.536914
 2021-02-09  46374.86  48142.19  44961.09  46420.42  115499.861712
 2021-02-10  46420.42  47310.00  43727.00  44807.58   97154.182200
 2021-02-11  44807.58  48678.90  43994.02  47969.51   89561.081454,
                 Open      High       Low     Close         

In [131]:
test_x[0].values - test_x[0].values.mean(0, keepdims=True)


array([[ -3522.94     ,  -3624.92     ,  -2232.844    ,  -3930.16     ,
         -8117.2031714],
       [ -2631.25     ,  -4880.43     ,  -3097.784    ,  -4321.41     ,
        -22510.8345914],
       [ -3016.57     ,   2214.02     ,  -2459.894    ,   3257.77     ,
         31723.0225596],
       [  4562.6      ,   3561.76     ,   4512.306    ,   3303.32     ,
          8625.3473576],
       [  4608.16     ,   2729.57     ,   3278.216    ,   1690.48     ,
         -9720.3321544]])

In [49]:
test_x[0].values

array([[ 38289.32    ,  40955.51    ,  38215.94    ,  39186.94    ,
         98757.311183],
       [ 39181.01    ,  39700.      ,  37351.      ,  38795.69    ,
         84363.679763],
       [ 38795.69    ,  46794.45    ,  37988.89    ,  46374.87    ,
        138597.536914],
       [ 46374.86    ,  48142.19    ,  44961.09    ,  46420.42    ,
        115499.861712],
       [ 46420.42    ,  47310.      ,  43727.      ,  44807.58    ,
         97154.1822  ]])

In [153]:
test_x[0].values[0,:]

array([38289.32    , 40955.51    , 38215.94    , 39186.94    ,
       98757.311183])

In [155]:
(test_x[0].values - test_x[0].values[0,:]) / test_x[0].values

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.02275822, -0.03162494, -0.02315708, -0.01008488, -0.17061408],
       [ 0.01305222,  0.12477847, -0.00597675,  0.15499623,  0.28745262],
       [ 0.17435179,  0.14928029,  0.15002194,  0.15582539,  0.14495732],
       [ 0.17516214,  0.134316  ,  0.12603334,  0.12543949, -0.01650087]])

In [156]:
(test_x[1].values - test_x[1].values[0,:]) / test_x[1].values

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [-0.00993203,  0.15160879,  0.01679149,  0.16343291,  0.39130462],
       [ 0.15512392,  0.17535949,  0.16925946,  0.16425379,  0.26957766],
       [ 0.15595313,  0.16085394,  0.1458138 ,  0.13417127,  0.13165159],
       [ 0.12557183,  0.18445158,  0.15099825,  0.19124273,  0.05803192]])

In [161]:
# zero index, every first day in the 5-day window is zero, the rest are percentage change from the first day
x = []
for i in range(len(test_x)):
    chg = (test_x[i].values - test_x[i].values[0,:]) / test_x[i].values
    x.append(chg)
x

[array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.02275822, -0.03162494, -0.02315708, -0.01008488, -0.17061408],
        [ 0.01305222,  0.12477847, -0.00597675,  0.15499623,  0.28745262],
        [ 0.17435179,  0.14928029,  0.15002194,  0.15582539,  0.14495732],
        [ 0.17516214,  0.134316  ,  0.12603334,  0.12543949, -0.01650087]]),
 array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [-0.00993203,  0.15160879,  0.01679149,  0.16343291,  0.39130462],
        [ 0.15512392,  0.17535949,  0.16925946,  0.16425379,  0.26957766],
        [ 0.15595313,  0.16085394,  0.1458138 ,  0.13417127,  0.13165159],
        [ 0.12557183,  0.18445158,  0.15099825,  0.19124273,  0.05803192]]),
 array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.16343273,  0.02799499,  0.15507186,  0.00098125, -0.19998011],
        [ 0.16425379,  0.01089727,  0.13122579, -0.03497823, -0.42657304],
        [ 0.13417127,

In [59]:
def get_y(df: pd.DataFrame, fold_length: int, fold_stride: int, x_len_stride):
    '''
    This function gets target y ('price_up column') from the Time Series dataframe of shape (n_timesteps, n_features) to create folds
    - of equal `fold_length`
    - using `fold_stride` between each fold
    - x_len_stride is length minus stride in the X (when predicting the next period)
    - add an extra d to x_len_stride for extra d days into the future when predicting
    
    Returns a y as an numpy array
    '''
    folds = []
    for idx in range(fold_stride, len(df)-fold_length, fold_stride): # --> also possible to get rid of the break 
    #for idx in range(0, len(df), fold_stride):   # range(start, stop, step): for each idx in our rows at every 91 days
        # Exits the loop as soon as the last fold index would exceed the last index
        # if (idx + fold_length) > len(df):
        #     break
        # fold = df['price_up'].iloc[idx:idx + fold_length]  # select from row idx til last row of the fold (3 years), all the columns
        fold = df.iloc[:,-2].iloc[idx + x_len_stride:idx + fold_length + x_len_stride]
        folds.append(fold)
    # return np.array(folds)
    return folds

In [71]:
test_x[0].loc[:,'Close']

Date
2021-02-06    39186.94
2021-02-07    38795.69
2021-02-08    46374.87
2021-02-09    46420.42
2021-02-10    44807.58
Name: Close, dtype: float64

In [83]:
folds_y = get_y(df=train_data, fold_length=5, fold_stride=1, x_len_stride=4)
test_y = folds_y[:3]
test_y

[Date
 2021-02-11    47969.51
 2021-02-12    47287.60
 2021-02-13    47153.69
 2021-02-14    48577.79
 2021-02-15    47911.10
 Name: Close, dtype: float64,
 Date
 2021-02-12    47287.60
 2021-02-13    47153.69
 2021-02-14    48577.79
 2021-02-15    47911.10
 2021-02-16    49133.45
 Name: Close, dtype: float64,
 Date
 2021-02-13    47153.69
 2021-02-14    48577.79
 2021-02-15    47911.10
 2021-02-16    49133.45
 2021-02-17    52119.71
 Name: Close, dtype: float64]

In [88]:
test_x[0].loc[:,'Close'][-1]

44807.58

In [162]:
# y is the 5-day percentage change in closing price 1 day after the x window
# !!!!!! y is the relative change to the last x date, not first!!!!!!
# y1
(test_y[0] - test_x[0].loc[:,'Close'][-1]) / test_x[0].loc[:,'Close'][-1]

Date
2021-02-11    0.070567
2021-02-12    0.055348
2021-02-13    0.052360
2021-02-14    0.084142
2021-02-15    0.069263
Name: Close, dtype: float64

In [92]:
# y2
(test_y[1] - test_x[1].loc[:,'Close'][-1]) / test_x[1].loc[:,'Close'][-1]

Date
2021-02-12   -0.014215
2021-02-13   -0.017007
2021-02-14    0.012681
2021-02-15   -0.001218
2021-02-16    0.024264
Name: Close, dtype: float64

In [108]:
y = []
for i, j in zip(test_x, test_y):
    chg = (j - i.loc[:,'Close'][-1]) / i.loc[:,'Close'][-1]
    y.append(chg)

In [111]:
y

[Date
 2021-02-11    0.070567
 2021-02-12    0.055348
 2021-02-13    0.052360
 2021-02-14    0.084142
 2021-02-15    0.069263
 Name: Close, dtype: float64,
 Date
 2021-02-12   -0.014215
 2021-02-13   -0.017007
 2021-02-14    0.012681
 2021-02-15   -0.001218
 2021-02-16    0.024264
 Name: Close, dtype: float64,
 Date
 2021-02-13   -0.002832
 2021-02-14    0.027284
 2021-02-15    0.013185
 2021-02-16    0.039035
 2021-02-17    0.102186
 Name: Close, dtype: float64]

In [110]:
np.array(y)

array([[ 0.07056685,  0.05534822,  0.05235967,  0.08414224,  0.06926328],
       [-0.01421549, -0.01700705,  0.01268055, -0.00121765,  0.02426416],
       [-0.00283182,  0.0272839 ,  0.01318527,  0.03903455,  0.10218556]])

# 4. LSTM Model

In [None]:
# Function to build an LSTM (Long Short-Term Memory) model# Function to build an LSTM (Long Short-Term Memory) model
def build_lstm_model(input_data, output_size, neurons, activ_func='linear',
                     dropout=0.2, loss='mse', optimizer='adam', metrics='mae'):
    """
    Build an LSTM (Long Short-Term Memory) model.
    Parameters:
    - input_data (numpy.ndarray): The input data for the model.
    - output_size (int): The size of the output layer.
    - neurons (int): The number of neurons in the LSTM layer.
    - activ_func (str, optional): Activation function for the output layer (default is 'linear').
    - dropout (float, optional): Dropout rate to prevent overfitting (default is 0.2).
    - loss (str, optional): Loss function for model training (default is 'mse' - Mean Squared Error).
    - optimizer (str, optional): Optimization algorithm for model training (default is 'adam').
    Returns:
    - tensorflow.keras.models.Sequential: The constructed LSTM model.
    """
    # Create a Sequential model
    model = Sequential()

    # Add an LSTM layer with the specified number of neurons and input shape
    model.add(LSTM(neurons, input_shape=(input_data.shape[1], input_data.shape[2])))

    # Add a Dropout layer to prevent overfitting
    model.add(Dropout(dropout))

    # Add a Dense layer with the specified number of units
    model.add(Dense(units=output_size))

    # Add an Activation layer with the specified activation function
    model.add(Activation(activ_func))

    # Compile the model with the specified loss function and optimizer
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

    # Return the constructed LSTM model
    return model


In [None]:
# Define parameters for data preparation and LSTM model
window_len = 15
test_size = 0.2
zero_base = True
lstm_neurons = 50
epochs = 100
batch_size = 32
loss = 'mse'
dropout = 0.24
optimizer = 'adam'
output_size = 1

# Prepare data for time series analysis
train_data, val_data, X_train, X_val, y_train, y_val = prepare_data(
    available_data, aim, window_len=window_len, zero_base=zero_base, test_size=test_size)


In [None]:
# Prepare test set
test_data, X_test, y_test = prepare_test_data(test_data, aim=aim, window_len=window_len)

In [None]:
available_data.shape

In [None]:
val_data.shape

In [None]:
# train_data, test_data, X_train_val, X_test_val, y_train_val, y_test_val = prepare_data(
#     val_data, aim, window_len=window_len, zero_base=True, test_size=0)

In [None]:
# Build an LSTM model
model = build_lstm_model(
    X_train, output_size=output_size, neurons=lstm_neurons, dropout=dropout, loss=loss, optimizer=optimizer)

# Train the LSTM model
modelfit = model.fit(
    X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, verbose=1, shuffle=False)


In [None]:
# Need to put preprocessing into model building pipeline, or check preprocessing for testing data

In [None]:
# Function to build an LSTM (Long Short-Term Memory) model# Function to build an LSTM (Long Short-Term Memory) model
def build_lstm_big(input_data, output_size, neurons, activ_func='linear',
                     dropout=0.2, loss='mse', optimizer='adam', metrics='mae'):
    """
    Build another LSTM with different architecture
    """
    model = Sequential()

    model.add(LSTM(neurons, input_shape=(input_data.shape[1], input_data.shape[2]), return_sequences=True))
    
    model.add(LSTM(neurons, return_sequences=True))
    # model.add(Dropout(dropout))
    
#     model.add(LSTM(neurons, return_sequences=True))
#     model.add(Dropout(dropout))
    
    model.add(Flatten())
    
    model.add(Dense(128, activation='relu'))

    # Add a Dense layer with the specified number of units
    model.add(Dense(units=output_size, activation=activ_func))
    

    # Compile the model with the specified loss function and optimizer
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

    # Return the constructed LSTM model
    return model


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
es = EarlyStopping(patience=2)

# Build big LSTM model
model_big = build_lstm_big(
    X_train, output_size=output_size, neurons=lstm_neurons, dropout=dropout, loss=loss, optimizer=optimizer)

# Train the LSTM model
history_big = model_big.fit(
    X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, verbose=1, shuffle=False, callbacks=es)


In [None]:
model_big.summary()

In [None]:
# modelfit.history

Prepare the data so that the model looks at past 15 minutes of data and predicts the price in 5 minutes, instead of in 1 minute.

# 4.1 BERT

In [None]:
from transformers import TFAutoModel

tiny_bert = TFAutoModel.from_pretrained('prajjwal1/bert-tiny', from_pt=True)

In [None]:
tiny_bert.compile(optimizer='adam',
                  metrics='accuracy'
                 )
tiny_bert.fit(X_train, y_train, validation_data=(X_val, y_val), shuffle=False, batch_size=32, epochs=3)

# 5. Results

In [None]:
def plot_loss_accuracy(history, title=None):
    fig, ax = plt.subplots(1,2, figsize=(20,7))

    # --- LOSS --- 

    ax[0].plot(history.history['loss'])
    ax[0].plot(history.history['val_loss'])

    ax[0].set_title('Model loss')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epoch')

    # ax[0].set_ylim((0,3))

    ax[0].legend(['Train', 'Validation'], loc='best')

    ax[0].grid(axis="x",linewidth=0.5)
    ax[0].grid(axis="y",linewidth=0.5)

    # --- ACCURACY

    ax[1].plot(history.history['mae'])
    ax[1].plot(history.history['val_mae'])

    ax[1].set_title('Model Accuracy')
    ax[1].set_ylabel('MAE')
    ax[1].set_xlabel('Epoch')

    ax[1].legend(['Train', 'Validation'], loc='best')

    # ax[1].set_ylim((0,1))

    ax[1].grid(axis="x",linewidth=0.5)
    ax[1].grid(axis="y",linewidth=0.5)

    if title:
        fig.suptitle(title)

In [None]:
plot_loss_accuracy(modelfit)

In [None]:
plot_loss_accuracy(history_big)

In [None]:
# Plotting the training and validation loss during model training
plt.plot(modelfit.history['loss'], 'r', linewidth=2, label='Training loss')
plt.plot(modelfit.history['val_loss'], 'g', linewidth=2, label='Validation loss')

# Set plot title and axis labels
plt.title('LSTM Neural Networks - BTC Model')
plt.xlabel('Epochs numbers')
plt.ylabel('MSE numbers')

# Display legend to distinguish between training and validation loss
plt.legend()

# Show the plot
plt.show()


In [None]:
X_test.shape

In [None]:
val_data[aim][window_len:]

In [None]:
# Extract the target values from the test dataset
targets = test_data[aim][window_len:]

# Make predictions using the trained LSTM model on the validation data
preds = model_big.predict(X_test).squeeze()

# Calculate the Mean Absolute Error (MAE) between predictions and actual targets
mae = mean_absolute_error(preds, y_test)

# Display the actual target values
targets.round()


In [None]:
# Generate final predictions by reversing the normalization process
preds = test_data[aim].values[:-window_len] * (preds + 1)

# Create a pandas Series with index and data for predictions
preds = pd.Series(index=targets.index, data=preds)

In [None]:
# Plotting the actual and predicted values
Cannot plot if dataset is large
line_plot(targets, preds, 'actual', 'prediction', lw=3)

In [None]:
# Turn preds to numpy array for MSE and R2 calculations
preds = preds.values

In [None]:
# Calculate the Mean Squared Error (MSE) between predictions and actual targets
SCORE_MSE = mean_squared_error(preds, y_test)
SCORE_RMSE = mean_squared_error(preds, y_test, squared=False)

# Display the calculated MSE score
SCORE_MSE


In [None]:
SCORE_RMSE

In [None]:
# Calculate the R-squared (R2) score between actual targets and predicted values
r2 = r2_score(y_test, preds)

# Multiply the R2 score by 100 for percentage representation
r2 * 100


In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X_test

In [None]:
len(y_test)

In [None]:
model_big.evaluate(X_test, y_test)

In [None]:
-- END OF NOTEBOOK --
-- CELLS BELOW ARE STILL IN PRODUCTION --

# 6. Preprocess Data for Binary Classification 

In [None]:
# Assuming 'data' is the DataFrame containing time series data
data['aim_binary'] = (data['Close'] > data['Close'].shift(1)).astype(int)

# Drop NaN values introduced by the shift operation
data = data.dropna()
data.head()


In [None]:
# def prepare_data_binary(continuous, aim, window_len=10, zero_base=True, test_size=0.2):
#     X_train = extract_window_data(train_data[continuous.columns], window_len, zero_base)
#     X_test = extract_window_data(test_data[continuous.columns], window_len, zero_base)
#     y_train = train_data[aim][window_len:].values
#     y_test = test_data[aim][window_len:].values
#     if zero_base:
#         y_train = y_train / train_data[aim][:-window_len].values - 1
#         y_test = y_test / test_data[aim][:-window_len].values - 1

#     return train_data, test_data, X_train, X_test, y_train, y_test


In [None]:
# Prepare data for binary classification
aim_binary = 'aim_binary'
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=[aim_binary]), data[aim_binary], test_size=0.2, shuffle=False)


# 7. Binary Classification Model

In [None]:
# Function to build a binary classification model
def build_binary_classification_model(input_data, output_size, neurons, activ_func='sigmoid', dropout=0.3, loss='binary_crossentropy', optimizer='adam'):
    model2 = Sequential()
    # Assuming your time series data has only one feature (e.g., 'Close' column)
    model2.add(LSTM(neurons, input_shape=(X_train.shape[1], 1)))  # Adjust the input shape
    model2.add(Dropout(dropout))
    model2.add(Dense(units=output_size))
    model2.add(Activation(activ_func))
    model2.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model2


In [None]:
# Specify parameters for the binary classification model
binary_lstm_neurons = 50
binary_epochs = 20
batch_size = 32


In [None]:
# Build the binary classification model
binary_model = build_binary_classification_model(X_train, output_size=1, neurons=binary_lstm_neurons)

# Train the binary classification model
binary_modelfit = binary_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=binary_epochs, batch_size=batch_size, verbose=1, shuffle=False)


# 8. Binary Classification 

In [None]:
# Plotting the training and validation loss during model training
plt.plot(binary_modelfit.history['loss'], 'r', linewidth=2, label='Training loss')
plt.plot(binary_modelfit.history['val_loss'], 'g', linewidth=2, label='Validation loss')

# Set plot title and axis labels
plt.title('LSTM Neural Networks - BTC Model')
plt.xlabel('Epochs numbers')
plt.ylabel('MSE numbers')

# Display legend to distinguish between training and validation loss
plt.legend()

# Show the plot
plt.show()


In [None]:
# Make predictions using the trained binary classification model on the test data
binary_preds = (binary_model.predict(X_test) > 0.5).astype(int)

# Extract the binary target values from the test dataset
binary_targets = y_test.astype(int)

# Display the actual binary target values
print("Actual Binary Targets:")
print(binary_targets)

# Display the predicted binary target values
print("Predicted Binary Targets:")
print(binary_preds)


In [None]:
# Evaluate the binary classification model
binary_preds = (binary_model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, binary_preds)
print(f'Accuracy: {accuracy}')
