## Imports

In [2]:
!pip install --quiet pytorch-lightning==1.2.5
!pip install --quiet tqdm==4.59.0
!pip install --q seaborn

In [3]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import math
import matplotlib

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

from collections import defaultdict

In [4]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#93D30C", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 14, 10

tqdm.pandas()

In [5]:
# Random Seed Pytorch Lightning
pl.seed_everything(42)

Global seed set to 42


42

## Load Data

In [6]:
# Dataset Source: https://www.cryptodatadownload.com/data/binance/

data_path = "./data/Binance_BTCUSDT_2023_minute.csv"

df = pd.read_csv(data_path, parse_dates = ["Date"]).sort_values(by = "Date", ignore_index = True)

df

Unnamed: 0,Unix,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
0,1.672530e+12,2023-01-01 00:00:00,BTCUSDT,16541.77,16544.76,16538.45,16543.67,83.08143,1.374269e+06,2687
1,1.672530e+12,2023-01-01 00:01:00,BTCUSDT,16543.04,16544.41,16538.48,16539.31,80.45300,1.330773e+06,2890
2,1.672530e+12,2023-01-01 00:02:00,BTCUSDT,16539.31,16541.17,16534.52,16536.43,62.90197,1.040248e+06,1930
3,1.672530e+12,2023-01-01 00:03:00,BTCUSDT,16536.43,16537.28,16531.00,16533.65,115.71894,1.913268e+06,2956
4,1.672530e+12,2023-01-01 00:04:00,BTCUSDT,16534.12,16536.08,16527.51,16535.38,144.45369,2.388081e+06,3795
...,...,...,...,...,...,...,...,...,...,...
493833,1.702170e+12,2023-12-09 23:55:00,BTCUSDT,43701.23,43701.23,43662.24,43689.55,25.22802,1.101798e+06,885
493834,1.702170e+12,2023-12-09 23:56:00,BTCUSDT,43689.56,43715.07,43681.44,43713.99,42.81061,1.870294e+06,731
493835,1.702170e+12,2023-12-09 23:57:00,BTCUSDT,43713.99,43714.00,43681.53,43700.00,28.19555,1.231927e+06,915
493836,1.702170e+12,2023-12-09 23:58:00,BTCUSDT,43699.99,43705.92,43699.99,43704.43,8.12997,3.553084e+05,492


## Pre-Processing Data

In [7]:
df.describe()

Unnamed: 0,Unix,Date,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
count,493838.0,493838,493838.0,493838.0,493838.0,493838.0,493838.0,493838.0,493838.0
mean,1687350000000.0,2023-06-21 12:19:15.425058048,27912.522484,27920.14952,27904.830037,27912.577538,72.533507,1795796.0,1834.423362
min,1672530000000.0,2023-01-01 00:00:00,16506.04,16508.73,16499.01,16505.87,0.0,0.0,0.0
25%,1679940000000.0,2023-03-27 18:59:15,25833.9,25839.19,25828.225,25833.9,9.393755,274310.1,354.0
50%,1687350000000.0,2023-06-21 12:38:30,27452.49,27460.22,27444.32,27452.495,22.44522,661871.4,632.0
75%,1694760000000.0,2023-09-15 06:17:45,29894.245,29899.58,29889.515,29894.27,76.406245,1929207.0,2071.0
max,1702170000000.0,2023-12-09 23:59:00,44687.79,44700.0,44634.52,44687.8,5877.77545,145955700.0,107315.0
std,8555054000.0,,4857.864211,4858.594944,4857.047111,4857.888023,141.966013,3350121.0,2899.529969


In [8]:
df["prev_close"] = df["Close"].shift(1)
df["close_change"] = df["Close"] - df["prev_close"]

## Converting DataFrame into features

In [9]:
features_df = pd.DataFrame()

features_df["day_of_week"] = df.Date.dt.dayofweek
features_df["day_of_month"] = df.Date.dt.day
features_df["week_of_year"] = df.Date.dt.isocalendar().week
features_df["month"] = df.Date.dt.month
features_df["open"] = df.Open
features_df["high"] = df.High
features_df["low"] = df.Low
features_df["close_change"] = df.close_change
features_df["close"] = df.Close

features_df.dropna(inplace = True)

In [10]:
features_df

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
1,6,1,52,1,16543.04,16544.41,16538.48,-4.36,16539.31
2,6,1,52,1,16539.31,16541.17,16534.52,-2.88,16536.43
3,6,1,52,1,16536.43,16537.28,16531.00,-2.78,16533.65
4,6,1,52,1,16534.12,16536.08,16527.51,1.73,16535.38
5,6,1,52,1,16534.91,16537.80,16533.94,1.32,16536.70
...,...,...,...,...,...,...,...,...,...
493833,5,9,49,12,43701.23,43701.23,43662.24,-11.68,43689.55
493834,5,9,49,12,43689.56,43715.07,43681.44,24.44,43713.99
493835,5,9,49,12,43713.99,43714.00,43681.53,-13.99,43700.00
493836,5,9,49,12,43699.99,43705.92,43699.99,4.43,43704.43


In [11]:
features_df.to_csv("./data/binance_btc_usd_dataset_processed.csv", index=False)

## Train-Test split

In [12]:
split_ratio = 0.9

train_size = int(features_df.shape[0] * split_ratio)

print("The size of the training set is %i" %train_size)
print("The size of the test set is %i" %(features_df.shape[0] - train_size))

train_df, test_df = features_df[:train_size], features_df[train_size:]

assert len(train_df) == train_size

The size of the training set is 444453
The size of the test set is 49384


In [13]:
#Normalize the data

scaler = MinMaxScaler()
scaler = scaler.fit(train_df)

train_df = pd.DataFrame(scaler.transform(train_df), columns = train_df.columns, index = train_df.index)

## Cutting DataFrame into sequence for LSTM

In [14]:
def create_sequences(input_data, target, sequence_length):

    output = []
    n = len(input_data)

    for i in tqdm(range(n-sequence_length)):
        x = input_data[i:i+sequence_length]

        y = input_data.iloc[i+sequence_length][target]

        output.append([x,y])

    return output    

## Creating Training and Testing Sequences

In [15]:
SEQUENCE_LENGTH = 120

train_sequences = create_sequences(train_df, "close", sequence_length=SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "close", sequence_length=SEQUENCE_LENGTH)

  0%|          | 0/444333 [00:00<?, ?it/s]

  0%|          | 0/49264 [00:00<?, ?it/s]

In [16]:
print("The dimensions of each feature df is :", train_sequences[0][0].shape)

print("The length of the training sequence is %i" %len(train_sequences))
print("The length of the test sequences is %i" %len(test_sequences))

The dimensions of each feature df is : (120, 9)
The length of the training sequence is 444333
The length of the test sequences is 49264


# Pytorch Datasets

In [17]:
class BTCDataset(Dataset):

    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, label = self.sequences[idx]
        return dict(
                sequence = torch.Tensor(sequence.to_numpy()),
                label = torch.tensor(label).float())

In [18]:
class BTCPriceDataModule(pl.LightningDataModule):
    def __init__(self, train_sequences, test_sequences, batch_size=8):
        super().__init__()
        self.train_sequences = train_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size

    def setup(self):
        self.train_dataset = BTCDataset(self.train_sequences)
        self.test_dataset = BTCDataset(self.test_sequences)

    def train_dataloader(self):
        return DataLoader(
                self.train_dataset,
                batch_size = self.batch_size,
                shuffle = False,
                num_workers = 2)

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size = self.batch_size,
            shuffle = False,
            num_workers = 1)

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size = self.batch_size,
            shuffle = False,
            num_workers = 1)

## Model Parameters

In [19]:
N_EPOCHS = 8
BATCH_SIZE = 64

data_module = BTCPriceDataModule(train_sequences, test_sequences, batch_size = BATCH_SIZE)
data_module.setup()

In [20]:
# Testing our dataloader
train_dataset = BTCDataset(train_sequences)

a = iter(train_dataset)
b = next(a)
print("Sequence Shape: ", b["sequence"].shape)
print("Label: {} and Label Shape: {}".format(b["label"], b["label"].shape) )

Sequence Shape:  torch.Size([120, 9])
Label: 0.002091138856485486 and Label Shape: torch.Size([])


## Model

In [21]:
class PricePredictionModel(nn.Module):
    def __init__(self, n_features, n_hidden = 128, n_layers = 2):
        super().__init__()

        self.n_features = n_features

        self.lstm = nn.LSTM(
            input_size = n_features,
            hidden_size = n_hidden,
            batch_first = True,
            num_layers = n_layers,
            dropout = 0.2)

        self.regressor = nn.Linear(n_hidden, 1)

        def forward(self, x):

            self.lstm.flatten_parameters()

            _, (hidden, _)  = self.lstm(x)
            out = hidden[-1]

            return self.regressor(out)            

In [22]:
class BTCPricePredictor(pl.LightningModule):
    def __init__(self, n_features: int):
        super().__init__()
        self.modle = PricePredictionModel(n_features)
        self.criterion = nn.MSELoss()

    def forward(self, sequence, labels = None):
        output = self.model(sequence)
        loss = 0
        if labels:
            loss -= self.criterion(output, labels.unsqueeze(dim=1))
        return loss, output

    def training_setp(self, batch, batch_idx):
        sequences = batch["sequences"]
        labels = batch["labels"]

        loss, output = self(sequences, labels = labels)

        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
    
        loss, output = self.forward(sequences, labels)
    
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
  
    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
    
        loss, output = self.forward(sequences, labels)
    
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return optim.Adam(self.model.parameters(), lr=0.001)
               

In [23]:
model = BTCPricePredictor(n_features = train_df.shape[1])

In [24]:
# for item in data_module.train_dataloader():
#   print(item["sequence"].shape)
#   print(item["label"].shape)
#   break

In [25]:
# Starting tensorboard
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [26]:
checkpoint_callback = ModelCheckpoint(
    dirpath = "checkpoints",
    filename = "best-checkpoint",
    save_top_k = 1,
    verbose = True,
    monitor = 'val_loss',
    mode = 'min'
)

logger = TensorBoardLogger("lightning-logs", name = "btc-price")

early_stopping_callback = EarlyStopping(
    monitor = 'val_loss',
    patience = 2)

trainer = pl.Trainer(
    logger = logger,
    checkpoint_callback = checkpoint_callback,
    callbacks = [early_stopping_callback],
    max_epochs = N_EPOCHS,
    gpus = 1,
    progress_bar_refresh_rate = 30)

MisconfigurationException: You requested GPUs: [0]
 But your machine only has: []

In [None]:
trainer.fit(model, data_module)