## Imports

In [1]:
!pip install --quiet pytorch-lightning==1.2.5

In [2]:
!pip install --quiet tqdm==4.59.0

In [3]:
!pip install --q seaborn

In [4]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [5]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#93D30C", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 14, 10

tqdm.pandas()

In [6]:
# Random Seed Pytorch Lightning
pl.seed_everything(42)

Global seed set to 42


42

## Load Data

In [7]:
# Dataset Source: https://www.cryptodatadownload.com/data/binance/

data_path = "./data/Binance_BTCUSDT_2023_minute.csv"

df = pd.read_csv(data_path, parse_dates = ["Date"]).sort_values(by = "Date", ignore_index = True)

df

Unnamed: 0,Unix,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
0,1.672530e+12,2023-01-01 00:00:00,BTCUSDT,16541.77,16544.76,16538.45,16543.67,83.08143,1.374269e+06,2687
1,1.672530e+12,2023-01-01 00:01:00,BTCUSDT,16543.04,16544.41,16538.48,16539.31,80.45300,1.330773e+06,2890
2,1.672530e+12,2023-01-01 00:02:00,BTCUSDT,16539.31,16541.17,16534.52,16536.43,62.90197,1.040248e+06,1930
3,1.672530e+12,2023-01-01 00:03:00,BTCUSDT,16536.43,16537.28,16531.00,16533.65,115.71894,1.913268e+06,2956
4,1.672530e+12,2023-01-01 00:04:00,BTCUSDT,16534.12,16536.08,16527.51,16535.38,144.45369,2.388081e+06,3795
...,...,...,...,...,...,...,...,...,...,...
493833,1.702170e+12,2023-12-09 23:55:00,BTCUSDT,43701.23,43701.23,43662.24,43689.55,25.22802,1.101798e+06,885
493834,1.702170e+12,2023-12-09 23:56:00,BTCUSDT,43689.56,43715.07,43681.44,43713.99,42.81061,1.870294e+06,731
493835,1.702170e+12,2023-12-09 23:57:00,BTCUSDT,43713.99,43714.00,43681.53,43700.00,28.19555,1.231927e+06,915
493836,1.702170e+12,2023-12-09 23:58:00,BTCUSDT,43699.99,43705.92,43699.99,43704.43,8.12997,3.553084e+05,492


## Pre-Processing Data

In [8]:
df.describe()

Unnamed: 0,Unix,Date,Open,High,Low,Close,Volume BTC,Volume USDT,tradecount
count,493838.0,493838,493838.0,493838.0,493838.0,493838.0,493838.0,493838.0,493838.0
mean,1687350000000.0,2023-06-21 12:19:15.425058048,27912.522484,27920.14952,27904.830037,27912.577538,72.533507,1795796.0,1834.423362
min,1672530000000.0,2023-01-01 00:00:00,16506.04,16508.73,16499.01,16505.87,0.0,0.0,0.0
25%,1679940000000.0,2023-03-27 18:59:15,25833.9,25839.19,25828.225,25833.9,9.393755,274310.1,354.0
50%,1687350000000.0,2023-06-21 12:38:30,27452.49,27460.22,27444.32,27452.495,22.44522,661871.4,632.0
75%,1694760000000.0,2023-09-15 06:17:45,29894.245,29899.58,29889.515,29894.27,76.406245,1929207.0,2071.0
max,1702170000000.0,2023-12-09 23:59:00,44687.79,44700.0,44634.52,44687.8,5877.77545,145955700.0,107315.0
std,8555054000.0,,4857.864211,4858.594944,4857.047111,4857.888023,141.966013,3350121.0,2899.529969


In [9]:
df["prev_close"] = df["Close"].shift(1)
df["prev_open"] = df["Open"].shift(1)
df["prev_high"] = df["High"].shift(1)
df["prev_low"] = df["Low"].shift(1)

In [10]:
df["close_change"] = df["Close"] - df["prev_close"]

### Converting DataFrame into features

In [50]:
features_df = pd.DataFrame()

features_df["day_of_week"] = df.Date.dt.dayofweek
features_df["day_of_month"] = df.Date.dt.day
features_df["week_of_year"] = df.Date.dt.isocalendar().week
features_df["month"] = df.Date.dt.month
features_df["open"] = df.Open
features_df["high"] = df.High
features_df["low"] = df.Low
features_df["close_change"] = df.close_change
features_df["close"] = df.Close

features_df.dropna(inplace = True)

In [51]:
features_df

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
1,6,1,52,1,16543.04,16544.41,16538.48,-4.36,16539.31
2,6,1,52,1,16539.31,16541.17,16534.52,-2.88,16536.43
3,6,1,52,1,16536.43,16537.28,16531.00,-2.78,16533.65
4,6,1,52,1,16534.12,16536.08,16527.51,1.73,16535.38
5,6,1,52,1,16534.91,16537.80,16533.94,1.32,16536.70
...,...,...,...,...,...,...,...,...,...
493833,5,9,49,12,43701.23,43701.23,43662.24,-11.68,43689.55
493834,5,9,49,12,43689.56,43715.07,43681.44,24.44,43713.99
493835,5,9,49,12,43713.99,43714.00,43681.53,-13.99,43700.00
493836,5,9,49,12,43699.99,43705.92,43699.99,4.43,43704.43


In [52]:
features_df.to_csv("./data/binance_btc_usd_dataset_processed.csv", index=False)

## Train-Test split

In [53]:
split_ratio = 0.9

train_size = int(features_df.shape[0] * split_ratio)

print("The size of the training set is %i" %train_size)
print("The size of the test set is %i" %(features_df.shape[0] - train_size))

train_df, test_df = features_df[:train_size], features_df[train_size:]

assert len(train_df) == train_size

The size of the training set is 444453
The size of the test set is 49384


In [54]:
#Normalize the data

scaler = MinMaxScaler()
scaler = scaler.fit(train_df)

train_df = pd.DataFrame(scaler.transform(train_df), columns = train_df.columns, index = train_df.index)

## Cutting DataFrame into sequence for LSTM

In [55]:
def create_sequences(input_data, target, sequence_len):

    output = []
    n = len(input_data)

    for i in range(n-sequence_len):
        x = input_data[i:i+sequence_len].drop(target, axis = 1)

        y = input_data.iloc[i+sequence_len][target]

        output.append([x,y])

    return output    

In [60]:
temp = create_sequences(train_df[0:10], target = 'close', sequence_len = 1)

assert len(temp) == 9

In [57]:
temp[0]

[   day_of_week  day_of_month  week_of_year  month      open      high  \
 1          1.0           0.0           1.0    0.0  0.001908  0.001832   
 
         low  close_change  
 1  0.002037      0.477832  ,
 0.0015755720342151003]