In [51]:
import pandas as pd
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns

# Dataset Bitfinex

### Collecting data

In [52]:
mypath = "../raw_data/crypto_data"
onlyfiles = [f for f in listdir(mypath)]


In [53]:
df = pd.read_csv("../raw_data/crypto_data/1inch-usd.csv")
    

In [54]:
df.head()

Unnamed: 0.1,time,open,close,high,low,volume,Unnamed: 0
0,1628623620000,2.7621,2.7662,2.7662,2.7621,1259.9376,
1,1628623980000,2.7668,2.7668,2.7668,2.7668,12.660914,
2,1628624100000,2.761,2.7664,2.7664,2.761,0.199698,
3,1628624220000,2.771,2.771,2.771,2.771,0.001921,
4,1628625300000,2.7706,2.7709,2.7709,2.7706,133.6339,


In [55]:
df.columns

Index(['time', 'open', 'close', 'high', 'low', 'volume', 'Unnamed: 0'], dtype='object')

### Cleaning data

In [56]:
df["time"]= pd.to_datetime(df["time"], unit='ms')
df['time']


0      2021-08-10 19:27:00
1      2021-08-10 19:33:00
2      2021-08-10 19:35:00
3      2021-08-10 19:37:00
4      2021-08-10 19:55:00
               ...        
7135   2021-08-29 13:06:00
7136   2021-08-29 13:08:00
7137   2021-08-29 13:10:00
7138   2021-08-29 13:12:00
7139   2021-08-29 13:14:00
Name: time, Length: 7140, dtype: datetime64[ns]

In [57]:
df.isnull().sum()
df = df.drop(columns = ['Unnamed: 0'])

## Dataset Binance

### Collecting data

In [58]:
path = "../raw_data/ETH-USDT.parquet"
eth = pd.read_parquet(path, engine='auto')

In [59]:
eth.head()

Unnamed: 0_level_0,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-08-17 04:00:00,301.130005,301.130005,301.130005,301.130005,0.42643,128.410873,2,0.42643,128.410873
2017-08-17 04:01:00,301.130005,301.130005,301.130005,301.130005,2.75787,830.477417,4,2.75787,830.477417
2017-08-17 04:02:00,300.0,300.0,300.0,300.0,0.0993,29.790001,2,0.0993,29.790001
2017-08-17 04:03:00,300.0,300.0,300.0,300.0,0.31389,94.167,3,0.0,0.0
2017-08-17 04:04:00,301.130005,301.130005,301.130005,301.130005,0.23202,69.868179,1,0.23202,69.868179


### Cleaning data

In [60]:
eth.shape

(2753105, 9)

In [61]:
eth = eth.drop(columns = ['quote_asset_volume',
       'number_of_trades', 'taker_buy_base_asset_volume',
       'taker_buy_quote_asset_volume'])

In [62]:
eth.head(1)

Unnamed: 0_level_0,open,high,low,close,volume
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-08-17 04:00:00,301.130005,301.130005,301.130005,301.130005,0.42643


In [63]:
eth.tail(1)

Unnamed: 0_level_0,open,high,low,close,volume
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-11-16 22:43:00,1205.150024,1206.5,1205.150024,1206.5,143.3013


In [64]:
eth.isnull().sum()

open      0
high      0
low       0
close     0
volume    0
dtype: int64

In [65]:
eth.corr()

Unnamed: 0,open,high,low,close,volume
open,1.0,0.999999,0.999999,0.999998,0.067169
high,0.999999,1.0,0.999998,0.999999,0.067632
low,0.999999,0.999998,1.0,0.999999,0.066665
close,0.999998,0.999999,0.999999,1.0,0.067143
volume,0.067169,0.067632,0.066665,0.067143,1.0


In [66]:
len(np.unique(eth.index)) == eth.shape[0]

True

In [67]:
eth["close"].min()

82.03

In [68]:
eth["close"].max()

4865.22

In [69]:
eth["high-low"] = eth['high'] - eth["low"]

In [70]:
eth["high-low"]

open_time
2017-08-17 04:00:00    0.000000
2017-08-17 04:01:00    0.000000
2017-08-17 04:02:00    0.000000
2017-08-17 04:03:00    0.000000
2017-08-17 04:04:00    0.000000
                         ...   
2022-11-16 22:39:00    0.440063
2022-11-16 22:40:00    0.429932
2022-11-16 22:41:00    0.819946
2022-11-16 22:42:00    0.910034
2022-11-16 22:43:00    1.349976
Name: high-low, Length: 2753105, dtype: float32

In [71]:
eth["open-close"]=eth["open"]- eth["close"]

In [72]:
eth["open-close"]

open_time
2017-08-17 04:00:00    0.000000
2017-08-17 04:01:00    0.000000
2017-08-17 04:02:00    0.000000
2017-08-17 04:03:00    0.000000
2017-08-17 04:04:00    0.000000
                         ...   
2022-11-16 22:39:00   -0.420044
2022-11-16 22:40:00   -0.209961
2022-11-16 22:41:00    0.809937
2022-11-16 22:42:00    0.390015
2022-11-16 22:43:00   -1.349976
Name: open-close, Length: 2753105, dtype: float32

# Preprocess data

## Scaling 

In [73]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [74]:
X = eth[["high", "low", "volume", "open", "close"]]
y = eth["close"]

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)


In [76]:
scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X_train)

0