In [1]:
from pathlib import Path
import plotly.express as px
import pandas as pd
import os
from pprint import pprint
import numpy as np

from datetime import datetime
if "notebooks" in os.getcwd():
    os.chdir('..')

pprint(f'Working Directory: {os.getcwd()}')


'Working Directory: /home/nic/git/crypto-forecasting'


In [2]:
def load_data(path: str = './data/01_raw/Binance_ADAUSDT_minute.csv', aggregation = 'mean', resample = '15T'):
    data = pd.read_csv(path,
    dtype={'unix':int, 'symbol': str, 'open': float, 'high': float, 'low': float, 'close': float, 'Volume ADA':         float, 'Volume USDT': float, 'tradecount': int},
    skiprows=1)
    data.loc[:, 'date'] = pd.to_datetime(data.loc[:,'date'], infer_datetime_format=True)
    return data.set_index('date').sort_index().resample(resample).agg(aggregation).fillna(method='ffill').astype('float32')

In [3]:
data = load_data(resample='1D')

In [4]:
data.head()

Unnamed: 0_level_0,unix,open,high,low,close,Volume ADA,Volume USDT,tradecount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-09-11,1599863000000.0,0.09663,0.096684,0.096581,0.096638,147057.625,14221.708008,32.291458
2020-09-12,1599912000000.0,0.096583,0.096634,0.096529,0.096585,118968.53125,11502.777344,31.817362
2020-09-13,1599998000000.0,0.09617,0.096237,0.0961,0.096172,167744.078125,16108.962891,43.836807
2020-09-14,1600085000000.0,0.095618,0.095688,0.09555,0.095622,154802.890625,14840.880859,39.927776
2020-09-15,1600171000000.0,0.095509,0.095577,0.09544,0.095511,158953.546875,15184.172852,36.961113


In [5]:
# sample target as tomorrow's mean
target = (data.high + data.low) / 2
data['target_landing'] = target

In [6]:
n_preds = 1


In [7]:
targets = []
for i in range(n_preds):
    col = f"y_{i}"
    targets.append(col)
    data[col] = np.nan
    col_idx = data.columns.get_loc(col)
    data.iloc[:, col_idx] = data.target_landing.shift(-i-1).values  # -i-1 because of sort_index()

In [8]:
data.head()

Unnamed: 0_level_0,unix,open,high,low,close,Volume ADA,Volume USDT,tradecount,target_landing,y_0
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-09-11,1599863000000.0,0.09663,0.096684,0.096581,0.096638,147057.625,14221.708008,32.291458,0.096633,0.096582
2020-09-12,1599912000000.0,0.096583,0.096634,0.096529,0.096585,118968.53125,11502.777344,31.817362,0.096582,0.096169
2020-09-13,1599998000000.0,0.09617,0.096237,0.0961,0.096172,167744.078125,16108.962891,43.836807,0.096169,0.095619
2020-09-14,1600085000000.0,0.095618,0.095688,0.09555,0.095622,154802.890625,14840.880859,39.927776,0.095619,0.095508
2020-09-15,1600171000000.0,0.095509,0.095577,0.09544,0.095511,158953.546875,15184.172852,36.961113,0.095508,0.091198


In [9]:
features = ['open', 'high', 'low', 'close' ,'Volume ADA', 'tradecount']

In [10]:
data = data[[*features, *targets]]

In [11]:
data.head()

Unnamed: 0_level_0,open,high,low,close,Volume ADA,tradecount,y_0
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-09-11,0.09663,0.096684,0.096581,0.096638,147057.625,32.291458,0.096582
2020-09-12,0.096583,0.096634,0.096529,0.096585,118968.53125,31.817362,0.096169
2020-09-13,0.09617,0.096237,0.0961,0.096172,167744.078125,43.836807,0.095619
2020-09-14,0.095618,0.095688,0.09555,0.095622,154802.890625,39.927776,0.095508
2020-09-15,0.095509,0.095577,0.09544,0.095511,158953.546875,36.961113,0.091198


In [13]:
def prepare_raw_data(path: str = './data/01_raw/Binance_ADAUSDT_minute.csv', aggregation = 'mean', resample = '15T') -> pd.DataFrame:
    df = load_data(path, aggregation, resample)
    features = ['open', 'high', 'low', 'close' ,'Volume ADA', 'tradecount']
    target = (df.high + df.low) / 2
    df['target_landing'] = target
    n_preds = 1
    targets = []
    for i in range(n_preds):
        col = f"y_{i}"
        targets.append(col)
        df[col] = np.nan
        col_idx = df.columns.get_loc(col)
        df.iloc[:, col_idx] = df.target_landing.shift(-i-1).values  # -i-1 because of sort_index()
    return df[[*features, *targets]]

In [14]:
data2 = prepare_raw_data()

In [15]:
data2.head()

Unnamed: 0_level_0,open,high,low,close,Volume ADA,tradecount,y_0
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-09-11 20:30:00,0.096222,0.096293,0.096195,0.096262,151668.40625,34.25,0.096256
2020-09-11 20:45:00,0.096246,0.096292,0.09622,0.096258,66052.34375,18.933332,0.096456
2020-09-11 21:00:00,0.096455,0.096487,0.096425,0.096459,43088.125,12.133333,0.096365
2020-09-11 21:15:00,0.096363,0.096383,0.096348,0.096365,11634.886719,9.133333,0.096427
2020-09-11 21:30:00,0.096422,0.096457,0.096397,0.096431,53965.679688,12.133333,0.096302
