In [1]:
from os.path import join
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim
import math

# Load dataset

In [2]:
available_coins = ['ADA', 'BNB', 'BTC', 'DASH', 'ETH', 'LINK', 'LTC', 'XRP']
print("Available coins: ", available_coins)

Available coins:  ['ADA', 'BNB', 'BTC', 'DASH', 'ETH', 'LINK', 'LTC', 'XRP']


In [8]:
def read_coin_data(coin_name: str) -> pd.DataFrame:
    raw_data_path = join("io", "input", "data_raw", "Crypto_July_2019_2023", "4H_2019", coin_name ,f"{coin_name.lower()}_2019.csv")
    data_df = pd.read_csv(raw_data_path, index_col=False)
    return data_df


# Feature engineering

## Time & date features

In [9]:
def append_date_features(df: pd.DataFrame) -> pd.DataFrame:
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['Week_of_Year'] = df['Date'].dt.isocalendar().week
    return df

In [10]:
def create_trigonometric_columns(df) -> pd.DataFrame:
    # Create sine and cosine columns for Year, Month and Day
    df['Year_sin'] = df['Year'].apply(lambda x: math.sin(2*math.pi*x/2023))
    df['Year_cos'] = df['Year'].apply(lambda x: math.cos(2*math.pi*x/2023))
    df['Month_sin'] = df['Month'].apply(lambda x: math.sin(2*math.pi*x/12))
    df['Month_cos'] = df['Month'].apply(lambda x: math.cos(2*math.pi*x/12))
    df['Day_sin'] = df['Day'].apply(lambda x: math.sin(2*math.pi*x/31))
    df['Day_cos'] = df['Day'].apply(lambda x: math.cos(2*math.pi*x/31))
    return df

## Create target column

In [21]:
def create_target_variable(df: pd.DataFrame, forecast_lead: int = 1) -> pd.DataFrame:    
    target_column = "Close"
    features = list(df.columns.difference([target_column]))
    
    target = f"{target_column}_lead_{forecast_lead}"

    df[target] = df[target_column].shift(-forecast_lead)
    df = df.iloc[:-forecast_lead]
    # display("Target added to dataframe", df.shape, df[['Close', target]].head())
    return df

In [24]:
def get_coins_data(coins: list) -> pd.DataFrame:
    df = pd.DataFrame(columns=["Open", "High","Low", "Close", "Volume"])
    for coin in coins:
        coin_df = read_coin_data(coin_name=coin)
        coin_df["Asset_id"] = coin
        coin_df = coin_df.rename(columns={"Time":"Date"})
        coin_df = append_date_features(df=coin_df)
        coin_df = create_trigonometric_columns(df=coin_df)
        coin_df = create_target_variable(df=coin_df, forecast_lead= 1)
        # Set date as index
        coin_df.set_index('Date', inplace=True)
        df = pd.concat([df, coin_df])
    display("Consolidated dataframe shape",df.shape)
    return df

In [30]:
consolidated_df = get_coins_data(coins=available_coins)
consolidated_df.head()

'Consolidated dataframe shape'

(63976, 17)

Unnamed: 0,Open,High,Low,Close,Volume,Asset_id,Year,Month,Day,Week_of_Year,Year_sin,Year_cos,Month_sin,Month_cos,Day_sin,Day_cos,Close_lead_1
2019-07-20 16:00:00,0.06222,0.06509,0.06203,0.06474,35347035.0,ADA,2019.0,7.0,20.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.790776,-0.612106,0.06262
2019-07-20 20:00:00,0.06475,0.06484,0.06221,0.06262,17106974.9,ADA,2019.0,7.0,20.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.790776,-0.612106,0.06147
2019-07-21 00:00:00,0.06262,0.064,0.06136,0.06147,15707034.1,ADA,2019.0,7.0,21.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.897805,-0.440394,0.06237
2019-07-21 04:00:00,0.06145,0.06253,0.06136,0.06237,7069070.1,ADA,2019.0,7.0,21.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.897805,-0.440394,0.06179
2019-07-21 08:00:00,0.0623,0.06315,0.06136,0.06179,10194081.5,ADA,2019.0,7.0,21.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.897805,-0.440394,0.06048


In [31]:
consolidated_df.loc['2019-07-20 16:00:00', :]	

Unnamed: 0,Open,High,Low,Close,Volume,Asset_id,Year,Month,Day,Week_of_Year,Year_sin,Year_cos,Month_sin,Month_cos,Day_sin,Day_cos,Close_lead_1
2019-07-20 16:00:00,0.06222,0.06509,0.06203,0.06474,35347040.0,ADA,2019.0,7.0,20.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.790776,-0.612106,0.06262
2019-07-20 16:00:00,30.5686,31.3376,30.5,31.2755,538758.8,BNB,2019.0,7.0,20.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.790776,-0.612106,31.0932
2019-07-20 16:00:00,10613.43,10995.0,10565.01,10898.66,12428.23,BTC,2019.0,7.0,20.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.790776,-0.612106,10740.23
2019-07-20 16:00:00,117.9,120.31,117.49,119.78,1091.986,DASH,2019.0,7.0,20.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.790776,-0.612106,116.19
2019-07-20 16:00:00,229.7,235.09,228.86,231.33,87468.72,ETH,2019.0,7.0,20.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.790776,-0.612106,228.2
2019-07-20 16:00:00,2.6085,2.7052,2.5984,2.667,994159.4,LINK,2019.0,7.0,20.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.790776,-0.612106,2.637
2019-07-20 16:00:00,100.06,105.88,99.8,103.4,136302.6,LTC,2019.0,7.0,20.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.790776,-0.612106,100.25
2019-07-20 16:00:00,0.33094,0.33782,0.32867,0.33548,16960410.0,XRP,2019.0,7.0,20.0,29,-0.012423,0.999923,-0.5,-0.866025,-0.790776,-0.612106,0.33279


# Split dataset

In [40]:
def split_train_valid_test(data: pd.DataFrame):    
    # Split the data into training and testing sets
    split_date_1 = datetime(2022, 1, 1)
    split_date_2 = datetime(2022, 12, 1)
    train_data = data.loc[data.index < split_date_1]
    valid_data = data.loc[(split_date_1<= data.index) & (data.index <= split_date_2)]
    test_data = data.loc[data.index > split_date_2]
    
    print("Train set fraction:", round((len(train_data) / len(data)), 2),'%', "- train shape -> ", train_data.shape)
    print("Valid set fraction:", round((len(valid_data) / len(data)), 2),'%', "- valid shape -> ", valid_data.shape)
    print("Test set fraction:", round((len(valid_data) / len(data)), 2),'%', "- test shape -> ", test_data.shape)
    return train_data, valid_data, test_data

In [41]:
train_data, valid_data, test_data = split_train_valid_test(data=consolidated_df)

Train set fraction: 0.67 % - train shape ->  (42960, 17)
Valid set fraction: 0.25 % - valid shape ->  (16040, 17)
Test set fraction: 0.25 % - test shape ->  (4976, 17)


In [42]:
output_path = join("io", "input", "base_data")
train_data.to_csv(join(output_path, "train.csv"))
valid_data.to_csv(join(output_path, "valid.csv"))
test_data.to_csv(join(output_path, "test.csv"))
print("Datasets saved!")

Datasets saved!
