In [1]:
from os.path import join
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim
import math

# Load dataset

In [2]:
available_coins = ['ADA', 'BNB', 'BTC', 'DASH', 'ETH', 'LINK', 'LTC', 'XRP']
print("Available coins: ", available_coins)

Available coins:  ['ADA', 'BNB', 'BTC', 'DASH', 'ETH', 'LINK', 'LTC', 'XRP']


In [3]:
def read_coin_data(coin_name: str) -> pd.DataFrame:
    raw_data_path = join("io", "input", "data_raw", "Crypto_July_2019_2023", "4H_2019", coin_name ,f"{coin_name.lower()}_2019.csv")
    data_df = pd.read_csv(raw_data_path, index_col=False)
    return data_df


# Feature engineering

## Time & date features

In [4]:
# Nikos Karantaglis = I have add column 'Hour'
def append_date_features(df: pd.DataFrame) -> pd.DataFrame:
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['Week_of_Year'] = df['Date'].dt.isocalendar().week
    df['Hour'] = df['Date'].dt.hour
    return df

In [5]:
# Nikos Karantaglis = I have add sin(hour) and cosine(hour)
def create_trigonometric_columns(df) -> pd.DataFrame:
    # Create sine and cosine columns for Year, Month and Day
    df['Year_sin'] = df['Year'].apply(lambda x: math.sin(2*math.pi*x/2023))
    df['Year_cos'] = df['Year'].apply(lambda x: math.cos(2*math.pi*x/2023))
    df['Month_sin'] = df['Month'].apply(lambda x: math.sin(2*math.pi*x/12))
    df['Month_cos'] = df['Month'].apply(lambda x: math.cos(2*math.pi*x/12))
    df['Day_sin'] = df['Day'].apply(lambda x: math.sin(2*math.pi*x/31))
    df['Day_cos'] = df['Day'].apply(lambda x: math.cos(2*math.pi*x/31))
    df['Hour_sin'] = df['Hour'].apply(lambda x: math.sin(2*math.pi*x/24))
    df['Hour_cos'] = df['Hour'].apply(lambda x: math.cos(2*math.pi*x/24))
    df = df.drop(['Year'], axis=1)
    df = df.drop(['Month'], axis=1)
    df = df.drop(['Day'], axis=1)
    df = df.drop(['Week_of_Year'], axis=1)
    df = df.drop(['Hour'], axis=1)    
    return df

## Create target column

In [6]:
def create_target_variable(df: pd.DataFrame, forecast_lead: int = 1) -> pd.DataFrame:    
    target_column = "Close"
    features = list(df.columns.difference([target_column]))
    
    target = f"{target_column}_lead_{forecast_lead}"
    df[target] = df[target_column].shift(-forecast_lead)
    df = df.iloc[:-forecast_lead]
    # display("Target added to dataframe", df.shape, df[['Close', target]].head())
    return df

In [7]:
# Nikos Karantaglis = I have add some fetures like lags, moving averages, also Target have been defined.
def get_coins_data(coins: list) -> pd.DataFrame:
    df = pd.DataFrame(columns=["Open", "High","Low", "Close", "Volume"])
    for coin in coins:
        coin_df = read_coin_data(coin_name=coin)
        coin_df["Asset_id"] = coin
        coin_df = coin_df.rename(columns={"Time":"Date"})
        coin_df = append_date_features(df=coin_df)
        coin_df = create_trigonometric_columns(df=coin_df)
        coin_df = create_target_variable(df=coin_df, forecast_lead= 1)
        # Set date as index
        coin_df.set_index('Date', inplace=True)
        df = pd.concat([df, coin_df])
        
        
    one_hot = pd.get_dummies(df['Asset_id'])
    df = pd.concat([df, one_hot], axis=1)
    
#     df['Open_lag1'] = df['Open'].shift(1).fillna(df['Open'].median())
#     df['Open_lag2'] = df['Open'].shift(2).fillna(df['Open'].median())
#     df['Open_lag3'] = df['Open'].shift(3).fillna(df['Open'].median())
#     df['Open_lag4'] = df['Open'].shift(4).fillna(df['Open'].median())
    
#     df['High_lag1'] = df['High'].shift(1).fillna(df['High'].median())
#     df['High_lag2'] = df['High'].shift(2).fillna(df['High'].median())
#     df['High_lag3'] = df['High'].shift(3).fillna(df['High'].median())
#     df['High_lag4'] = df['High'].shift(4).fillna(df['High'].median())
    
#     df['Close_lag1'] = df['Close'].shift(1).fillna(df['Close'].median())
#     df['Close_lag2'] = df['Close'].shift(2).fillna(df['Close'].median())
#     df['Close_lag3'] = df['Close'].shift(3).fillna(df['Close'].median())
#     df['Close_lag4'] = df['Close'].shift(4).fillna(df['Close'].median())
    
#     df['Low_lag1'] = df['Low'].shift(1).fillna(df['Low'].median())
#     df['Low_lag2'] = df['Low'].shift(3).fillna(df['Low'].median())
#     df['Low_lag3'] = df['Low'].shift(4).fillna(df['Low'].median())
#     df['Low_lag4'] = df['Low'].shift(4).fillna(df['Low'].median())
    
#     df['Volume_lag1'] = df['Volume'].shift(1).fillna(df['Volume'].median())
#     df['Volume_lag2'] = df['Volume'].shift(3).fillna(df['Volume'].median())
#     df['Volume_lag3'] = df['Volume'].shift(4).fillna(df['Volume'].median())
#     df['Volume_lag4'] = df['Volume'].shift(4).fillna(df['Volume'].median())
    
    
    df['Close_ma_1d'] = df.groupby('Asset_id')['Close'].rolling(6, min_periods=1).mean().reset_index(0, drop=True)
    df['Close_ma_2d'] = df.groupby('Asset_id')['Close'].rolling(18, min_periods=1).mean().reset_index(0, drop=True)
    df['Close_ma_3d'] = df.groupby('Asset_id')['Close'].rolling(42, min_periods=1).mean().reset_index(0, drop=True)
    
    df['Target'] = (df['Close'] - df['Close_lead_1']) / df['Close_lead_1']
    df['Target_lag1'] = df['Target'].shift(1).fillna(df['Target'].median())
    df['Target_lag2'] = df['Target'].shift(2).fillna(df['Target'].median())
    df['Target_lag3'] = df['Target'].shift(3).fillna(df['Target'].median())
    df['Target_lag4'] = df['Target'].shift(4).fillna(df['Target'].median())
    
    df['Target_ma_1d'] = df.groupby('Asset_id')['Target_lag1'].rolling(6, min_periods=1).mean().reset_index(0, drop=True)
    df['Target_ma_2d'] = df.groupby('Asset_id')['Target_lag1'].rolling(18, min_periods=1).mean().reset_index(0, drop=True)
    df['Target_ma_3d'] = df.groupby('Asset_id')['Target_lag1'].rolling(42, min_periods=1).mean().reset_index(0, drop=True)
    df = df.drop(['Close_lead_1'], axis=1)
    display("Consolidated dataframe shape",df.shape)
    return df

In [8]:
consolidated_df = get_coins_data(coins=available_coins)
consolidated_df.head()

'Consolidated dataframe shape'

(63976, 33)

Unnamed: 0,Open,High,Low,Close,Volume,Asset_id,Year_sin,Year_cos,Month_sin,Month_cos,...,Close_ma_2d,Close_ma_3d,Target,Target_lag1,Target_lag2,Target_lag3,Target_lag4,Target_ma_1d,Target_ma_2d,Target_ma_3d
2019-07-20 16:00:00,0.06222,0.06509,0.06203,0.06474,35347035.0,ADA,-0.012423,0.999923,-0.5,-0.866025,...,0.06474,0.06474,0.033855,-0.000224,-0.000224,-0.000224,-0.000224,-0.000224,-0.000224,-0.000224
2019-07-20 20:00:00,0.06475,0.06484,0.06221,0.06262,17106974.9,ADA,-0.012423,0.999923,-0.5,-0.866025,...,0.06368,0.06368,0.018708,0.033855,-0.000224,-0.000224,-0.000224,0.016815,0.016815,0.016815
2019-07-21 00:00:00,0.06262,0.064,0.06136,0.06147,15707034.1,ADA,-0.012423,0.999923,-0.5,-0.866025,...,0.062943,0.062943,-0.01443,0.018708,0.033855,-0.000224,-0.000224,0.017446,0.017446,0.017446
2019-07-21 04:00:00,0.06145,0.06253,0.06136,0.06237,7069070.1,ADA,-0.012423,0.999923,-0.5,-0.866025,...,0.0628,0.0628,0.009387,-0.01443,0.018708,0.033855,-0.000224,0.009477,0.009477,0.009477
2019-07-21 08:00:00,0.0623,0.06315,0.06136,0.06179,10194081.5,ADA,-0.012423,0.999923,-0.5,-0.866025,...,0.062598,0.062598,0.02166,0.009387,-0.01443,0.018708,0.033855,0.009459,0.009459,0.009459


In [9]:
consolidated_df.loc['2021-08-29 12:00:00', :]	

Unnamed: 0,Open,High,Low,Close,Volume,Asset_id,Year_sin,Year_cos,Month_sin,Month_cos,...,Close_ma_2d,Close_ma_3d,Target,Target_lag1,Target_lag2,Target_lag3,Target_lag4,Target_ma_1d,Target_ma_2d,Target_ma_3d
2021-08-29 12:00:00,2.868,2.92,2.834,2.864,55161600.8,ADA,-0.006212,0.999981,-0.866025,-0.5,...,2.768556,2.758831,-0.010366,0.001397,-0.015342,-0.027266,0.036403,-0.001673,-0.006072,-0.002263
2021-08-29 12:00:00,484.5,487.5,480.0,483.3,127803.011,BNB,-0.006212,0.999981,-0.866025,-0.5,...,484.35,484.573571,0.002489,0.002483,0.006192,-0.011897,0.009342,0.000986,-0.000207,-0.001878
2021-08-29 12:00:00,48159.98,48561.46,48105.03,48474.95,4372.83599,BTC,-0.006212,0.999981,-0.866025,-0.5,...,48181.657222,48465.367619,-0.006646,-0.006498,0.006647,-0.005214,0.013852,0.00118,-0.001923,8.2e-05
2021-08-29 12:00:00,237.3,239.0,230.1,231.7,18892.656,DASH,-0.006212,0.999981,-0.866025,-0.5,...,235.7,248.993571,0.022507,0.024169,0.0059,-0.005027,-0.004632,0.003971,0.001339,0.000908
2021-08-29 12:00:00,3181.83,3206.83,3165.57,3186.99,41187.1476,ETH,-0.006212,0.999981,-0.866025,-0.5,...,3194.218333,3215.947381,-0.010322,-0.001619,0.007691,-0.008115,0.020187,0.003113,-0.001402,3.6e-05
2021-08-29 12:00:00,25.48,25.78,25.14,25.38,482111.66,LINK,-0.006212,0.999981,-0.866025,-0.5,...,25.366111,26.350857,-0.012067,0.00394,0.011774,-0.007758,0.005864,0.001867,-0.001885,0.00198
2021-08-29 12:00:00,175.9,177.6,173.1,174.5,77680.989,LTC,-0.006212,0.999981,-0.866025,-0.5,...,173.027778,176.805238,-0.003427,0.008023,0.018192,-0.015075,-0.004535,-0.00125,-0.001975,0.001157
2021-08-29 12:00:00,1.1354,1.1458,1.1192,1.1322,51669052.0,XRP,-0.006212,0.999981,-0.866025,-0.5,...,1.131961,1.165057,-0.01974,0.002915,0.008014,-0.017735,0.019212,0.002555,-0.002261,0.001688


# Split dataset

In [10]:
def split_train_valid_test(data: pd.DataFrame):    
    # Split the data into training and testing sets
    split_date_1 = datetime(2022, 1, 1)
    split_date_2 = datetime(2022, 12, 1)
    train_data = data.loc[data.index < split_date_1]
    valid_data = data.loc[(split_date_1<= data.index) & (data.index <= split_date_2)]
    test_data = data.loc[data.index > split_date_2]
    
    print("Train set fraction:", round((len(train_data) / len(data)), 2),'%', "- train shape -> ", train_data.shape)
    print("Valid set fraction:", round((len(valid_data) / len(data)), 2),'%', "- valid shape -> ", valid_data.shape)
    print("Test set fraction:", round((len(test_data) / len(data)), 2),'%', "- test shape -> ", test_data.shape)
    return train_data, valid_data, test_data

In [11]:
train_data, valid_data, test_data = split_train_valid_test(data=consolidated_df)

Train set fraction: 0.67 % - train shape ->  (42960, 33)
Valid set fraction: 0.25 % - valid shape ->  (16040, 33)
Test set fraction: 0.08 % - test shape ->  (4976, 33)


In [12]:
output_path = join("io", "input", "base_data")
train_data.to_csv(join(output_path, "train.csv"))
valid_data.to_csv(join(output_path, "valid.csv"))
test_data.to_csv(join(output_path, "test.csv"))
print("Datasets saved!")

Datasets saved!


Nikos Karantaglis: Plot every time-series of Close column for review purposes.

In [14]:
# df=consolidated_df
# import pandas as pd
# import matplotlib.pyplot as plt

# # assume your dataframe is called df and has columns 'Asset_id' and 'Close'
# grouped = df.groupby('Asset_id')

# for asset_id, group in grouped:
#     # create a new figure for each plot
#     fig, ax = plt.subplots(figsize=(12, 6))
    
#     # plot the timeseries for this asset_id
#     group.plot(y='Close', ax=ax)
    
#     # set title and axis labels
#     ax.set_title(f"Asset {asset_id} Close Timeseries")
#     ax.set_xlabel('Date')
#     ax.set_ylabel('Target')
    
#     # show the plot
#     plt.show()