In [6]:
import os
from typing import Tuple

import pandas as pd
import numpy as np


In [7]:
def z_score_scaling(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    out = data.copy()
    scalars = []
    for column in out.columns.tolist():
        curr = out[column]
        mean = curr.mean()
        std = curr.std()
        out[column] = (curr - mean) / std
        scalars.append({"column": column, "mean": mean, "std": std})
    return out, pd.DataFrame(scalars)

In [8]:
def min_max_scaling(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    out = data.copy()
    scalars = []
    for column in out.columns.tolist():
        curr = out[column]
        cmin = curr.min()
        cmax = curr.max()
        out[column] = (curr - cmin) / (cmax - cmin)
        scalars.append({"column": column, "min": cmin, "max": cmax})
    return out, pd.DataFrame(scalars)

In [9]:
def min_max_scaling_test(data: pd.DataFrame, scalars: pd.DataFrame) -> pd.DataFrame:
    out = data.copy()
    for column in out.columns.tolist():
        curr = out[column]
        curr_scalars = scalars.loc[scalars["column"] == column]
        cmin = curr_scalars["min"].item()
        cmax = curr_scalars["max"].item()
        out[column] = (curr - cmin) / (cmax - cmin)
    return out

In [10]:

def z_score_scaling_test(data: pd.DataFrame, scalars: pd.DataFrame) -> pd.DataFrame:
    out = data.copy()
    for column in out.columns.tolist():
        curr = out[column]
        curr_scalars = scalars.loc[scalars["column"] == column]
        mean = curr_scalars["mean"].item()
        std = curr_scalars["std"].item()
        out[column] = (curr - mean) / std
    return out

In [11]:
# Path to the raw data.
cwd = os.getcwd()
data_path = os.path.join(cwd, "experiments", "data", "AAPL", "raw", "AAPL.csv")

In [12]:
# Specify the sizes for the testing and the training split in percent.
training_split = 0.8

raw_data = pd.read_csv(data_path)
data_length = raw_data.shape[0]

In [13]:
raw_data

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume,bid_count,bid_mean,...,twitter_negative_mention_75%,twitter_negative_mention_max,twitter_score_count,twitter_score_mean,twitter_score_std,twitter_score_min,twitter_score_25%,twitter_score_50%,twitter_score_75%,twitter_score_max
0,1628713260,146.49,146.52,146.46,146.49,42,1,19532,87.0,146.499655,...,95.0,159.0,23.0,-0.306946,0.205778,-0.656902,-0.457212,-0.283227,-0.183190,0.156984
1,1628713320,146.49,146.54,146.47,146.53,37,1,19856,83.0,146.490000,...,95.0,159.0,23.0,-0.306946,0.205778,-0.656902,-0.457212,-0.283227,-0.183190,0.156984
2,1628713380,146.53,146.53,146.44,146.49,36,1,17146,69.0,146.513188,...,95.0,159.0,23.0,-0.306946,0.205778,-0.656902,-0.457212,-0.283227,-0.183190,0.156984
3,1628713440,146.50,146.51,146.46,146.50,32,1,10923,67.0,146.474328,...,95.0,159.0,23.0,-0.306946,0.205778,-0.656902,-0.457212,-0.283227,-0.183190,0.156984
4,1628713500,146.49,146.54,146.47,146.54,31,1,12150,59.0,146.486441,...,95.0,159.0,23.0,-0.306946,0.205778,-0.656902,-0.457212,-0.283227,-0.183190,0.156984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97482,1662155700,156.47,156.54,156.46,156.53,46,1,10507,93.0,156.495376,...,79.0,79.0,1.0,-0.372687,0.000000,-0.372687,-0.372687,-0.372687,-0.372687,-0.372687
97483,1662155760,156.53,156.55,156.27,156.28,50,1,14353,91.0,156.493736,...,79.0,79.0,1.0,-0.372687,0.000000,-0.372687,-0.372687,-0.372687,-0.372687,-0.372687
97484,1662155820,156.26,156.38,156.26,156.38,42,1,12245,99.0,156.399596,...,79.0,79.0,1.0,-0.372687,0.000000,-0.372687,-0.372687,-0.372687,-0.372687,-0.372687
97485,1662155880,156.37,156.50,156.37,156.48,43,1,13920,84.0,156.318810,...,79.0,79.0,1.0,-0.372687,0.000000,-0.372687,-0.372687,-0.372687,-0.372687,-0.372687


In [14]:
train_length = int(np.ceil(data_length * training_split))
test_length = data_length - train_length

In [15]:
training_data = raw_data.iloc[:train_length]
training_data

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume,bid_count,bid_mean,...,twitter_negative_mention_75%,twitter_negative_mention_max,twitter_score_count,twitter_score_mean,twitter_score_std,twitter_score_min,twitter_score_25%,twitter_score_50%,twitter_score_75%,twitter_score_max
0,1628713260,146.49,146.52,146.46,146.49,42,1,19532,87.0,146.499655,...,95.0,159.0,23.0,-0.306946,0.205778,-0.656902,-0.457212,-0.283227,-0.183190,0.156984
1,1628713320,146.49,146.54,146.47,146.53,37,1,19856,83.0,146.490000,...,95.0,159.0,23.0,-0.306946,0.205778,-0.656902,-0.457212,-0.283227,-0.183190,0.156984
2,1628713380,146.53,146.53,146.44,146.49,36,1,17146,69.0,146.513188,...,95.0,159.0,23.0,-0.306946,0.205778,-0.656902,-0.457212,-0.283227,-0.183190,0.156984
3,1628713440,146.50,146.51,146.46,146.50,32,1,10923,67.0,146.474328,...,95.0,159.0,23.0,-0.306946,0.205778,-0.656902,-0.457212,-0.283227,-0.183190,0.156984
4,1628713500,146.49,146.54,146.47,146.54,31,1,12150,59.0,146.486441,...,95.0,159.0,23.0,-0.306946,0.205778,-0.656902,-0.457212,-0.283227,-0.183190,0.156984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77985,1655412780,131.22,131.22,131.05,131.19,61,1,11561,124.0,131.177419,...,223.5,377.0,23.0,-0.405900,0.110264,-0.668722,-0.460773,-0.402749,-0.335507,-0.197287
77986,1655412840,131.16,131.24,131.06,131.24,60,1,10228,122.0,131.121475,...,223.5,377.0,23.0,-0.405900,0.110264,-0.668722,-0.460773,-0.402749,-0.335507,-0.197287
77987,1655412900,131.21,131.31,131.13,131.28,53,1,6532,120.0,131.134917,...,223.5,377.0,23.0,-0.405900,0.110264,-0.668722,-0.460773,-0.402749,-0.335507,-0.197287
77988,1655412960,131.27,131.34,131.15,131.15,62,1,9967,106.0,131.220472,...,223.5,377.0,23.0,-0.405900,0.110264,-0.668722,-0.460773,-0.402749,-0.335507,-0.197287


In [16]:
testing_data = raw_data.iloc[train_length:]
testing_data

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume,bid_count,bid_mean,...,twitter_negative_mention_75%,twitter_negative_mention_max,twitter_score_count,twitter_score_mean,twitter_score_std,twitter_score_min,twitter_score_25%,twitter_score_50%,twitter_score_75%,twitter_score_max
77990,1655413080,131.09,131.15,131.03,131.09,64,1,10073,115.0,131.109130,...,223.50,377.0,23.0,-0.405900,0.110264,-0.668722,-0.460773,-0.402749,-0.335507,-0.197287
77991,1655413140,131.11,131.13,130.93,130.93,63,1,11166,128.0,131.093516,...,223.50,377.0,23.0,-0.405900,0.110264,-0.668722,-0.460773,-0.402749,-0.335507,-0.197287
77992,1655413200,130.93,130.97,130.80,130.80,55,1,8886,126.0,131.025794,...,210.75,377.0,24.0,-0.398816,0.113287,-0.668722,-0.460347,-0.393301,-0.325747,-0.197287
77993,1655413260,130.82,130.83,130.74,130.80,51,1,8694,110.0,130.886273,...,223.50,377.0,23.0,-0.396299,0.115145,-0.668722,-0.460773,-0.383854,-0.325527,-0.197287
77994,1655413320,130.77,130.80,130.63,130.66,55,1,10239,102.0,130.790000,...,223.50,377.0,23.0,-0.396299,0.115145,-0.668722,-0.460773,-0.383854,-0.325527,-0.197287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97482,1662155700,156.47,156.54,156.46,156.53,46,1,10507,93.0,156.495376,...,79.00,79.0,1.0,-0.372687,0.000000,-0.372687,-0.372687,-0.372687,-0.372687,-0.372687
97483,1662155760,156.53,156.55,156.27,156.28,50,1,14353,91.0,156.493736,...,79.00,79.0,1.0,-0.372687,0.000000,-0.372687,-0.372687,-0.372687,-0.372687,-0.372687
97484,1662155820,156.26,156.38,156.26,156.38,42,1,12245,99.0,156.399596,...,79.00,79.0,1.0,-0.372687,0.000000,-0.372687,-0.372687,-0.372687,-0.372687,-0.372687
97485,1662155880,156.37,156.50,156.37,156.48,43,1,13920,84.0,156.318810,...,79.00,79.0,1.0,-0.372687,0.000000,-0.372687,-0.372687,-0.372687,-0.372687,-0.372687


In [17]:
training_data["close"].max()

182.84

In [18]:
training_data["close"].min()

130.01

In [19]:
train_close = training_data["close"]
train_close = (train_close - train_close.min()) / (train_close.max() - train_close.min())
train_close

0        0.311944
1        0.312701
2        0.311944
3        0.312133
4        0.312890
           ...   
77985    0.022336
77986    0.023282
77987    0.024039
77988    0.021579
77989    0.021200
Name: close, Length: 77990, dtype: float64