In [2]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib
from sklearn.preprocessing import StandardScaler


In [3]:
DATA_DIR = './data'
RESOLUTION = 'daily'
REGION = 'us'
INSTRUMENT = './nyse stocks/2/unh.us.txt'

ohlc = ['open', 'high', 'low', 'close']
target_col = 'close'


In [4]:
data_file = os.path.normpath(os.path.join(DATA_DIR, RESOLUTION, REGION, INSTRUMENT))
data_file

'data/daily/us/nyse stocks/2/unh.us.txt'

In [5]:
df = pd.read_csv(data_file)
df.columns = ['date', 'open', 'high', 'low', 'close', 'volume', 'openint']
df

Unnamed: 0,date,open,high,low,close,volume,openint
0,19900326,0.25264,0.25264,0.25264,0.25264,473990,0
1,19900327,0.25264,0.26996,0.25264,0.26996,4493732,0
2,19900328,0.26996,0.26996,0.26119,0.26119,1793083,0
3,19900329,0.26996,0.26996,0.26119,0.26119,617291,0
4,19900330,0.26996,0.26996,0.26119,0.26119,2017221,0
...,...,...,...,...,...,...,...
7510,20200116,298.00000,300.99000,295.93000,300.74000,3710344,0
7511,20200117,300.01000,300.70000,295.88000,298.47000,4632160,0
7512,20200121,296.89000,302.54000,296.01000,300.53000,4963132,0
7513,20200122,301.99000,302.13000,297.35000,300.59000,3117568,0


In [6]:
df.date = pd.to_datetime(df.date, format='%Y%m%d')
df

Unnamed: 0,date,open,high,low,close,volume,openint
0,1990-03-26,0.25264,0.25264,0.25264,0.25264,473990,0
1,1990-03-27,0.25264,0.26996,0.25264,0.26996,4493732,0
2,1990-03-28,0.26996,0.26996,0.26119,0.26119,1793083,0
3,1990-03-29,0.26996,0.26996,0.26119,0.26119,617291,0
4,1990-03-30,0.26996,0.26996,0.26119,0.26119,2017221,0
...,...,...,...,...,...,...,...
7510,2020-01-16,298.00000,300.99000,295.93000,300.74000,3710344,0
7511,2020-01-17,300.01000,300.70000,295.88000,298.47000,4632160,0
7512,2020-01-21,296.89000,302.54000,296.01000,300.53000,4963132,0
7513,2020-01-22,301.99000,302.13000,297.35000,300.59000,3117568,0


In [7]:
df['high_open_delta'] = df['high'] - df['open']
df['low_close_delta'] = df['low'] - df['close']

df['high_low_delta'] = df['high'] - df['low']
df['open_close_delta'] = df['open'] - df['close']

df['high_close_delta'] = df['high'] - df['close']
df['low_open_delta'] = df['low'] - df['open']
        
df[f'{target_col}_t1'] = df[target_col].shift(-1)
df[f'{target_col}_delta'] = df[f'{target_col}_t1'] - df[target_col]

df[['volume']] = StandardScaler().fit_transform(df[['volume']])

df.drop([f'{target_col}_t1'], axis=1, inplace=True)
df.drop(ohlc + ['date', 'openint'], axis=1, inplace=True)
df = df[:-1]

df

Unnamed: 0,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_delta
0,-1.116097,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.01732
1,-0.560923,0.01732,-0.01732,0.01732,-0.01732,0.00000,0.00000,-0.00877
2,-0.933915,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00000
3,-1.096305,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00000
4,-0.902958,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00877
...,...,...,...,...,...,...,...,...
7509,-0.341618,10.62000,-7.41000,10.64000,-7.39000,3.23000,-0.02000,4.33000
7510,-0.669118,2.99000,-4.81000,5.06000,-2.74000,0.25000,-2.07000,-2.27000
7511,-0.541805,0.69000,-2.59000,4.82000,1.54000,2.23000,-4.13000,2.06000
7512,-0.496093,5.65000,-4.52000,6.53000,-3.64000,2.01000,-0.88000,0.06000


In [8]:
features = ['volume',
            'high_open_delta',
            'low_close_delta',
            'high_low_delta',
            'open_close_delta',
            'high_close_delta',
            'low_open_delta']
torch_tensor = torch.tensor(df[features].values)

# printing out result
print(torch_tensor)
print(torch_tensor.size())


tensor([[-1.1161,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.5609,  0.0173, -0.0173,  ..., -0.0173,  0.0000,  0.0000],
        [-0.9339,  0.0000,  0.0000,  ...,  0.0088,  0.0088, -0.0088],
        ...,
        [-0.5418,  0.6900, -2.5900,  ...,  1.5400,  2.2300, -4.1300],
        [-0.4961,  5.6500, -4.5200,  ..., -3.6400,  2.0100, -0.8800],
        [-0.7510,  0.1400, -3.2400,  ...,  1.4000,  1.5400, -4.6400]],
       dtype=torch.float64)
torch.Size([7514, 7])


In [11]:

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def _get_ops(op):
    if op == 'avg':
        return np.mean

    raise Exception(f'Unknown operation {op}')


def _aggregate_collection(lst, op):
    if len(lst) == 0 or len(lst) == 1:
        return lst
    else:
        aggrgated_values_dict = {}
        for entry in lst:
            for key, value in entry.items():
                if key in aggrgated_values_dict:
                    aggrgated_values_dict[key].append(value)
                else:
                    aggrgated_values_dict[key] = [value]

        final_aggrgated_values_dict = {}
        for key, values in aggrgated_values_dict.items():
            values = [e for e in values if e]
            final_aggrgated_values_dict[key] = op(values)

        return final_aggrgated_values_dict


def _day_aggregator_handler(dataset, **kwargs):
    length = kwargs['length']
    return dataset[0:length], dataset[length:]


def _week_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 7, **kwargs)


def _month_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21, **kwargs)


def _quarterly_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21 * 3, **kwargs)


def _yearly_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21 * 12, **kwargs)


def _universal_aggregator_handler(dataset, period_range, **kwargs):
    aggregated_dataset = []

    length = kwargs['length']
    ops = _get_ops(kwargs['op'])

    raw_dataset = dataset[0:length]
    raw_chunked_dataset = chunks(raw_dataset, period_range)
    for week in raw_chunked_dataset:
        aggregated_dataset.append(_aggregate_collection(week, ops))

    return aggregated_dataset, dataset[length:]


_features_descriptors = [
    {'index': 0, 'length': 21 * 12 * 1, 'handler': _day_aggregator_handler, 'params': {}},
    {'index': 1, 'length': 21 * 12 * 6, 'handler': _week_aggregator_handler, 'params': {'op': 'avg'}},
    {'index': 2, 'length': 21 * 12 * 6, 'handler': _month_aggregator_handler, 'params': {'op': 'avg'}},
    {'index': 3, 'length': 21 * 12 * 6, 'handler': _quarterly_aggregator_handler, 'params': {'op': 'avg'}},
    {'index': 4, 'length': 21 * 12 * 6, 'handler': _yearly_aggregator_handler, 'params': {'op': 'avg'}},
    {'index': 5, 'length': -1, 'handler': _universal_aggregator_handler, 'params': {'op': 'avg'}},
]


def features_aggregator(dataset, features_descriptors):
    dataset = list(df.T.to_dict().values()) if isinstance(dataset, pd.DataFrame) else data_file

    aggregated_dataset = []
    for feature_set in features_descriptors:
        index, length, handler, params = feature_set['index'], feature_set['length'], feature_set['handler'], \
                                         feature_set['params']

        if length == -1:
            period_range = len(dataset)
            step_aggregated_dataset, dataset = handler(dataset, period_range, **{
                'index': index,
                'length': length,

                **params
            })
        else:
            step_aggregated_dataset, dataset = handler(dataset, **{
                'index': index,
                'length': length,

                **params
            })

        aggregated_dataset += step_aggregated_dataset

    return aggregated_dataset



aggregated_dataset = features_aggregator(df, _features_descriptors)
aggregated_dataset[:5]

[{'volume': -1.1160968214833211,
  'high_open_delta': 0.0,
  'low_close_delta': 0.0,
  'high_low_delta': 0.0,
  'open_close_delta': 0.0,
  'high_close_delta': 0.0,
  'low_open_delta': 0.0,
  'close_delta': 0.017319999999999947},
 {'volume': -0.5609230949325865,
  'high_open_delta': 0.017319999999999947,
  'low_close_delta': -0.017319999999999947,
  'high_low_delta': 0.017319999999999947,
  'open_close_delta': -0.017319999999999947,
  'high_close_delta': 0.0,
  'low_open_delta': 0.0,
  'close_delta': -0.008769999999999945},
 {'volume': -0.9339145380240871,
  'high_open_delta': 0.0,
  'low_close_delta': 0.0,
  'high_low_delta': 0.008769999999999945,
  'open_close_delta': 0.008769999999999945,
  'high_close_delta': 0.008769999999999945,
  'low_open_delta': -0.008769999999999945,
  'close_delta': 0.0},
 {'volume': -1.0963052651624312,
  'high_open_delta': 0.0,
  'low_close_delta': 0.0,
  'high_low_delta': 0.008769999999999945,
  'open_close_delta': 0.008769999999999945,
  'high_close_delta

In [13]:
df = pd.DataFrame(aggregated_dataset)
df

Unnamed: 0,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_delta
0,-1.116097,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.017320
1,-0.560923,0.017320,-0.017320,0.017320,-0.017320,0.000000,0.000000,-0.008770
2,-0.933915,0.000000,0.000000,0.008770,0.008770,0.008770,-0.008770,0.000000
3,-1.096305,0.000000,0.000000,0.008770,0.008770,0.008770,-0.008770,0.000000
4,-0.902958,0.000000,0.000000,0.008770,0.008770,0.008770,-0.008770,0.008770
...,...,...,...,...,...,...,...,...
566,-0.007944,0.551894,-0.549100,1.024456,-0.054888,0.489665,-0.510162,0.036761
567,-0.193129,0.507193,-0.477228,0.934968,-0.029800,0.467088,-0.451036,0.013722
568,-0.397698,0.607451,-0.532357,1.053599,-0.071746,0.531794,-0.473771,0.091920
569,-0.581725,0.758402,-0.756292,1.405655,-0.084980,0.660608,-0.678777,0.141347


In [55]:

([1,2,3,4])

2.5