# Deal or No Deal

### Dependency

In [20]:
import os
import glob
import collections
import numpy as np
import pandas as pd
import matplotlib
from pathlib import Path
from datetime import datetime
from enum import Enum

import torch
from torch import nn
from torchsummary import summary
import torch.tensor
import torch.optim as optim
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.mode.chained_assignment = None

### Configs

In [21]:
TRAINING_DATE_FROM = datetime.strptime('1990-03-26', '%Y-%m-%d')
TRAINING_DATE_TO = datetime.strptime('2019-07-01', '%Y-%m-%d')

VALIDATION_DATE_FROM = datetime.strptime('2019-07-01', '%Y-%m-%d')
VALIDATION_DATE_TO = datetime.strptime('2019-12-31', '%Y-%m-%d')

TESTING_DATE_FROM = datetime.strptime('2020-01-01', '%Y-%m-%d')
TESTING_DATE_TO = datetime.strptime('2020-02-13', '%Y-%m-%d')

PREDICT_UP_TO = 21

In [22]:
DATA_DIR = '.'
REGION = 'us'
INSTRUMENT = 'unh'

RAW_DATA_FULL_FEATURES_SET = ['date', 'open', 'high', 'low', 'close', 'volume', 'openint']
RAW_DATA_REMOVED_FEATURES_SET = ['open', 'high', 'low', 'close']
RAW_DATA_ADDED_FEATURES_SET = ['volume']
RAW_DATA_FEATURES_SET = RAW_DATA_REMOVED_FEATURES_SET + RAW_DATA_ADDED_FEATURES_SET

RAW_DATA_FEATURES_SET = ['open', 'high', 'low', 'close', 'volume']
RAW_TARGET = 'close'

FEATURES_SET = ['volume',
                'high_open_delta',
                'low_close_delta',
                'high_low_delta',
                'open_close_delta',
                'high_close_delta',
                'low_open_delta']
TARGET = 'close_delta'

## Helpers

### Market Data

In [23]:
def fetch_instrument_file(instrument, region, base_dir):
    cwd = os.getcwd()
    os.chdir(base_dir)
    
    instrument_file_list = result = list(Path(".").rglob(f"*{instrument}*.txt"))
    if not instrument_file_list:
        os.chdir(cwd)
        raise Exception(f'Cannot find file for instrument {instrument}')
    if len(instrument_file_list) > 1:
        os.chdir(cwd)
        raise Exception(f'Found multiple file for instrument {instrument}')
        
    os.chdir(cwd)
    return str(os.path.join(os.getcwd(), instrument_file_list[0]))

instrument_data_file = fetch_instrument_file(INSTRUMENT, REGION, DATA_DIR)
instrument_data_file

'/Users/myousse3/data/dev/personal/be-mil/data/daily/us/nyse stocks/2/unh.us.txt'

# Data Preparation 

In [24]:
instrument_data_file = fetch_instrument_file(INSTRUMENT, REGION, DATA_DIR)
instrument_data_file

'/Users/myousse3/data/dev/personal/be-mil/data/daily/us/nyse stocks/2/unh.us.txt'

In [25]:
df = pd.read_csv(instrument_data_file)
df.columns = RAW_DATA_FULL_FEATURES_SET
df.date = pd.to_datetime(df.date, format='%Y%m%d')
df

Unnamed: 0,date,open,high,low,close,volume,openint
0,1990-03-26,0.25264,0.25264,0.25264,0.25264,473990,0
1,1990-03-27,0.25264,0.26996,0.25264,0.26996,4493732,0
2,1990-03-28,0.26996,0.26996,0.26119,0.26119,1793083,0
3,1990-03-29,0.26996,0.26996,0.26119,0.26119,617291,0
4,1990-03-30,0.26996,0.26996,0.26119,0.26119,2017221,0
...,...,...,...,...,...,...,...
7510,2020-01-16,298.00000,300.99000,295.93000,300.74000,3710344,0
7511,2020-01-17,300.01000,300.70000,295.88000,298.47000,4632160,0
7512,2020-01-21,296.89000,302.54000,296.01000,300.53000,4963132,0
7513,2020-01-22,301.99000,302.13000,297.35000,300.59000,3117568,0


#### Split dataset - (Training / Validation / Testing)

In [26]:
training_df = df[(df['date'] >= TRAINING_DATE_FROM) & (df['date'] <= TRAINING_DATE_TO)] 
training_df = training_df.reset_index()
training_df.drop(['index'], axis=1, inplace=True)
training_df

validation_df = df[(df['date'] >= VALIDATION_DATE_FROM) & (df['date'] <= VALIDATION_DATE_TO)] 
validation_df = validation_df.reset_index()
validation_df.drop(['index'], axis=1, inplace=True)
validation_df

testing_df = df[(df['date'] >= TESTING_DATE_FROM) & (df['date'] <= TESTING_DATE_TO)] 
testing_df = testing_df.reset_index()
testing_df.drop(['index'], axis=1, inplace=True)
testing_df

Unnamed: 0,date,open,high,low,close,volume,openint
0,1990-03-26,0.25264,0.25264,0.25264,0.25264,473990,0
1,1990-03-27,0.25264,0.26996,0.25264,0.26996,4493732,0
2,1990-03-28,0.26996,0.26996,0.26119,0.26119,1793083,0
3,1990-03-29,0.26996,0.26996,0.26119,0.26119,617291,0
4,1990-03-30,0.26996,0.26996,0.26119,0.26119,2017221,0
...,...,...,...,...,...,...,...
7368,2019-06-25,247.18000,248.90000,245.32000,245.56000,3210315,0
7369,2019-06-26,245.89000,245.97000,241.06000,241.40000,3885971,0
7370,2019-06-27,242.18000,246.04000,241.09000,244.24000,3395049,0
7371,2019-06-28,244.40000,244.65000,236.62000,241.93000,6785360,0


Unnamed: 0,date,open,high,low,close,volume,openint
0,2019-07-01,243.86,244.07,238.670,240.58,4144328,0
1,2019-07-02,240.53,241.91,237.500,241.89,3897938,0
2,2019-07-03,242.24,245.42,241.960,243.13,2581404,0
3,2019-07-05,242.31,246.64,241.980,244.89,2990530,0
4,2019-07-08,244.31,245.87,242.800,245.74,1918951,0
...,...,...,...,...,...,...,...
123,2019-12-24,295.26,295.83,293.560,294.54,714032,0
124,2019-12-26,295.30,296.26,294.540,295.65,1050688,0
125,2019-12-27,296.01,296.54,295.014,295.97,1547211,0
126,2019-12-30,296.05,296.54,293.450,293.85,1512088,0


Unnamed: 0,date,open,high,low,close,volume,openint
0,2020-01-02,293.98,295.7,289.79,292.5,2544306,0
1,2020-01-03,287.27,291.875,284.36,289.54,2712006,0
2,2020-01-06,288.0,291.66,287.22,291.55,3079134,0
3,2020-01-07,290.7,291.49,287.58,289.79,2497613,0
4,2020-01-08,291.5,297.38,290.04,295.9,3379840,0
5,2020-01-09,294.29,297.2,293.717,294.22,2838587,0
6,2020-01-10,296.07,296.07,292.21,295.13,2477989,0
7,2020-01-13,294.37,294.45,285.015,285.85,6996743,0
8,2020-01-14,285.6,288.24,284.25,288.24,6067709,0
9,2020-01-15,289.02,299.64,289.0,296.41,6081615,0


# Data Preparation /Training

#### Trim dataframe

In [27]:
training_df = training_df[RAW_DATA_FEATURES_SET]
training_df

Unnamed: 0,open,high,low,close,volume
0,0.25264,0.25264,0.25264,0.25264,473990
1,0.25264,0.26996,0.25264,0.26996,4493732
2,0.26996,0.26996,0.26119,0.26119,1793083
3,0.26996,0.26996,0.26119,0.26119,617291
4,0.26996,0.26996,0.26119,0.26119,2017221
...,...,...,...,...,...
7368,247.18000,248.90000,245.32000,245.56000,3210315
7369,245.89000,245.97000,241.06000,241.40000,3885971
7370,242.18000,246.04000,241.09000,244.24000,3395049
7371,244.40000,244.65000,236.62000,241.93000,6785360


In [28]:
class features_extraction_ops(Enum):
    calc = 'calc'
    clean = 'clean'
    statistics = 'statistics'
    standardize = 'standardize'

def features_extraction(df, ops = []):
    def _calc(df):
        df['high_open_delta'] = df['high'] - df['open']
        df['low_close_delta'] = df['low'] - df['close']

        df['high_low_delta'] = df['high'] - df['low']
        df['open_close_delta'] = df['open'] - df['close']

        df['high_close_delta'] = df['high'] - df['close']
        df['low_open_delta'] = df['low'] - df['open']

        df[f'{RAW_TARGET}_t1'] = df[RAW_TARGET].shift(-1)
        df[TARGET] = df[f'{RAW_TARGET}_t1'] - df[RAW_TARGET]

        df = df[:-1]
        
        return df
    
    def _clean(df):
        df.drop([f'{RAW_TARGET}_t1'], axis=1, inplace=True)
        df.drop(RAW_DATA_REMOVED_FEATURES_SET, axis=1, inplace=True)
        
        return df

    def _statistics(df):
        return df.describe()
    
    def _standardize(df):
        for feature in FEATURES_SET:
            df[feature] = StandardScaler().fit_transform(df[[feature]])
        df[TARGET] = StandardScaler().fit_transform(df[[TARGET]])
        
        return df

    if features_extraction_ops.calc in ops:
        df = _calc(df)
    elif features_extraction_ops.clean in ops:
        df = _clean(df)
    elif features_extraction_ops.statistics in ops:
        df = _statistics(df)
    elif features_extraction_ops.standardize in ops:
        df = _standardize(df)
        
    return df
        
training_df = features_extraction(training_df, [features_extraction_ops.calc])
training_df

Unnamed: 0,open,high,low,close,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_t1,close_delta
0,0.25264,0.25264,0.25264,0.25264,473990,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.26996,0.01732
1,0.25264,0.26996,0.25264,0.26996,4493732,0.01732,-0.01732,0.01732,-0.01732,0.00000,0.00000,0.26119,-0.00877
2,0.26996,0.26996,0.26119,0.26119,1793083,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.26119,0.00000
3,0.26996,0.26996,0.26119,0.26119,617291,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.26119,0.00000
4,0.26996,0.26996,0.26119,0.26119,2017221,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.26996,0.00877
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7367,249.19000,250.48000,245.76000,247.52000,3190897,1.29000,-1.76000,4.72000,1.67000,2.96000,-3.43000,245.56000,-1.96000
7368,247.18000,248.90000,245.32000,245.56000,3210315,1.72000,-0.24000,3.58000,1.62000,3.34000,-1.86000,241.40000,-4.16000
7369,245.89000,245.97000,241.06000,241.40000,3885971,0.08000,-0.34000,4.91000,4.49000,4.57000,-4.83000,244.24000,2.84000
7370,242.18000,246.04000,241.09000,244.24000,3395049,3.86000,-3.15000,4.95000,-2.06000,1.80000,-1.09000,241.93000,-2.31000


In [29]:
training_df = features_extraction(training_df, [features_extraction_ops.clean])
training_df

Unnamed: 0,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_delta
0,473990,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.01732
1,4493732,0.01732,-0.01732,0.01732,-0.01732,0.00000,0.00000,-0.00877
2,1793083,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00000
3,617291,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00000
4,2017221,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00877
...,...,...,...,...,...,...,...,...
7367,3190897,1.29000,-1.76000,4.72000,1.67000,2.96000,-3.43000,-1.96000
7368,3210315,1.72000,-0.24000,3.58000,1.62000,3.34000,-1.86000,-4.16000
7369,3885971,0.08000,-0.34000,4.91000,4.49000,4.57000,-4.83000,2.84000
7370,3395049,3.86000,-3.15000,4.95000,-2.06000,1.80000,-1.09000,-2.31000


In [30]:
training_statistics_df = features_extraction(training_df, [features_extraction_ops.statistics])
training_statistics_df

Unnamed: 0,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_delta
count,7372.0,7372.0,7372.0,7372.0,7372.0,7372.0,7372.0,7372.0
mean,8646364.0,0.454762,-0.480708,0.930512,-0.004958,0.449803,-0.47575,0.0326
std,7276773.0,0.795738,0.873407,1.369311,0.97654,0.856307,0.982283,1.075511
min,0.0,0.0,-14.53,0.0,-12.61,0.0,-21.91,-12.69
25%,4554940.0,0.0436,-0.54,0.1566,-0.191,0.0522,-0.52525,-0.165
50%,6890318.0,0.183,-0.189,0.525,0.0,0.174,-0.1715,0.0086
75%,10460200.0,0.53825,-0.0524,1.11075,0.157,0.51325,-0.0433,0.2266
max,185809400.0,13.89,0.0,23.91,16.83,16.83,0.0,12.09


In [31]:
training_df = features_extraction(training_df, [features_extraction_ops.standardize])
training_df

Unnamed: 0,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_delta
0,-1.123153,-0.571535,0.550420,-0.679593,0.005078,-0.525318,0.484363,-0.014208
1,-0.570708,-0.549768,0.530589,-0.666944,-0.012660,-0.525318,0.484363,-0.038468
2,-0.941866,-0.571535,0.550420,-0.673188,0.014059,-0.515076,0.475435,-0.030313
3,-1.103459,-0.571535,0.550420,-0.673188,0.014059,-0.515076,0.475435,-0.030313
4,-0.911062,-0.571535,0.550420,-0.673188,0.014059,-0.515076,0.475435,-0.022158
...,...,...,...,...,...,...,...,...
7367,-0.749761,1.049710,-1.464815,2.767629,1.715313,2.931619,-3.007738,-1.852826
7368,-0.747092,1.590126,0.275616,1.935037,1.664109,3.375415,-1.409312,-3.898504
7369,-0.654235,-0.470993,0.161114,2.906394,4.603256,4.811912,-4.433085,2.610471
7370,-0.721703,4.279634,-3.056392,2.935608,-2.104554,1.576873,-0.625371,-2.178275


In [32]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def _get_ops(op):
    if op == 'avg':
        return np.mean

    raise Exception(f'Unknown operation {op}')


def _aggregate_collection(lst, op):
    if len(lst) == 0:
        return {}
    elif len(lst) == 1:
        return lst[0]
    else:
        aggrgated_values_dict = {}
        for entry in lst:
            for key, value in entry.items():
                if key in aggrgated_values_dict:
                    aggrgated_values_dict[key].append(value)
                else:
                    aggrgated_values_dict[key] = [value]

        final_aggrgated_values_dict = {}
        for key, values in aggrgated_values_dict.items():
            values = [e for e in values if e]
            final_aggrgated_values_dict[key] = op(values)

        return final_aggrgated_values_dict


def _day_aggregator_handler(dataset, **kwargs):
    length = kwargs['length']
    return dataset[0:length], dataset[length:]


def _week_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 5, **kwargs)


def _month_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21, **kwargs)


def _quarterly_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21 * 3, **kwargs)


def _yearly_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21 * 12, **kwargs)


def _universal_aggregator_handler(dataset, period_range, **kwargs):
    length = kwargs['length']
    ops = _get_ops(kwargs['op'])

    if length == -1:
        return [_aggregate_collection(dataset, ops)], []

    else:
        aggregated_dataset = []

        raw_dataset = dataset[0:length]
        raw_chunked_dataset = chunks(raw_dataset, period_range)
        for week in raw_chunked_dataset:
            aggregated_dataset.append(_aggregate_collection(week, ops))

        return aggregated_dataset, dataset[length:]


def features_aggregator(dataset, features_descriptors):
    aggregated_dataset = []
    for feature_set in features_descriptors:
        index, length, handler, params = feature_set['index'], feature_set['length'], feature_set['handler'], feature_set['params']

        if length == -1:
            period_range = len(dataset)
            step_aggregated_dataset, dataset = handler(dataset, period_range, **{
                'index': index,
                'length': length,

                **params
            })
        else:
            step_aggregated_dataset, dataset = handler(dataset, **{
                'index': index,
                'length': length,

                **params
            })

        if len(step_aggregated_dataset) > 0:
            aggregated_dataset += step_aggregated_dataset

        if len(dataset) == 0:
            return aggregated_dataset

    return aggregated_dataset


def train_dataset_generator(df, shift_range, repeat_out):
    _features_descriptors = [
        {'index': 0, 'length': 21 * 12 * 1, 'handler': _day_aggregator_handler, 'params': {}},
        {'index': 1, 'length': 21 * 12 * 6, 'handler': _week_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 2, 'length': 21 * 12 * 6, 'handler': _month_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 3, 'length': 21 * 12 * 6, 'handler': _quarterly_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 4, 'length': 21 * 12 * 6, 'handler': _yearly_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 5, 'length': -1, 'handler': _universal_aggregator_handler, 'params': {'op': 'avg'}},
    ]

    dataset = list(df.T.to_dict().values())
    train_dataset = []
    while repeat_out > 0:
        train_subset = features_aggregator(dataset, _features_descriptors)
        train_dataset.append(train_subset)
        dataset = dataset[shift_range:]
        repeat_out -= 1

    return train_dataset


train_dataset = train_dataset_generator(training_df, shift_range=5, repeat_out=100)
len(train_dataset[0])
train_dataset[0][0]

658

{'volume': -1.1231529367986126,
 'high_open_delta': -0.5715353021241117,
 'low_close_delta': 0.5504203843396211,
 'high_low_delta': -0.6795932014523284,
 'open_close_delta': 0.005077743794642328,
 'high_close_delta': -0.525318346250986,
 'low_open_delta': 0.48436344318589303,
 'close_delta': -0.01420818377085028}

In [33]:
def pad_dataset_sequence(dataset):
    if len(dataset) == 0:
        return dataset

    max_sequence = max([len(e) for e in dataset])

    input_example_dict = dataset[0][0]
    pad_example_dict = {k: 0.0 for k, _ in input_example_dict.items()}

    for example in dataset:
        residual_sequence_length = max_sequence - len(example)
        if residual_sequence_length > 0:
            for e in [pad_example_dict] * residual_sequence_length:
                example.append(e)

    return dataset


print(collections.Counter([len(e) for e in train_dataset]))
train_dataset = pad_dataset_sequence(train_dataset)
print(collections.Counter([len(e) for e in train_dataset]))

Counter({658: 100})
Counter({658: 100})


# Data Preparation /Validation

# Modeling

### Hyper-parameters

In [34]:
class HyperParams:
    
    class training:
        epochs = 100
        batch_size = 1
        learning_rate = 0.01
        
        
    class Model:
        class RNN:
            input_size = len(FEATURES_SET)
            sequence_size = len(train_dataset[0])
            layers_size = 1
            hidden_size = 1
            dropout_rate = 0.05

        class Dense:
            layer_1 = 1024
            layer_2 = 1024
            layer_3 = 1024
            
            dropout = 0.05

            input_size = len(train_dataset[0])
            output_size = len(train_dataset[0])

gpu_enabled = torch.cuda.is_available()
device = torch.device("cuda") if gpu_enabled else torch.device("cpu")


In [35]:
class SingleInstrumentPredictorRNN(nn.Module):

    def __init__(self):
        super(SingleInstrumentPredictorRNN, self).__init__()

        self._rnn = nn.LSTM(input_size=HyperParams.Model.RNN.input_size, 
                            hidden_size=HyperParams.Model.RNN.hidden_size, 
                            num_layers=HyperParams.Model.RNN.layers_size, 
                            batch_first=True)

        self._fc_1 = nn.Linear(HyperParams.Model.Dense.input_size, HyperParams.Model.Dense.layer_1)
        self._fc_2 = nn.Linear(HyperParams.Model.Dense.layer_1, HyperParams.Model.Dense.layer_2)
        self._fc_3 = nn.Linear(HyperParams.Model.Dense.layer_2, HyperParams.Model.Dense.layer_3)
        self._fc_4 = nn.Linear(HyperParams.Model.Dense.layer_3, HyperParams.Model.Dense.output_size)
        
        self._drop_layer = nn.Dropout(p=HyperParams.Model.Dense.dropout)

    def forward(self, input):
        batch_size = input.size(0)

        hidden = self.init_hidden(batch_size)
        out, hidden = self._rnn(input.double(), hidden)

        out = out.view(-1, HyperParams.Model.RNN.sequence_size)

        out = self._drop_layer(F.relu(self._fc_1(out)))
        out = self._drop_layer(F.relu(self._fc_2(out)))
        out = self._drop_layer(F.relu(self._fc_3(out)))
        out = self._fc_4(out)

        return out, hidden


    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(HyperParams.Model.RNN.layers_size, 
                             batch_size, 
                             HyperParams.Model.RNN.hidden_size).zero_().double(),
                  weight.new(HyperParams.Model.RNN.layers_size, 
                             batch_size, HyperParams.Model.RNN.hidden_size).zero_().double())
        return hidden
    

## Training

In [36]:
model = SingleInstrumentPredictorRNN().double()
print(model)

criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=HyperParams.training.learning_rate)

if gpu_enabled:
    model.cuda()
    print('GPU Enabled Model')
else:
    print('GPU Disabled Model')

SingleInstrumentPredictorRNN(
  (_rnn): LSTM(7, 1, batch_first=True)
  (_fc_1): Linear(in_features=658, out_features=1024, bias=True)
  (_fc_2): Linear(in_features=1024, out_features=1024, bias=True)
  (_fc_3): Linear(in_features=1024, out_features=1024, bias=True)
  (_fc_4): Linear(in_features=1024, out_features=658, bias=True)
  (_drop_layer): Dropout(p=0.05, inplace=False)
)
GPU Disabled Model


In [37]:
def tensorify_example(example):
    example_df = pd.DataFrame(example)

    features_tensor = torch.tensor(example_df[FEATURES_SET].values, dtype=torch.double)
    features_tensor = features_tensor.unsqueeze(0)

    target_tensor = torch.tensor(example_df[TARGET].values, dtype=torch.double)
    target_tensor = target_tensor.view(1, -1)

    #  features_tensor.size() # torch.Size([1, 658, 7])
    # target_tensor.size() # torch.Size([1, 658])
    return features_tensor, target_tensor


def batch_tensorify(examples_batch):
    features_tensors_list = [tensorify_example(example)[0] for example in examples_batch]
    target_tensors_list = [tensorify_example(example)[1] for example in examples_batch]

    return torch.cat(features_tensors_list, 0), torch.cat(target_tensors_list, 0)


exmaple_1 = train_dataset[0]
exmaple_2 = train_dataset[1]
batch_features, batch_target = batch_tensorify([exmaple_1, exmaple_2])

batch_features.size()
batch_features

batch_target.size()
batch_target

torch.Size([2, 658, 7])

tensor([[[-1.1232, -0.5715,  0.5504,  ...,  0.0051, -0.5253,  0.4844],
         [-0.5707, -0.5498,  0.5306,  ..., -0.0127, -0.5253,  0.4844],
         [-0.9419, -0.5715,  0.5504,  ...,  0.0141, -0.5151,  0.4754],
         ...,
         [-0.4083,  0.1737, -0.0591,  ..., -0.0684,  0.0834,  0.0154],
         [-0.5914,  0.3589, -0.3087,  ..., -0.0819,  0.2401, -0.1930],
         [-0.6666,  1.3003, -1.3473,  ...,  0.0268,  1.2389, -1.2246]],

        [[-0.8015, -0.5605,  0.5306,  ..., -0.0039, -0.5253,  0.4757],
         [-0.7444, -0.5607,  0.5404,  ...,  0.0051, -0.5152,  0.4754],
         [-0.5566, -0.5715,  0.5504,  ...,  0.0139, -0.5152,  0.4755],
         ...,
         [-0.4292,  0.1531, -0.0496,  ..., -0.0550,  0.0796,  0.0106],
         [-0.5929,  0.3691, -0.3177,  ..., -0.0694,  0.2638, -0.2135],
         [-0.6669,  1.3041, -1.3523,  ...,  0.0232,  1.2384, -1.2255]]],
       dtype=torch.float64)

torch.Size([2, 658])

tensor([[-0.0142, -0.0385, -0.0303,  ...,  0.0548,  0.1006,  0.0834],
        [-0.0303, -0.0303, -0.0223,  ...,  0.0410,  0.0955,  0.0842]],
       dtype=torch.float64)

In [38]:

def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]


for epoch in range(1, HyperParams.training.epochs + 1):
    epoch_loss = 0.0
    
    for batch_examples in batch(train_dataset, HyperParams.training.batch_size):

        batch_features, batch_target = batch_tensorify(batch_examples)
        batch_features, batch_target = batch_features.double(), batch_target.double()

        if gpu_enabled:
            batch_features = batch_features.cuda()
            batch_target = batch_target.cuda()

        output, hidden = model(batch_features.double())

        loss = criterion(output, batch_target)
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}/{HyperParams.training.epochs} ............. Loss: {epoch_loss/HyperParams.training.epochs}')

    
    
torch.save(model.state_dict(), f'./{INSTRUMENT}_{HyperParams.training.epochs}epochs_model_state.pt')

Epoch: 1/100 ............. Loss: 0.003367934197673745
Epoch: 2/100 ............. Loss: 0.002561407916926949


KeyboardInterrupt: 

# Evaludation

#### Load model

In [None]:
model = SingleInstrumentPredictorRNN().double()
model.load_state_dict(torch.load(f'./{INSTRUMENT}_{param_epochs}epochs_model_state.pt'))
model.eval()

#### Prepare data

In [None]:
testing_df = testing_df[RAW_DATA_FEATURES_SET]
testing_df = features_extraction(testing_df, [features_extraction_ops.calc])
testing_df = features_extraction(testing_df, [features_extraction_ops.clean])
testing_statistics_df = features_extraction(testing_df, [features_extraction_ops.statistics])
testing_df = features_extraction(testing_df, [features_extraction_ops.standardize])
train_dataset = train_dataset_generator(training_df, shift_range=5, repeat_out=1000)
len(train_dataset[0])
train_dataset[0][0]