# Deal or No Deal

### Dependency

In [17]:
import os
import glob
import collections
import numpy as np
import pandas as pd
import matplotlib
from pathlib import Path
from datetime import datetime
from enum import Enum

import torch
from torch import nn
import torch.tensor
import torch.optim as optim
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.mode.chained_assignment = None

### Configs

In [29]:
TRAINING_DATE_FROM = datetime.strptime('1990-03-26', '%Y-%m-%d')
TRAINING_DATE_TO = datetime.strptime('2019-07-01', '%Y-%m-%d')

VALIDATION_DATE_FROM = datetime.strptime('2019-07-01', '%Y-%m-%d')
VALIDATION_DATE_TO = datetime.strptime('2019-12-31', '%Y-%m-%d')

TESTING_DATE_FROM = datetime.strptime('2020-01-01', '%Y-%m-%d')
TESTING_DATE_TO = datetime.strptime('2020-02-13', '%Y-%m-%d')

PREDICT_UP_TO = 21

In [30]:
DATA_DIR = '.'
REGION = 'us'
INSTRUMENT = 'unh'

RAW_DATA_FULL_FEATURES_SET = ['date', 'open', 'high', 'low', 'close', 'volume', 'openint']
RAW_DATA_REMOVED_FEATURES_SET = ['open', 'high', 'low', 'close']
RAW_DATA_ADDED_FEATURES_SET = ['volume']
RAW_DATA_FEATURES_SET = RAW_DATA_REMOVED_FEATURES_SET + RAW_DATA_ADDED_FEATURES_SET

RAW_DATA_FEATURES_SET = ['open', 'high', 'low', 'close', 'volume']
RAW_TARGET = 'close'

FEATURES_SET = ['volume',
                'high_open_delta',
                'low_close_delta',
                'high_low_delta',
                'open_close_delta',
                'high_close_delta',
                'low_open_delta']
TARGET = 'close_delta'

## Helpers

### Market Data

In [44]:
def fetch_instrument_file(instrument, region, base_dir):
    cwd = os.getcwd()
    os.chdir(base_dir)
    
    instrument_file_list = result = list(Path(".").rglob(f"*{instrument}*.txt"))
    if not instrument_file_list:
        os.chdir(cwd)
        raise Exception(f'Cannot find file for instrument {instrument}')
    if len(instrument_file_list) > 1:
        os.chdir(cwd)
        raise Exception(f'Found multiple file for instrument {instrument}')
        
    os.chdir(cwd)
    return str(os.path.join(os.getcwd(), instrument_file_list[0]))

instrument_data_file = fetch_instrument_file(INSTRUMENT, REGION, DATA_DIR)
instrument_data_file

'/Users/myousse3/data/dev/personal/be-mil/data/daily/us/nyse stocks/2/unh.us.txt'

# Data Preparation 

In [45]:
instrument_data_file = fetch_instrument_file(INSTRUMENT, REGION, DATA_DIR)
instrument_data_file

'/Users/myousse3/data/dev/personal/be-mil/data/daily/us/nyse stocks/2/unh.us.txt'

In [46]:
df = pd.read_csv(instrument_data_file)
df.columns = RAW_DATA_FULL_FEATURES_SET
df.date = pd.to_datetime(df.date, format='%Y%m%d')
df

Unnamed: 0,date,open,high,low,close,volume,openint
0,1990-03-26,0.25264,0.25264,0.25264,0.25264,473990,0
1,1990-03-27,0.25264,0.26996,0.25264,0.26996,4493732,0
2,1990-03-28,0.26996,0.26996,0.26119,0.26119,1793083,0
3,1990-03-29,0.26996,0.26996,0.26119,0.26119,617291,0
4,1990-03-30,0.26996,0.26996,0.26119,0.26119,2017221,0
...,...,...,...,...,...,...,...
7510,2020-01-16,298.00000,300.99000,295.93000,300.74000,3710344,0
7511,2020-01-17,300.01000,300.70000,295.88000,298.47000,4632160,0
7512,2020-01-21,296.89000,302.54000,296.01000,300.53000,4963132,0
7513,2020-01-22,301.99000,302.13000,297.35000,300.59000,3117568,0


#### Split dataset - (Training / Validation / Testing)

In [47]:
training_df = df[(df['date'] >= TRAINING_DATE_FROM) & (df['date'] <= TRAINING_DATE_TO)] 
training_df = training_df.reset_index()
training_df.drop(['index'], axis=1, inplace=True)
training_df

validation_df = df[(df['date'] >= VALIDATION_DATE_FROM) & (df['date'] <= VALIDATION_DATE_TO)] 
validation_df = validation_df.reset_index()
validation_df.drop(['index'], axis=1, inplace=True)
validation_df

testing_df = df[(df['date'] >= TESTING_DATE_FROM) & (df['date'] <= TESTING_DATE_TO)] 
testing_df = testing_df.reset_index()
testing_df.drop(['index'], axis=1, inplace=True)
testing_df

Unnamed: 0,date,open,high,low,close,volume,openint
0,1990-03-26,0.25264,0.25264,0.25264,0.25264,473990,0
1,1990-03-27,0.25264,0.26996,0.25264,0.26996,4493732,0
2,1990-03-28,0.26996,0.26996,0.26119,0.26119,1793083,0
3,1990-03-29,0.26996,0.26996,0.26119,0.26119,617291,0
4,1990-03-30,0.26996,0.26996,0.26119,0.26119,2017221,0
...,...,...,...,...,...,...,...
7368,2019-06-25,247.18000,248.90000,245.32000,245.56000,3210315,0
7369,2019-06-26,245.89000,245.97000,241.06000,241.40000,3885971,0
7370,2019-06-27,242.18000,246.04000,241.09000,244.24000,3395049,0
7371,2019-06-28,244.40000,244.65000,236.62000,241.93000,6785360,0


Unnamed: 0,date,open,high,low,close,volume,openint
0,2019-07-01,243.86,244.07,238.670,240.58,4144328,0
1,2019-07-02,240.53,241.91,237.500,241.89,3897938,0
2,2019-07-03,242.24,245.42,241.960,243.13,2581404,0
3,2019-07-05,242.31,246.64,241.980,244.89,2990530,0
4,2019-07-08,244.31,245.87,242.800,245.74,1918951,0
...,...,...,...,...,...,...,...
123,2019-12-24,295.26,295.83,293.560,294.54,714032,0
124,2019-12-26,295.30,296.26,294.540,295.65,1050688,0
125,2019-12-27,296.01,296.54,295.014,295.97,1547211,0
126,2019-12-30,296.05,296.54,293.450,293.85,1512088,0


Unnamed: 0,date,open,high,low,close,volume,openint
0,2020-01-02,293.98,295.7,289.79,292.5,2544306,0
1,2020-01-03,287.27,291.875,284.36,289.54,2712006,0
2,2020-01-06,288.0,291.66,287.22,291.55,3079134,0
3,2020-01-07,290.7,291.49,287.58,289.79,2497613,0
4,2020-01-08,291.5,297.38,290.04,295.9,3379840,0
5,2020-01-09,294.29,297.2,293.717,294.22,2838587,0
6,2020-01-10,296.07,296.07,292.21,295.13,2477989,0
7,2020-01-13,294.37,294.45,285.015,285.85,6996743,0
8,2020-01-14,285.6,288.24,284.25,288.24,6067709,0
9,2020-01-15,289.02,299.64,289.0,296.41,6081615,0


# Data Preparation /Training

#### Trim dataframe

In [50]:
training_df = training_df[RAW_DATA_FEATURES_SET]
training_df

Unnamed: 0,open,high,low,close,volume
0,0.25264,0.25264,0.25264,0.25264,473990
1,0.25264,0.26996,0.25264,0.26996,4493732
2,0.26996,0.26996,0.26119,0.26119,1793083
3,0.26996,0.26996,0.26119,0.26119,617291
4,0.26996,0.26996,0.26119,0.26119,2017221
...,...,...,...,...,...
7368,247.18000,248.90000,245.32000,245.56000,3210315
7369,245.89000,245.97000,241.06000,241.40000,3885971
7370,242.18000,246.04000,241.09000,244.24000,3395049
7371,244.40000,244.65000,236.62000,241.93000,6785360


In [7]:
class features_extraction_ops(Enum):
    calc = 'calc'
    clean = 'clean'
    statistics = 'statistics'
    standardize = 'standardize'

def features_extraction(df, ops = []):
    def _calc(df):
        df['high_open_delta'] = df['high'] - df['open']
        df['low_close_delta'] = df['low'] - df['close']

        df['high_low_delta'] = df['high'] - df['low']
        df['open_close_delta'] = df['open'] - df['close']

        df['high_close_delta'] = df['high'] - df['close']
        df['low_open_delta'] = df['low'] - df['open']

        df[f'{RAW_TARGET}_t1'] = df[RAW_TARGET].shift(-1)
        df[TARGET] = df[f'{RAW_TARGET}_t1'] - df[RAW_TARGET]

        df = df[:-1]
        
        return df
    
    def _clean(df):
        df.drop([f'{RAW_TARGET}_t1'], axis=1, inplace=True)
        df.drop(RAW_DATA_REMOVED_FEATURES_SET, axis=1, inplace=True)
        
        return df

    def _statistics(df):
        return df.describe()
    
    def _standardize(df):
        for feature in FEATURES_SET:
            df[feature] = StandardScaler().fit_transform(df[[feature]])
        df[TARGET] = StandardScaler().fit_transform(df[[TARGET]])
        
        return df

    if features_extraction_ops.calc in ops:
        df = _calc(df)
    elif features_extraction_ops.clean in ops:
        df = _clean(df)
    elif features_extraction_ops.statistics in ops:
        df = _statistics(df)
    elif features_extraction_ops.statistics in ops:
        df = _standardize(df)
        
    return df
        
    

Unnamed: 0,open,high,low,close,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_t1,close_delta
0,0.25264,0.25264,0.25264,0.25264,473990,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.26996,0.01732
1,0.25264,0.26996,0.25264,0.26996,4493732,0.01732,-0.01732,0.01732,-0.01732,0.00000,0.00000,0.26119,-0.00877
2,0.26996,0.26996,0.26119,0.26119,1793083,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.26119,0.00000
3,0.26996,0.26996,0.26119,0.26119,617291,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.26119,0.00000
4,0.26996,0.26996,0.26119,0.26119,2017221,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.26996,0.00877
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7509,289.02000,299.64000,289.00000,296.41000,6081615,10.62000,-7.41000,10.64000,-7.39000,3.23000,-0.02000,300.74000,4.33000
7510,298.00000,300.99000,295.93000,300.74000,3710344,2.99000,-4.81000,5.06000,-2.74000,0.25000,-2.07000,298.47000,-2.27000
7511,300.01000,300.70000,295.88000,298.47000,4632160,0.69000,-2.59000,4.82000,1.54000,2.23000,-4.13000,300.53000,2.06000
7512,296.89000,302.54000,296.01000,300.53000,4963132,5.65000,-4.52000,6.53000,-3.64000,2.01000,-0.88000,300.59000,0.06000


In [None]:
features_extraction(training_df, )

In [8]:
df.drop([f'{RAW_TARGET}_t1'], axis=1, inplace=True)
df.drop(RAW_DATA_REMOVED_FEATURES_SET, axis=1, inplace=True)
df

Unnamed: 0,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_delta
0,473990,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.01732
1,4493732,0.01732,-0.01732,0.01732,-0.01732,0.00000,0.00000,-0.00877
2,1793083,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00000
3,617291,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00000
4,2017221,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00877
...,...,...,...,...,...,...,...,...
7509,6081615,10.62000,-7.41000,10.64000,-7.39000,3.23000,-0.02000,4.33000
7510,3710344,2.99000,-4.81000,5.06000,-2.74000,0.25000,-2.07000,-2.27000
7511,4632160,0.69000,-2.59000,4.82000,1.54000,2.23000,-4.13000,2.06000
7512,4963132,5.65000,-4.52000,6.53000,-3.64000,2.01000,-0.88000,0.06000


#### Collect statistical summary

In [9]:
df_summary = df.describe()
df_summary

Unnamed: 0,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_delta
count,7514.0,7514.0,7514.0,7514.0,7514.0,7514.0,7514.0,7514.0
mean,8555819.0,0.492946,-0.521362,1.005653,-0.008654,0.484292,-0.512708,0.03982
std,7241209.0,0.901377,0.963271,1.496808,1.064109,0.924253,1.048638,1.184307
min,0.0,0.0,-14.53,0.0,-13.24,0.0,-21.91,-12.69
25%,4464686.0,0.0438,-0.56475,0.157,-0.199675,0.0523,-0.556,-0.16775
50%,6791788.0,0.191,-0.1945,0.5435,0.0,0.18,-0.176,0.00866
75%,10384580.0,0.564,-0.053,1.17,0.163,0.54,-0.043415,0.235
max,185809400.0,19.38,0.0,23.91,16.83,16.83,0.0,17.93


#### Standardized dataset

In [10]:
for feature in FEATURES_SET:
    df[feature] = StandardScaler().fit_transform(df[[feature]])
df[TARGET] = StandardScaler().fit_transform(df[[TARGET]])

df

Unnamed: 0,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_delta
0,-1.116163,-0.546917,0.541277,-0.671910,0.008133,-0.524017,0.488960,-0.019000
1,-0.561005,-0.527701,0.523295,-0.660338,-0.008145,-0.524017,0.488960,-0.041031
2,-0.933986,-0.546917,0.541277,-0.666050,0.016375,-0.514527,0.480596,-0.033625
3,-1.096372,-0.546917,0.541277,-0.666050,0.016375,-0.514527,0.480596,-0.033625
4,-0.903031,-0.546917,0.541277,-0.666050,0.016375,-0.514527,0.480596,-0.026220
...,...,...,...,...,...,...,...,...
7509,-0.341707,11.235840,-7.151771,6.437022,-6.937104,2.970931,0.469886,3.622764
7510,-0.669197,2.770451,-4.452456,2.708842,-2.566962,-0.253510,-1.485160,-1.950485
7511,-0.541887,0.218629,-2.147657,2.548490,1.455449,1.888904,-3.449743,1.705904
7512,-0.496178,5.721687,-4.151379,3.690997,-3.412796,1.650858,-0.350280,0.017041


In [None]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def _get_ops(op):
    if op == 'avg':
        return np.mean

    raise Exception(f'Unknown operation {op}')


def _aggregate_collection(lst, op):
    if len(lst) == 0:
        return {}
    elif len(lst) == 1:
        return lst[0]
    else:
        aggrgated_values_dict = {}
        for entry in lst:
            for key, value in entry.items():
                if key in aggrgated_values_dict:
                    aggrgated_values_dict[key].append(value)
                else:
                    aggrgated_values_dict[key] = [value]

        final_aggrgated_values_dict = {}
        for key, values in aggrgated_values_dict.items():
            values = [e for e in values if e]
            final_aggrgated_values_dict[key] = op(values)

        return final_aggrgated_values_dict


def _day_aggregator_handler(dataset, **kwargs):
    length = kwargs['length']
    return dataset[0:length], dataset[length:]


def _week_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 5, **kwargs)


def _month_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21, **kwargs)


def _quarterly_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21 * 3, **kwargs)


def _yearly_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21 * 12, **kwargs)


def _universal_aggregator_handler(dataset, period_range, **kwargs):
    length = kwargs['length']
    ops = _get_ops(kwargs['op'])

    if length == -1:
        return [_aggregate_collection(dataset, ops)], []

    else:
        aggregated_dataset = []

        raw_dataset = dataset[0:length]
        raw_chunked_dataset = chunks(raw_dataset, period_range)
        for week in raw_chunked_dataset:
            aggregated_dataset.append(_aggregate_collection(week, ops))

        return aggregated_dataset, dataset[length:]


def features_aggregator(dataset, features_descriptors):
    aggregated_dataset = []
    for feature_set in features_descriptors:
        index, length, handler, params = feature_set['index'], feature_set['length'], feature_set['handler'], feature_set['params']

        if length == -1:
            period_range = len(dataset)
            step_aggregated_dataset, dataset = handler(dataset, period_range, **{
                'index': index,
                'length': length,

                **params
            })
        else:
            step_aggregated_dataset, dataset = handler(dataset, **{
                'index': index,
                'length': length,

                **params
            })

        if len(step_aggregated_dataset) > 0:
            aggregated_dataset += step_aggregated_dataset

        if len(dataset) == 0:
            return aggregated_dataset

    return aggregated_dataset


def train_dataset_generator(df, shift_range=21, repeat_out=2):
    _features_descriptors = [
        {'index': 0, 'length': 21 * 12 * 1, 'handler': _day_aggregator_handler, 'params': {}},
        {'index': 1, 'length': 21 * 12 * 6, 'handler': _week_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 2, 'length': 21 * 12 * 6, 'handler': _month_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 3, 'length': 21 * 12 * 6, 'handler': _quarterly_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 4, 'length': 21 * 12 * 6, 'handler': _yearly_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 5, 'length': -1, 'handler': _universal_aggregator_handler, 'params': {'op': 'avg'}},
    ]

    dataset = list(df.T.to_dict().values())
    train_dataset = []
    while repeat_out > 0:
        train_subset = features_aggregator(dataset, _features_descriptors)
        train_dataset.append(train_subset)
        dataset = dataset[shift_range:]
        repeat_out -= 1

    return train_dataset


train_dataset = train_dataset_generator(df, shift_range=5, repeat_out=100)
len(train_dataset[0])
train_dataset[0][0]

In [None]:
def pad_dataset_sequence(dataset):
    if len(dataset) == 0:
        return dataset

    max_sequence = max([len(e) for e in dataset])

    input_example_dict = dataset[0][0]
    pad_example_dict = {k: 0.0 for k, _ in input_example_dict.items()}

    for example in dataset:
        residual_sequence_length = max_sequence - len(example)
        if residual_sequence_length > 0:
            for e in [pad_example_dict] * residual_sequence_length:
                example.append(e)

    return dataset


print(collections.Counter([len(e) for e in train_dataset]))
train_dataset = pad_dataset_sequence(train_dataset)
print(collections.Counter([len(e) for e in train_dataset]))

# Modeling

### Hyper-parameters

In [None]:
class HyperParams:
    
    class training:
        epochs = 20
        batch_size = 1
        learning_rate = 0.01
        
        
    class Model:
        class RNN:
            input_size = len(features_set)
            sequence_size = len(train_dataset[0])
            layers_size = 1
            hidden_size = 1
            dropout_rate = 0.05

        class Dense:
            layer_1 = 1024
            layer_2 = 1024
            layer_3 = 1024
            
            dropout = 0.05

            input_size = len(train_dataset[0])
            output_size = len(train_dataset[0])

gpu_enabled = torch.cuda.is_available()
device = torch.device("cuda") if gpu_enabled else torch.device("cpu")

In [None]:
class SingleInstrumentPredictorRNN(nn.Module):

    def __init__(self):
        super(SingleInstrumentPredictorRNN, self).__init__()

        self._rnn = nn.RNN(param_rnn_input_size, param_hidden_size, param_layers_size, batch_first=True, dropout=param_rnn_dropout_rate)

        self._fc_1 = nn.Linear(param_dense_input_size, param_dense_1)
        self._fc_2 = nn.Linear(param_dense_1, param_dense_2)
        self._fc_3 = nn.Linear(param_dense_2, param_dense_3)
        self._fc_4 = nn.Linear(param_dense_3, param_dense_output_size)
        
        self._drop_layer = nn.Dropout(p=p)

    def forward(self, input):
        batch_size = input.size(0)

        hidden = self.init_hidden(batch_size)
        out, hidden = self._rnn(input.double(), hidden.double())

        out = out.view(-1, param_sequence_size)

        out = self._drop_layer(F.relu(self._fc_1(out)))
        out = self._drop_layer(F.relu(self._fc_2(out)))
        out = self._drop_layer(F.relu(self._fc_3(out)))
        out = self._fc_4(out)

        return out, hidden

    def init_hidden(self, batch_size):
        return torch.rand(param_layers_size, batch_size, param_hidden_size, dtype=torch.double)

## Training

In [None]:
model = SingleInstrumentPredictorRNN().double()

criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=HyperParams.training.learning_rate)

if gpu_enabled:
    model.cuda()
    print('GPU Enabled Model')
else:
    print('GPU Disabled Model')

In [None]:
def tensorify_example(example):
    example_df = pd.DataFrame(example)

    features_tensor = torch.tensor(example_df[features_set].values, dtype=torch.double)
    features_tensor = features_tensor.unsqueeze(0)

    target_tensor = torch.tensor(example_df[target].values, dtype=torch.double)
    target_tensor = target_tensor.view(1, -1)

    #  features_tensor.size() # torch.Size([1, 658, 7])
    # target_tensor.size() # torch.Size([1, 658])
    return features_tensor, target_tensor


def batch_tensorify(examples_batch):
    features_tensors_list = [tensorify_example(example)[0] for example in examples_batch]
    target_tensors_list = [tensorify_example(example)[1] for example in examples_batch]

    return torch.cat(features_tensors_list, 0), torch.cat(target_tensors_list, 0)


exmaple_1 = train_dataset[0]
exmaple_2 = train_dataset[1]
batch_features, batch_target = batch_tensorify([exmaple_1, exmaple_2])

batch_features.size()
batch_features

batch_target.size()
batch_target

In [None]:

def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]


for epoch in range(1, param_epochs + 1):
    epoch_loss = 0.0
    
    for batch_examples in batch(train_dataset, param_batch_size):

        batch_features, batch_target = batch_tensorify(batch_examples)
        batch_features, batch_target = batch_features.double(), batch_target.double()

        if gpu_enabled:
            input_batch = input_batch.cuda()
            labels_batch = labels_batch.cuda()

        optimizer.zero_grad()

        batch_features.to(device)
        batch_target.to(device)

        output, hidden = model(batch_features.double())

        loss = criterion(output, batch_target)
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}/{param_epochs} ............. Loss: {epoch_loss}')

torch.save(model.state_dict(), f'./{INSTRUMENT}_{param_epochs}epochs_model_state.pt')

In [None]:
# Evaludation