# Deal or No Deal

## Dependency

In [76]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib
import collections
from sklearn.preprocessing import StandardScaler

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


# Configs

In [2]:
DATA_DIR = './data'
RESOLUTION = 'daily'
REGION = 'us'
INSTRUMENT = './nyse stocks/2/unh.us.txt'

ohlc = ['open', 'high', 'low', 'close']


In [3]:
data_file = os.path.normpath(os.path.join(DATA_DIR, RESOLUTION, REGION, INSTRUMENT))
data_file

'data/daily/us/nyse stocks/2/unh.us.txt'

In [4]:
df = pd.read_csv(data_file)
df.columns = ['date', 'open', 'high', 'low', 'close', 'volume', 'openint']
df

Unnamed: 0,date,open,high,low,close,volume,openint
0,19900326,0.25264,0.25264,0.25264,0.25264,473990,0
1,19900327,0.25264,0.26996,0.25264,0.26996,4493732,0
2,19900328,0.26996,0.26996,0.26119,0.26119,1793083,0
3,19900329,0.26996,0.26996,0.26119,0.26119,617291,0
4,19900330,0.26996,0.26996,0.26119,0.26119,2017221,0
...,...,...,...,...,...,...,...
7510,20200116,298.00000,300.99000,295.93000,300.74000,3710344,0
7511,20200117,300.01000,300.70000,295.88000,298.47000,4632160,0
7512,20200121,296.89000,302.54000,296.01000,300.53000,4963132,0
7513,20200122,301.99000,302.13000,297.35000,300.59000,3117568,0


In [5]:
df.date = pd.to_datetime(df.date, format='%Y%m%d')
df

Unnamed: 0,date,open,high,low,close,volume,openint
0,1990-03-26,0.25264,0.25264,0.25264,0.25264,473990,0
1,1990-03-27,0.25264,0.26996,0.25264,0.26996,4493732,0
2,1990-03-28,0.26996,0.26996,0.26119,0.26119,1793083,0
3,1990-03-29,0.26996,0.26996,0.26119,0.26119,617291,0
4,1990-03-30,0.26996,0.26996,0.26119,0.26119,2017221,0
...,...,...,...,...,...,...,...
7510,2020-01-16,298.00000,300.99000,295.93000,300.74000,3710344,0
7511,2020-01-17,300.01000,300.70000,295.88000,298.47000,4632160,0
7512,2020-01-21,296.89000,302.54000,296.01000,300.53000,4963132,0
7513,2020-01-22,301.99000,302.13000,297.35000,300.59000,3117568,0


In [6]:
df['high_open_delta'] = df['high'] - df['open']
df['low_close_delta'] = df['low'] - df['close']

df['high_low_delta'] = df['high'] - df['low']
df['open_close_delta'] = df['open'] - df['close']

df['high_close_delta'] = df['high'] - df['close']
df['low_open_delta'] = df['low'] - df['open']
        
df[f'{target_col}_t1'] = df[target_col].shift(-1)
df[f'{target_col}_delta'] = df[f'{target_col}_t1'] - df[target_col]

df[['volume']] = StandardScaler().fit_transform(df[['volume']])

df.drop([f'{target_col}_t1'], axis=1, inplace=True)
df.drop(ohlc + ['date', 'openint'], axis=1, inplace=True)
df = df[:-1]

df

Unnamed: 0,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_delta
0,-1.116097,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.01732
1,-0.560923,0.01732,-0.01732,0.01732,-0.01732,0.00000,0.00000,-0.00877
2,-0.933915,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00000
3,-1.096305,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00000
4,-0.902958,0.00000,0.00000,0.00877,0.00877,0.00877,-0.00877,0.00877
...,...,...,...,...,...,...,...,...
7509,-0.341618,10.62000,-7.41000,10.64000,-7.39000,3.23000,-0.02000,4.33000
7510,-0.669118,2.99000,-4.81000,5.06000,-2.74000,0.25000,-2.07000,-2.27000
7511,-0.541805,0.69000,-2.59000,4.82000,1.54000,2.23000,-4.13000,2.06000
7512,-0.496093,5.65000,-4.52000,6.53000,-3.64000,2.01000,-0.88000,0.06000


In [71]:
features_set = ['volume',
            'high_open_delta',
            'low_close_delta',
            'high_low_delta',
            'open_close_delta',
            'high_close_delta',
            'low_open_delta']
target = ['close_delta']
torch_tensor = torch.tensor(df[features_set].values)

# printing out result
print(torch_tensor)
print(torch_tensor.size())


tensor([[-1.1161,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.5609,  0.0173, -0.0173,  ..., -0.0173,  0.0000,  0.0000],
        [-0.9339,  0.0000,  0.0000,  ...,  0.0088,  0.0088, -0.0088],
        ...,
        [-0.5418,  0.6900, -2.5900,  ...,  1.5400,  2.2300, -4.1300],
        [-0.4961,  5.6500, -4.5200,  ..., -3.6400,  2.0100, -0.8800],
        [-0.7510,  0.1400, -3.2400,  ...,  1.4000,  1.5400, -4.6400]],
       dtype=torch.float64)
torch.Size([7514, 7])


In [72]:

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def _get_ops(op):
    if op == 'avg':
        return np.mean

    raise Exception(f'Unknown operation {op}')


def _aggregate_collection(lst, op):
    if len(lst) == 0:
        return {}
    elif len(lst) == 1:
        return lst[0]
    else:
        aggrgated_values_dict = {}
        for entry in lst:
            for key, value in entry.items():
                if key in aggrgated_values_dict:
                    aggrgated_values_dict[key].append(value)
                else:
                    aggrgated_values_dict[key] = [value]

        final_aggrgated_values_dict = {}
        for key, values in aggrgated_values_dict.items():
            values = [e for e in values if e]
            final_aggrgated_values_dict[key] = op(values)

        return final_aggrgated_values_dict


def _day_aggregator_handler(dataset, **kwargs):
    length = kwargs['length']
    return dataset[0:length], dataset[length:]


def _week_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 5, **kwargs)


def _month_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21, **kwargs)


def _quarterly_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21 * 3, **kwargs)


def _yearly_aggregator_handler(dataset, **kwargs):
    return _universal_aggregator_handler(dataset, 21 * 12, **kwargs)


def _universal_aggregator_handler(dataset, period_range, **kwargs):
    length = kwargs['length']
    ops = _get_ops(kwargs['op'])

    if length == -1:
        return [_aggregate_collection(dataset, ops)], []

    else:
        aggregated_dataset = []

        raw_dataset = dataset[0:length]
        raw_chunked_dataset = chunks(raw_dataset, period_range)
        for week in raw_chunked_dataset:
            aggregated_dataset.append(_aggregate_collection(week, ops))

        return aggregated_dataset, dataset[length:]


def features_aggregator(dataset, features_descriptors):
    aggregated_dataset = []
    for feature_set in features_descriptors:
        index, length, handler, params = feature_set['index'], feature_set['length'], feature_set['handler'], feature_set['params']

        if length == -1:
            period_range = len(dataset)
            step_aggregated_dataset, dataset = handler(dataset, period_range, **{
                'index': index,
                'length': length,

                **params
            })
        else:
            step_aggregated_dataset, dataset = handler(dataset, **{
                'index': index,
                'length': length,

                **params
            })

        if len(step_aggregated_dataset) > 0:
            aggregated_dataset += step_aggregated_dataset

        if len(dataset) == 0:
            return aggregated_dataset

    return aggregated_dataset


def train_dataset_generator(df, shift_range=21, repeat_out=2):
    _features_descriptors = [
        {'index': 0, 'length': 21 * 12 * 1, 'handler': _day_aggregator_handler, 'params': {}},
        {'index': 1, 'length': 21 * 12 * 6, 'handler': _week_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 2, 'length': 21 * 12 * 6, 'handler': _month_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 3, 'length': 21 * 12 * 6, 'handler': _quarterly_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 4, 'length': 21 * 12 * 6, 'handler': _yearly_aggregator_handler, 'params': {'op': 'avg'}},
        {'index': 5, 'length': -1, 'handler': _universal_aggregator_handler, 'params': {'op': 'avg'}},
    ]

    dataset = list(df.T.to_dict().values())
    train_dataset = []
    while repeat_out > 0:
        train_subset = features_aggregator(dataset, _features_descriptors)
        train_dataset.append(train_subset)
        dataset = dataset[shift_range:]
        repeat_out -= 1

    return train_dataset


train_dataset = train_dataset_generator(df, shift_range=5, repeat_out=1000)


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [73]:
def pad_dataset_sequence(dataset):
    if len(dataset) == 0:
        return dataset

    max_sequence = max([len(e) for e in dataset])

    input_example_dict = dataset[0][0]
    pad_example_dict = {k: 0.0 for k, _ in input_example_dict.items()}

    for example in dataset:
        residual_sequence_length = max_sequence - len(example)
        if residual_sequence_length > 0:
            for e in [pad_example_dict] * residual_sequence_length:
                example.append(e)

    return dataset


print(collections.Counter([len(e) for e in train_dataset]))
train_dataset = pad_dataset_sequence(train_dataset)
print(collections.Counter([len(e) for e in train_dataset]))

Counter({658: 243, 657: 51, 654: 51, 652: 51, 656: 50, 655: 50, 653: 50, 650: 13, 648: 13, 647: 13, 645: 13, 643: 13, 642: 13, 640: 13, 638: 13, 637: 13, 635: 13, 633: 13, 632: 13, 630: 13, 628: 13, 651: 12, 649: 12, 646: 12, 644: 12, 641: 12, 639: 12, 636: 12, 634: 12, 631: 12, 629: 12, 625: 5, 620: 5, 615: 5, 610: 5, 605: 5, 600: 5, 595: 5, 627: 4, 626: 4, 624: 4, 623: 4, 622: 4, 621: 4, 619: 4, 618: 4, 617: 4, 616: 4, 614: 4, 613: 4, 612: 4, 611: 4, 609: 4, 608: 4, 607: 4, 606: 4, 604: 4, 603: 4, 602: 4, 601: 4, 599: 4, 598: 4, 597: 4, 596: 4, 594: 4, 593: 4, 592: 4, 591: 1})
Counter({658: 1000})


# Modeling

In [None]:
# Hyper-params

param_epochs = 2
param_batch_size = 1

param_input_size = 7
param_sequence_size = 658
param_layers_size = 1
param_hidden_size = 1
param_dropout = 0.05
param_dense_1 = 1024
param_dense_2 = 512
param_dense_3 = 128
param_output_size = 1

gpu_enabled = torch.cuda.is_available()
device = torch.device("cuda") if gpu_enabled else torch.device("cpu")

param_lr = 0.01

criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=param_lr)

In [None]:
class SingleInstrumentPredictorRNN(nn.Model):
    
    def __init__(self):
        super(SingleInstrumentPredictorRNNim, self).__init__()
        
        self._rnn = nn.rnn(param_input_size, param_hidden_size, param_layers_size, batch_first=True, dropout=param_dropout)
        
        self._fc_1 = nn.Linear(param_dense_1, param_dense_2)
        self._fc_2 = nn.Linear(param_dense_2, param_dense_3)
        self._fc_3 = nn.Linear(param_dense_3, param_output_size)

    def forward(self, input):
        batch_size = input.size(0)
        

        hidden = self.init_hidden(batch_size)
        out, hidden = self.rnn(input, hidden)
        
        out = out.view(-1, self.hidden_dim)
        
        out = self._fc_1(out)
        out = self._fc_2(out)
        out = self._fc_3(out)
        
        return out, hidden
    

    def init_hidden(self, batch_size):
        hidden = torch.zeros(param_layers_size, batch_size, param_hidden_size)
        return hidden


# Training

In [None]:
model = SimpleRNN(hidden_size)

if gpu_enabled:
    model.cuda()
    print('GPU Enabled Model')
else:
    print('GPU Disabled Model')

In [24]:
example_df

Unnamed: 0,volume,high_open_delta,low_close_delta,high_low_delta,open_close_delta,high_close_delta,low_open_delta,close_delta
0,-1.116097,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.017320
1,-0.560923,0.017320,-0.017320,0.017320,-0.017320,0.000000,0.000000,-0.008770
2,-0.933915,0.000000,0.000000,0.008770,0.008770,0.008770,-0.008770,0.000000
3,-1.096305,0.000000,0.000000,0.008770,0.008770,0.008770,-0.008770,0.000000
4,-0.902958,0.000000,0.000000,0.008770,0.008770,0.008770,-0.008770,0.008770
...,...,...,...,...,...,...,...,...
653,-0.007944,0.551894,-0.549100,1.024456,-0.054888,0.489665,-0.510162,0.036761
654,-0.193129,0.507193,-0.477228,0.934968,-0.029800,0.467088,-0.451036,0.013722
655,-0.397698,0.607451,-0.532357,1.053599,-0.071746,0.531794,-0.473771,0.091920
656,-0.581725,0.758402,-0.756292,1.405655,-0.084980,0.660608,-0.678777,0.141347


In [97]:
def tensorify_example(example):
    example_df = pd.DataFrame(single_exmaple)

    features_tensor = torch.tensor(example_df[features_set].values)
    features_tensor = features_tensor.unsqueeze(0)

    target_tensor = torch.tensor(example_df[target].values)
    target_tensor = target_tensor.view(1, -1)

    # features_tensor.size() # torch.Size([1, 658, 7])
    # target_tensor.size() # torch.Size([1, 658])
    return features_tensor, target_tensor

def batch_tensorify(batch):
    

exmaple_1 = train_dataset[0]
exmaple_2 = train_dataset[1]
exmaple_3 = train_dataset[2]

exmaple_1_tensor, _ = tensorify_example(exmaple_1)
exmaple_2_tensor, _ = tensorify_example(exmaple_2)
exmaple_3_tensor, _ = tensorify_example(exmaple_3)

third_tensor = torch.cat([exmaple_1_tensor, exmaple_2_tensor], 0)
third_tensor.size()
third_tensor


torch.Size([2, 658, 7])

tensor([[[-1.1161,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [-0.5609,  0.0173, -0.0173,  ..., -0.0173,  0.0000,  0.0000],
         [-0.9339,  0.0000,  0.0000,  ...,  0.0088,  0.0088, -0.0088],
         ...,
         [-0.3977,  0.6075, -0.5324,  ..., -0.0717,  0.5318, -0.4738],
         [-0.5817,  0.7584, -0.7563,  ..., -0.0850,  0.6606, -0.6788],
         [-0.6564,  1.6468, -1.7875,  ..., -0.0047,  1.6120, -1.8084]],

        [[-1.1161,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [-0.5609,  0.0173, -0.0173,  ..., -0.0173,  0.0000,  0.0000],
         [-0.9339,  0.0000,  0.0000,  ...,  0.0088,  0.0088, -0.0088],
         ...,
         [-0.3977,  0.6075, -0.5324,  ..., -0.0717,  0.5318, -0.4738],
         [-0.5817,  0.7584, -0.7563,  ..., -0.0850,  0.6606, -0.6788],
         [-0.6564,  1.6468, -1.7875,  ..., -0.0047,  1.6120, -1.8084]]],
       dtype=torch.float64)

In [80]:
Row_list =[] 
  
for rows in df.itertuples(): 
    my_list =[rows.Date, rows.Event, rows.Cost] 
      
    # append the list to the final list 
    Row_list.append(my_list) 
  
# Print the list 
print(Row_list) 

AttributeError: 'Pandas' object has no attribute 'Date'

In [None]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]
    

for epoch in range(1, epochs + 1):
    
    loss = 0
    for example in batch(train_dataset, param_batch_size):
        
        optimizer.zero_grad()

        input_seq.to(device)

        output, hidden = model(input_seq)

        loss = criterion(output, target_seq.view(-1).long())

        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}/{epochs} ............. Loss: {loss.item()}')

In [None]:
# Evaludation