In [5]:
from IPython.display import clear_output
from plotly import offline as plotly
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torch import nn
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import pandas as pd
import matplotlib
import torch
import tqdm
import os

plotly.init_notebook_mode(True)
np = pd.np

### Hyper Parameters
LEARNING_RATE = 0.01
BATCH_SIZE = 1000
NUM_EPOCHS = 10
INPUT_SIZE = NUM_FEATURES = 1
HIDDEN_SIZE = 200
NUM_CLASSES = 2
SEQ_LENGTH = 100
NUM_LAYERS = 1

def read_returns(pair):
    path = f'/media/mikeokslonger/data/returns.parquet/pair={pair}/part.0.parquet'
    path = f's3://mikeokslonger-ticks/returns.parquet/pair={pair}/part.0.parquet'
    df = pd.read_parquet(path, columns=['time', 'relative_returns', 'buy', 'quantity', 'price'])
    return df

def create_features(df):
    one_hot = np.array([[1, 0],
                       [0, 1]], dtype='float32')
    df['buy'] = df['buy'].astype(float)
    df['label'] = (df.shift(-1)['relative_returns'] > 0.).astype(int)
    df['label_onehot'] = df['label'].apply(lambda i: one_hot[i])
    df['time'] = (df['time'] - 20180101000000).astype(float)
    return df[1:]

pairs = ['ADX-USD', 'AIR-USD', 'AMM-USD', 'ATB-USD', 'ATM-USD', 'B2X-USD', 'BCC-USD', 'BCH-USD', 'BCN-USD', 'BMC-USD', 'BNT-USD', 'BQX-USD', 'BTCA-USD', 'BTC-USD', 'BTG-USD', 'BTM-USD', 'BTX-USD', 'CAT-USD', 'CDT-USD', 'CLD-USD', 'CL-USD', 'CND-USD', 'CTR-USD', 'CVC-USD', 'DASH-USD', 'DATA-USD', 'DCN-USD', 'DGB-USD', 'DIM-USD', 'DOGE-USD', 'EBTCOLD-USD', 'EDO-USD', 'EMGO-USD', 'ENJ-USD', 'EOS-USD', 'ETC-USD', 'ETH-USD', 'ETP-USD', 'EVX-USD', 'FUEL-USD', 'FUN-USD', 'ICOS-USD', 'ICX-USD', 'KMD-USD', 'LOC-USD', 'LSK-USD', 'LTC-USD', 'MAID-USD', 'MANA-USD', 'MCO-USD', 'NEO-USD', 'NGC-USD', 'NXT-USD', 'OAX-USD', 'OMG-USD', 'PLR-USD', 'PPC-USD', 'PRG-USD', 'QTUM-USD', 'SMART-USD', 'SMS-USD', 'SNC-USD', 'SNT-USD', 'STRAT-USD', 'STU-USD', 'STX-USD', 'SUB-USD', 'SUR-USD', 'SWFTC-USD', 'TNT-USD', 'TRX-USD', 'UGT-USD', 'UTT-USD', 'VEN-USD', 'VERI-USD', 'VIB-USD', 'WMGO-USD', 'WRC-USD', 'XDN-USD', 'XEM-USD', 'XMR-USD', 'XTZ-USD', 'XUC-USD', 'XVG-USD', 'ZEC-USD', 'ZRX-USD', 'ZSC-USD']

#pairs = !ls /media/mikeokslonger/data/returns.parquet/ | grep pair
#pairs = [p.split('=')[1] for p in pairs]

In [None]:
df = pd.concat([create_features(read_returns(pair)) for pair in tqdm.tqdm(pairs)]) # Full dataset
df = df[df.relative_returns.abs() > 0]


  0%|          | 0/87 [00:00<?, ?it/s][A

Returns



  1%|          | 1/87 [00:00<00:22,  3.90it/s][A

features
Returns
features



  2%|▏         | 2/87 [00:13<09:52,  6.97s/it][A

Returns



  3%|▎         | 3/87 [00:14<06:45,  4.83s/it][A

features
Returns



  5%|▍         | 4/87 [00:14<05:07,  3.70s/it][A

features
Returns



  6%|▌         | 5/87 [00:15<04:12,  3.09s/it][A

features
Returns
features



  7%|▋         | 6/87 [00:17<03:50,  2.85s/it][A

Returns
features



  8%|▊         | 7/87 [00:18<03:29,  2.62s/it][A

Returns


In [None]:
print(pairs)

In [4]:
pairs = !ls /media/mikeokslonger/data/returns.parquet/ | grep pair
pairs = [p.split('=')[1] for p in pairs]

In [4]:
class Ticks(Dataset):
    def __init__(self, data, batch_size):
        self.x_data = data[:, 1:2].copy().astype('float32')
        self.y_data = data[:, -1].copy()#.astype('int')
        
        indices = pd.DataFrame(data[:, -2] == 0,columns=['isnegative']).reset_index()
        positive_indices = indices[~indices.isnegative]['index']
        negative_indices = indices[indices.isnegative]['index']
        negative_indices_resampled = negative_indices.sample(frac=len(positive_indices)/len(negative_indices))
        self.new_indices = pd.np.concatenate([positive_indices, negative_indices_resampled])
        self.new_indices.sort()
        
        self.len = len(self.new_indices) - SEQ_LENGTH - ((len(self.new_indices) - SEQ_LENGTH) % batch_size)
        
    
    def __getitem__(self, index):
        new_index = self.new_indices[index]
        x = self.x_data[new_index: new_index + SEQ_LENGTH]
        return x, self.y_data[new_index]
    
    def __len__(self):
#        return 1000000
        return self.len


test_validation_split = int(len(df) * 0.8)
dataset_train = Ticks(df[['time', 'relative_returns', 'buy', 'quantity', 'label', 'label_onehot']].values[:test_validation_split], BATCH_SIZE)
dataset_test = Ticks(df[['time', 'relative_returns', 'buy', 'quantity', 'label', 'label_onehot']].values[test_validation_split:], BATCH_SIZE)
dataloader_train = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
dataloader_test = DataLoader(dataset=dataset_test, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

In [6]:
### Create Model

class LSTM(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = SEQ_LENGTH
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        ### (LAYERS, BATCH_SIZE, HIDDEN_SIZE)
        h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # Hidden
        c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # Cell state
        _, (h_out, _) = self.lstm(x, (h0, c0))
        h_out = h_out.view(-1, self.hidden_size)
        out = self.fc(h_out)
        activated = self.sig(out)
        return activated


lstm = LSTM(NUM_CLASSES, INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS)
lstm.load_state_dict(torch.load('/home/mikeokslonger/Downloads/180.pt', map_location=lambda storage, loc: storage))

In [15]:

    
# TRAIN

#loss_fn = torch.nn.MSELoss()
#loss_fn = torch.nn.CrossEntropyLoss()
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=LEARNING_RATE)


In [3]:

losses = []
accuracies = []
for epoch in list(range(NUM_EPOCHS)):
    correct = 0
    
    num_positive_returns = 0
    predicted_positive_returns = 0
    num_observations = 0
    for i, (trainX, trainY) in tqdm.tqdm(enumerate(dataloader_train), total=len(dataset_train) // BATCH_SIZE):
        outputs = lstm(trainX)
        correct += torch.max(outputs.data, 1)[1].eq(torch.max(trainY.data, 1)[1]).sum()
        optimizer.zero_grad()
        loss = loss_fn(outputs, trainY)
        loss.backward()
        optimizer.step()
        predicted_positive_returns += torch.max(outputs.data, 1)[1].sum()
        num_positive_returns += trainY[:, [1]].sum()
        num_observations += len(trainY)

    accuracies.append(correct.item() / len(dataloader_train.dataset))
    losses.append(loss.item())
    
    clear_output(True)
    print(f'epoch {epoch + 1}/{NUM_EPOCHS}')
    print(f'actual positive returns {num_positive_returns}/{num_observations}')
    print(f'predicted positive_returns {predicted_positive_returns}/{num_observations}')
    
    plotly.iplot(go.Figure(data=[go.Scatter(x=list(range(len(losses))), y=losses, name='loss'),
                                 go.Scatter(x=list(range(len(accuracies))), y=accuracies, name='accuracy', yaxis='y2')],
                           layout=go.Layout(yaxis={'title': 'loss'}, yaxis2={'title': 'accuracy', 'overlaying': 'y', 'side': 'right'})))
    print(outputs)
    
correct = 0
for i, (testX, testY) in tqdm.tqdm(enumerate(dataloader_test), total=len(dataset_test) // BATCH_SIZE):
    outputs = lstm(testX)
    predicted_labels = torch.max(outputs.data, 1)[1]
    correct += predicted_labels.eq(testY[:, [1]].view(-1).long()).sum()

loss = loss_fn(outputs, trainY)
accuracy = correct.item() / len(dataloader_test.dataset)
print(f'loss: {loss}, accuracy: {accuracy}')

epoch 7/10
actual positive returns 233102.0/466000
predicted positive_returns 210004/466000


tensor([[ 0.5000,  0.5000],
        [ 0.5000,  0.5000],
        [ 0.5000,  0.5000],
        ...,
        [ 0.5000,  0.5000],
        [ 0.5000,  0.5000],
        [ 0.5000,  0.5000]])


  1%|          | 3/466 [00:52<2:14:27, 17.42s/it]Process Process-15:
Process Process-16:
Traceback (most recent call last):
  File "/home/mikeokslonger/miniconda2/envs/python3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/mikeokslonger/miniconda2/envs/python3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mikeokslonger/miniconda2/envs/python3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
    r = index_queue.get()
  File "/home/mikeokslonger/miniconda2/envs/python3/lib/python3.6/multiprocessing/queues.py", line 335, in get
    res = self._reader.recv_bytes()
Traceback (most recent call last):
  File "/home/mikeokslonger/miniconda2/envs/python3/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/home/mikeokslonger/miniconda2/envs/python3/lib/python3.6/multiprocessi

RuntimeError: DataLoader worker (pid 102517) exited unexpectedly with exit code 1.

In [None]:
correct = 0
num_positive_returns = 0
predicted_positive_returns = 0
num_observations = 0
for i, (testX, testY) in tqdm.tqdm(enumerate(dataloader_test), total=len(dataset_test) // BATCH_SIZE):
    testX, testY = Variable(testX), Variable(testY)
    outputs = lstm(testX)
    predicted_labels = torch.max(outputs.data, 1)[1]
    correct += predicted_labels.cpu().eq(testY.cpu().data[:, [1]].view(-1).long()).sum()
    predicted_positive_returns += torch.max(outputs.data, 1)[1].sum()
    num_positive_returns += testY[:, [1]].sum()



In [19]:
loss = loss_fn(outputs, testY)
accuracy = correct.item() / len(dataloader_test.dataset)
num_positive_returns = num_positive_returns.cpu().data[0]

print(f'loss: {loss.data[0]}, accuracy: {accuracy}')
print(f'actual positive returns {num_positive_returns}/{len(dataloader_test.dataset)}: {num_positive_returns/len(dataloader_test.dataset)}')
print(f'predicted positive_returns {predicted_positive_returns}/{len(dataloader_test.dataset)}: {predicted_positive_returns/len(dataloader_test.dataset)}')

loss: 1.0134743452072144, accuracy: 0.5325142857142857
actual positive returns 17191.0/35000: 0.4911714196205139
predicted positive_returns 2291/35000: 0



invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. Use tensor.item() to convert a 0-dim tensor to a Python number


invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. Use tensor.item() to convert a 0-dim tensor to a Python number

