In [1]:
import torch
torch.cuda.get_device_name(torch.cuda.current_device())

'Tesla V100-SXM2-16GB'

In [2]:
from IPython.display import clear_output
from plotly import offline as plotly
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torch import nn
import plotly.graph_objs as go
import pandas as pd
import torch
import tqdm
import os

plotly.init_notebook_mode(True)
np = pd.np

### Hyper Parameters
LEARNING_RATE = 0.00001 # 0.01 and 0.005 worked eventually
BATCH_SIZE = 25000
NUM_EPOCHS = 1
INPUT_SIZE = NUM_FEATURES = 1
HIDDEN_SIZE = 200
NUM_CLASSES = 2
SEQ_LENGTH = 10
NUM_LAYERS = 1

  (fname, cnt))
  (fname, cnt))


In [None]:
def read_returns(pair):
    path = f's3://mikeokslonger-ticks/returns.parquet/pair={pair}/part.0.parquet'
    df = pd.read_parquet(path, columns=['time', 'relative_returns', 'buy', 'quantity', 'price'])
    return df

def create_features(df):
    one_hot = np.array([[1, 0],
                       [0, 1]], dtype='float32')
    df['buy'] = df['buy'].astype(float)
    df['label'] = (df.shift(-1)['relative_returns'] > 0.).astype(int)
    df['label_onehot'] = df['label'].apply(lambda i: one_hot[i])
    df['time'] = (df['time'] - 20180101000000).astype(float)
    return df[1:]

pairs = ['ADX-USD', 'AIR-USD', 'AMM-USD', 'ATB-USD', 'ATM-USD', 'B2X-USD', 'BCC-USD', 'BCH-USD', 'BCN-USD', 'BMC-USD', 'BNT-USD', 'BQX-USD', 'BTCA-USD', 'BTC-USD', 'BTG-USD', 'BTM-USD', 'BTX-USD', 'CAT-USD', 'CDT-USD', 'CLD-USD', 'CL-USD', 'CND-USD', 'CTR-USD', 'CVC-USD', 'DASH-USD', 'DATA-USD', 'DCN-USD', 'DGB-USD', 'DIM-USD', 'DOGE-USD', 'EBTCOLD-USD', 'EDO-USD', 'EMGO-USD', 'ENJ-USD', 'EOS-USD', 'ETC-USD', 'ETH-USD', 'ETP-USD', 'EVX-USD', 'FUEL-USD', 'FUN-USD', 'ICOS-USD', 'ICX-USD', 'KMD-USD', 'LOC-USD', 'LSK-USD', 'LTC-USD', 'MAID-USD', 'MANA-USD', 'MCO-USD', 'NEO-USD', 'NGC-USD', 'NXT-USD', 'OAX-USD', 'OMG-USD', 'PLR-USD', 'PPC-USD', 'PRG-USD', 'QTUM-USD', 'SMART-USD', 'SMS-USD', 'SNC-USD', 'SNT-USD', 'STRAT-USD', 'STU-USD', 'STX-USD', 'SUB-USD', 'SUR-USD', 'SWFTC-USD', 'TNT-USD', 'TRX-USD', 'UGT-USD', 'UTT-USD', 'VEN-USD', 'VERI-USD', 'VIB-USD', 'WMGO-USD', 'WRC-USD', 'XDN-USD', 'XEM-USD', 'XMR-USD', 'XTZ-USD', 'XUC-USD', 'XVG-USD', 'ZEC-USD', 'ZRX-USD', 'ZSC-USD']
pairs = pairs
df = pd.concat([create_features(read_returns(pair)) for pair in tqdm.tqdm(pairs)]) # Full dataset
df = df[df.relative_returns.abs() > 0]

 41%|████▏     | 36/87 [00:49<01:10,  1.37s/it]

In [4]:
len(df)

38337223

In [5]:
class Ticks(Dataset):
    def __init__(self, data, batch_size):
        self.x_data = data[:, 1:2].copy().astype('float32')
        self.y_data = data[:, -1].copy()#.astype('int')
        
        indices = pd.DataFrame(data[:, -2] == 0,columns=['isnegative']).reset_index()
        positive_indices = indices[~indices.isnegative]['index']
        negative_indices = indices[indices.isnegative]['index']
        negative_indices_resampled = negative_indices.sample(frac=len(positive_indices)/len(negative_indices))
        self.new_indices = pd.np.concatenate([positive_indices, negative_indices_resampled])
        self.new_indices.sort()
        
        self.len = len(self.new_indices) - SEQ_LENGTH - ((len(self.new_indices) - SEQ_LENGTH) % batch_size)
        
    
    def __getitem__(self, index):
        new_index = self.new_indices[index]
        x = self.x_data[new_index: new_index + SEQ_LENGTH]
        return x, self.y_data[new_index]
    
    def __len__(self):
#        return 1000000
        return self.len


test_validation_split = int(len(df) * 0.8)
dataset_train = Ticks(df[['time', 'relative_returns', 'buy', 'quantity', 'label', 'label_onehot']].values[:test_validation_split], BATCH_SIZE)
dataset_test = Ticks(df[['time', 'relative_returns', 'buy', 'quantity', 'label', 'label_onehot']].values[test_validation_split:], BATCH_SIZE)
dataloader_train = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
dataloader_test = DataLoader(dataset=dataset_test, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

In [10]:
### Create Model

class LSTM(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = SEQ_LENGTH
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        ### (LAYERS, BATCH_SIZE, HIDDEN_SIZE)
        h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() # Hidden
        c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() # Cell state
        _, (h_out, _) = self.lstm(x, (h0, c0))
        h_out = h_out.view(-1, self.hidden_size)
        out = self.fc(h_out)
        activated = self.sig(out)
        return activated


losses = []
accuracies = []
test_accuracies = []
test_predicted_positives = []

# TRAIN

lstm = LSTM(NUM_CLASSES, INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS).cuda()
lstm.load_state_dict(torch.load('models/180.pt'))

In [11]:
#loss_fn = torch.nn.MSELoss()
#loss_fn = torch.nn.CrossEntropyLoss()
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=LEARNING_RATE)

In [12]:
for epoch in list(range(300, 400)):
    correct = 0
    num_positive_returns = 0
    predicted_positive_returns = 0
    num_observations = 0
    for i, (trainX, trainY) in tqdm.tqdm(enumerate(dataloader_train), total=len(dataset_train) // BATCH_SIZE):
        trainX, trainY = Variable(trainX).cuda(), Variable(trainY).cuda()
        outputs = lstm(trainX)
        correct += torch.max(outputs.data, 1)[1].eq(torch.max(trainY.data, 1)[1]).sum()
        optimizer.zero_grad()
        loss = loss_fn(outputs, trainY)
        loss.backward()
        optimizer.step()
        predicted_positive_returns += torch.max(outputs.data, 1)[1].sum()
        num_positive_returns += trainY[:, [1]].sum()
        num_observations += len(trainY)

    accuracies.append(correct / len(dataloader_train.dataset))
    losses.append(loss.cpu().data[0])
    num_positive_returns = num_positive_returns.cpu().data[0]

    clear_output(True)
    print(f'epoch {epoch + 1}/{NUM_EPOCHS}')
    print(f'actual positive returns {num_positive_returns}/{num_observations}: {num_positive_returns/num_observations}')
    print(f'predicted positive_returns {predicted_positive_returns}/{num_observations}: {predicted_positive_returns/num_observations}')
    
    correct = 0
    num_positive_returns = 0
    predicted_positive_returns = 0
    num_observations = 0
    for i, (testX, testY) in tqdm.tqdm(enumerate(dataloader_test), total=len(dataset_test) // BATCH_SIZE):
        testX, testY = Variable(testX).cuda(), Variable(testY).cuda()
        outputs = lstm(testX)
        predicted_labels = torch.max(outputs.data, 1)[1]
        correct += predicted_labels.cpu().eq(testY.cpu().data[:, [1]].view(-1).long()).sum()
        predicted_positive_returns += torch.max(outputs.data, 1)[1].sum()
        num_positive_returns += testY[:, [1]].sum()

    loss = loss_fn(outputs, trainY)
    accuracy = correct / len(dataloader_test.dataset)
    test_accuracies.append(accuracy)
    num_positive_returns = num_positive_returns.cpu().data[0]
    test_predicted_positives.append(predicted_positive_returns/len(dataloader_test.dataset))
    
    print(f'loss: {loss.data[0]}, accuracy: {accuracy}')
    print(f'actual positive returns {num_positive_returns}/{len(dataloader_test.dataset)}: {num_positive_returns/len(dataloader_test.dataset)}')
    print(f'predicted positive_returns {predicted_positive_returns}/{len(dataloader_test.dataset)}: {predicted_positive_returns/len(dataloader_test.dataset)}')
    
    plotly.iplot(go.Figure(data=[go.Scatter(x=list(range(len(losses))), y=losses, name='loss'),
                                 go.Scatter(x=list(range(len(accuracies))), y=accuracies, name='train_accuracy', yaxis='y2'),
                                 go.Scatter(x=list(range(len(test_accuracies))), y=test_accuracies, name='test_accuracy', yaxis='y2'),
                                 go.Scatter(x=list(range(len(test_predicted_positives))), y=test_predicted_positives, name='test_predicted_positives', yaxis='y2')],
                           layout=go.Layout(yaxis={'title': 'loss'}, yaxis2={'title': 'accuracy', 'overlaying': 'y', 'side': 'right'})))
    plotly.plot(go.Figure(data=[go.Scatter(x=list(range(len(losses))), y=losses, name='loss'),
                                go.Scatter(x=list(range(len(accuracies))), y=accuracies, name='train_accuracy', yaxis='y2'),
                                go.Scatter(x=list(range(len(test_accuracies))), y=test_accuracies, name='test_accuracy', yaxis='y2'),
                                go.Scatter(x=list(range(len(test_predicted_positives))), y=test_predicted_positives, name='test_predicted_positives', yaxis='y2')],
                          layout=go.Layout(yaxis={'title': 'loss'}, yaxis2={'title': 'accuracy', 'overlaying': 'y', 'side': 'right'})),
               filename=f'models/training{epoch}.html')
    data = pd.DataFrame(outputs.cpu().data.numpy())
    plotly.plot(go.Figure(data=[go.Scatter(x=data.sample(n=1000).values[:, 0], y=data.sample(n=1000).values[:, 1], mode='markers')],
                       layout=go.Layout(width=600, height=600)), filename=f'models/scatter{epoch}.html')
    print(outputs)
    torch.save(lstm.state_dict(), f'models/{epoch}.pt')


epoch 218/1
actual positive returns 2511737.0/5025000: 4.7842609523809525
predicted positive_returns 3642226/5025000: 0.7248210945273632


100%|██████████| 21/21 [00:04<00:00,  4.66it/s]

loss: 1.4986577033996582, accuracy: 0.7441638095238096
actual positive returns 261045.0/525000: 0.4972285714285714
predicted positive_returns 378141/525000: 0.7202685714285715





Variable containing:
 9.8710e-01  1.3803e-02
 1.3796e-01  8.5563e-01
 3.8216e-01  6.2838e-01
           ⋮            
 2.9930e-01  6.9254e-01
 2.5692e-01  7.6269e-01
 3.3767e-01  6.7067e-01
[torch.cuda.FloatTensor of size 25000x2 (GPU 0)]



 20%|█▉        | 40/201 [00:10<00:41,  3.89it/s]Process Process-73:
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process Process-74:
  File "/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 55, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 135, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 135, in <listcomp>
    return [default_collate(samples) for samples in t

KeyboardInterrupt: 

In [13]:
correct = 0
num_positive_returns = 0
predicted_positive_returns = 0
num_observations = 0
for i, (testX, testY) in tqdm.tqdm(enumerate(dataloader_test), total=len(dataset_test) // BATCH_SIZE):
    testX, testY = Variable(testX).cuda(), Variable(testY).cuda()
    outputs = lstm(testX)
    predicted_labels = torch.max(outputs.data, 1)[1]
    correct += predicted_labels.cpu().eq(testY.cpu().data[:, [1]].view(-1).long()).sum()
    predicted_positive_returns += torch.max(outputs.data, 1)[1].sum()
    num_positive_returns += testY[:, [1]].sum()

loss = loss_fn(outputs, testY)
accuracy = correct / len(dataloader_test.dataset)
num_positive_returns = num_positive_returns.cpu().data[0]

print(f'loss: {loss.data[0]}, accuracy: {accuracy}')
print(f'actual positive returns {num_positive_returns}/{len(dataloader_test.dataset)}: {num_positive_returns/len(dataloader_test.dataset)}')
print(f'predicted positive_returns {predicted_positive_returns}/{len(dataloader_test.dataset)}: {predicted_positive_returns/len(dataloader_test.dataset)}')

100%|██████████| 21/21 [00:05<00:00,  3.55it/s]

loss: 0.5318581461906433, accuracy: 0.7052266666666667
actual positive returns 260951.0/525000: 0.49704952380952383
predicted positive_returns 368879/525000: 0.7026266666666666





In [11]:
lstm.load_state_dict(torch.load('models/2.pt'))