In [1]:
from IPython.display import clear_output
from plotly import offline as plotly
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torch import nn
import plotly.graph_objs as go
import pandas as pd
import torch
import tqdm
import os

plotly.init_notebook_mode(True)
np = pd.np

### Hyper Parameters
LEARNING_RATE = 0.001 # 0.01 and 0.005 worked eventually
BATCH_SIZE = 100000
NUM_EPOCHS = 1
INPUT_SIZE = NUM_FEATURES = 1
HIDDEN_SIZE = 200
NUM_CLASSES = 2
SEQ_LENGTH = 10
NUM_LAYERS = 1

In [2]:
def read_returns(pair):
    try:
        path = f's3://mikeokslonger-ticks/returns.parquet/pair={pair}/part.0.parquet'
        path = f'/home/mikeokslonger/data_unseen/returns.parquet/pair={pair}/part.0.parquet'
        path = f'/home/mikeokslonger/data/returns.parquet/pair={pair}/part.0.parquet'
        df = pd.read_parquet(path, columns=['time', 'relative_returns', 'buy', 'quantity', 'price'], engine='fastparquet')
        return df
    except FileNotFoundError:
        pass

def create_features(df):
    if df is None:
        return None
    one_hot = np.array([[1, 0],
                       [0, 1]], dtype='float32')
    return df.assign(buy=lambda x: x.buy.astype(float)) \
        .assign(label=lambda x: (x.shift(-1).relative_returns > 0.).astype(int)) \
        .assign(label_onehot=lambda x: x['label'].apply(lambda i: one_hot[i])) \
        .assign(time=lambda x: (x['time'] - 20180101000000).astype(float))[1:]

def filter_null_returns(df):
    return df[df.relative_returns.abs() > 0.]

In [3]:
pairs = ['ADX-USD', 'AIR-USD', 'AMM-USD', 'ATB-USD', 'ATM-USD', 'B2X-USD', 'BCC-USD', 'BCH-USD', 'BCN-USD', 'BMC-USD', 'BNT-USD', 'BQX-USD', 'BTCA-USD', 'BTC-USD', 'BTG-USD', 'BTM-USD', 'BTX-USD', 'CAT-USD', 'CDT-USD', 'CLD-USD', 'CL-USD', 'CND-USD', 'CTR-USD', 'CVC-USD', 'DASH-USD', 'DATA-USD', 'DCN-USD', 'DGB-USD', 'DIM-USD', 'DOGE-USD', 'EBTCOLD-USD', 'EDO-USD', 'EMGO-USD', 'ENJ-USD', 'EOS-USD', 'ETC-USD', 'ETH-USD', 'ETP-USD', 'EVX-USD', 'FUEL-USD', 'FUN-USD', 'ICOS-USD', 'ICX-USD', 'KMD-USD', 'LOC-USD', 'LSK-USD', 'LTC-USD', 'MAID-USD', 'MANA-USD', 'MCO-USD', 'NEO-USD', 'NGC-USD', 'NXT-USD', 'OAX-USD', 'OMG-USD', 'PLR-USD', 'PPC-USD', 'PRG-USD', 'QTUM-USD', 'SMART-USD', 'SMS-USD', 'SNC-USD', 'SNT-USD', 'STRAT-USD', 'STU-USD', 'STX-USD', 'SUB-USD', 'SUR-USD', 'SWFTC-USD', 'TNT-USD', 'TRX-USD', 'UGT-USD', 'UTT-USD', 'VEN-USD', 'VERI-USD', 'VIB-USD', 'WMGO-USD', 'WRC-USD', 'XDN-USD', 'XEM-USD', 'XMR-USD', 'XTZ-USD', 'XUC-USD', 'XVG-USD', 'ZEC-USD', 'ZRX-USD', 'ZSC-USD']
pairs = pairs
df = pd.concat([read_returns(pair).pipe(filter_null_returns).pipe(create_features) for pair in tqdm.tqdm(pairs)]) # Full dataset

100%|██████████| 87/87 [00:36<00:00,  2.40it/s]


In [7]:
class Ticks(Dataset):
    def __init__(self, data, batch_size):
        self.x_data = data[:, 1:2].copy().astype('float32')
        self.y_data = data[:, -1].copy()#.astype('int')
        
        indices = pd.DataFrame(data[:, -2] == 0,columns=['isnegative']).reset_index()
        positive_indices = indices[~indices.isnegative]['index']
        negative_indices = indices[indices.isnegative]['index']
        negative_indices_resampled = negative_indices.sample(frac=len(positive_indices)/len(negative_indices))
        self.new_indices = pd.np.concatenate([positive_indices, negative_indices_resampled])
        self.new_indices.sort()
        
        self.len = len(self.new_indices) - SEQ_LENGTH - ((len(self.new_indices) - SEQ_LENGTH) % batch_size)
        
    
    def __getitem__(self, index):
        new_index = self.new_indices[index]
        x = self.x_data[new_index: new_index + SEQ_LENGTH]
        return x, self.y_data[new_index]
    
    def __len__(self):
        return self.len


dataset_test = Ticks(df[['time', 'relative_returns', 'buy', 'quantity', 'label', 'label_onehot']].values, BATCH_SIZE)
dataloader_test = DataLoader(dataset=dataset_test, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

In [8]:
num_positive = 0
num_negative = 0

for i in tqdm.tqdm(list(range(len(dataset_test)))):
    positive = dataset_test[i][1][1] == 1
    if positive:
        num_positive += 1
    else:
        num_negative += 1

100%|██████████| 38300000/38300000 [02:21<00:00, 270854.36it/s]


In [9]:
print(num_positive)
print(num_negative)

19150021
19149979


In [19]:
print(num_positive)
print(num_negative)

2768239
2731761


In [77]:
### Create Model

class LSTM(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = SEQ_LENGTH
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        ### (LAYERS, BATCH_SIZE, HIDDEN_SIZE)
        h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # Hidden
        c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # Cell state
        _, (h_out, _) = self.lstm(x, (h0, c0))
        h_out = h_out.view(-1, self.hidden_size)
        out = self.fc(h_out)
        activated = self.sig(out)
        return activated


losses = []
accuracies = []
test_accuracies = []
test_predicted_positives = []

# TRAIN

lstm = LSTM(NUM_CLASSES, INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS)
lstm.load_state_dict(torch.load('model202.pt', map_location=lambda a, b: a))

In [78]:
class BCEUniformLoss(torch.nn.BCELoss):
    def forward(self, input, target):
        normal_result = super(BCEUniformLoss, self).forward(input, target)
        skew = torch.abs(input.data[:, [1]].sum() - trainY[:, [1]].sum())
        adjustment_factor = (skew / trainY[:, [1]].sum()) ** 2
        return normal_result + normal_result * adjustment_factor

In [93]:
testX.shape

torch.Size([100000, 10, 1])

In [79]:
#loss_fn = torch.nn.MSELoss()
#loss_fn = torch.nn.CrossEntropyLoss()
#loss_fn = torch.nn.BCELoss()
loss_fn = BCEUniformLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=LEARNING_RATE)

In [80]:
correct = 0
num_positive_returns = 0
predicted_positive_returns = 0
num_observations = 0
for i, (testX, testY) in tqdm.tqdm(enumerate(dataloader_test), total=len(dataset_test) // BATCH_SIZE):
    testX, testY = Variable(testX), Variable(testY)
    outputs = lstm(testX)
    predicted_labels = torch.max(outputs.data, 1)[1]
    correct += predicted_labels.cpu().eq(testY.cpu().data[:, [1]].view(-1).long()).sum()
    predicted_positive_returns += torch.max(outputs.data, 1)[1].sum()
    num_positive_returns += testY[:, [1]].sum()

predicted_positive_returns = predicted_positive_returns.item()
accuracy = correct.item() / len(dataloader_test.dataset)
test_accuracies.append(accuracy)
num_positive_returns = num_positive_returns.cpu().data[0]
test_predicted_positives.append(predicted_positive_returns/len(dataloader_test.dataset))

print(f'accuracy: {accuracy}')
print(f'actual positive returns {num_positive_returns}/{len(dataloader_test.dataset)}: {num_positive_returns/len(dataloader_test.dataset)}')
print(f'predicted positive_returns {predicted_positive_returns}/{len(dataloader_test.dataset)}: {predicted_positive_returns/len(dataloader_test.dataset)}')

100%|██████████| 4/4 [00:58<00:00, 14.74s/it]

accuracy: 0.7198
actual positive returns 206133.0/400000: 0.5153325200080872
predicted positive_returns 252309/400000: 0.6307725




invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. Use tensor.item() to convert a 0-dim tensor to a Python number



100%|██████████| 436/436 [01:59<00:00,  3.64it/s]

accuracy: 0.6644564220183486
actual positive returns 218047.0/436000: 0.5001078248023987
predicted positive_returns 256768/436000: 0.5889174311926606




invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. Use tensor.item() to convert a 0-dim tensor to a Python number



In [18]:
accuracy

0.7171467889908257

In [None]:
#172.pt = 66% accurate, 59% positive
#202.pt = 71% accurate, 62% positive
#180.pt = 75% accurate, 73% positive

In [19]:
lstm

LSTM(
  (lstm): LSTM(1, 200, batch_first=True)
  (fc): Linear(in_features=200, out_features=2, bias=True)
  (sig): Sigmoid()
)

In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [21]:
count_parameters(lstm)

162802

In [81]:
correct_ones = (predicted_labels.cpu().eq(testY.cpu().data[:, [1]].view(-1).long()).long() & (predicted_labels.cpu().long() == 1).long()).sum()
incorrect_ones = ((predicted_labels.cpu().eq(testY.cpu().data[:, [1]].view(-1).long()).long() == 0).long() & (predicted_labels.cpu().long() == 1).long()).sum()
correct_zeroes = (predicted_labels.cpu().eq(testY.cpu().data[:, [1]].view(-1).long()).long() & (predicted_labels.cpu().long() == 0).long()).sum()
incorrect_zeroes = ((predicted_labels.cpu().eq(testY.cpu().data[:, [1]].view(-1).long()).long() == 0).long() & (predicted_labels.cpu().long() == 0).long()).sum()


In [82]:
correct_ones

tensor(43238)

In [83]:
incorrect_ones

tensor(19709)

In [84]:
correct_zeroes

tensor(28856)

In [85]:
incorrect_zeroes

tensor(8197)

In [86]:
correct_ones + incorrect_ones + correct_zeroes + incorrect_zeroes

tensor(1.0000e+05)