In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler

import torch
import torchvision
import torch.utils.data as tdata
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

np.random.seed(4)
torch.manual_seed(7);

# Data Preprocessing

In [2]:
train_df = pd.read_csv('train.csv')
train_df.head(5)

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,,,103.0,0,3842.4,3842.0,3841.8,...,1,6,14,6,6,1,1,10,2,1
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1


In [3]:
# Get train and test data as np arrays
X0, Y = train_df.values[:, :-1], train_df.values[:, -1]
X0.shape, Y.shape

((592380, 27), (592380,))

In [4]:
# Only columns 3 and 4 (opened_position_qty and closed_position_qty) have NaN's
# Have to decide how to handle NaN's at some point
print(f'Cols where train data is Nan: {np.where(np.any(np.isnan(X0), axis=0))[0]}')

Cols where train data is Nan: [3 4]


### Get a process function that takes some data and processes it for input to the model
Wrapper is a decorator that binds the paramaters from train data to the process function,
so process can be a one argument function that takes any data (e.g. test data), and 
normalizes / processes using saved paramaters from train data.

In [5]:
def wrapper(X0):
    def get_pars_for_processing(X0):
        keep_cols = np.all(~np.isnan(X0), axis=0) # Drop NaN columns
        keep_cols[0] = False # Don't keep id
        scaler = StandardScaler()
        scaler.fit(X0[:, keep_cols])
        return scaler, keep_cols
    pars = get_pars_for_processing(X0)
    def decorator(proc_func):
        return lambda X: proc_func(X, pars)
    return decorator

@wrapper(X0)
def process(X, params):
    '''
    Function that takes training / test data, 
    and process it for training / evaluation
    '''
    scaler, keep_cols = params
    return scaler.transform(X[:, keep_cols])

X = process(X0)
(N, D) = X.shape

# Model attempt 1

In [6]:
### Just gonna take a random 10th for validation
val_size = N // 10
inds = np.random.permutation(N)
X, valX = X[inds[:-val_size]], X[inds[-val_size:]]
Y, valY = Y[inds[:-val_size]], Y[inds[-val_size:]]

In [7]:
batch_size = 32
X, Y = torch.Tensor(X), torch.Tensor(Y)
valX, valY = torch.Tensor(valX), torch.Tensor(valY)


train_loader = tdata.DataLoader(tdata.TensorDataset(X, Y), \
                                     batch_size=batch_size,\
                                     shuffle=True)
val_loader = tdata.DataLoader(tdata.TensorDataset(valX, valY), \
                                     batch_size=batch_size,\
                                     shuffle=True)

### Notes on sigmoid
Either Option 1:

1) No nn.Sigmoid() layer, 2) Use nn.BCEWithLogitsLoss, and 3) apply torch.sigmoid() to output to
get probabilities.
This option is more numerically stable

OR Option 2:

1) Use nn.Sigmoid() layer, 2) Use nn.BCELoss(), and 3) Don't need torch.sigmoid() at the prediction step

In [8]:
# Simple one hidden layer (k units) logistic regression
k = 64
model = nn.Sequential(
        nn.Linear(X.shape[1], k),
        nn.ReLU(), 
        nn.Dropout(0.3),
        nn.Linear(k, k//4),
        nn.ReLU(), 
        nn.Dropout(0.3),
        nn.Linear(k//4, 1),
        #nn.Sigmoid(); #Option 2
    )

criterion = nn.BCEWithLogitsLoss()
#criterion = nn.BCELoss(); #Option 2
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [9]:
# Some layers, such as Dropout, behave differently during training
model.train()

for epoch in range(10):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Erase accumulated gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(data)

        # Calculate loss (broadcast target to (B, 1), where B is batch size)
        loss = criterion(output, target.unsqueeze(1)) 

        # Backward pass
        loss.backward()
        
        # Weight update
        optimizer.step()

    # Track loss each epoch
    print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))

Train Epoch: 1  Loss: 0.6226
Train Epoch: 2  Loss: 0.5733
Train Epoch: 3  Loss: 0.6239
Train Epoch: 4  Loss: 0.6599
Train Epoch: 5  Loss: 0.5208
Train Epoch: 6  Loss: 0.7705
Train Epoch: 7  Loss: 0.7291
Train Epoch: 8  Loss: 0.6272
Train Epoch: 9  Loss: 0.5971
Train Epoch: 10  Loss: 0.6092


In [10]:
# Putting layers like Dropout into evaluation mode
model.eval()

val_loss = 0
correct = 0

# Turning off automatic differentiation
with torch.no_grad():
    for i, (data, target) in enumerate(val_loader):
        output = torch.sigmoid(model(data))
        # output = model(data); #Option 2
        
        val_loss += criterion(output, target.unsqueeze(1)).item()  # Sum up batch loss
        pred = output.round()
        correct += (pred.eq(target.view_as(pred))).sum().item()

val_loss /= len(val_loader.dataset)

print('Val set: Average loss: %.4f, Accuracy: %d/%d (%.4f)' %
      (val_loss, correct, len(val_loader.dataset),
       100. * correct / len(val_loader.dataset)))

Val set: Average loss: 0.0236, Accuracy: 38599/59238 (65.1592)


# Get Predictions on Test Set

In [11]:
test_df = pd.read_csv('test.csv')
tX = process(test_df.values)

with torch.no_grad():
    output = torch.sigmoid(model(torch.Tensor(tX)))
    #output = model(torch.Tensor(tX)); #Option 2
    
output_df = pd.DataFrame({'id':test_df['id'], 'Predicted':output.data.numpy().squeeze()})
output_df.to_csv('submission.csv', index=False)

# Old Code

In [None]:
# Get some data to make sure logistic regression worked lmao
import sys
sys.path.append('~/git/MLModels')
from MLModels import utils as u
from MLModels import linearModels as lm

f, line = u.genF(zero_one=True)
X, Y = u.genData(f, 10000)
(N, D) = X.shape