## Loading Packages

In [134]:
from deeplearning import *
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from preprocessing import *
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from utils import *
import warnings

# plt.style.use('dark_background')
warnings.filterwarnings("ignore")
set_seed(42)

True

### Insuring that training is done on GPU if available

In [18]:
device = 'cpu'
if torch.cuda.device_count() > 0 and torch.cuda.is_available():
    print("Cuda installed! Running on GPU!")
    device = 'cuda'
else:
    print("No GPU available!")

No GPU available!


## Loading Data

In [7]:
path = "Data/"
universe_dict = universe_select(path, "Cu")

## Preprocessing

In [8]:
# Renaming the columns to price
universe_dict = price_rename(universe_dict)
# Cleaning the dataset of any erroneous datapoints
universe_dict = clean_dict_gen(universe_dict)
# Making sure that all the points in the window have consistent lenght
universe_dict = truncate_window_length(universe_dict)
df_full = generate_dataset(universe_dict, lg_returns_only=True)

Included Instrument:
cu_shfe
cu_lme
cu_comex_p
cu_comex_s
peso
sol
bdi
ted
vix
skew
gsci


The target column represents the log returns at one forecast length out in the future for the instrument of interest (aluminium or copper prices on the London Metals Exchange). 

To normalise the independent variables, the 1 day log returns between closing prices have been used.

In [9]:
# Visualise the plots
# visualise_universe(universe_dict)
df = df_full[["target"]]

In [10]:
df_full.head(5)
# df_full.tail(5)

Unnamed: 0_level_0,cu_shfe,cu_lme,cu_comex_p,cu_comex_s,peso,sol,bdi,ted,vix,skew,gsci,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2006-09-13,0.001583,-0.013602,0.00059,0.055554,-0.000558,0.000307,0.029244,-0.004276,-0.064091,-0.048458,0.002624,-0.001404
2006-09-14,0.025695,-0.006098,-0.0031,-0.104881,0.000707,-0.001692,0.018715,0.005088,0.032559,-0.008086,-0.010694,0.021675
2006-09-15,-0.02771,-0.021468,-0.019557,0.041104,-0.001172,0.000154,0.01205,0.002636,0.018019,0.12406,-0.004952,0.043012
2006-09-18,0.006177,0.024153,0.030146,0.032495,0.0,-0.000924,0.00492,0.002023,0.001699,-0.060186,0.007824,0.01345
2006-09-19,0.023214,0.007613,-0.012512,0.138083,0.0,0.0,-0.000935,-0.000202,0.016835,-0.018385,-0.019642,0.025225


Normalise data

Split into validation data and test data

Use validation to tune hyperparameters

Perform predictions on the test dataset

### Creating dataset for pytorch

In [52]:
df_X = df_full.loc[:, df_full.columns != 'target']
df_y = df_full.loc[:, df_full.columns == 'target']

In [122]:
class DeepLearning():
    """Class to perform training and validation for a given model"""
    def __init__(self, df_X, df_y,
                 #model, 
                 #optimiser, 
                 loss_function=torch.nn.MSELoss(size_average=False),
                 device="cpu", seed=42):
        
        self.df_X = df_X
        self.df_y = df_y
   
        #self.model = model
        #self.optimiser = optimiser
        self.loss_function = loss_function
        self.device = device
        self.seed = seed
        
        assert (type(self.df_X) == pd.DataFrame)
        assert (type(self.df_y) == pd.DataFrame)
        assert (len(self.df_X.index) == len(self.df_y.index))
        assert (len(self.df_X.index) > 0)
        
        self.X_train = None
        self.X_val = None
        self.X_test = None
        
        self.y_train = None
        self.y_val = None
        self.y_test = None
        
        
    def train_val_test(self):
        """Splits the dataframes in to a training, validation
        and test set and creates torch tensors from the underlying
        numpy arrays"""
        # Splitting the sets into train, test and validation
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.df_X, self.df_y, test_size=0.2, shuffle=False)
        
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(self.X_train, self.y_train, test_size=0.25, shuffle=False)

        print("Train Length: \t\t%i\nValidation Length: \t%i\nTest Length:\t\t%i" % (len(X_train), len(X_val), len(X_test)))
        
        # Tensor of training data
        self.X_train = torch.from_numpy(self.X_train.values).float()
        self.y_train = torch.from_numpy(self.y_train.values).float()

        # Tensor of training labels
        self.X_val = torch.from_numpy(self.X_val.values).float()
        self.y_val = torch.from_numpy(self.y_val.values).float()

        #  Tensor of test data
        self.X_test = torch.from_numpy(self.X_test.values).float()
        self.y_test = torch.from_numpy(self.y_test.values).float()

        # Size Check
        print("\nX Train Shape:\t\t", self.X_train.size())
        print("X Val Shape:\t\t", self.X_val.size())
        print("X Test Shape:\t\t", self.X_test.size())

        print("\ny Train Shape:\t\t", self.y_train.size())
        print("y Val Shape:\t\t", self.y_val.size())
        print("y Test Shape:\t\t", self.y_test.size())
    
    
    def normalise(self):
        """Normalizes the data using MaxMinScaler, which
            was chosen because it preserves the original 
            scale and doesn't reduce effect of outliers"""
        scaler = MinMaxScaler()
        
        # Normalize the validation and test set by the same
        # scale as the training data. Needed for training to
        # be correctly scaled
        
        self.X_train = scaler.fit_transform(self.X_train)
        self.X_val = scaler.transform(self.X_val)
        self.X_test = scaler.transform(self.X_test)
        
        """TODO implement output scaling for MTL because 
        otherwise the scale of the different outputs will
        lead to dominance of some tasks"""
        
    
    def create_data_loaders(self):
        """Forms iterators to pipeline in the data"""
        
        # Create tensor datasets
        train_dataset = TensorDataset(X_train, y_train)
        val_dataset = TensorDataset(X_val, y_val)
        test_dataset = TensorDataset(X_test, y_test)
        
        # Data loaders
        train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)
        val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
        test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
    
    def train(self):
        
        # Sets the model to train mode
        self.model.train()
        
        train_loss, train_accuracy = 0., 0.

        # The data loader creates batches of data to train
        for X_train_batch, y_train_batch in train_data_loader:

        X_train_batch = X_train_batch.to(self.device)
        y_train_batch = y_train_batch.to(self.device)

        # Zeros the gradients
        self.optimiser.zero_grad()
        
        # Perform forward pass
        y_pred = self.model(X_train_batch)

        # Calculate loss for the batch
        loss = self.loss_function(y_pred, y_train_batch)   
        
        # Perform backward pass
        loss.backward()   
        
        # Calculate the training loss
        train_loss += (loss * X_train_batch.size()[0]).detach().cpu().numpy()
        
        """TODO Put in an accuracy function as well"""
        
        # Update Parameters
        self.optimiser.step()               
        
        return
    

In [123]:
# Hyperparameters
lr = 1e-2
momentum = 0.5
batch_size = 64
test_batch_size = 1000
n_epochs = 2
weight_decay=0

df_X = df_full.loc[:, df_full.columns != 'target']
df_y = df_full.loc[:, df_full.columns == 'target']

# optimiser = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)

"""Do I need to declare a data explicitly in the dataset"""

learning = DeepLearning(df_X, df_y)
# Splitting the data into the train, validation and test sets
learning.train_val_test()
# Scaling the data by the training dataset
learning.normalise()

Train Length: 		1999
Validation Length: 	667
Test Length:		667

X Train Shape:		 torch.Size([1999, 11])
X Val Shape:		 torch.Size([667, 11])
X Test Shape:		 torch.Size([667, 11])

y Train Shape:		 torch.Size([1999, 1])
y Val Shape:		 torch.Size([667, 1])
y Test Shape:		 torch.Size([667, 1])


In [None]:
class LSTMnet(nn.Module):
    """A Long Short Term Memory network
    model"""
        
    def __init__(self):
        super(LSTMnet, self, num_features, hidden_dim, num_layers, output_dim,
              batch_size, debug=True).__init__()
        
        # Number of features
        self.num_features = num_features
        
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.num_layers = num_layers
        
        # The output dimensions
        self.output_dim = output_dim
        
        # Batch Size
        self.batch_size = batch_size
        
        # Dropout rate
        self.dropout = dropout

        # Debug Mode
        self.debug = debug
        
        # Define the LSTM layer
        self.lstm = nn.LSTM(input_size = self.num_features, 
            hidden_size =self.hidden_dim,
            num_layers =self.num_layers)

        # Output layer
        self.fc = nn.Linear(self.hidden_dim, self.output_dim)
        
        
    def init_hidden(self):
    # This initialised the hidden state to be zeros
    return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim).to(device),
            torch.zeros(self.num_layers, self.batch_size, self.hidden_dim).to(device))
    
    
    def forward(self, x)
        """Forward pass through the neural network"""
        # Initialises the hidden states
        h0, c0 = self.init_hidden()
        # Passes through the lstm layer with hidden states
        out, (hn, cn) = self.lstm(x, (h0, c0))
        # Make prediction from linear layer
        y_pred = self.linear(out[-1].view(self.batch_size, -1))
        
        return y_pred.view(-1)

In [133]:
np.random.seed(2)

T = 20
L = 1000
N = 100

x = np.empty((N, L), 'int64')
x[:] = np.array(range(L)) + np.random.randint(-4 * T, 4 * T, N).reshape(N, 1)
data = np.sin(x / 1.0 / T).astype('float64')

# Results

In [659]:
df = df_full[["target"]]
# Taking t-1 to be the value for t
df["persistance"] = df.shift(1)
df.dropna(inplace=True)
# Calculating metrics for these columns
MSE, MAE, MDE = evaluate(df, "target", "persistance")

In [660]:
print(df[:5])
print(df[-5:])

              target  persistance
date                             
2006-09-14  0.021675    -0.001404
2006-09-15  0.043012     0.021675
2006-09-18  0.013450     0.043012
2006-09-19  0.025225     0.013450
2006-09-20  0.024843     0.025225
              target  persistance
date                             
2019-06-24 -0.000589     0.004104
2019-06-25 -0.026401    -0.000589
2019-06-26 -0.010480    -0.026401
2019-06-27 -0.011358    -0.010480
2019-06-28 -0.015457    -0.011358


In [661]:
# Placing in results dataframe
results = pd.DataFrame(columns={"MSE", "MAE", "MDE"})
results.index.name = 'Name'
results.head()
results.loc["persistance"] = [MSE, MAE, MDE] 
results.head()

Unnamed: 0_level_0,MSE,MAE,MDE
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
persistance,0.000582,0.017105,0.465585
