# Customized baseline model with LSTM price prediction

#### Table of contents
- LSTM price change prediction
- Conic optimizer to transform price change data to optimal portfolio weights

# 1. LSTM Price Change Prediction

In [0]:
""" mount drive """

import os

GOOGLE_DIR = '/content/drive/My Drive/DL_Project/Kaggle_New'
LOCAL_DIR = '/home/ziyan/Desktop/Deep Learning Project/Kaggle_New'

def colab_mount_google_drive():
    drive.mount('/content/drive', force_remount=True)
    os.chdir(GOOGLE_DIR)
    os.listdir()

def mount_local_drive():
    os.chdir(LOCAL_DIR)
    os.listdir()

try:
    from google.colab import drive
    colab_mount_google_drive()
    DIR = GOOGLE_DIR
    print('Mounted google drive')
except ModuleNotFoundError:
    mount_local_drive
    DIR = LOCAL_DIR
    print('Mounted local drive')
  
print(os.listdir())

Mounted local drive
['Baseline_2_LSTM.ipynb', 'cudlfinance.zip', 'Baseline_2_LSTM (3).ipynb', 'sample_submission.csv', 'Baseline_2_LSTM (2).ipynb', 'Baseline_2_LSTM (1).ipynb', 'prediction', 'data', 'train_updated.csv', '.ipynb_checkpoints']


## Preprocess data
We convert price data to log natural price change data by performing:
ln(P[t] / P[t-1])

In [0]:
import pandas as pd
import numpy as np

CREATE_PER_CHANGE = False

if CREATE_PER_CHANGE or not os.path.isfile('data/train_features_input.csv'):
  # load data
  data_df = pd.read_csv('data/train_features.csv', index_col=0)

  ###
  # TODO: handle nans and bad data
  ###

  # Divide to P[t] and P[t-1]
  data_yesterday = data_df[:-1]
  data_today = data_df[1:]
  data_yesterday.index = data_today.index

  # price to percent change
  perc_chg_df = (data_today/data_yesterday).apply(lambda row: np.log(row))

  # fill 0 for ln(negative) or pt/0 in cash
  perc_chg_df = perc_chg_df.replace([np.inf, -np.inf], np.nan)
  perc_chg_df = perc_chg_df.fillna(0)

  # need to manually add Date column name after to_csv
  perc_chg_df.reset_index(drop=True).to_csv('data/train_features_input.csv')
  print('nans: {}'.format(perc_chg_df.isna().sum().sum()))

else:
  perc_chg_df = pd.read_csv('data/train_features_input.csv')

perc_chg_df.head()

Unnamed: 0.1,Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,...,XLNX,XOM,XRAY,XRX,XYL,YUM,ZBH,ZION,ZTS,Cash
0,0,-0.009325,0.005207,-0.004531,-0.009558,0.006853,0.003322,-0.019902,-0.002669,-0.005276,...,0.007135,0.004102,-0.025477,-0.007968,0.000263,-0.006887,-0.007345,-0.007747,0.006486,-0.848237
1,1,-0.018915,-0.000649,-0.013269,-0.028576,-0.018999,-0.008548,-0.006453,0.000223,-0.017028,...,-0.018636,-0.027743,-0.006957,-0.02206,-0.064258,-0.020527,0.036625,-0.038189,-0.006021,-0.40876
2,2,-0.0157,-0.01562,-0.000703,9.4e-05,-0.004962,0.005448,-0.025684,-0.011421,-0.007239,...,-0.014475,-0.00533,-0.012488,-0.013472,-0.005898,-0.012351,-0.008513,-0.038942,-0.009804,0.918269
3,3,0.013185,-0.000566,0.021261,0.013925,0.039621,0.019763,0.031333,0.008075,0.020772,...,0.000356,0.010082,0.025588,0.012727,0.007856,0.032601,0.024566,0.009311,0.020433,-0.220184
4,4,0.029534,0.012187,0.008727,0.037703,0.010404,0.002281,0.044078,0.020346,0.015134,...,0.022033,0.016508,0.024389,0.027158,0.007518,0.01727,0.010561,0.014086,0.015281,-0.289325


## Base csv Loader

In [0]:
# used by LSTM Dataloader
class DatasetLoader():
    def __init__(self, data_dir, dataset_name):
        dataset_path = '%s/%s.csv' % (data_dir, dataset_name)
        self.data_df = pd.read_csv(dataset_path, index_col=0)

    # get dataframe or numpy array.
    # can sample number of stocks (columns) and limit number of days (rows).
    # can also return plot figure with stock prices over time
    def get_data(self, limit_days=None, exclude_days=None,
                 test_split_days=0, random_state=1, as_numpy=True, plot=False,
                 dropna=True, drop_test=False):

        data_ret = self.data_df

        if 'Date' in data_ret:
            # we don't need a separate date col (it's ok if it's the index)
            data_ret = self.data_df.drop(['Date'], axis=1)

        if limit_days:
            # optional limit to latest n days
            data_ret = data_ret.tail(limit_days).reset_index(drop=True, inplace=False)

        if exclude_days:
            # optional exclusion of latest n days
            data_ret = data_ret.head(data_ret.shape[0] - exclude_days).reset_index(drop=True, inplace=False)

        if dropna:
            # optional drop of NaN columns
            data_ret = data_ret.dropna(axis=1, how='any') # drop cols/stocks with NA prices in selected day range

        # training data
        train_data = data_ret
        if drop_test:
            # optional drop test data from training data
            train_data = data_ret[:-test_split_days]
            
        # test data
        test_data = data_ret[-test_split_days:]

        if as_numpy:
            # optional return numpy
            train_data = train_data.to_numpy()
            test_data = test_data.to_numpy()

        return train_data, test_data

## LSTM Prices Dataloader

In [0]:
# Adapted from DV360 challenge dataloader class

from datetime import datetime
import pandas as pd
import numpy as np
from random import shuffle
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data import Sampler


class SubsetSampler(Sampler):
    """Samples elements sequentially from a given list of indices, without replacement.

    Arguments:
        indices (sequence): a sequence of indices
    """

    def __init__(self, indices):
        self.indices = indices

    def __iter__(self):
        return (self.indices[i] for i in range(len(self.indices)))

    def __len__(self):
        return len(self.indices)

    def update_indices(self, indices):
        self.indices = indices


class FuturePricesLoader(DataLoader):

    def __init__(self, phase, batch_size, data_dir, dataset_name, past_prices_lookback_window=30,
                 target_size=None, limit_days=None, exclude_days=None, random_state=1, history_number=2):

        self.futureprices = FuturePrices(phase,
                                         data_dir=data_dir,
                                         dataset_name=dataset_name,
                                         past_prices_lookback_window=past_prices_lookback_window,                 
                                         target_size=target_size,
                                         limit_days=limit_days,
                                         exclude_days=exclude_days,
                                         random_state=random_state,
                                         history_number=history_number)

        sampler = SubsetSampler(self.futureprices.indices)
        
        # TODO: can make this config driven
        num_workers = 1

        self.data_dim = self.futureprices.dataframe.shape

        super().__init__(dataset=self.futureprices,
                         batch_size=batch_size,
                         sampler=sampler,
                         num_workers=num_workers)

    def add_day(self, day_prices):
        self.futureprices.add_day(day_prices)
        self.sampler.update_indices(self.futureprices.indices)


class FuturePrices(object):
    def __init__(self, phase, data_dir, dataset_name, past_prices_lookback_window=30,
                 target_size=None, limit_days=None, exclude_days=None, random_state=1, history_number=2):

        self.past_prices_lookback_window = past_prices_lookback_window
                
        self.shuffle = False # TODO: can make these config driven or phase dependent (i.e. train, test):
        
        self.history_number = history_number
        if limit_days is not None:
            # for testing need to pad days to accommodate for historic number window
            limit_days = limit_days + self.history_number + 1

        #### reading in dataframe from csv #####
        base_dataset_loader = DatasetLoader(data_dir, dataset_name)
        self.dataframe, _ = base_dataset_loader.get_data(limit_days=limit_days,
                                                         exclude_days=exclude_days,
                                                         random_state=random_state,
                                                         as_numpy=False)

        # first target_size columns will be picked as the target if specified
        self.target_size = target_size if target_size else self.dataframe.shape[1]

        self.sequence_length = self.history_number
        max_temporal_history = self.sequence_length

        # we remove window+temporal history from start so we always have a full window
        # we remove one from the end so that target/next_prices doesn't index out of bounds
        self.indices = self.dataframe.iloc[self.past_prices_lookback_window+max_temporal_history:-2].index.tolist()
        self.phase = phase

        #### phase specific manipulation #####
        if phase == 'train':
            pass

        elif phase == 'validation':
            pass

        elif phase == 'test':
            # for test phase we want to append new predicted days
            self.indices = self.dataframe.iloc[self.past_prices_lookback_window+max_temporal_history:].index.tolist()

        if self.shuffle:
            shuffle(self.indices)

        print('Phase:', phase, '# of data:', len(self.indices))
        
        # data transforms - TODO: can compose more data transforms
        self.past_prices_transform = transforms.Compose([
                transforms.ToTensor()
            ])

        self.next_prices_transform = transforms.Compose([
                transforms.ToTensor()
            ])

    def add_day(self, day_prices):
        # update dataframe with new day and new index
        new_row = dict(zip(self.dataframe.columns, day_prices))
        current_max_index = self.dataframe.index.max()
        # new_index = (pd.Timestamp(current_max_index) + pd.DateOffset(days=1)).strftime('%d/%m/%Y')
        new_index = current_max_index + 1
        new_row_series = pd.Series(new_row, name=new_index)

        self.dataframe = self.dataframe.append(new_row_series)

        # update indices
        # remove oldest index and append new one (we don't want to process first day again)
        self.indices = self.indices[1:]
        self.indices.append(new_index)

    def __getitem__(self, index):
        inputs = {}
        labels = {}
        window_start = index - self.past_prices_lookback_window

        for i in range(self.sequence_length):
            inputs[i] = {}
            past_prices_img = self.dataframe.iloc[window_start-i+1:index-i+1].reset_index(drop=True, inplace=False).to_numpy()
            past_prices_img = self.past_prices_transform(past_prices_img)
            inputs[i]['past_prices'] = past_prices_img

        if self.phase in ('train', 'validation'):
            # training/validation labels
            next_prices = self.dataframe.iloc[index+1, :self.target_size].to_numpy().reshape(1, -1)
            next_prices = self.next_prices_transform(next_prices)

            labels['next_prices'] = next_prices
        else:
            # no labels for test
            labels['next_prices'] = np.empty((1, self.target_size))
        
        return inputs, labels

## Models

In [0]:
# Adapted from DV360 LSTM model

from torchvision import models
import torch.nn as nn
import torch


class PricePredictionModel(nn.Module):
    def __init__(self, input_size=506, output_size=506, hidden_size=128,
                 num_layers=3):
        super(PricePredictionModel, self).__init__()
        final_concat_size = 0

        # CNN
        cnn = models.resnet34(pretrained=True)
        feats = list(cnn.children())[:-1]
        feats[0] = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.features = nn.Sequential(*feats)
        self.intermediate = nn.Sequential(nn.Linear(
            cnn.fc.in_features, input_size),
            nn.ReLU())
        final_concat_size += input_size

        # LSTM
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=False)
        final_concat_size += hidden_size

        # Prices Regressor
        self.predict_prices = nn.Sequential(
            nn.Linear(final_concat_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, int(hidden_size / 2)),
            nn.ReLU(),
            nn.Linear(int(hidden_size / 2), output_size)
        )

    def forward(self, data_input):
        module_outputs = []
        lstm_i = []
        # Loop through temporal sequence of price "images" and pass through the cnn.

        for idx, v in data_input.items():
            x = self.features(v['past_prices'])
            x = x.view(x.size(0), -1)
            x = self.intermediate(x)
            lstm_i.append(x)
            # feed the current output directly into the regression network.
            if idx == 0:
                module_outputs.append(x)

        # Feed temporal outputs of CNN into LSTM
        i_lstm, _ = self.lstm(torch.stack(lstm_i))
        module_outputs.append(i_lstm[-1])

        # Concatenate current image CNN output and LSTM output.
        x_cat = torch.cat(module_outputs, dim=-1)

        # Feed concatenated outputs into the regession networks.
        prediction = {'next_prices': torch.squeeze(self.predict_prices(x_cat))}
        return prediction

## Train method

In [0]:
from torch import nn, optim
import time
import torch

def train(model, lr, train_loader, validation_loader, epochs, device, log_interval):

    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(0, epochs):
        # epoch training
        
        start_epoch_train = time.time()
        epoch_prices_losses = []        
        model.train()
        running_prices_loss = 0.0
        for batch_idx, (data, target) in enumerate(train_loader):

            # convert data and labels to device [cpu, cuda]
            for k, v in data.items():
                for k2, v2 in v.items():
                    data[k][k2] = v2.float().to(device)
            for k, v in target.items():
                target[k] = v.float().to(device)

            optimizer.zero_grad()
            prediction = model(data)

            # loss, backprop, optimize
            prices_loss = criterion(prediction['next_prices'], target['next_prices'].squeeze())
            combined_loss = prices_loss
            combined_loss.backward()

            optimizer.step()

            running_prices_loss += prices_loss.item()
            if batch_idx > 0 and batch_idx % log_interval == 0:
                # print avg batch statistics
                avg_batch_prices_loss = running_prices_loss / log_interval
                epoch_prices_losses.append(avg_batch_prices_loss)
                print('[epoch: %d, batch:  %5d] prices loss: %.5f' % (epoch + 1, batch_idx + 1, avg_batch_prices_loss))
                running_prices_loss = 0.0

        if len(epoch_prices_losses) > 0 and len(epoch_prices_losses) > 0:
            # print avg epoch statistics
            epoch_prices_loss = sum(epoch_prices_losses) / len(epoch_prices_losses)
            print('[avg train loss epoch %d] prices loss %.5f' % (epoch, epoch_prices_loss))
        else:
            print('0 epoch losses for training')

        end_epoch_train = time.time()
        epoch_elapsed = end_epoch_train - start_epoch_train
        print('epoch %d: %f elapsed' % (epoch+1, epoch_elapsed))

        if validation_loader:
            # epoch validation
            
            model.eval()
            with torch.no_grad():
                epoch_validation_prices_losses = []
                for batch_idx, (data, target) in enumerate(validation_loader):
                    
                    # convert data and labels to device [cpu, cuda]
                    for k, v in data.items():
                        for k2, v2 in v.items():
                            data[k][k2] = v2.float().to(device)
                    for k, v in target.items():
                        target[k] = v.float().to(device)

                    prediction = model(data)

                    prices_loss = criterion(prediction['next_prices'], target['next_prices'].squeeze())

                    epoch_validation_prices_losses.append(prices_loss.item())

                if len(epoch_validation_prices_losses) > 0:
                    epoch_validation_prices_loss = sum(epoch_validation_prices_losses) / len(epoch_validation_prices_losses)
                    print('[avg validation loss epoch %d] prices loss %.5f' % (epoch, epoch_validation_prices_loss))
                else:
                    print('0 epoch losses for validation')

## Test method

In [0]:
import torch
import pandas as pd


# generates a results.csv containing predictions for all existing training days + new test days for predict_days
def test(model, predict_days, train_loader, test_loader, device):

    initial_data_shape = test_loader.futureprices.dataframe.shape
    # init output file
    output_path = 'prediction/results.csv'
    columns = train_loader.futureprices.dataframe.columns
    pd.DataFrame([], columns=columns).to_csv(output_path)

    output_interval = 50
    last_output_index = 0
    num_batches = len(train_loader)

    model.eval()
    with torch.no_grad():
        # generate predictions for existing days and save to results.csv in batches
        output = []
        for batch_idx, (data, _) in enumerate(train_loader):
            
            # convert data to device [cpu, cuda]
            for k, v in data.items():
                for k2, v2 in v.items():
                    data[k][k2] = v2.float().to(device)

            # predict and append to output
            prediction = model(data)
            output.append(prediction['next_prices'].detach().cpu().numpy())

            if (batch_idx+1) % output_interval == 0 or (batch_idx+1) == num_batches:
                # every output_interval iterations append predictions to results.csv file on disk
                print('predicting training day t+1 {}/{}...'.format(
                    batch_idx+1, num_batches))

                pd.DataFrame(output, columns=columns, index=range(last_output_index, batch_idx+1))\
                    .to_csv(output_path,
                            header=False,
                            mode='a')
                last_output_index = batch_idx + 1
                output = []

        # predict new days and save to results.csv
        for day in range(predict_days):
            for batch_idx, (data, _) in enumerate(test_loader):
                
                # convert data to device [cpu, cuda]
                for k, v in data.items():
                    for k2, v2 in v.items():
                        data[k][k2] = v2.float().to(device)

                # predict a new day and append it to the test dataloader
                # so it can later be used to predict the next day
                prediction = model(data)
                test_loader.add_day(prediction['next_prices'].cpu().numpy())

                if (day+1) % 5 == 0 or (day+1) == predict_days:
                    print('predicting new day {}/{}...'.format(day+1, predict_days))

        print('Finished | initial data shape: {} | final data shape: {}'.format(
            initial_data_shape, test_loader.futureprices.dataframe.shape))

    # append new days to results.csv file on disk
    print('Saving results to "{}"...'.format(output_path))
    output_df = test_loader.futureprices.dataframe.iloc[-predict_days:]
    output_df.index = range(last_output_index, last_output_index+len(output_df))
    output_df.to_csv(output_path, header=False, mode='a')



## Args

In [0]:
force_cpu = False
data_dir = 'data' # directory in which the data is
dataset_name = 'train_new_percent_change' # name of csv file in data_dir (excluding .csv extension)
modes = ['train', 'test'] # train, test or train-and-test

batch_size = 1 # training batch size - Note haven't tested with more than 1, might only work with 1 on the 'test' phase
epochs = 5 # number of training epochs
lr = 0.0001 # learning rate

days_lookback_window = 20 # LSTM sliding window size
history_number = 30 # number of sliding temporal windows to consider at each iteration (length of LSTM in time)
num_layers = 1 # number of layers of LSTM model
hidden_size = 512 # int(input_size / 4)

test_predict_days = 30 # number of new days to predict
log_interval = 50 # interval of batches to print statistics for during training

limit_days = None # optional: integer limit to limit_days final days of the dataset
val_days = None # optional: integer validation set size in days (taken from end of dataset)
target_size = None # optional: integer number of columns to have in final prediction. If not specified will be identical to input number of columns

## Main runner

In [0]:
import torch

###
# Run this to generate results.csv with target_size columns and K rows where:
# K = min(training data, limit_days) - val_days - days_lookback_window - history_number + test_predict_days
# The final test_predict_days rows are the new generated prediction for the new days
###

# SETUP DEVICE #
device_type = 'cuda' if not force_cpu and torch.cuda.is_available() else 'cpu'
device = torch.device(device_type)
print('device is: ', device)
# END SETUP DEVICE #

# SETUP DATALOADERS #

val_data_dim = None
test_data_dim = None
validation_loader = None
test_loader = None

train_loader = FuturePricesLoader('train', batch_size, data_dir, dataset_name,
                                      days_lookback_window,
                                      target_size=target_size,
                                      limit_days=limit_days,
                                      exclude_days=val_days,
                                      history_number=history_number)

train_data_dim = train_loader.data_dim

if val_days and val_days > 0:
    validation_loader = FuturePricesLoader('validation', batch_size, data_dir, dataset_name,
                                          days_lookback_window,
                                          target_size=target_size,
                                          limit_days=val_days,
                                          history_number=history_number)
    val_data_dim = validation_loader.data_dim


if 'test' in modes:
    test_loader = FuturePricesLoader('test', batch_size, data_dir, dataset_name,
                                          days_lookback_window,
                                          target_size=target_size,
                                          limit_days=days_lookback_window,
                                          history_number=history_number)

    test_data_dim = test_loader.data_dim


output_size = train_loader.futureprices.target_size if train_loader else test_loader.futureprices.target_size


validation_dataloader = None
# END SETUP DATALOADERS #

params = {
    'lr': lr,
    'batch_size': batch_size,
    'epochs': epochs,
    'log_interval': log_interval,
    'device': device_type,
    'train_data_shape': train_data_dim,
    'validation_data_shape': val_data_dim,
    'test_data_shape': test_data_dim,
    'test_predict_days': test_predict_days,
    'target_size': target_size
}

print('running with params: {}'.format(params))

# SETUP MODEL #
assert (train_data_dim or test_data_dim)
input_size = train_data_dim[1] if train_data_dim else test_data_dim[1]

model = PricePredictionModel(input_size=input_size,
                             output_size=output_size,
                             hidden_size=hidden_size,
                             num_layers=num_layers)

if device_type == 'cuda':
    model.cuda()
# END SETUP MODEL #


if 'train' in modes:
    print('--Started training--')
    train(model, lr, train_loader, validation_loader, epochs, device, log_interval)
    print('--Finished training--')

if 'test' in modes:
    print('--Started testing--')
    test(model, test_predict_days, train_loader, test_loader, device)
    print('--Finished testing--')

device is:  cuda
Phase: train # of data: 838
Phase: test # of data: 1
running with params: {'lr': 0.0001, 'batch_size': 1, 'epochs': 5, 'log_interval': 50, 'device': 'cuda', 'train_data_shape': (890, 506), 'validation_data_shape': None, 'test_data_shape': (51, 506), 'test_predict_days': 30, 'target_size': None}
--Started training--
[epoch: 1, batch:     51] prices loss: 0.00127
[epoch: 1, batch:    101] prices loss: 0.00044
[epoch: 1, batch:    151] prices loss: 0.00107
[epoch: 1, batch:    201] prices loss: 0.00046
[epoch: 1, batch:    251] prices loss: 0.00054
[epoch: 1, batch:    301] prices loss: 0.00031
[epoch: 1, batch:    351] prices loss: 0.00030
[epoch: 1, batch:    401] prices loss: 0.00022
[epoch: 1, batch:    451] prices loss: 0.00032
[epoch: 1, batch:    501] prices loss: 0.00020
[epoch: 1, batch:    551] prices loss: 0.00015
[epoch: 1, batch:    601] prices loss: 0.00019
[epoch: 1, batch:    651] prices loss: 0.00020
[epoch: 1, batch:    701] prices loss: 0.00019
[epoch: 

# 2. Conic weight optimizer

## Helper methods

In [0]:
import numpy as np
from scipy.optimize import minimize
import pandas as pd


# conic optimization of price change covariance to weights using SLSQP method
def optimize(num_stocks, cov, expected_val, bounds):
#     print(num_stocks, cov.shape, expected_val.shape)
    init_guess = np.ones(num_stocks) * (1.0 / num_stocks)
    expected_val = expected_val.values.reshape((num_stocks, 1))
    
#     print('1', np.matmul(init_guess.T,np.matmul(cov,init_guess)).shape)
#     print('2', np.matmul(init_guess,expected_val).shape, init_guess.shape, expected_val.shape)
    
    weights = minimize(
        lambda w: (np.matmul(w.T,np.matmul(cov,w)) - np.matmul(w.T,expected_val)), 
        init_guess,
        method='SLSQP', 
        options={'disp': False}, 
        constraints=({'type': 'eq', 'fun': lambda w: 1.0 - np.sum(w)}), 
        bounds=bounds
    )
    return weights.x


# multiprocessing approach to optimization 
# (each row takes 30 seconds to optimize, 
#  so multithreading is crucial)
# modifies w_ret
def handle_optimization(i, bounds, lookback_window, prices, num_stocks, num_days):
    # get lookback of k days from real prices and lookahead of 1
    train = prices.iloc[i-(lookback_window-1):i+1]
    print(train.shape)
    print("optimizing row {}/{}...".format(i-(lookback_window-1), num_days - (lookback_window-1)))
    cov = train.cov()
    expected_val = train.mean()
    print('expected_val', expected_val.shape)
    test = optimize(num_stocks, cov.values, expected_val, bounds)
    return (i-(lookback_window-1), test)

## Args

In [0]:
from datetime import datetime

curr_time = datetime.now().strftime('%H%M')

# input_path = f'prediction/results_{curr_time}.csv' # full path to price change csv
# output_path = f'prediction/weights_{curr_time}.csv' # full output path for weights csv

input_path = 'prediction/results.csv' # full path to price change csv
output_path = 'prediction/weights.csv' # full output path for weights csv
weight_min = -0.5 # minimum weights range bound
weight_max = 2 # maximum weights range bound
lookback_window = 5 # lookback window in days for optimization
num_threads = 4 # number of concurrent threads to use (upper bounded by available cpus)
target_days = test_predict_days # number of desired output rows

## Main runner

In [0]:
import pandas as pd
import numpy as np
from scipy.optimize import Bounds
from functools import partial
import argparse
import multiprocessing as mp

num_threads = min(num_threads, mp.cpu_count())

###
# This conic optimizer is used to optimize a matrix of price changes 
# into a matrix of portfolio weights in a range.
# We only want to optimize weights for the new predicted days in the 
# conic optimizer.
# For each row the optimizer looks at today and a window of k days in the past
###

prices = pd.read_csv(input_path, index_col=0)

# consider only final target_days + the lookback window
prices = prices[-(target_days + (lookback_window - 1)):]

num_days = prices.shape[0]
num_stocks = prices.shape[1]

thread_pool = mp.Pool(num_threads) # concurrency

w_ret = np.zeros((target_days, prices.shape[1]))
print('output shape: {}'.format(w_ret.shape))
# bounds for weights. -.5/2 default
bounds = Bounds([weight_min]*num_stocks, [weight_max]*num_stocks)

# each row can take 30-60 seconds to optimize, so multithreading can be helpful
multithread_partial = partial(
    handle_optimization,
    bounds=bounds,
    lookback_window=lookback_window,
    prices=prices,
    num_stocks=num_stocks,
    num_days=num_days
)

# print(range((lookback_window-1), num_days))
optimal_weights_unsorted = thread_pool.map(
    multithread_partial, 
    range((lookback_window-1), num_days)
)

# realign output into matrix by indices
for i, w in sorted(optimal_weights_unsorted, key=lambda pair: pair[0]):
    w_ret[i, :] = w

print('Saving optimal weights to "{}"'.format(output_path))
weights_df = pd.DataFrame(w_ret, columns=prices.columns)
weights_df.to_csv(output_path, index=False)

(5, 506)
(5, 506)
(5, 506)
optimizing row 2/30...
(5, 506)
optimizing row 0/30...
optimizing row 4/30...
optimizing row 6/30...
expected_val (506,)
expected_val (506,)
expected_val (506,)
expected_val (506,)
output shape: (30, 506)
(5, 506)
optimizing row 1/30...
expected_val (506,)
(5, 506)
optimizing row 7/30...
expected_val (506,)
(5, 506)
optimizing row 5/30...
expected_val (506,)
(5, 506)
optimizing row 3/30...
expected_val (506,)
(5, 506)
optimizing row 8/30...
expected_val (506,)
(5, 506)
optimizing row 10/30...
expected_val (506,)
(5, 506)
optimizing row 12/30...
expected_val (506,)
(5, 506)
optimizing row 14/30...
expected_val (506,)
(5, 506)
optimizing row 9/30...
expected_val (506,)
(5, 506)
optimizing row 13/30...
expected_val (506,)
(5, 506)
optimizing row 11/30...
expected_val (506,)
(5, 506)
optimizing row 15/30...
expected_val (506,)
(5, 506)
optimizing row 16/30...
expected_val (506,)
(5, 506)
optimizing row 18/30...
expected_val (506,)
(5, 506)
optimizing row 20/30...

In [0]:
weights_df = pd.read_csv('prediction/weights.csv')
columns = [col.strip() for col in weights_df.columns]
# print(weights_df.head())

out = ['Id,Predicted']
for ind, row in enumerate(weights_df.itertuples(index=False)):
    for col_ind, col in enumerate(row):
        out.append(f'{ind}_{columns[col_ind]},{col}')

# write to submission file
sub_file = 'prediction/submission.csv'
with open(sub_file, 'w') as f:
    for l in out:
        f.write(l + '\n')
print(f'wrote to {sub_file}')
    

wrote to prediction/submission.csv
