# Training the sentiment price prediction model

## 0. Setup

Run `$ bash setup.sh` to set up environments and download data.

In [None]:
# setup environment
!bash setup.sh

In [None]:
# You can also set up the environment manually:
!conda create --name sent_env --file requirements.

# --or--
# !conda conda env create -f environment.yml

!mkdir -p ./data

# then download the sql files:
!gdown https://drive.google.com/file/d/1YG_AQIbcY2Mi-bKMLN1hff66jDBFeh4O/view?usp=sharing -O ./data/spx_news_sentiment_fundamental.db
!gdown https://drive.google.com/file/d/1C49ElctSD0hPsukQTkioneA7PeWBlMfe/view?usp=sharing -O ./data/spx_news_sentiment_price.db

## 1. Loading data

A dataset class has been implemented already. 

This uses the iterable style of the `Dataset` class.

Initiate the BaseDataset class by passing a Config object, and wrap the kwargs
into the Config object. 

When calling the `__iter__` method using `for data in dataset`, a triple will be returned `(x1, x2, y)`. x1 is the sentiment/price data, x2 is the past financial data up until the train data time, and y is the share price performance during the look_forward period (defaul 5 days, or a trading week). 

The `forward()` method needs to take in (x1, x2), as x2 carries information about the stock's fundamentals and type, but is of a different dimension as x1. An autoencoder model has been implemented and trained to convert the different dimensions of x2 into the same hidden dimensions (and is also useful for dimensionality reduction).

Note x1 and x2 are scaled to between -1 and 1. 

In [4]:
from datasets.datasets import BaseDataset
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from utils import Config

config = Config(mode='train', look_back=100, look_forward=5, num_workers=64)
dataset = BaseDataset(config=config)

## 2. Train autoencoder

In [5]:
from models.autoencoder import AutoEncoder

In [6]:
model = AutoEncoder()

In [3]:
print(model)

AutoEncoder(
  (LSTM_encoder): LSTM(88, 5, batch_first=True)
  (Conv1D_encoder): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
  (MaxPool1D_encoder): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Flatten_encoder): Flatten(start_dim=1, end_dim=-1)
  (Dense_encoder): Linear(in_features=100, out_features=5, bias=True)
)


In [2]:
for data in dataset:
    x1, x2, y = data
    print(x1.shape, x2.shape, y)
    break


torch.Size([18, 100]) torch.Size([88, 23]) tensor(0.0567, dtype=torch.float64)


In [7]:
from utils import padding
padding(x2, repeat=100 - x2.shape[1])

tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.9956, -0.9955, -0.9956],
        [ 0.0000,  0.0000,  0.0000,  ..., -1.0000, -0.9999, -0.9999],
        [ 0.0000,  0.0000,  0.0000,  ..., -0.9991, -0.9984, -0.9985],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ..., -1.0000, -1.0000, -1.0000],
        [ 0.0000,  0.0000,  0.0000,  ..., -0.9835, -0.9835, -0.9835],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.9506,  0.9506,  0.9506]],
       dtype=torch.float64)