# About

Train predictor to predict the next day stock price by giving it N previous days of stock prices in combination with text sentiments for stock analysis posts for each day.

In [52]:
from os.path import join
from time import time
from typing import Any
import pickle
import datetime as dt

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from torch import nn, tensor

# Set device for torch.
device = torch.device('cuda')

## Load data

In [53]:
SOURCE_DATA_FOLDER = ['..', 'datasets']
# Stock data
df_stock_data = pd.read_csv(join(*SOURCE_DATA_FOLDER,'stock_data.csv'))
# Analysis posts as features
df_news_features = pd.read_csv(join(*SOURCE_DATA_FOLDER,'news_features.csv'))
# Sentiment classifier model
with open('LogisticRegression.bin', 'rb') as f:
    sentiment_model,_ = pickle.load(f)

### Prepare data

In [54]:
# Create list of stocks where each stock is a map of dates to values
stock_sets = []
for label, df_shock in df_stock_data.groupby(by='Label'):
    stock = {}
    for idx, row in df_shock.iterrows():
        stock[row['Date']] = row['Value']
    stock_sets.append(stock)
# Get min and max dates (use last stock handled)
dates = list(stock.keys())
dates.sort()
first_date = dt.date.fromisoformat(dates[0])
last_date = dt.date.fromisoformat(dates[-1])
# Deltas
one_day = dt.timedelta(1)
print('Date range:', first_date, last_date)

Date range: 2020-01-01 2020-03-31


### Convert news post features to sentiments

In [55]:
# Create map of sentiments
posts = {}
# dates
post_dates = list(df_news_features['DATE'])
# features
news_features = df_news_features.drop('DATE', axis=1).to_numpy()[:, 1:]
# predictions
sentiment_predicted = sentiment_model.predict(news_features)
# build dictionary of predictions
for i in range(len(post_dates)):
    posts[post_dates[i]] = sentiment_predicted[i]


{'2020-01-01': 1.0, '2020-01-02': 1.0, '2020-01-03': 1.0, '2020-01-04': 0.0, '2020-01-05': 1.0, '2020-01-06': -1.0, '2020-01-07': -1.0, '2020-01-08': -1.0, '2020-01-09': 1.0, '2020-01-10': 1.0, '2020-01-11': -1.0, '2020-01-12': 1.0, '2020-01-13': 1.0, '2020-01-14': 1.0, '2020-01-15': 1.0, '2020-01-16': 1.0, '2020-01-17': 1.0, '2020-01-18': 0.0, '2020-01-19': 1.0, '2020-01-20': 1.0, '2020-01-21': 1.0, '2020-01-22': 1.0, '2020-01-23': 1.0, '2020-01-24': 1.0, '2020-01-25': -1.0, '2020-01-26': 1.0, '2020-01-27': -1.0, '2020-01-28': 1.0, '2020-01-29': 1.0, '2020-01-30': 1.0, '2020-01-31': 1.0, '2020-02-01': 1.0, '2020-02-02': -1.0, '2020-02-03': 1.0, '2020-02-04': 1.0, '2020-02-05': 1.0, '2020-02-06': 1.0, '2020-02-07': -1.0, '2020-02-08': 1.0, '2020-02-09': 1.0, '2020-02-10': 1.0, '2020-02-11': 1.0, '2020-02-12': 1.0, '2020-02-13': 1.0, '2020-02-14': 1.0, '2020-02-15': 1.0, '2020-02-16': 1.0, '2020-02-17': -1.0, '2020-02-18': 1.0, '2020-02-19': 1.0, '2020-02-20': 1.0, '2020-02-21': 1.0, '2

In [64]:
# Previous days
N = 3
# Prediction start date
start_date = first_date + one_day * N
# sample count per stock
samples = (last_date - start_date).days
print('N:',N)
print('Start:',start_date)
print('Samples per stock:',samples)
X_data=[]
y_data=[]
# generate from all stocks
for stock in stock_sets:
    date = start_date
    # do all valid prediction dates
    for offset in range(samples):
        prediction_date = start_date + one_day * offset
        # array holding stock values in first N spaces
        # and sentiments in last N
        X = np.zeros(N*2)
        train_date_start = prediction_date - one_day*N
        for i in range(0,N):
            date_ = (train_date_start + one_day*i).isoformat()
            # stock
            X[i] = stock[date_]
            # sentiment
            X[N+i] = posts[date_]
        y_data.append(stock[prediction_date.isoformat()])
        X_data.append(X)
y_data = np.array(y_data, dtype=np.float32)
X_data = np.array(X_data, dtype=np.float32)
print('Sample set count:',len(X_data))
print('Row 0:',X_data[0],y_data[0])

N: 3
Start: 2020-01-04
Samples per stock: 87
Sample set count: 1653
Row 0: [0.42 0.42 0.42 1.   1.   1.  ] 0.42


In [61]:
# Name
nn_name = 'PredictorNN'

class PredictorNN(nn.Module):
    """Input is 3 days of sentiment and stock values.
    Output is next day stock value.
    """
    INPUT = 6
    
    def __init__(self) -> None:
        super().__init__()
        
        self.fc = nn.Sequential(
            nn.Linear(self.INPUT, self.INPUT*10),
            nn.ReLU(),
            nn.Linear(self.INPUT*7, self.INPUT*4),
            nn.ReLU(),
            nn.Linear(self.INPUT*4, 1),
            nn.ReLU()
        )

    def forward(self, x):
        return self.fc(x)

    def save(self, path: str) -> bool:
        try:
            torch.save(self.state_dict(), path)
            return True
        except:
            return False

    def load(self, path: str) -> bool:
        try:
            self.load_state_dict(torch.load(path))
            self.eval()
            return True
        except:
            return False

In [73]:
model_file = f'{nn_name}.bin'
predictor_model = PredictorNN().to(
    device=device, dtype=torch.float32)

if not predictor_model.load(model_file):
    # create optimizer cunction
    optimizer = torch.optim.Adam(predictor_model.parameters())
    # create loss function
    criterion = nn.MSELoss()
    # set passes
    passes = 100000
    start_time = time()

    X_ = tensor(X_data, dtype=torch.float32, device=device)
    y_ = tensor(y_data, dtype=torch.float32, device=device)

    # do training iterations
    for epoch in range(passes):
        # reset gradients
        optimizer.zero_grad()
        # batch load data each pass
        # prediction
        predicted = predictor_model(X_).reshape(y_.shape)
        # calculate cost
        loss = criterion(predicted, y_)
        # calculate gradients
        loss.backward()
        # update nn weights
        optimizer.step()
        if epoch % 1000 == 0:
            print(f'Epoch {epoch}: loss={loss}')
    print(f'Last epoch {epoch}: loss={loss}, time={time()-start_time}')
    # save results to file
    predictor_model.save(model_file)

Epoch 0: loss=2849.645751953125
Epoch 1000: loss=2.930873155593872
Epoch 2000: loss=2.8580358028411865
Epoch 3000: loss=2.8031725883483887
Epoch 4000: loss=2.767005681991577
Epoch 5000: loss=2.7020082473754883
Epoch 6000: loss=2.615553855895996
Epoch 7000: loss=2.57550048828125
Epoch 8000: loss=2.551692247390747
Epoch 9000: loss=2.532294750213623
Epoch 10000: loss=2.5272183418273926
Epoch 11000: loss=2.508028268814087
Epoch 12000: loss=2.491166591644287
Epoch 13000: loss=2.4852395057678223
Epoch 14000: loss=2.4796905517578125
Epoch 15000: loss=2.4695403575897217
Epoch 16000: loss=2.46474289894104
Epoch 17000: loss=2.4591360092163086
Epoch 18000: loss=2.459909677505493
Epoch 19000: loss=2.4529013633728027
Epoch 20000: loss=2.4354171752929688
Epoch 21000: loss=2.409074544906616
Epoch 22000: loss=2.4051096439361572
Epoch 23000: loss=2.396021842956543
Epoch 24000: loss=2.391101121902466
Epoch 25000: loss=2.3813202381134033
Epoch 26000: loss=2.3784120082855225
Epoch 27000: loss=2.3748178482

In [None]:
# TODO: Test accuracy
print('Accuracy: ??')