# About

Train predictor to predict the next day stock price by giving it N previous days of stock price changes in combination with text sentiments for stock analysis posts for each day.

In [2]:
from os.path import join
from time import time
from typing import Any
import pickle
import datetime as dt

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import get_scorer

from torch import nn, tensor

# Set device for torch.
device = torch.device('cuda')

## Load data

In [3]:
SOURCE_DATA_FOLDER = ['..', 'datasets']
# Stock data
df_stock_data = pd.read_csv(join(*SOURCE_DATA_FOLDER,'stock_data.csv'))
# Analysis posts as features
df_news_features = pd.read_csv(join(*SOURCE_DATA_FOLDER,'news_features.csv'))
# Sentiment classifier model
with open('LogisticRegression.bin', 'rb') as f:
    sentiment_model,_ = pickle.load(f)

### Prepare data

In [4]:
# Create list of stocks where each stock is a map of dates to values
stock_sets = []
for label, df_shock in df_stock_data.groupby(by='Label'):
    stock = {}
    for idx, row in df_shock.iterrows():
        stock[row['Date']] = row['Value']
    stock_sets.append(stock)
# Get min and max dates (use last stock handled)
dates = list(stock.keys())
dates.sort()
first_date = dt.date.fromisoformat(dates[0])
last_date = dt.date.fromisoformat(dates[-1])
# Deltas
one_day = dt.timedelta(1)
print('Date range:', first_date, last_date)

Date range: 2020-01-01 2020-03-31


### Convert news post features to sentiments

In [5]:
# Create map of sentiments
posts = {}
# dates
post_dates = list(df_news_features['DATE'])
# features
news_features = df_news_features.drop('DATE', axis=1).to_numpy()[:, 1:]
# predictions
sentiment_predicted = sentiment_model.predict(news_features)
# build dictionary of predictions
for i in range(len(post_dates)):
    posts[post_dates[i]] = sentiment_predicted[i]


In [6]:
# Previous days
N = 3
# Prediction start date
start_date = first_date + one_day * N
# sample count per stock
samples = (last_date - start_date).days
print('N:',N)
print('Start:',start_date)
print('Samples per stock:',samples)
X_data=[]
y_data=[]
# generate from all stocks
for stock in stock_sets:
    date = start_date
    # do all valid prediction dates
    for offset in range(samples):
        prediction_date = start_date + one_day * offset
        # array holding stock values in first N spaces
        # and sentiments in last N
        X = np.zeros(N*2)
        train_date_start = prediction_date - one_day*N
        for i in range(0,N):
            date_ = (train_date_start + one_day*i).isoformat()
            # Get stock change (day 0 == 1.0).
            if i==0:
                X[i] = 1.0
                base = stock[date_]
            else: X[i] = stock[date_] / base
            # sentiment
            X[N+i] = posts[date_]
        y_data.append(stock[prediction_date.isoformat()]/base)
        X_data.append(X)
y_data = np.array(y_data, dtype=np.float32)
X_data = np.array(X_data, dtype=np.float32)
print('Sample set count:',len(X_data))
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
print('Train count:',len(X_train))
print('Test count:',len(X_test))
print(X_train[:5])
print(y_train[:5])

N: 3
Start: 2020-01-04
Samples per stock: 87
Sample set count: 1653
Train count: 1322
Test count: 331
[[ 1.          0.9808467   0.99596775 -1.         -1.          1.        ]
 [ 1.          1.          1.          1.         -1.         -1.        ]
 [ 1.          0.9725628   0.95125514  1.         -1.         -1.        ]
 [ 1.          0.9928058   0.9223022  -1.          1.         -1.        ]
 [ 1.          1.          1.          1.          1.          1.        ]]
[0.96874994 1.         0.9454174  0.88201445 1.        ]


In [9]:
# Name
nn_name = 'PredictorNN'

class PredictorNN(nn.Module):
    """Input is 3 days of sentiment and stock values.
    Output is next day stock value.
    """
    INPUT = 6
    
    def __init__(self) -> None:
        super().__init__()
        
        self.fc = nn.Sequential(
            nn.Linear(self.INPUT, self.INPUT*10),
            nn.ReLU(),
            nn.Linear(self.INPUT*10, self.INPUT*4),
            nn.ReLU(),
            nn.Linear(self.INPUT*4, 1),
            nn.ReLU()
        )

    def forward(self, x):
        return self.fc(x)

    def save(self, path: str) -> bool:
        try:
            torch.save(self.state_dict(), path)
            return True
        except:
            return False

    def load(self, path: str) -> bool:
        try:
            self.load_state_dict(torch.load(path))
            self.eval()
            return True
        except:
            return False

In [10]:
model_file = f'{nn_name}.bin'
predictor_model = PredictorNN().to(
    device=device, dtype=torch.float32)

if not predictor_model.load(model_file):
    # create optimizer cunction
    optimizer = torch.optim.Adam(predictor_model.parameters())
    # create loss function
    criterion = nn.MSELoss()
    # set passes
    passes = 100000
    start_time = time()

    X_ = tensor(X_train, dtype=torch.float32, device=device)
    y_ = tensor(y_train, dtype=torch.float32, device=device)

    # do training iterations
    for epoch in range(passes):
        # reset gradients
        optimizer.zero_grad()
        # batch load data each pass
        # prediction
        predicted = predictor_model(X_).reshape(y_.shape)
        # calculate cost
        loss = criterion(predicted, y_)
        # calculate gradients
        loss.backward()
        # update nn weights
        optimizer.step()
        if epoch % 1000 == 0:
            print(f'Epoch {epoch}: loss={loss}')
    print(f'Last epoch {epoch}: loss={loss}, time={time()-start_time}')
    # save results to file
    predictor_model.save(model_file)

In [14]:
# Test accuracy
from sklearn.metrics import r2_score
scorer = get_scorer('r2')
X_ = tensor(X_test, dtype=torch.float32, device=device)
y_pred = predictor_model(X_).cpu().detach().numpy().reshape(y_test.shape)
accuracy = 1.0 - np.absolute(y_pred - y_test).sum()/y_pred.shape[0]
print(f'Accuracy: {accuracy}')
print('X test:', X_[:5])
print('  Pred:', y_pred[:5])
print('  Test:', y_test[:5])

Accuracy: 0.9775905407447469
X test: tensor([[ 1.0000,  1.0952,  1.0105, -1.0000, -1.0000, -1.0000],
        [ 1.0000,  1.0366,  1.0610,  1.0000, -1.0000, -1.0000],
        [ 1.0000,  0.9764,  0.9941, -1.0000, -1.0000,  1.0000],
        [ 1.0000,  0.9839,  0.9471, -1.0000,  1.0000,  1.0000],
        [ 1.0000,  1.0885,  1.0885, -1.0000, -1.0000,  1.0000]],
       device='cuda:0')
  Pred: [0.9677838  1.0771403  0.98829985 0.9388577  1.0797765 ]
  Test: [1.1162646 1.097561  0.9587195 0.9280191 1.0885341]
