The goal of this code is the building of an SI-RCNN model to forcast closing price directional movements.

The first step is the loading of seven technical indicators from our stock of choice. For the remit of this assignment we used the S&P 500.

In [8]:
import pandas as pd
import numpy as np
import yfinance as yf
import import_ipynb
import layers_nt


start_date = "2008-01-01"
end_date = "2013-12-31"
technical_layer = layers_nt.calculate_technical_indicator(start_date,end_date)
technical_layer.head()

[*********************100%***********************]  1 of 1 completed




Price,Date,Close,High,Low,Open,Volume,Stochastic_%K,Williams_%R,Stochastic_%D,AD_Line,AD_Oscillator,Momentum,Disparity,ROC
Ticker,Unnamed: 1_level_1,^GSPC,^GSPC,^GSPC,^GSPC,^GSPC,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2008-01-02,1447.160034,1471.77002,1442.069946,1467.969971,3452650000,,,,-2269199000.0,,,,
1,2008-01-03,1447.160034,1456.800049,1443.72998,1447.550049,3429500000,,,,-3898652000.0,,,,
2,2008-01-04,1411.630005,1444.01001,1411.189941,1444.01001,4166000000,0.726416,-99.273584,,-7952933000.0,,,98.349724,
3,2008-01-07,1416.180054,1423.869995,1403.449951,1414.069946,4221260000,23.861442,-76.138558,,-6911024000.0,-4641825000.0,-30.97998,99.381752,-2.140743
4,2008-01-08,1390.189941,1430.280029,1388.300049,1415.709961,4705390000,3.392378,-96.607622,9.326745,-11192750000.0,-7294099000.0,-56.970093,98.875529,-3.936682


The next step was to create another input of financial news sentence embeddings, for this we used the FNSPID dataset which hold millions of financial news records covering S&P 500 companies.

https://github.com/Zdong104/FNSPID_Financial_News_Dataset

We manually cleaned the dataset by removing some of the lower rows that had garbage data upon downloaded. Following this we loaded it into python sorted it by date and removed all other columns before saving it again so we may reduce how many times this section is run. We then reduce it to the 5 year date range of 2008-2013 which will be our training window.

In [3]:
def extract_embedding_data(start_date,end_date,out_filename):
    full_csv = pd.read_csv('All_external.csv',usecols=['Article_title', 'Date'])  
    full_csv = full_csv.sort_values('Date').reset_index(drop=True)  
    full_csv = full_csv.set_index('Date')

    filtered_csv = full_csv.loc[start_date:end_date]
    filtered_csv.to_csv(out_filename)

extract_embedding_data('2019-05-31','2020-06-01','test2020.csv')

First we tokenize the titles, we handled quotations as this caused some parsing issues. Next we create a sentence embeddings by averaging the word vectors.

In [4]:
trading_days = pd.to_datetime(technical_layer['Date']).dt.tz_localize(None).dt.date
daily_news_trading_days = layers_nt.calculate_embedding_layer('Sorted_Articles_Reduced.csv',trading_days)
merged = technical_layer
merged['sentence_vector'] = daily_news_trading_days['sentence_vector']

In [None]:
lookback = 3
X_news = []
X_tech = []
y = []

for i in range(lookback, len(merged) - 1):
    news_seq = np.stack(merged['sentence_vector'].iloc[i-lookback:i].values)  # shape (5, 100)
    tech_seq = merged[['Stochastic_%K', 'Williams_%R', 'Stochastic_%D',
                       'AD_Oscillator', 'Momentum', 'Disparity', 'ROC']].iloc[i-lookback:i].values  # shape (5, 7)
    if np.isnan(news_seq).any() or np.isnan(tech_seq).any():
        continue

    today_close = merged['Close'].iloc[int(i)]
    next_close = merged['Close'].iloc[int(i) + 1]
    label = 1 if float(next_close.iloc[0]) > float(today_close.iloc[0]) else 0

    X_news.append(news_seq)
    X_tech.append(tech_seq)
    y.append(label)

X_news = np.array(X_news)      # shape: (num_samples, 5, 100)
X_tech = np.array(X_tech)      # shape: (num_samples, 5, 7)
y = np.array(y)                # shape: (num_samples,)

print("News shape:", X_news.shape)
print("Tech shape:", X_tech.shape)
print("Labels shape:", y.shape)

  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(nex

News shape: (1502, 3, 100)
Tech shape: (1502, 3, 7)
Labels shape: (1502,)


  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(nex

In [6]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from torch.utils.data import TensorDataset, DataLoader
import copy
import model_nt
import torch
import torch.nn as nn
import torch.nn.functional as F

X_news_tensor = torch.tensor(X_news, dtype=torch.float32)
X_tech_tensor = torch.tensor(X_tech, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

tscv = TimeSeriesSplit(n_splits=5)
fold = 1
best_acc = 0.0
best_model_state = None

for train_idx, test_idx in tscv.split(X_news_tensor):
    print(f"\n--- Fold {fold} ---")

    train_dataset = TensorDataset(X_news_tensor[train_idx], X_tech_tensor[train_idx], y_tensor[train_idx])
    test_dataset = TensorDataset(X_news_tensor[test_idx], X_tech_tensor[test_idx], y_tensor[test_idx])

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    model = model_nt.NewsTechLSTM()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(20):
        total_loss = 0
        for news_batch, tech_batch, y_batch in train_loader:
            optimizer.zero_grad()
            out = model(news_batch, tech_batch)
            loss = loss_fn(out, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for news_batch, tech_batch, y_batch in test_loader:
            out = model(news_batch, tech_batch)
            preds = torch.argmax(out, dim=1)
            all_preds.extend(preds.numpy())
            all_labels.extend(y_batch.numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f"Fold {fold} Accuracy: {acc:.4f}")

    if acc > best_acc:
        best_acc = acc
        best_model_state = copy.deepcopy(model.state_dict())

    fold += 1

torch.save(best_model_state, 'best_news_tech_model.pt')
print(f"\nBest model saved with accuracy: {best_acc:.4f}")


--- Fold 1 ---
Epoch 1 Loss: 0.6943
Epoch 2 Loss: 0.6935
Epoch 3 Loss: 0.6926
Epoch 4 Loss: 0.6928
Epoch 5 Loss: 0.6934
Epoch 6 Loss: 0.6910
Epoch 7 Loss: 0.6946
Epoch 8 Loss: 0.6926
Epoch 9 Loss: 0.6915
Epoch 10 Loss: 0.6911
Epoch 11 Loss: 0.6930
Epoch 12 Loss: 0.6912
Epoch 13 Loss: 0.6908
Epoch 14 Loss: 0.6915
Epoch 15 Loss: 0.6927
Epoch 16 Loss: 0.6898
Epoch 17 Loss: 0.6893
Epoch 18 Loss: 0.6901
Epoch 19 Loss: 0.6906
Epoch 20 Loss: 0.6944
Fold 1 Accuracy: 0.4640

--- Fold 2 ---
Epoch 1 Loss: 0.6920
Epoch 2 Loss: 0.6896
Epoch 3 Loss: 0.6909
Epoch 4 Loss: 0.6902
Epoch 5 Loss: 0.6891
Epoch 6 Loss: 0.6900
Epoch 7 Loss: 0.6883
Epoch 8 Loss: 0.6873
Epoch 9 Loss: 0.6900
Epoch 10 Loss: 0.6896
Epoch 11 Loss: 0.6882
Epoch 12 Loss: 0.6870
Epoch 13 Loss: 0.6883
Epoch 14 Loss: 0.6888
Epoch 15 Loss: 0.6871
Epoch 16 Loss: 0.6882
Epoch 17 Loss: 0.6876
Epoch 18 Loss: 0.6888
Epoch 19 Loss: 0.6870
Epoch 20 Loss: 0.6875
Fold 2 Accuracy: 0.5360

--- Fold 3 ---
Epoch 1 Loss: 0.6892
Epoch 2 Loss: 0.6864


We evaluate the model by seeing its performance on fiscal year 2014

In [None]:
model = model_nt.NewsTechLSTM()
model.load_state_dict(torch.load("best_news_tech_model.pt"))
model.eval()
technical_indicators = layers_nt.calculate_technical_indicator(start_date='2014-01-01', end_date='2014-12-31')
trading_days = pd.to_datetime(technical_indicators['Date']).dt.tz_localize(None).dt.date
daily_news_trading_days = layers_nt.calculate_embedding_layer('test2014.csv',trading_days)

merged = technical_indicators
merged['sentence_vector'] = daily_news_trading_days['sentence_vector']
y_true,y_pred = [], []

for i in range(lookback, len(merged) - 1):
    news_seq = np.stack(merged['sentence_vector'].iloc[i-lookback:i].values)
    tech_seq = merged[['Stochastic_%K', 'Williams_%R', 'Stochastic_%D','AD_Oscillator', 'Momentum', 'Disparity', 'ROC']].iloc[i-lookback:i].values

    news_tensor = torch.tensor(news_seq[np.newaxis, ...], dtype=torch.float32)
    tech_tensor = torch.tensor(tech_seq[np.newaxis, ...], dtype=torch.float32)

    with torch.no_grad():
        out = model(news_tensor, tech_tensor)
        pred = torch.argmax(out, dim=1).item()

    today_close = merged['Close'].iloc[int(i)]
    next_close = merged['Close'].iloc[int(i) + 1]
    label = 1 if float(next_close.iloc[0]) > float(today_close.iloc[0]) else 0
    
    y_pred.append(pred)
    y_true.append(label)


[*********************100%***********************]  1 of 1 completed


Model Accuracy on 2014 Test Set: 0.5709


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, balanced_accuracy_score

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
bal_acc = balanced_accuracy_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, y_pred)

print(f"\n--- Model Performance on 2014 test set ---")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Balanced Accuracy: {bal_acc:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")


--- Model Performance on 2014 test set ---
Accuracy: 0.5709
Precision: 0.5869
Recall: 0.8741
F1-Score: 0.7022
Balanced Accuracy: 0.5140
ROC-AUC: 0.5140
