The goal of this code is the building of an SI-RCNN model to forcast closing price directional movements.

The first step is the loading of seven technical indicators from our stock of choice. For the remit of this assignment we used the S&P 500.

In [38]:
import pandas as pd
import numpy as np
import yfinance as yf
import import_ipynb
import layers_nt


start_date = "2007-12-31"
end_date = "2017-12-31"
technical_layer = layers_nt.calculate_technical_indicator(start_date,end_date)
technical_layer.head()

[*********************100%***********************]  1 of 1 completed


Price,Date,Close,High,Low,Open,Volume,Stochastic_%K,Williams_%R,Stochastic_%D,AD_Line,AD_Oscillator,Momentum,Disparity,ROC
Ticker,Unnamed: 1_level_1,^GSPC,^GSPC,^GSPC,^GSPC,^GSPC,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,2008-01-03,1447.160034,1456.800049,1443.72998,1447.550049,3429500000,15.07727,-84.92273,13.078558,-5271027000.0,-4865881000.0,-31.329956,99.514061,-2.119051
3,2008-01-04,1411.630005,1444.01001,1411.189941,1444.01001,4166000000,0.726416,-99.273584,7.901843,-9325308000.0,-7952933000.0,-56.72998,98.349724,-3.863493
4,2008-01-07,1416.180054,1423.869995,1403.449951,1414.069946,4221260000,23.861442,-76.138558,12.293929,-8283399000.0,-4641825000.0,-30.97998,99.381752,-2.140743
5,2008-01-08,1390.189941,1430.280029,1388.300049,1415.709961,4705390000,3.392378,-96.607622,13.62691,-12565130000.0,-7294099000.0,-56.970093,98.875529,-3.936682
6,2008-01-09,1409.130005,1409.189941,1378.699951,1390.25,5351030000,58.99575,-41.00425,31.194064,-7235134000.0,2090174000.0,-2.5,100.282055,-0.1771


The next step was to create another input of financial news sentence embeddings, for this we used the FNSPID dataset which hold millions of financial news records covering S&P 500 companies.

https://github.com/Zdong104/FNSPID_Financial_News_Dataset

We manually cleaned the dataset by removing some of the lower rows that had garbage data upon downloaded. Following this we loaded it into python sorted it by date and removed all other columns before saving it again so we may reduce how many times this section is run. We then reduce it to the 10 year date range of 2007-2017 which will be our training window.

The below function 

In [39]:
def extract_embedding_data(start_date,end_date,out_filename,lookback):
    start_date = pd.to_datetime(start_date) - pd.Timedelta(days=lookback)
    full_csv = pd.read_csv('All_external.csv',usecols=['Article_title', 'Date'])  
    full_csv = full_csv.sort_values('Date').reset_index(drop=True)  
    full_csv = full_csv.set_index('Date')

    filtered_csv = full_csv.loc[str(start_date):end_date]
    filtered_csv.to_csv(out_filename)

# extract_embedding_data('2007-12-31','2017-12-31','train2007_2017.csv',3)

First we tokenize the titles, we handled quotations as this caused some parsing issues. Next we create a sentence embeddings by averaging the word vectors.

In [40]:
trading_days = pd.to_datetime(technical_layer['Date']).dt.tz_localize(None).dt.date
daily_news_trading_days = layers_nt.calculate_embedding_layer('train2007_2017.csv',trading_days)
merged = technical_layer
merged['sentence_vector'] = daily_news_trading_days['sentence_vector']

In [41]:
merged

Price,Date,Close,High,Low,Open,Volume,Stochastic_%K,Williams_%R,Stochastic_%D,AD_Line,AD_Oscillator,Momentum,Disparity,ROC,sentence_vector
Ticker,Unnamed: 1_level_1,^GSPC,^GSPC,^GSPC,^GSPC,^GSPC,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,2008-01-03,1447.160034,1456.800049,1443.729980,1447.550049,3429500000,15.077270,-84.922730,13.078558,-5.271027e+09,-4.865881e+09,-31.329956,99.514061,-2.119051,"[0.11341399877799806, 0.3113132897123606, -0.0..."
3,2008-01-04,1411.630005,1444.010010,1411.189941,1444.010010,4166000000,0.726416,-99.273584,7.901843,-9.325308e+09,-7.952933e+09,-56.729980,98.349724,-3.863493,"[0.11800809200919483, 0.2414086187256526, -0.0..."
4,2008-01-07,1416.180054,1423.869995,1403.449951,1414.069946,4221260000,23.861442,-76.138558,12.293929,-8.283399e+09,-4.641825e+09,-30.979980,99.381752,-2.140743,"[0.09236586959109562, 0.13749066506623778, -0...."
5,2008-01-08,1390.189941,1430.280029,1388.300049,1415.709961,4705390000,3.392378,-96.607622,13.626910,-1.256513e+10,-7.294099e+09,-56.970093,98.875529,-3.936682,"[0.09118898027257308, 0.11904885413070383, -0...."
6,2008-01-09,1409.130005,1409.189941,1378.699951,1390.250000,5351030000,58.995750,-41.004250,31.194064,-7.235134e+09,2.090174e+09,-2.500000,100.282055,-0.177100,"[0.16195091112878784, -0.0007310634570047913, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2514,2017-12-22,2683.340088,2685.350098,2678.129883,2684.219971,2401030000,43.739107,-56.260893,44.946579,1.202022e+12,-2.711157e+09,1.870117,100.035542,0.069742,"[-0.17170232655358383, 0.3344518069809692, -0...."
2515,2017-12-26,2680.500000,2682.739990,2677.959961,2679.090088,1970660000,17.302799,-82.697201,30.520953,1.202145e+12,-7.093981e+08,1.250000,99.914143,0.046655,"[-0.17529083112511964, 0.35032462501412204, -0..."
2516,2017-12-27,2682.620117,2685.639893,2678.909912,2682.100098,2202900000,60.679658,-39.320342,38.991228,1.202371e+12,1.413899e+09,-1.949951,100.017401,-0.072636,"[-0.1723699686297925, 0.2845121734706942, -0.7..."
2517,2017-12-28,2687.540039,2687.659912,2682.689941,2686.100098,2174890000,98.764189,-1.235811,79.721924,1.204441e+12,2.419672e+09,4.199951,100.148559,0.156520,


In [42]:
lookback = 3
X_news = []
X_tech = []
y = []

for i in range(lookback, len(merged) - 1):
    news_seq = np.stack(merged['sentence_vector'].iloc[i-lookback:i].values)
    tech_seq = merged[['Stochastic_%K', 'Williams_%R', 'Stochastic_%D',
                       'AD_Oscillator', 'Momentum', 'Disparity', 'ROC']].iloc[i-lookback:i].values
    if np.isnan(news_seq).any() or np.isnan(tech_seq).any():
        continue

    today_close = merged['Close'].iloc[int(i)]
    next_close = merged['Close'].iloc[int(i) + 1]
    label = 1 if float(next_close.iloc[0]) > float(today_close.iloc[0]) else 0

    X_news.append(news_seq)
    X_tech.append(tech_seq)
    y.append(label)

X_news = np.array(X_news)
X_tech = np.array(X_tech) 
y = np.array(y) 

print("News shape:", X_news.shape)
print("Tech shape:", X_tech.shape)
print("Labels shape:", y.shape)

News shape: (2513, 3, 100)
Tech shape: (2513, 3, 7)
Labels shape: (2513,)


We split the dataset into 10 folds which each fold roughly translatingto a year.

In [43]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from torch.utils.data import TensorDataset, DataLoader
import copy
import model_nt
import torch
import torch.nn as nn
import torch.nn.functional as F

patience = 10
delta = 0.001

X_news_tensor = torch.tensor(X_news, dtype=torch.float32)
X_tech_tensor = torch.tensor(X_tech, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

tscv = TimeSeriesSplit(n_splits=5)
fold = 1
best_acc = 0.0
best_model_state = None

for train_idx, test_idx in tscv.split(X_news_tensor):
    print(f"\n--- Fold {fold} ---")

    train_dataset = TensorDataset(X_news_tensor[train_idx], X_tech_tensor[train_idx], y_tensor[train_idx])
    test_dataset = TensorDataset(X_news_tensor[test_idx], X_tech_tensor[test_idx], y_tensor[test_idx])

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    model = model_nt.NewsTechLSTM()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    best_val_loss = np.inf
    epochs_no_improve = 0
    early_stop = False

    model.train()
    for epoch in range(50):
        total_loss = 0
        for news_batch, tech_batch, y_batch in train_loader:
            optimizer.zero_grad()
            out = model(news_batch, tech_batch)
            loss = loss_fn(out, y_batch)
            preds = torch.argmax(out, dim=1)
            up_ratio = (preds == 1).float().mean().item()
            if up_ratio >= 0.97 :
                penalty = 0.5
                loss += penalty
            elif up_ratio >= 0.9 :
                penalty = 0.1
                loss += penalty
            loss.backward()
            optimizer.step()
            total_loss += loss.item()      
        avg_loss = total_loss / len(train_loader)
        #print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")

    # Validation (test set for this fold)
        model.eval()
        val_loss = 0
        all_preds, all_labels = [], []
        with torch.no_grad():
            for news_batch, tech_batch, y_batch in test_loader:
                out = model(news_batch, tech_batch)
                loss = loss_fn(out, y_batch)
                val_loss += loss.item()
                preds = torch.argmax(out, dim=1)
                all_preds.extend(preds.numpy())
                all_labels.extend(y_batch.numpy())
        print(all_preds)
        avg_val_loss = val_loss / len(test_loader)
        acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}: Train Loss={avg_loss:.4f}, Val Loss={avg_val_loss:.4f}, Val Acc={acc:.4f}")

        # Early stopping logic
        if avg_val_loss < best_val_loss - delta:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            best_fold_model_state = copy.deepcopy(model.state_dict())
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                early_stop = True
                model.load_state_dict(best_fold_model_state)
                break

    # Final evaluation for this fold
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for news_batch, tech_batch, y_batch in test_loader:
            out = model(news_batch, tech_batch)
            preds = torch.argmax(out, dim=1)
            all_preds.extend(preds.numpy())
            all_labels.extend(y_batch.numpy())
            
    acc = accuracy_score(all_labels, all_preds)
    print(f"Fold {fold} Accuracy: {acc:.4f}")

    if acc > best_acc:
        best_acc = acc
        best_model_state = copy.deepcopy(model.state_dict())

    fold += 1

torch.save(best_model_state, 'best_news_tech_model.pt')
print(f"\nBest model saved with accuracy: {best_acc:.4f}")


--- Fold 1 ---
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,

We evaluate the model by seeing its performance on fiscal year 2014

In [44]:
model = model_nt.NewsTechLSTM()
model.load_state_dict(torch.load("best_news_tech_model.pt"))
model.eval()
technical_indicators = layers_nt.calculate_technical_indicator(start_date='2018-01-01', end_date='2018-12-31')
trading_days = pd.to_datetime(technical_indicators['Date']).dt.tz_localize(None).dt.date
daily_news_trading_days = layers_nt.calculate_embedding_layer('test2018.csv',trading_days)

merged = technical_indicators
merged['sentence_vector'] = daily_news_trading_days['sentence_vector']
y_true,y_pred = [], []

for i in range(lookback, len(merged) - 1):
    news_seq = np.stack(merged['sentence_vector'].iloc[i-lookback:i].values)
    tech_seq = merged[['Stochastic_%K', 'Williams_%R', 'Stochastic_%D','AD_Oscillator', 'Momentum', 'Disparity', 'ROC']].iloc[i-lookback:i].values

    news_tensor = torch.tensor(news_seq[np.newaxis, ...], dtype=torch.float32)
    tech_tensor = torch.tensor(tech_seq[np.newaxis, ...], dtype=torch.float32)

    with torch.no_grad():
        out = model(news_tensor, tech_tensor)
        pred = torch.argmax(out, dim=1).item()

    today_close = merged['Close'].iloc[int(i)]
    next_close = merged['Close'].iloc[int(i) + 1]
    label = 1 if float(next_close.iloc[0]) > float(today_close.iloc[0]) else 0
    
    y_pred.append(pred)
    y_true.append(label)


[*********************100%***********************]  1 of 1 completed


In [45]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, balanced_accuracy_score

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
bal_acc = balanced_accuracy_score(y_true, y_pred)
roc_auc = roc_auc_score(y_true, y_pred)

print(f"\n--- Model Performance on 2014 test set ---")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Balanced Accuracy: {bal_acc:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")


--- Model Performance on 2014 test set ---
Accuracy: 0.5041
Precision: 0.5102
Recall: 0.8000
F1-Score: 0.6231
Balanced Accuracy: 0.4966
ROC-AUC: 0.4966


In [46]:
y_pred

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]