The goal of this code is the building of an SI-RCNN model to forcast intraday directional movements.

The first step is the loading of seven technical indicators from our stock of choice. For the remit of this assignment we used the S&P 500.

We made use of the following 7 indicators:


1. Stochastic %K
2. William’s %R
3. Stochastic %D
4. A/D Oscillator
5. Momentum
6. Disparity
7. Rate of Change

In [1]:
# import yfinance as yf
# import pandas as pd
# import numpy as np
# import ta

# data = yf.download("^GSPC", start="2023-01-01", end="2025-01-01", interval="1d")
# data.dropna(inplace=True)

# # 1. SMA (Simple Moving Average - 20 days)
# data['SMA_20'] = data['Close'].rolling(window=20).mean()

# # 2. EMA (Exponential Moving Average - 20 days)
# data['EMA_20'] = data['Close'].ewm(span=20, adjust=False).mean()

# # 3. RSI (Relative Strength Index - 14 days)
# delta = data['Close'].diff()
# gain = np.where(delta > 0, delta, 0)
# loss = np.where(delta < 0, -delta, 0)
# avg_gain = pd.Series(gain.reshape(-1)).rolling(window=14).mean()
# avg_loss = pd.Series(loss.reshape(-1)).rolling(window=14).mean()
# rs = avg_gain / avg_loss
# data['RSI_14'] = 100 - (100 / (1 + rs))

# # 4. MACD (Moving Average Convergence Divergence)
# ema_12 = data['Close'].ewm(span=12, adjust=False).mean()
# ema_26 = data['Close'].ewm(span=26, adjust=False).mean()
# data['MACD'] = ema_12 - ema_26

# # 5. Stochastic Oscillator %K (14-day)
# low_14 = data['Low'].rolling(window=14).min()
# high_14 = data['High'].rolling(window=14).max()
# data['Stochastic_K'] = 100 * ((data['Close'] - low_14) / (high_14 - low_14))

# # 6. ATR (Average True Range - 14 days)
# high_low = data['High'] - data['Low']
# high_close = np.abs(data['High'] - data['Close'].shift())
# low_close = np.abs(data['Low'] - data['Close'].shift())
# true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
# data['ATR'] = true_range.rolling(window=14).mean()

# # 7. OBV (On-Balance Volume)
# obv = (np.sign(data['Close'].diff()) * data['Volume']).fillna(0).cumsum()
# data['OBV'] = obv

# # Keep only indicator columns
# indicators = data[['SMA_20', 'EMA_20', 'RSI_14', 'MACD', 'Stochastic_K', 'ATR', 'OBV']].dropna()

In [2]:
import pandas as pd
import numpy as np
import yfinance as yf

# Ensure the data is sorted by date
technical_layer = yf.download("^GSPC", start="2008-01-01", end="2009-12-31", interval="1d")
technical_layer.reset_index(inplace=True)
technical_layer.dropna(inplace=True)


# Parameters
lookback = 14  # typical lookback for most of these indicators

# 1. Stochastic %K
low_min = technical_layer['Low'].rolling(window=lookback).min()
high_max = technical_layer['High'].rolling(window=lookback).max()
technical_layer['Stochastic_%K'] = 100 * ((technical_layer['Close'] - low_min) / (high_max - low_min))

# 2. Williams %R
technical_layer["Williams_%R"] = -100 * ((high_max - technical_layer['Close']) / (high_max - low_min))

# 3. Stochastic %D (3-period SMA of %K)
technical_layer['Stochastic_%D'] = technical_layer['Stochastic_%K'].rolling(window=3).mean()

# 4. A/D Oscillator (Accumulation/Distribution Line)
ad = ((technical_layer['Close'] - technical_layer['Low']) - (technical_layer['High'] - technical_layer['Close'])) / (technical_layer['High'] - technical_layer['Low']) * technical_layer['Volume']
technical_layer['AD_Line'] = ad.cumsum()
technical_layer['AD_Oscillator'] = technical_layer['AD_Line'] - technical_layer['AD_Line'].shift(lookback)

# 5. Momentum (Close - Close n periods ago)
technical_layer['Momentum'] = technical_layer['Close'] - technical_layer['Close'].shift(lookback)

# 6. Disparity (Close / Moving Average * 100)
technical_layer['Disparity'] = (technical_layer['Close'] / technical_layer['Close'].rolling(window=lookback).mean()) * 100

# 7. Rate of Change (ROC)
technical_layer['ROC'] = ((technical_layer['Close'] - technical_layer['Close'].shift(lookback)) / technical_layer['Close'].shift(lookback)) * 100

# # Display relevant columns
# technical_indicators = technical_layer[['Date', 'Stochastic_%K', 'Williams_%R', 'Stochastic_%D','AD_Oscillator', 'Momentum', 'Disparity', 'ROC']]
# technical_indicators.dropna(inplace=True)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


The next step was to create another input of financial news sentence embeddings, for this we used the FNSPID dataset which hold millions of financial news records covering S&P 500 companies.

https://github.com/Zdong104/FNSPID_Financial_News_Dataset

We manually cleaned the dataset by removing some of the lower rows that had garbage data upon downloaded. Following this we loaded it into python sorted it by date and removed all other columns before saving it again so we may reduce how many times this section is run. We then reduce it to the 5 year date range of 2008-2013 which will be our training window.

In [3]:
# full_csv = pd.read_csv('All_external.csv',usecols=['Article_title', 'Date'])  
# full_csv = full_csv.sort_values('Date').reset_index(drop=True)  

# full_csv = full_csv.set_index('Date')
# full_csv.to_csv('Sorted_Articles.csv')    

# filtered_layer = embedding_layer.loc['2008-01-01':'2013-12-31']
# filtered_layer.to_csv('Sorted_Articles_Reduced.csv')

First we tokenize the titles, we handled quotations as this caused some parsing issues.

In [4]:
from gensim.models import Word2Vec
import re

embedding_layer = pd.read_csv('Sorted_Articles_Reduced_Further.csv')  
embedding_layer = embedding_layer.sort_values('Date').reset_index(drop=True)  
embedding_layer['Date'] = pd.to_datetime(embedding_layer['Date']).dt.tz_localize(None).dt.date

def preprocess_title(title):
    title = str(title).lower()
    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    tokens = re.findall(r"\b[a-zA-Z']+\b", title)
    return tokens

token_list = embedding_layer['Article_title'].apply(preprocess_title)    

In [5]:
print(embedding_layer.head())

         Date                                      Article_title
0  2008-01-01  В гостинице в Гане нашли двух мертвых моряков ...
1  2008-01-01  ЦИК Пакистана назначит выборы в парламент стра...
2  2008-01-01  Новый год на Красной площади встретили 40 тыся...
3  2008-01-01  Жительница Греции объявила эскизы ван Гога вое...
4  2008-01-01              ООН призвала на помощь Человека-Паука


Next we create a sentence embeddings by averaging the word vectors

In [6]:
model = Word2Vec(sentences=token_list, vector_size=100, window=5, min_count=1, workers=4)

def get_sentence_vector(tokens):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

embedding_layer['sentence_vector'] = token_list.apply(get_sentence_vector)

daily_news = embedding_layer.groupby('Date')['sentence_vector'].apply(
    lambda vecs: np.mean(list(vecs), axis=0)
).reset_index()
trading_days = pd.to_datetime(technical_layer['Date']).dt.tz_localize(None).dt.date
daily_news_trading_days = daily_news[daily_news['Date'].isin(trading_days)].reset_index(drop=True)

In [7]:
merged = technical_layer
merged['sentence_vector'] = daily_news_trading_days['sentence_vector']

In [8]:

print(daily_news_trading_days)
print(trading_days)

           Date                                    sentence_vector
0    2008-01-02  [0.2440001643770927, -0.557558611012441, -0.11...
1    2008-01-03  [0.12804681347203434, -0.4463019373631936, -0....
2    2008-01-04  [0.2214711876477938, -0.5480950954532419, -0.0...
3    2008-01-07  [0.08661896623509543, -0.3280746973550903, -0....
4    2008-01-08  [0.10874798711850829, -0.3389029068049729, -0....
..          ...                                                ...
499  2009-12-23  [0.07896654499136317, -0.3301854333265109, -0....
500  2009-12-24  [0.17497795561159546, -0.36503062808560793, -0...
501  2009-12-28  [-0.025856152425437456, -0.17675778438299147, ...
502  2009-12-29  [0.12245176102151632, -0.3678337718904045, -0....
503  2009-12-30  [0.13769285454543606, -0.4702379035099631, -0....

[504 rows x 2 columns]
0      2008-01-02
1      2008-01-03
2      2008-01-04
3      2008-01-07
4      2008-01-08
          ...    
499    2009-12-23
500    2009-12-24
501    2009-12-28
502    200

In [13]:
sequence_length = 5
X_news = []
X_tech = []
y = []

for i in range(sequence_length, len(merged) - 1):
    news_seq = np.stack(merged['sentence_vector'].iloc[i-sequence_length:i].values)  # shape (5, 100)
    tech_seq = merged[['Stochastic_%K', 'Williams_%R', 'Stochastic_%D',
                       'AD_Oscillator', 'Momentum', 'Disparity', 'ROC']].iloc[i-sequence_length:i].values  # shape (5, 7)

    today_close = merged['Close'].iloc[int(i)]
    next_close = merged['Close'].iloc[int(i) + 1]
    label = 1 if float(next_close) > float(today_close) else 0

    X_news.append(news_seq)
    X_tech.append(tech_seq)
    y.append(label)

X_news = np.array(X_news)      # shape: (num_samples, 5, 100)
X_tech = np.array(X_tech)      # shape: (num_samples, 5, 7)
y = np.array(y)                # shape: (num_samples,)

print("News shape:", X_news.shape)
print("Tech shape:", X_tech.shape)
print("Labels shape:", y.shape)

  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(nex

News shape: (498, 5, 100)
Tech shape: (498, 5, 7)
Labels shape: (498,)


  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(next_close) > float(today_close) else 0
  label = 1 if float(nex

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NewsTechLSTM(nn.Module):
    def __init__(self, embedding_dim=100, cnn_out_channels=64, news_hidden=128, tech_hidden=64,dropout=0.5, num_classes=2):
        super(NewsTechLSTM, self).__init__()

        # CNN + LSTM for News
        self.conv1 = nn.Conv1d(embedding_dim, cnn_out_channels, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(dropout)
        self.news_lstm = nn.LSTM(cnn_out_channels, news_hidden, batch_first=True)

        # LSTM for Technical Indicators
        self.tech_lstm = nn.LSTM(7, tech_hidden, batch_first=True)

        # Classifier
        self.fc = nn.Linear(news_hidden + tech_hidden, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, news_seq, tech_seq):
        # news_seq: (batch, 5, 100)
        x = news_seq.permute(0, 2, 1)  # to (batch, 100, 5)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.dropout(x)
        x = x.permute(0, 2, 1)  # to (batch, seq_len, features)
       # LSTM layer
        news_out, _ = self.news_lstm(x) 
        news_last = news_out[:, -1, :]  
        tech_out, _ = self.tech_lstm(tech_seq)
        tech_last = tech_out[:, -1, :] 

        combined = torch.cat((news_last,tech_last),dim=1)
        out=self.fc(combined)
        return self.softmax(out)

In [26]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from torch.utils.data import TensorDataset, DataLoader

# Convert numpy arrays to torch tensors
X_news_tensor = torch.tensor(X_news, dtype=torch.float32)
X_tech_tensor = torch.tensor(X_tech, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

tscv = TimeSeriesSplit(n_splits=5)
fold = 1

for train_idx, test_idx in tscv.split(X_news_tensor):
    print(f"\n--- Fold {fold} ---")

    train_dataset = TensorDataset(X_news_tensor[train_idx], X_tech_tensor[train_idx], y_tensor[train_idx])
    test_dataset = TensorDataset(X_news_tensor[test_idx], X_tech_tensor[test_idx], y_tensor[test_idx])

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    model = NewsTechLSTM()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    # Training
    model.train()
    for epoch in range(5):  # Adjust epochs
        total_loss = 0
        for news_batch, tech_batch, y_batch in train_loader:
            optimizer.zero_grad()
            out = model(news_batch, tech_batch)
            loss = loss_fn(out, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")

    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for news_batch, tech_batch, y_batch in test_loader:
            out = model(news_batch, tech_batch)
            preds = torch.argmax(out, dim=1)
            all_preds.extend(preds.numpy())
            all_labels.extend(y_batch.numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f"Fold {fold} Accuracy: {acc:.4f}")
    fold += 1


--- Fold 1 ---
Epoch 1 Loss: nan
Epoch 2 Loss: nan
Epoch 3 Loss: nan
Epoch 4 Loss: nan
Epoch 5 Loss: nan
Fold 1 Accuracy: 0.4699

--- Fold 2 ---
Epoch 1 Loss: nan
Epoch 2 Loss: nan
Epoch 3 Loss: nan
Epoch 4 Loss: nan
Epoch 5 Loss: nan
Fold 2 Accuracy: 0.5301

--- Fold 3 ---
Epoch 1 Loss: nan
Epoch 2 Loss: nan
Epoch 3 Loss: nan
Epoch 4 Loss: nan
Epoch 5 Loss: nan
Fold 3 Accuracy: 0.4699

--- Fold 4 ---
Epoch 1 Loss: nan
Epoch 2 Loss: nan
Epoch 3 Loss: nan
Epoch 4 Loss: nan
Epoch 5 Loss: nan
Fold 4 Accuracy: 0.4578

--- Fold 5 ---
Epoch 1 Loss: nan
Epoch 2 Loss: nan
Epoch 3 Loss: nan
Epoch 4 Loss: nan
Epoch 5 Loss: nan
Fold 5 Accuracy: 0.3976
