The goal of this code is the building of an SI-RCNN model to forcast intraday directional movements.

The first step is the loading of seven technical indicators from our stock of choice. For the remit of this assignment we used the S&P 500.

We made use of the following 7 indicators:


1. Stochastic %K
2. William’s %R
3. Stochastic %D
4. A/D Oscillator
5. Momentum
6. Disparity
7. Rate of Change

In [22]:
# import yfinance as yf
# import pandas as pd
# import numpy as np
# import ta

# data = yf.download("^GSPC", start="2023-01-01", end="2025-01-01", interval="1d")
# data.dropna(inplace=True)

# # 1. SMA (Simple Moving Average - 20 days)
# data['SMA_20'] = data['Close'].rolling(window=20).mean()

# # 2. EMA (Exponential Moving Average - 20 days)
# data['EMA_20'] = data['Close'].ewm(span=20, adjust=False).mean()

# # 3. RSI (Relative Strength Index - 14 days)
# delta = data['Close'].diff()
# gain = np.where(delta > 0, delta, 0)
# loss = np.where(delta < 0, -delta, 0)
# avg_gain = pd.Series(gain.reshape(-1)).rolling(window=14).mean()
# avg_loss = pd.Series(loss.reshape(-1)).rolling(window=14).mean()
# rs = avg_gain / avg_loss
# data['RSI_14'] = 100 - (100 / (1 + rs))

# # 4. MACD (Moving Average Convergence Divergence)
# ema_12 = data['Close'].ewm(span=12, adjust=False).mean()
# ema_26 = data['Close'].ewm(span=26, adjust=False).mean()
# data['MACD'] = ema_12 - ema_26

# # 5. Stochastic Oscillator %K (14-day)
# low_14 = data['Low'].rolling(window=14).min()
# high_14 = data['High'].rolling(window=14).max()
# data['Stochastic_K'] = 100 * ((data['Close'] - low_14) / (high_14 - low_14))

# # 6. ATR (Average True Range - 14 days)
# high_low = data['High'] - data['Low']
# high_close = np.abs(data['High'] - data['Close'].shift())
# low_close = np.abs(data['Low'] - data['Close'].shift())
# true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
# data['ATR'] = true_range.rolling(window=14).mean()

# # 7. OBV (On-Balance Volume)
# obv = (np.sign(data['Close'].diff()) * data['Volume']).fillna(0).cumsum()
# data['OBV'] = obv

# # Keep only indicator columns
# indicators = data[['SMA_20', 'EMA_20', 'RSI_14', 'MACD', 'Stochastic_K', 'ATR', 'OBV']].dropna()

In [23]:
import pandas as pd
import numpy as np
import yfinance as yf

# Ensure the data is sorted by date
technical_layer = yf.download("^GSPC", start="2023-01-01", end="2025-01-01", interval="1d")
technical_layer.reset_index(inplace=True)
technical_layer.dropna(inplace=True)


# Parameters
lookback = 14  # typical lookback for most of these indicators

# 1. Stochastic %K
low_min = technical_layer['Low'].rolling(window=lookback).min()
high_max = technical_layer['High'].rolling(window=lookback).max()
technical_layer['Stochastic_%K'] = 100 * ((technical_layer['Close'] - low_min) / (high_max - low_min))

# 2. Williams %R
technical_layer["Williams_%R"] = -100 * ((high_max - technical_layer['Close']) / (high_max - low_min))

# 3. Stochastic %D (3-period SMA of %K)
technical_layer['Stochastic_%D'] = technical_layer['Stochastic_%K'].rolling(window=3).mean()

# 4. A/D Oscillator (Accumulation/Distribution Line)
ad = ((technical_layer['Close'] - technical_layer['Low']) - (technical_layer['High'] - technical_layer['Close'])) / (technical_layer['High'] - technical_layer['Low']) * technical_layer['Volume']
technical_layer['AD_Line'] = ad.cumsum()
technical_layer['AD_Oscillator'] = technical_layer['AD_Line'] - technical_layer['AD_Line'].shift(lookback)

# 5. Momentum (Close - Close n periods ago)
technical_layer['Momentum'] = technical_layer['Close'] - technical_layer['Close'].shift(lookback)

# 6. Disparity (Close / Moving Average * 100)
technical_layer['Disparity'] = (technical_layer['Close'] / technical_layer['Close'].rolling(window=lookback).mean()) * 100

# 7. Rate of Change (ROC)
technical_layer['ROC'] = ((technical_layer['Close'] - technical_layer['Close'].shift(lookback)) / technical_layer['Close'].shift(lookback)) * 100

# Display relevant columns
technical_indicators = technical_layer[['Date', 'Stochastic_%K', 'Williams_%R', 'Stochastic_%D','AD_Oscillator', 'Momentum', 'Disparity', 'ROC']]
technical_indicators.dropna(inplace=True)
print(technical_indicators.head())

[*********************100%***********************]  1 of 1 completed

Price        Date Stochastic_%K Williams_%R Stochastic_%D AD_Oscillator  \
Ticker                                                                    
15     2023-01-25     90.252829   -9.747171     90.951325  1.302313e+10   
16     2023-01-26     99.547583   -0.452417     93.453797  1.936106e+10   
17     2023-01-27     89.097404  -10.902596     92.965938  1.618603e+10   
18     2023-01-30     64.761217  -35.238783     84.468735  1.680585e+10   
19     2023-01-31     91.560900   -8.439100     81.806507  1.764679e+10   

Price     Momentum   Disparity       ROC  
Ticker                                    
15      163.250000  101.656858  4.236991  
16      252.329834  102.309142  6.626134  
17      175.479980  102.241487  4.505170  
18      125.679932  100.688509  3.229112  
19      157.350098  101.875888  4.014801  



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  technical_indicators.dropna(inplace=True)


The next step was to create another input of financial news sentence embeddings, for this we used the FNSPID dataset which hold millions of financial news records covering S&P 500 companies.

https://github.com/Zdong104/FNSPID_Financial_News_Dataset

First we tokenize the titles, we handled quotations as this caused some parsing issues.

In [24]:
from gensim.models import Word2Vec
import re

embedding_layer = pd.read_csv('All_external_subset.csv')  
print(embedding_layer.head)
embedding_layer = embedding_layer.sort_values('Date').reset_index(drop=True)  

def preprocess_title(title):
    title = title.lower()
    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    tokens = re.findall(r"\b[a-zA-Z']+\b", title)
    return tokens

token_list = embedding_layer['Article_title'].apply(preprocess_title)

<bound method NDFrame.head of                        Date  \
0   2020-06-05 06:30:54 UTC   
1   2020-06-03 06:45:20 UTC   
2   2020-05-26 00:30:07 UTC   
3   2020-05-22 08:45:06 UTC   
4   2020-05-22 07:38:59 UTC   
5   2020-05-22 07:23:25 UTC   
6   2020-05-22 05:36:20 UTC   
7   2020-05-22 05:07:04 UTC   
8   2020-05-22 04:37:59 UTC   
9   2020-05-22 04:06:17 UTC   
10  2020-05-22 00:00:00 UTC   
11  2020-05-22 00:00:00 UTC   
12  2020-05-21 00:00:00 UTC   
13  2020-05-21 00:00:00 UTC   
14  2020-05-21 00:00:00 UTC   
15  2020-05-21 00:00:00 UTC   
16  2020-05-18 00:00:00 UTC   
17  2020-05-16 00:00:00 UTC   
18  2020-05-15 00:00:00 UTC   
19  2020-05-08 00:00:00 UTC   
20  2020-05-05 00:00:00 UTC   
21  2020-05-01 00:00:00 UTC   
22  2020-04-28 00:00:00 UTC   
23  2020-04-23 00:00:00 UTC   
24  2020-04-22 00:00:00 UTC   
25  2020-04-14 00:00:00 UTC   
26  2020-04-08 00:00:00 UTC   
27  2020-04-06 00:00:00 UTC   
28  2020-04-02 00:00:00 UTC   
29  2020-04-01 00:00:00 UTC   
30  2020-

Next we create a sentence embeddings by averaging the word vectors

In [None]:
model = Word2Vec(sentences=token_list, vector_size=100, window=5, min_count=1, workers=4)

def get_sentence_vector(tokens):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

embedding_layer['sentence_vector'] = token_list.apply(get_sentence_vector)

In [27]:
print(embedding_layer['sentence_vector'].head)
embedding_layer['sentence_vector'].to_csv("sentence_vector")

<bound method NDFrame.head of 0     [0.00038041366, 0.00038732446, 0.00061241444, ...
1     [-0.0010377998, 0.00042375515, -0.0011617246, ...
2     [-0.0023824708, -0.0012906671, 0.0009930803, 0...
3     [-0.0007787028, 0.0014723241, -0.0006502996, -...
4     [-0.0023002836, 0.001976771, -0.00018580035, -...
5     [-0.0010352622, -0.0015254191, -0.002226289, -...
6     [-0.00014356375, -0.00014395872, -0.0015540291...
7     [0.0007607373, 0.0022731104, -0.0003411403, 0....
8     [-0.0008626608, -0.0023146372, 0.002436483, 0....
9     [-0.0008346233, 0.0016315253, 0.0006025097, -0...
10    [-0.00014000069, 0.002407807, 0.0018011844, -0...
11    [-0.0008903536, 0.0041253697, -0.0019558165, 0...
12    [-0.00420256, 0.001401495, 0.0012031626, 0.002...
13    [0.0015997442, 0.0040907594, 0.0014146743, -0....
14    [-0.0014908734, 0.0027777825, 0.001969549, 0.0...
15    [0.0014033166, 0.0020460426, -0.0011179068, -9...
16    [0.001953044, 0.00066828105, 0.0009675104, 0.0...
17    [0.003297134

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


day_to_vectors = embedding_layer.groupby('Date')['sentence_vector'].apply(list)
day_tensors = []
for vectors in day_to_vectors:
    tensor = torch.tensor(vectors, dtype=torch.float32)
    day_tensors.append(tensor)

# # Example: get shape info
# for i, tensor in enumerate(day_tensors[:3]):
#     print(f"Day {i+1}: shape {tensor.shape}")  # (num_titles, 100)

# Define CNN model
class NewsCNN(nn.Module):
    def __init__(self, embedding_dim=100, out_channels=64, kernel_size=3, dropout=0.5):
        super(NewsCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=out_channels, kernel_size=kernel_size, padding=1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: (batch_size, seq_len, embedding_dim) → need (batch_size, embedding_dim, seq_len)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool1d(x, kernel_size=x.size(2))  # Temporal max pooling over sequence
        x = x.squeeze(2)  # Remove the pooled dimension
        x = self.dropout(x)
        return x

# Example: Create model and test on a batch of days
model = NewsCNN()

# Create batch: pad sequences to max length (optional, if using batch training)
from torch.nn.utils.rnn import pad_sequence

batch = pad_sequence(day_tensors, batch_first=True)  # Shape: (batch_size, max_seq_len, 100)
print("Batch shape:", batch.shape)

# Forward pass
output = model(batch)
print("CNN output shape:", output.shape)  # (batch_size, out_channels)

Batch shape: torch.Size([27, 4, 100])
CNN output shape: torch.Size([27, 64])
