Read dataframe of 04/2023-04/2024 news headlines related to several stocks (AMD, AAPL, NFLX, AMZN, TSCO, WMT, MSFT, LLY, GOOG, NKE, TSLA, QQQ, NVDA, stock market).
The data was scrapped with my own code and by sending requests to finnhub.io

In [None]:
import pandas as pd
from sqlalchemy import create_engine, inspect
import numpy as np

In [None]:
engine = create_engine('sqlite:///../output/AAPL_2023-04-06_2024-04-06/financial_data.db', echo=False)
df = pd.read_sql('AAPL_', con=engine)

In [None]:
df.head(3)

The next step before adding the labels to the data is to calculate the emotions of the headlines using FinBERT pre-trained textual model.

We are going to use a pipeline that already does for us all the preprocessing and tokenizing of the textual data so we only need to send the headlines as inputs and get the output

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sqlalchemy import MetaData, inspect

def batch(data, batch_size):
    """Batch the data to avoid too high memory consumption"""
    batches = []
    
    for i in range(0, len(data), batch_size):
        batches.append(data[i:i + batch_size])
    return batches
headlines_list = list(df.headline)
headlines_batches = batch(headlines_list, 100)

pipe = pipeline("text-classification", model="ProsusAI/finbert", return_all_scores=True)
outputs = list()

for index, batch in enumerate(headlines_batches):
    print(f"index: {index*100}", end="\r")
    output_polarity = pipe(batch)
    outputs += output_polarity

After adding the processed data the df will look like this

In [21]:
engine = create_engine('sqlite:///../output/output_with_emotions/AAPL_2023-04-06_2024-04-06.db', echo=False)
df = pd.read_sql('AAPL_', con=engine)

In [23]:
df.head(3)

Unnamed: 0,index,category,datetime,headline,id,image,related,source,summary,url,positive,negative,neutral
0,0,company,1681324461,Bitcoin Surges 81% YTD: Is The Rally For Real ...,119733697,https://static.seekingalpha.com/cdn/s3/uploads...,AAPL,SeekingAlpha,"Bitcoin is skyrocketing, now up roughly 81% YT...",https://finnhub.io/api/news?id=02a1949786f4080...,0.091022,0.076031,0.832947
1,1,company,1681318105,UPDATE 1-Cirrus Logic slumps as analyst says A...,119729225,,AAPL,Reuters,Shares of Apple Inc supplier Cirrus Logic Inc ...,https://finnhub.io/api/news?id=c22e367f55d9aa1...,0.010884,0.970709,0.018407
2,2,company,1681316532,Cirrus Logic slumps as analyst says Apple to a...,119729226,,AAPL,Reuters,Shares of Apple Inc supplier Cirrus Logic Inc ...,https://finnhub.io/api/news?id=eab0fcff4d4cb98...,0.011214,0.964231,0.024555


Add labels to the data using yfinance and each time calculating if the price of the relevant stock went up the next hour or went down
  

In [None]:
def get_str_dates(df):
    dates_list = list()

    for index, row in df.iterrows():
        # Convert date of headline to str dates 
        date_obj = datetime.utcfromtimestamp(row.datetime)

        str_date = date_obj.strftime('%Y-%m-%d %H:%M:%S')
        dates_list.append(str_date)
    df["str_date"] = dates_list
    

def add_market_change(data):
    data_change = [-1]

    for num_index, (index, row) in enumerate(data[1:].iterrows()):

        if row.Open > data.iloc[num_index].Close:
            data_change.append(1)
        else:
            data_change.append(0)

    data['data_change'] = data_change
        

def find_next_open_date(data, date, counter=0):
    
    if counter == 10:
        return -1
    
    date_obj = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
    date_obj += timedelta(days=1)
    date_str = date_obj.strftime("%Y-%m-%d %H:%M:%S")
    
    if date_str[:10] not in data.index:
        return find_next_open_date(data, date_str, counter+1)
    return data.loc[date_str[:10]].data_change
        

def get_finance_data(data, df):
    finance_data = list()

    for index, row in df.iterrows():
        finance_data_obj = find_next_open_date(data, row.str_date, counter=0)
        finance_data.append(int(finance_data_obj))
    df["market_change"] = finance_data
    

news_dir = "../output/output_with_emotions"
news_with_finance_and_emotions = "../output/output_with_emotions_and_finance"

if not os.path.exists(news_with_finance_and_emotions):
    os.mkdir(news_with_finance_and_emotions)


for file in os.listdir(news_dir):
    
    ticker = file.split("_")[0]
    
    engine = create_engine(f"sqlite:///{news_dir}/{file}", echo=False)
    df = pd.read_sql(f"{ticker}_", con=engine, index_col="index")
        
    data = yf.download(ticker, period="1y")
    
    get_str_dates(df)
    add_market_change(data)
    get_finance_data(data, df)
        
    df.drop(columns=['url', 'image', 'id', 'headline', 'datetime', 'summary', 'category'], inplace=True)
    engine_output = create_engine(f"sqlite:///{news_with_finance_and_emotions}/{file}", echo=False)
    df.to_sql(f"{ticker}_", con=engine_output)

Some of the news headlines are just adds so we need to remove them

In [None]:
def remove_adds(df):
    indexes_to_remove = list()

    start_date = datetime.strptime("2023-04-12", "%Y-%m-%d")

    for index, row in df.iterrows():
        date_obj = datetime.strptime(row.str_date, "%Y-%m-%d %H:%M:%S")
        if date_obj < start_date:
            indexes_to_remove.append(index)
        else:
            break
    df.drop(indexes_to_remove, inplace=True)
    
    
# Combining all the data
df = pd.DataFrame(columns=["related", "source", "positive", 
                           "negative", "neutral", "str_date", "market_change"])

for file in os.listdir(news_with_finance_and_emotions):
    
    if file == ".DS_Store":
        continue
    ticker = file.split("_")[0]
    engine = create_engine(f"sqlite:///{news_with_finance_and_emotions}/{file}", echo=False)
    
    df_tmp = pd.read_sql(f"{ticker}_", con=engine, index_col="index")
    df = pd.concat([df, df_tmp])

# Convert text (that represent a category) to numbers
for col in ['source']:
    df[col], _ = pd.factorize(df[col])
df.sort_values("str_date", inplace=True)
df.reset_index(inplace=True, drop=True)
remove_adds(df)
df.reset_index(inplace=True, drop=True)

The final results -

In [64]:
engine = create_engine('sqlite:///../output/all_output_parsed_hourly_market_change.db', echo=False)
df = pd.read_sql('ALL', con=engine)


In [26]:
df.tail(3)

Unnamed: 0,index,related,source,positive,negative,neutral,str_date,market_change
118279,118279,QQQ,0,0.359381,0.612389,0.02823,2024-04-06 03:30:00,1
118280,118280,QQQ,0,0.072482,0.879065,0.048453,2024-04-06 03:50:00,1
118281,118281,AAPL,0,0.026854,0.02776,0.945386,2024-04-06 06:45:00,0


related is the stock, source is the website from which the headline came (transformed to integer categories) and the market_change is the label that we want to predict

In [66]:
df.drop(df[df.market_change == -1].index, inplace=True)
for col in ['related']:
    df[col], _ = pd.factorize(df[col])
df.reset_index(inplace=True, drop=True)

## Known models verification

Prepare the DF

In [27]:
df.drop(df[df.market_change==-1].index, inplace=True)
df.reset_index(drop=True, inplace=True)

label = df.pop("market_change")

# Convert 'related' to an integer represantation of category 
# (in a later stage i will convert it to a one-hot array) 
for col in ['related']:
    df[col], _ = pd.factorize(df[col])

Split the data to test and train datasets by shuffling the data

In [33]:
from sklearn.model_selection import train_test_split

df.drop(columns=["index", "str_date"], inplace=True)
X_train_full, X_test, y_train_full, y_test = train_test_split(df, label)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)

## SVM

C - regularization parameter

class-weights - Used when a certain class is missrrepresented in the data so we give this class higher weights so the model can learn from the few examples it has faster (in this case the data is well balanced)

degree - The polinomial degree assigned to that task

gamma -  used to control the influence of distant training points in the rbf kernel (i am using the default value of gamma parameter which scales gamma based on the number of features)

kernel - describes the kernel function used to calculate the similarities between the functions and i used the default param

probability - indicates the model to output the probability of being part of a function

In [35]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

svm_clf = SVC(C=200, class_weight=None, degree=2, gamma="scale", kernel="rbf", probability=True)
svm_clf.fit(X_train_full, y_train_full)

In [None]:
y_pred = svm_clf.predict_proba(X_test)
print(accuracy_score(y_test, y_pred))

## Random forest CLF

n_estimators - how many "trees" are in the forest

max_leaf_nodes - how many leafs can each tree have (higher number of leafs is a more complex model which can also overfit)

n_jobs - how many threads to run in order to train the model (-1 means everything available)

In [43]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=64, n_jobs=-1)
rnd_clf.fit(X_train_full, y_train_full)

y_pred_rf = rnd_clf.predict(X_test)

print(accuracy_score(y_test, y_pred_rf))

0.5302054376567903


Note that if i use predict_proba function i can decide to use only prediction with a certain level of certainty which improves the model to get an accuracy of approximately 0.59%

In [51]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[6076 7933]
 [6052 9437]]


In [59]:
y_test.value_counts()

market_change
1    15489
0    14009
Name: count, dtype: int64

## Gradient boosting

max_depth - The max depth for each decision tree

n_estimators - How many trees in the ensamble

learning_rate - Used to tweek the size of each step

In [49]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X_train_full, y_train_full)

y_pred = gbrt.predict(X_test)
y_pred = np.round(y_pred).astype(int)
print(accuracy_score(y_test, y_pred))

0.5259000610210862


## Voting

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')
voting_clf.fit(X_train_full, y_train_full)

In [None]:
y_pred = voting_clf.predict(X_test)
accuracy_score(y_test, y_pred)

## LSTM

Trying to train LSTM model with the data to give it some context of the previous article as they can affect the accuracy of the current news headline.

Note - The data is currently a mix of all the stocks which can affect the model

In [60]:
def reshape_data(df, seq_length_news=32):
    '''
    Reshaping the data to be in batches form (each input value will be now an array of <seq_length_news>
    rows from the dataframe)
    
    return value - 
    validation, training and testing data in the correct shape fot LSTM model
    '''
    # Assuming your DataFrame is already loaded, let's split it into features (X) and labels (y)
    X = df[['related', 'source', 'positive', 'negative', 'neutral']].values  # Assuming 'related' and 'source' are your features
    y = df[["market_change"]].values  # Assuming 'positive', 'negative', 'neutral' are your labels

    # Reshape the data for LSTM input (assuming your data has a time dimension)
    n_features = X.shape[1]  # Number of features

    # Split the data into train and test sets while preserving the sequence
    test_size = 0.2
    val_size = 0.1
    split_index = int((1 - test_size) * len(df))
    split_val = int(split_index*0.9)

    X_train_full, X_test = X[:split_index], X[split_index:]
    y_train_full, y_test = y[:split_index], y[split_index:]
    X_train, X_val =  X_train_full[:split_val], X_train_full[split_val:]
    y_train, y_val =  y_train_full[:split_val], y_train_full[split_val:]

    # Train reshape
    n_samples_train = len(X_train) - seq_length_news + 1
    X_train_reshaped = np.zeros((n_samples_train, seq_length_news, n_features))
    y_train_reshaped = np.zeros(n_samples_train)
    for i in range(n_samples_train):
        X_train_reshaped[i] = X_train[i:i + seq_length_news]
        y_train_reshaped[i] = y_train[i + seq_length_news-1]

    # Val reshape
    n_samples_train = len(X_val) - seq_length_news + 1
    X_val_reshaped = np.zeros((n_samples_train, seq_length_news, n_features))
    y_val_reshaped = np.zeros(n_samples_train)
    for i in range(n_samples_train):
        X_val_reshaped[i] = X_val[i:i + seq_length_news]
        y_val_reshaped[i] = y_val[i + seq_length_news-1]

    n_samples_train = len(X_test) - seq_length_news + 1
    X_test_reshaped = np.zeros((n_samples_train, seq_length_news, n_features))
    y_test_reshaped = np.zeros(n_samples_train)
    for i in range(n_samples_train):
        X_test_reshaped[i] = X_test[i:i + seq_length_news]
        y_test_reshaped[i] = y_test[i + seq_length_news-1]
    return X_train_reshaped, y_train_reshaped, X_val_reshaped, y_val_reshaped, X_test_reshaped, y_test_reshaped

Defining callbacks to avoid overfitting, and to save the best model at each iteration

In [61]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping


# Define the checkpoint filepath
filepath = "/content/drive/MyDrive/model_checkpoints_LSTM/checkpoint-{epoch:02d}.keras"

# Define the callback
checkpoint = ModelCheckpoint(
    filepath=filepath,
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=False,
)

early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

In [73]:
import warnings
warnings.filterwarnings(action='ignore')

X = df[['related', 'source', 'positive', 'negative', 'neutral']].values  
y = df[["market_change"]].values

# Reshape the data for LSTM input
n_features = X.shape[1]  # Number of features
seq_length_news = 32

X_train_reshaped, y_train_reshaped, X_val_reshaped, y_val_reshaped, X_test_reshaped, y_test_reshaped = \
reshape_data(df, seq_length_news=seq_length_news)


In [79]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.initializers import HeNormal

model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(seq_length_news, n_features), kernel_initializer=HeNormal()))
model.add(Dense(32, activation="relu", kernel_initializer=HeNormal()))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal()))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train_reshaped, epochs=50,
                  batch_size=seq_length_news,
                  validation_data=(X_val_reshaped, y_val_reshaped),
                  callbacks=[checkpoint, early_stopping])

# Evaluate the model
y_pred_rf = rnd_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_rf))

53.8791460160122


### Search grid

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop


def create_model(neurons, dropout_rate, learning_rate, batch_size, use_regularization):
    model = Sequential()

    if use_regularization:
        # L2 regularization with weight decay of 0.01
        from tensorflow.keras.regularizers import l2

        model.add(LSTM(neurons, activation='relu', kernel_regularizer=l2(0.01), input_shape=(batch_size, n_features), kernel_initializer=HeNormal()))
    else:
        model.add(LSTM(neurons, activation='relu', input_shape=(batch_size, n_features),
                      kernel_initializer=HeNormal()))
    model.add(Dropout(dropout_rate))
    model.add(Dense(32, activation="relu", kernel_initializer=HeNormal()))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal()))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model


# Define the hyperparameter grid
param_grid = {
  'neurons': [32, 64, 128],
  'dropout_rate': [0.2, 0.3, 0.4],
  'learning_rate': [0.01, 0.001],
  'batch_size': [32],
  'use_regularization': [True, False]
}

X = df[['related', 'source', 'positive', 'negative', 'neutral']].values  # Assuming 'related' and 'source' are your features
y = df[["market_change"]].values  # Assuming 'positive', 'negative', 'neutral' are your labels

# Reshape the data for LSTM input (assuming your data has a time dimension)
n_features = X.shape[1]

# Create the grid search object
grid_search = GridSearchCV(estimator=KerasClassifier(build_fn=create_model, epochs=10),
                           param_grid=param_grid, cv=3, verbose=2)

X_train_reshaped, y_train_reshaped, X_val_reshaped, y_val_reshaped, X_test_reshaped, y_test_reshaped = \
reshape_data(df, seq_length_news=32)


grid_search.fit(X_train_reshaped, y_train_reshaped, 
                validation_data=(X_val_reshaped, y_val_reshaped), callbacks=[checkpoint])


The model training during the regular training and the grid search platoed and wouldn't improve the validation accuracy (it did overfit after a while)

There are several options I can do to try to fine tune and improve the results by some quantity (the model won't improve to 90% but can improve by some percents) - 
1. feature engeneering - try to add one hot encoding, remove some of the columns etc...
2. try to make the model more complex - add layers and neurons
3. change batch size
4. change the loss function
5. convert the model to use bert and the BOW
6. add more data to the dataset - for example the previous market_change, or more news headlines that are not necessarily related to the stock

## DNN

Trying more calssical DNN composed of only Dense layers

In [None]:
model = keras.models.Sequential([
        keras.layers.Dense(100, activation="relu", kernel_initializer=glorot_uniform()),
        keras.layers.Dense(100, activation="relu", kernel_initializer=glorot_uniform()),
        keras.layers.Dense(50, activation="relu", kernel_initializer=glorot_uniform()),
        keras.layers.Dense(2, activation="softmax", kernel_initializer=glorot_uniform())
])


model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=0.01),
              metrics=["accuracy"])

callback_early_stopping = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
X_train.fillna(0, inplace=True)
X_valid.fillna(0, inplace=True)


history = model.fit(X_train, y_train, epochs=1000, 
                    validation_data=(X_valid, y_valid), callbacks=[callback_early_stopping])