<a href="https://colab.research.google.com/github/lro99/stock_sentiment/blob/main/SentimentTrading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install hf_xet

In [None]:
pip install newsapi-python

In [None]:
pip install finnhub-python

In [None]:
# apikey = '62638dc7df5e4e958183e238948a0ebf'

In [None]:
finhubkey = 'd06ltdpr01qg26s8pi6gd06ltdpr01qg26s8pi70'

In [None]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# finBERT
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
# newsapi
from newsapi import NewsApiClient
import requests
import finnhub
# timeseries
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
# deep learning
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
# get list of news
news = yf.Search("S&P 500", news_count=10).news
for i in news:
  print(i['title'])

In [None]:
# S&P500 yahoo

ticker = yf.Ticker("SPY")
historical = ticker.history(period="10y")
historical

In [None]:
historical['Return'] = historical['Close'].pct_change() * 100
historical['Lag1'] = historical['Return'].shift(1)
historical['Lag2'] = historical['Return'].shift(2)
historical['Lag3'] = historical['Return'].shift(3)
historical['MA5'] = historical['Close'].rolling(5).mean()
historical['MA20'] = historical['Close'].rolling(20).mean()
historical['Volatility10'] = historical['Return'].rolling(10).std()
historical['RoC10'] = historical['Close'].pct_change(periods=10)
historical.dropna(inplace=True)


X = historical[['Lag1', 'Lag2', 'Lag3', 'MA5', 'MA20', 'Volatility10', 'RoC10']]
y = historical['Return']

# Exploratory Data Analysis

In [None]:
historical.describe()

# Baseline

Rolling Avg

In [None]:
window = 5

historical['RollingAvg_Pred'] = historical['Return'].shift(1).rolling(window).mean()


In [None]:
true_vals = historical['Return'].dropna()
rolling_pred = historical['RollingAvg_Pred'].dropna()
aligned = true_vals.loc[rolling_pred.index]

mse = mean_squared_error(aligned, rolling_pred)
mae = mean_absolute_error(aligned, rolling_pred)
r2 = r2_score(aligned, rolling_pred)

print(f"Rolling Avg Mean Squared Error: {mse}")
print(f"Rolling Avg Mean Absolute Error: {mae}")
print(f"Rolling Avg R-squared: {r2}")

Naive

In [None]:
historical['Naive_Pred'] = historical['Return'].shift(1)

In [None]:
naive_pred = historical['Naive_Pred'].dropna()
aligned = true_vals.loc[naive_pred.index]

mse = mean_squared_error(aligned, naive_pred)
mae = mean_absolute_error(aligned, naive_pred)
r2 = r2_score(aligned, naive_pred)

print(f"Naive Mean Squared Error: {mse}")
print(f"Naive Mean Absolute Error: {mae}")
print(f"Naive R-squared: {r2}")

# Model with cross validation

RandomForest

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

model = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [50, 300],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5]
}


scorer = make_scorer(mean_squared_error, greater_is_better=False)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=tscv,
    scoring=scorer,
    verbose=1
)

grid_search.fit(X, y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

LSTM

In [None]:
X = X.values
y = y.values

def create_sequences(X, y, time_steps=10):
  Xs, ys = [], []
  for i in range(len(X) - time_steps):
    Xs.append(X[i:(i + time_steps)])
    ys.append(y[i + time_steps])
  return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X, y, time_steps=10)

In [None]:
def lstm_model(input_shape):
  model = tf.keras.Sequential()

  model.add(layers.LSTM(64, activation=None, input_shape=input_shape, dropout=0.2, return_sequences=False))
  model.add(layers.Dense(1))
  model.compile(optimizer='adam', loss='mse')
  return model


In [None]:
for train_idx, val_idx in tscv.split(X_seq):
  X_train, X_val = X_seq[train_idx], X_seq[val_idx]
  y_train, y_val = y_seq[train_idx], y_seq[val_idx]

  model = lstm_model(X_train.shape[1:])
  model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# finBERT

In [None]:
# finBERT model
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

headline = news[9]['title']
res = finbert(headline)
print(res)

In [None]:
for i in news:
  headline = i['title']
  res = finbert(headline)
  print(headline, res)

In [None]:
# graphing % change
historical = historical.reset_index()
X = pd.to_datetime(historical['Date'])
y = historical['Change']

plt.plot(X, y)
plt.title('S&P 500')
plt.xlabel('Date')
plt.ylabel('Interday Change (%)')
plt.show()


# Finnhub API

In [None]:
# finnhub news api. allows for historical search

finnhub_client = finnhub.Client(api_key=finhubkey)

news = finnhub_client.company_news('SPY', _from="2025-04-01", to="2025-04-01")

# for i in news:
#   print(i['headline'])
for i in news:
  sentiment = finbert(i['summary'])
  if sentiment[0]['label'] == 'neutral':
    continue
  print(i['summary'], sentiment)