In [6]:
# %% --------------- IMPORTS AND SETUP -----------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

import plotly.graph_objects as go

from statsmodels.tsa.statespace.sarimax import SARIMAX

# Keras / TensorFlow (for LSTM)
#from keras.models import Sequential
#from keras.layers import LSTM, Dense

# Sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense


In [7]:
# %% --------------- LOAD PRICE DATA -----------------
PRICE_FILE = "russell1000_preprocessed.csv"   # <-- change if needed

if not os.path.exists(PRICE_FILE):
    raise FileNotFoundError(f"{PRICE_FILE} not found. Put your price CSV in the same folder.")

df_price = pd.read_csv(PRICE_FILE)

# Parse datetime and keep only needed cols
df_price['Datetime'] = pd.to_datetime(df_price['Datetime'])
df_price = df_price[['Datetime', 'Close']].sort_values('Datetime').reset_index(drop=True)

print("Data loaded:", df_price.shape)
print(df_price.head())

# Train / Test split (last 20% as test)
train_size = int(len(df_price) * 0.8)
train_df = df_price.iloc[:train_size]
test_df  = df_price.iloc[train_size:]

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")


Data loaded: (269, 2)
                   Datetime        Close
0 2025-12-01 14:30:00+00:00  3716.798096
1 2025-12-01 15:30:00+00:00  3723.897705
2 2025-12-01 16:30:00+00:00  3728.767578
3 2025-12-01 17:30:00+00:00  3731.485107
4 2025-12-01 18:30:00+00:00  3721.335205
Train size: 215, Test size: 54


In [8]:
# %% --------------- ARIMA MODEL -----------------
order_arima = (2, 1, 2)

arima_model = SARIMAX(train_df['Close'],
                      order=order_arima,
                      enforce_stationarity=False,
                      enforce_invertibility=False)

arima_result = arima_model.fit(disp=False)

# Forecast for the test period
arima_pred_test = arima_result.forecast(steps=len(test_df))

# Put into a DataFrame aligned with test dates
df_price['ARIMA_pred'] = np.nan
df_price.loc[test_df.index, 'ARIMA_pred'] = arima_pred_test.values

# Metrics for ARIMA (on test set)
mae_arima = mean_absolute_error(test_df['Close'], arima_pred_test)
rmse_arima = np.sqrt(mean_squared_error(test_df['Close'], arima_pred_test))
r2_arima   = r2_score(test_df['Close'], arima_pred_test)

print("ARIMA", order_arima, "performance on TEST:")
print(f"  MAE  : {mae_arima:.4f}")
print(f"  RMSE : {rmse_arima:.4f}")
print(f"  R²   : {r2_arima:.4f}")


ARIMA (2, 1, 2) performance on TEST:
  MAE  : 30.0199
  RMSE : 40.1165
  R²   : -1.1924


In [9]:
# %% --------------- ARIMA PLOT -----------------
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=test_df['Datetime'],
    y=test_df['Close'],
    mode='lines',
    name='Actual (test)'
))
fig.add_trace(go.Scatter(
    x=test_df['Datetime'],
    y=arima_pred_test,
    mode='lines',
    name='Predicted (ARIMA)'
))
fig.update_layout(
    title=f"ARIMA{order_arima} – Actual vs Predicted (Test)",
    xaxis_title="Datetime",
    yaxis_title="Close"
)
fig.show()


In [10]:
# %% --------------- LSTM DATA PREPARATION -----------------
look_back = 10   # number of past time steps to use

# Use only the Close price and scale between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
close_scaled = scaler.fit_transform(df_price[['Close']].values)

def create_sequences(data, look_back):
    X, y = [], []
    for i in range(look_back, len(data)):
        X.append(data[i - look_back:i, 0])
        y.append(data[i, 0])
    return np.array(X), np.array(y)

X_all, y_all = create_sequences(close_scaled, look_back)

# Because we cut off the first 'look_back' rows, adjust split index
split_index = train_size - look_back
X_train, X_test = X_all[:split_index], X_all[split_index:]
y_train, y_test = y_all[:split_index], y_all[split_index:]

# Reshape to [samples, timesteps, features]
X_train = X_train.reshape((X_train.shape[0], look_back, 1))
X_test  = X_test.reshape((X_test.shape[0],  look_back, 1))

print("LSTM shapes:")
print("  X_train:", X_train.shape, "X_test:", X_test.shape)


LSTM shapes:
  X_train: (205, 10, 1) X_test: (54, 10, 1)


In [11]:
# %% --------------- LSTM MODEL -----------------
lstm_model = Sequential()
lstm_model.add(LSTM(50, return_sequences=False, input_shape=(look_back, 1)))
lstm_model.add(Dense(1))

lstm_model.compile(optimizer='adam', loss='mse')

print("Training LSTM model (this may take a bit)...")
history = lstm_model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=16,
    verbose=1
)

print("LSTM training complete.")


Training LSTM model (this may take a bit)...
Epoch 1/30



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3469  
Epoch 2/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0406
Epoch 3/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0238
Epoch 4/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0157
Epoch 5/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0125
Epoch 6/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0126
Epoch 7/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0116 
Epoch 8/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0112 
Epoch 9/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0110 
Epoch 10/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0107 
Epoch 11/30
[

In [12]:
# %% --------------- LSTM EVALUATION -----------------
# Predictions (scaled)
y_pred_scaled = lstm_model.predict(X_test)

# Inverse scale to original price
y_test_inv = scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
y_pred_inv = scaler.inverse_transform(y_pred_scaled).flatten()

# Put into df aligned with test_df
df_price['LSTM_pred'] = np.nan
df_price.loc[test_df.index, 'LSTM_pred'] = y_pred_inv

# Metrics
mae_lstm = mean_absolute_error(y_test_inv, y_pred_inv)
rmse_lstm = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))
r2_lstm   = r2_score(y_test_inv, y_pred_inv)

print("LSTM performance on TEST:")
print(f"  MAE  : {mae_lstm:.4f}")
print(f"  RMSE : {rmse_lstm:.4f}")
print(f"  R²   : {r2_lstm:.4f}")

# Summary table of model performance
perf_df = pd.DataFrame({
    "Model": ["ARIMA(2,1,2)", "LSTM"],
    "MAE":   [mae_arima, mae_lstm],
    "RMSE":  [rmse_arima, rmse_lstm],
    "R2":    [r2_arima, r2_lstm]
})
print("\nModel performance summary:")
print(perf_df)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
LSTM performance on TEST:
  MAE  : 14.7000
  RMSE : 20.1655
  R²   : 0.4460

Model performance summary:
          Model        MAE       RMSE        R2
0  ARIMA(2,1,2)  30.019870  40.116485 -1.192360
1          LSTM  14.699951  20.165496  0.446033


In [13]:
# %% --------------- LSTM PLOT -----------------
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=test_df['Datetime'],
    y=y_test_inv,
    mode='lines',
    name='Actual (test)'
))
fig.add_trace(go.Scatter(
    x=test_df['Datetime'],
    y=y_pred_inv,
    mode='lines',
    name='Predicted (LSTM)'
))
fig.update_layout(
    title="LSTM – Actual vs Predicted (Test)",
    xaxis_title="Datetime",
    yaxis_title="Close"
)
fig.show()


In [14]:
# %% --------------- SENTIMENT: LOAD NEWS -----------------
NEWS_FILE   = "Combined_News_DJIA.csv"   # Kaggle file
PRICE_DJIA  = "upload_DJIA_table.csv"   # Kaggle DJIA price file

if not os.path.exists(NEWS_FILE):
    raise FileNotFoundError(f"{NEWS_FILE} not found.")

if not os.path.exists(PRICE_DJIA):
    raise FileNotFoundError(f"{PRICE_DJIA} not found.")

news_df = pd.read_csv(NEWS_FILE)
print("News shape:", news_df.shape)
print(news_df.head())


News shape: (1989, 27)
         Date  Label                                               Top1  \
0  2008-08-08      0  b"Georgia 'downs two Russian warplanes' as cou...   
1  2008-08-11      1  b'Why wont America and Nato help us? If they w...   
2  2008-08-12      0  b'Remember that adorable 9-year-old who sang a...   
3  2008-08-13      0  b' U.S. refuses Israel weapons to attack Iran:...   
4  2008-08-14      1  b'All the experts admit that we should legalis...   

                                                Top2  \
0            b'BREAKING: Musharraf to be impeached.'   
1        b'Bush puts foot down on Georgian conflict'   
2                 b"Russia 'ends Georgia operation'"   
3  b"When the president ordered to attack Tskhinv...   
4  b'War in South Osetia - 89 pictures made by a ...   

                                                Top3  \
0  b'Russia Today: Columns of troops roll into So...   
1  b"Jewish Georgian minister: Thanks to Israeli ...   
2  b'"If we had no se

In [15]:
# %% --------------- SENTIMENT SCORE PER DAY -----------------
# Make sure Date is a date type
news_df['Date'] = pd.to_datetime(news_df['Date'])

# Collect all headline columns (those starting with 'Top')
headline_cols = [c for c in news_df.columns if c.startswith("Top")]
print("Number of headline columns:", len(headline_cols))

# Combine all headlines into one long string per row (per day)
news_df['combined_news'] = news_df[headline_cols] \
    .fillna('') \
    .apply(lambda row: ' '.join(str(x) for x in row), axis=1)

# VADER sentiment
analyzer = SentimentIntensityAnalyzer()
news_df['sentiment'] = news_df['combined_news'].apply(
    lambda text: analyzer.polarity_scores(str(text))['compound']
)

# Now aggregate sentiment per day (mean)
daily_sentiment = news_df.groupby('Date')['sentiment'].mean().reset_index()
daily_sentiment.rename(columns={'sentiment': 'daily_sentiment'}, inplace=True)

print("Daily sentiment:")
print(daily_sentiment.head())


Number of headline columns: 25
Daily sentiment:
        Date  daily_sentiment
0 2008-08-08          -0.9979
1 2008-08-11          -0.9804
2 2008-08-12          -0.9658
3 2008-08-13          -0.9809
4 2008-08-14          -0.9802


In [16]:
# %% --------------- MERGE SENTIMENT WITH DJIA PRICE -----------------
djia_df = pd.read_csv(PRICE_DJIA)
djia_df['Date'] = pd.to_datetime(djia_df['Date'])

# Keep only needed columns
price_cols = ['Date', 'Close']   # adjust if your column names differ
djia_df = djia_df[price_cols]

# Merge
merged_df = pd.merge(djia_df, daily_sentiment, on='Date', how='left')
print("Merged price + sentiment data:", merged_df.shape)
print(merged_df.head())


Merged price + sentiment data: (1989, 3)
        Date         Close  daily_sentiment
0 2016-07-01  17949.369141          -0.9983
1 2016-06-30  17929.990234          -0.9977
2 2016-06-29  17694.679688          -0.9975
3 2016-06-28  17409.720703          -0.9571
4 2016-06-27  17140.240234          -0.9644


In [17]:
# %% --------------- PLOT PRICE VS SENTIMENT -----------------
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=merged_df['Date'],
    y=merged_df['Close'],
    mode='lines',
    name='DJIA Close'
))

fig.add_trace(go.Scatter(
    x=merged_df['Date'],
    y=merged_df['daily_sentiment'] * 100,   # scale sentiment so it’s visible
    mode='lines',
    name='Sentiment (x100)',
    yaxis='y2'
))

fig.update_layout(
    title="DJIA Close vs Daily News Sentiment",
    xaxis=dict(title='Date'),
    yaxis=dict(title='Close Price'),
    yaxis2=dict(
        title='Sentiment (scaled)',
        overlaying='y',
        side='right',
        showgrid=False
    )
)

fig.show()
