In [1]:
# MOUNT GOOGLE DRIVE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from textblob import TextBlob

## Set up your tools

In [3]:
np.random.seed(42)
tf.random.set_seed(42)

# Helper Functions


## LSTM NN functions

In [4]:
def build_lstm_model(input_shape, learning_rate=0.001):
  model = Sequential()
  model.add(LSTM(units=50, return_sequences=True, input_shape=input_shape))
  model.add(Dropout(0.2))
  model.add(LSTM(units=50))
  model.add(Dropout(0.2))
  model.add(Dense(units=1))

  optimizer = Adam(learning_rate=learning_rate)
  model.compile(optimizer=optimizer, loss='mean_squared_error')
  return model

def prepare_data_for_lstm(df, feature_columns, target_column, test_size=0.2, random_state=42):
  X = df[feature_columns].values
  y = df[target_column].values

  scaler = MinMaxScaler()

  X = scaler.fit_transform(X)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

  input_shape = (X_train.shape[1], 1)
  X_train = X_train.reshape(X_train.shape[0], input_shape[0], input_shape[1])
  X_test = X_test.reshape(X_test.shape[0], input_shape[0], input_shape[1])

  return X_train, X_test, y_train, y_test, scaler

def train_lstm_model(X_train, y_train, input_shape1, input_shape2, epochs=10, batch_size=32):
  input_shape = (input_shape1, input_shape2)
  model = build_lstm_model(input_shape)
  model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)
  return model

def evaluate_model(model, X_test, y_test):
  predictions = model.predict(X_test)

  mse = mean_squared_error(y_test, predictions)
  mae = mean_absolute_error(y_test, predictions)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, predictions)

  mape = np.mean(np.abs((y_test - predictions) / y_test)) * 100

  return mae, mse, rmse, r2, mape

## Visualizations

In [5]:
def plot_sentiment_vs_price(df, sentiment_col, price_change_col, title, sentiment_label='Sentiment', price_change_label='Percent Price Change'):
  fig, ax1 = plt.subplots(figsize=(10, 6))

  ax1.set_xlabel('Time')
  ax1.set_ylabel(sentiment_label, color='tab:blue')
  ax1.plot(df['Publication Date'], df[sentiment_col], color='tab:blue', label=sentiment_label)
  ax1.tick_params(axis='y', labelcolor='tab:blue')
  ax1.legend(loc='upper left')

  ax2 = ax1.twinx()
  ax2.set_ylabel(price_change_label, color='tab:red')
  ax2.plot(df['Publication Date'], df[price_change_col], color='tab:red', label=price_change_label)
  ax2.tick_params(axis='y', labelcolor='tab:red')
  ax2.legend(loc='upper right')

  plt.title(title)
  fig.tight_layout()
  plt.show()

def plot_scatter_sentiment_vs_price_change(df, sentiment_column, price_change_column, title):
    plt.figure(figsize=(10, 6))

    sentiment = df[sentiment_column]
    price_change = df[price_change_column]

    plt.scatter(sentiment, price_change, alpha=0.5)

    plt.xlabel(sentiment_column)
    plt.ylabel(price_change_column)
    plt.title(title)
    plt.show()

def plot_colored_scatter_sentiment_vs_price_change(df, sentiment_col, price_change_col, color_col, title):
    plt.figure(figsize=(10, 6))

    sentiment = df[sentiment_col]
    price_change = df[price_change_col]
    colors = df[color_col]

    plt.scatter(sentiment, price_change, c=colors, alpha=0.7)

    plt.xlabel(sentiment_col)
    plt.ylabel(price_change_col)
    plt.title(title)

    plt.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.axvline(x=0, color='black', linestyle='--', linewidth=1)

    plt.show()

# Load data into dataframes


## Paths

In [None]:
# store paths to data
path_to_apple_newsStock_data = '/content/drive/Shareddrives/econ470 group presentation/datasets/final_data/apple_final.csv'
path_to_ed_newsStock_data = '/content/drive/Shareddrives/econ470 group presentation/datasets/final_data/ed_final.csv'
path_to_ip_newsStock_data = '/content/drive/Shareddrives/econ470 group presentation/datasets/final_data/ip_final.csv'
path_to_levi_newsStock_data = '/content/drive/Shareddrives/econ470 group presentation/datasets/final_data/levi_final.csv'
path_to_pm_newsStock_data = '/content/drive/Shareddrives/econ470 group presentation/datasets/final_data/pm_final.csv'
path_to_ubs_newsStock_data = '/content/drive/Shareddrives/econ470 group presentation/datasets/final_data/ubs_final.csv'

## DFs

In [None]:
# load DFs
df_apple_newsStock_data = pd.read_csv(path_to_apple_newsStock_data)
df_ed_newsStock_data = pd.read_csv(path_to_ed_newsStock_data)
df_ip_newsStock_data = pd.read_csv(path_to_ip_newsStock_data)
df_levi_newsStock_data = pd.read_csv(path_to_levi_newsStock_data)
df_pm_newsStock_data = pd.read_csv(path_to_pm_newsStock_data)
df_ubs_newsStock_data = pd.read_csv(path_to_ubs_newsStock_data)

df_list = [df_apple_newsStock_data, df_ed_newsStock_data, df_ip_newsStock_data, df_levi_newsStock_data, df_pm_newsStock_data, df_ubs_newsStock_data]
df_all_companies = pd.concat(df_list, ignore_index=True)

In [None]:
new_column_names = {'1_day': 'StockPrice_PercentChangeAfter_1Days',
                    '2_day': 'StockPrice_PercentChangeAfter_2Days',
                    '3_day': 'StockPrice_PercentChangeAfter_3Days',
                    '4_day': 'StockPrice_PercentChangeAfter_4Days',
                    '5_day': 'StockPrice_PercentChangeAfter_5Days',
                    'sentiment': 'Sentiment_onPubDate'}

for df in df_list:
  df.rename(columns=new_column_names, inplace=True)

# Drop rows with NaN values
for df in df_list:
  df.dropna(inplace=True)

Here is the list of columns we can work with now at the end of our Data phase:

In [None]:
df_apple_newsStock_data.columns

Index(['Date', 'Ticker', 'StockPrice_PercentChangeAfter_1Days',
       'StockPrice_PercentChangeAfter_2Days',
       'StockPrice_PercentChangeAfter_3Days',
       'StockPrice_PercentChangeAfter_4Days',
       'StockPrice_PercentChangeAfter_5Days', 'Publication Date',
       'Sentiment_onPubDate'],
      dtype='object')

# Models

## train test split - Regression Models

In [None]:
# Regression Models to predict stock price percent change after 1 day for each company
apple_1day_X_train, apple_1day_X_test, apple_1day_y_train, apple_1day_y_test, scaler_daily = prepare_data_for_lstm(
    df_apple_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_1Days')
ed_1day_X_train, ed_1day_X_test, ed_1day_y_train, ed_1day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ed_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_1Days')
ip_1day_X_train, ip_1day_X_test, ip_1day_y_train, ip_1day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ip_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_1Days')
levi_1day_X_train, levi_1day_X_test, levi_1day_y_train, levi_1day_y_test, scaler_daily = prepare_data_for_lstm(
    df_levi_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_1Days')
pm_1day_X_train, pm_1day_X_test, pm_1day_y_train, pm_1day_y_test, scaler_daily = prepare_data_for_lstm(
    df_pm_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_1Days')
ubs_1day_X_train, ubs_1day_X_test, ubs_1day_y_train, ubs_1day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ubs_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_1Days')

# Regression Models to predict stock price percent change after 2 day for each company
apple_2day_X_train, apple_2day_X_test, apple_2day_y_train, apple_2day_y_test, scaler_daily = prepare_data_for_lstm(
    df_apple_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_2Days')
ed_2day_X_train, ed_2day_X_test, ed_2day_y_train, ed_2day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ed_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_2Days')
ip_2day_X_train, ip_2day_X_test, ip_2day_y_train, ip_2day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ip_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_2Days')
levi_2day_X_train, levi_2day_X_test, levi_2day_y_train, levi_2day_y_test, scaler_daily = prepare_data_for_lstm(
    df_levi_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_2Days')
pm_2day_X_train, pm_2day_X_test, pm_2day_y_train, pm_2day_y_test, scaler_daily = prepare_data_for_lstm(
    df_pm_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_2Days')
ubs_2day_X_train, ubs_2day_X_test, ubs_2day_y_train, ubs_2day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ubs_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_2Days')

# Regression Models to predict stock price percent change after 3 day for each company
apple_3day_X_train, apple_3day_X_test, apple_3day_y_train, apple_3day_y_test, scaler_daily = prepare_data_for_lstm(
    df_apple_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_3Days')
ed_3day_X_train, ed_3day_X_test, ed_3day_y_train, ed_3day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ed_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_3Days')
ip_3day_X_train, ip_3day_X_test, ip_3day_y_train, ip_3day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ip_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_3Days')
levi_3day_X_train, levi_3day_X_test, levi_3day_y_train, levi_3day_y_test, scaler_daily = prepare_data_for_lstm(
    df_levi_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_3Days')
pm_3day_X_train, pm_3day_X_test, pm_3day_y_train, pm_3day_y_test, scaler_daily = prepare_data_for_lstm(
    df_pm_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_3Days')
ubs_3day_X_train, ubs_3day_X_test, ubs_3day_y_train, ubs_3day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ubs_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_3Days')

# Regression Models to predict stock price percent change after 4 day for each company
apple_4day_X_train, apple_4day_X_test, apple_4day_y_train, apple_4day_y_test, scaler_daily = prepare_data_for_lstm(
    df_apple_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_4Days')
ed_4day_X_train, ed_4day_X_test, ed_4day_y_train, ed_4day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ed_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_4Days')
ip_4day_X_train, ip_4day_X_test, ip_4day_y_train, ip_4day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ip_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_4Days')
levi_4day_X_train, levi_4day_X_test, levi_4day_y_train, levi_4day_y_test, scaler_daily = prepare_data_for_lstm(
    df_levi_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_4Days')
pm_4day_X_train, pm_4day_X_test, pm_4day_y_train, pm_4day_y_test, scaler_daily = prepare_data_for_lstm(
    df_pm_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_4Days')
ubs_4day_X_train, ubs_4day_X_test, ubs_4day_y_train, ubs_4day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ubs_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_4Days')

# Regression Models to predict stock price percent change after 5 day for each company
apple_5day_X_train, apple_5day_X_test, apple_5day_y_train, apple_5day_y_test, scaler_daily = prepare_data_for_lstm(
    df_apple_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_5Days')
ed_5day_X_train, ed_5day_X_test, ed_5day_y_train, ed_5day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ed_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_5Days')
ip_5day_X_train, ip_5day_X_test, ip_5day_y_train, ip_5day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ip_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_5Days')
levi_5day_X_train, levi_5day_X_test, levi_5day_y_train, levi_5day_y_test, scaler_daily = prepare_data_for_lstm(
    df_levi_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_5Days')
pm_5day_X_train, pm_5day_X_test, pm_5day_y_train, pm_5day_y_test, scaler_daily = prepare_data_for_lstm(
    df_pm_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_5Days')
ubs_5day_X_train, ubs_5day_X_test, ubs_5day_y_train, ubs_5day_y_test, scaler_daily = prepare_data_for_lstm(
    df_ubs_newsStock_data, ['Sentiment_onPubDate'], 'StockPrice_PercentChangeAfter_5Days')

## Train and Evaluate Regression Models





In [None]:
apple_1day_regression_model = train_lstm_model(apple_1day_X_train, apple_1day_y_train, (apple_1day_X_train.shape[1]), (apple_1day_X_train.shape[2]))
apple_2day_regression_model = train_lstm_model(apple_2day_X_train, apple_2day_y_train, (apple_2day_X_train.shape[1]), (apple_2day_X_train.shape[2]))
apple_3day_regression_model = train_lstm_model(apple_3day_X_train, apple_3day_y_train, (apple_3day_X_train.shape[1]), (apple_3day_X_train.shape[2]))
apple_4day_regression_model = train_lstm_model(apple_4day_X_train, apple_4day_y_train, (apple_4day_X_train.shape[1]), (apple_4day_X_train.shape[2]))
apple_5day_regression_model = train_lstm_model(apple_5day_X_train, apple_5day_y_train, (apple_5day_X_train.shape[1]), (apple_5day_X_train.shape[2]))
apple_1day_mae, apple_1day_mse, apple_1day_rmse, apple_1day_r2, apple_1day_mape = evaluate_model(apple_1day_regression_model, apple_1day_X_test, apple_1day_y_test)
apple_2day_mae, apple_2day_mse, apple_2day_rmse, apple_2day_r2, apple_2day_mape = evaluate_model(apple_2day_regression_model, apple_2day_X_test, apple_2day_y_test)
apple_3day_mae, apple_3day_mse, apple_3day_rmse, apple_3day_r2, apple_3day_mape = evaluate_model(apple_3day_regression_model, apple_3day_X_test, apple_3day_y_test)
apple_4day_mae, apple_4day_mse, apple_4day_rmse, apple_4day_r2, apple_4day_mape = evaluate_model(apple_4day_regression_model, apple_4day_X_test, apple_4day_y_test)
apple_5day_mae, apple_5day_mse, apple_5day_rmse, apple_5day_r2, apple_5day_mape = evaluate_model(apple_5day_regression_model, apple_5day_X_test, apple_5day_y_test)

ed_1day_regression_model = train_lstm_model(ed_1day_X_train, ed_1day_y_train, (ed_1day_X_train.shape[1]), (ed_1day_X_train.shape[2]))
ed_2day_regression_model = train_lstm_model(ed_2day_X_train, ed_2day_y_train, (ed_2day_X_train.shape[1]), (ed_2day_X_train.shape[2]))
ed_3day_regression_model = train_lstm_model(ed_3day_X_train, ed_3day_y_train, (ed_3day_X_train.shape[1]), (ed_3day_X_train.shape[2]))
ed_4day_regression_model = train_lstm_model(ed_4day_X_train, ed_4day_y_train, (ed_4day_X_train.shape[1]), (ed_4day_X_train.shape[2]))
ed_5day_regression_model = train_lstm_model(ed_5day_X_train, ed_5day_y_train, (ed_5day_X_train.shape[1]), (ed_5day_X_train.shape[2]))
ed_1day_mae, ed_1day_mse, ed_1day_rmse, ed_1day_r2, ed_1day_mape = evaluate_model(ed_1day_regression_model, ed_1day_X_test, ed_1day_y_test)
ed_2day_mae, ed_2day_mse, ed_2day_rmse, ed_2day_r2, ed_2day_mape = evaluate_model(ed_2day_regression_model, ed_2day_X_test, ed_2day_y_test)
ed_3day_mae, ed_3day_mse, ed_3day_rmse, ed_3day_r2, ed_3day_mape = evaluate_model(ed_3day_regression_model, ed_3day_X_test, ed_3day_y_test)
ed_4day_mae, ed_4day_mse, ed_4day_rmse, ed_4day_r2, ed_4day_mape = evaluate_model(ed_4day_regression_model, ed_4day_X_test, ed_4day_y_test)
ed_5day_mae, ed_5day_mse, ed_5day_rmse, ed_5day_r2, ed_5day_mape = evaluate_model(ed_5day_regression_model, ed_5day_X_test, ed_5day_y_test)

ip_1day_regression_model = train_lstm_model(ip_1day_X_train, ip_1day_y_train, (ip_1day_X_train.shape[1]), (ip_1day_X_train.shape[2]))
ip_2day_regression_model = train_lstm_model(ip_2day_X_train, ip_2day_y_train, (ip_2day_X_train.shape[1]), (ip_2day_X_train.shape[2]))
ip_3day_regression_model = train_lstm_model(ip_3day_X_train, ip_3day_y_train, (ip_3day_X_train.shape[1]), (ip_3day_X_train.shape[2]))
ip_4day_regression_model = train_lstm_model(ip_4day_X_train, ip_4day_y_train, (ip_4day_X_train.shape[1]), (ip_4day_X_train.shape[2]))
ip_5day_regression_model = train_lstm_model(ip_5day_X_train, ip_5day_y_train, (ip_5day_X_train.shape[1]), (ip_5day_X_train.shape[2]))
ip_1day_mae, ip_1day_mse, ip_1day_rmse, ip_1day_r2, ip_1day_mape = evaluate_model(ip_1day_regression_model, ip_1day_X_test, ip_1day_y_test)
ip_2day_mae, ip_2day_mse, ip_2day_rmse, ip_2day_r2, ip_2day_mape = evaluate_model(ip_2day_regression_model, ip_2day_X_test, ip_2day_y_test)
ip_3day_mae, ip_3day_mse, ip_3day_rmse, ip_3day_r2, ip_3day_mape = evaluate_model(ip_3day_regression_model, ip_3day_X_test, ip_3day_y_test)
ip_4day_mae, ip_4day_mse, ip_4day_rmse, ip_4day_r2, ip_4day_mape = evaluate_model(ip_4day_regression_model, ip_4day_X_test, ip_4day_y_test)
ip_5day_mae, ip_5day_mse, ip_5day_rmse, ip_5day_r2, ip_5day_mape = evaluate_model(ip_5day_regression_model, ip_5day_X_test, ip_5day_y_test)

levi_1day_regression_model = train_lstm_model(levi_1day_X_train, levi_1day_y_train, (levi_1day_X_train.shape[1]), (levi_1day_X_train.shape[2]))
levi_2day_regression_model = train_lstm_model(levi_2day_X_train, levi_2day_y_train, (levi_2day_X_train.shape[1]), (levi_2day_X_train.shape[2]))
levi_3day_regression_model = train_lstm_model(levi_3day_X_train, levi_3day_y_train, (levi_3day_X_train.shape[1]), (levi_3day_X_train.shape[2]))
levi_4day_regression_model = train_lstm_model(levi_4day_X_train, levi_4day_y_train, (levi_4day_X_train.shape[1]), (levi_4day_X_train.shape[2]))
levi_5day_regression_model = train_lstm_model(levi_5day_X_train, levi_5day_y_train, (levi_5day_X_train.shape[1]), (levi_5day_X_train.shape[2]))
levi_1day_mae, levi_1day_mse, levi_1day_rmse, levi_1day_r2, levi_1day_mape = evaluate_model(levi_1day_regression_model, levi_1day_X_test, levi_1day_y_test)
levi_2day_mae, levi_2day_mse, levi_2day_rmse, levi_2day_r2, levi_2day_mape = evaluate_model(levi_2day_regression_model, levi_2day_X_test, levi_2day_y_test)
levi_3day_mae, levi_3day_mse, levi_3day_rmse, levi_3day_r2, levi_3day_mape = evaluate_model(levi_3day_regression_model, levi_3day_X_test, levi_3day_y_test)
levi_4day_mae, levi_4day_mse, levi_4day_rmse, levi_4day_r2, levi_4day_mape = evaluate_model(levi_4day_regression_model, levi_4day_X_test, levi_4day_y_test)
levi_5day_mae, levi_5day_mse, levi_5day_rmse, levi_5day_r2, levi_5day_mape = evaluate_model(levi_5day_regression_model, levi_5day_X_test, levi_5day_y_test)

pm_1day_regression_model = train_lstm_model(pm_1day_X_train, pm_1day_y_train, (pm_1day_X_train.shape[1]), (pm_1day_X_train.shape[2]))
pm_2day_regression_model = train_lstm_model(pm_2day_X_train, pm_2day_y_train, (pm_2day_X_train.shape[1]), (pm_2day_X_train.shape[2]))
pm_3day_regression_model = train_lstm_model(pm_3day_X_train, pm_3day_y_train, (pm_3day_X_train.shape[1]), (pm_3day_X_train.shape[2]))
pm_4day_regression_model = train_lstm_model(pm_4day_X_train, pm_4day_y_train, (pm_4day_X_train.shape[1]), (pm_4day_X_train.shape[2]))
pm_5day_regression_model = train_lstm_model(pm_5day_X_train, pm_5day_y_train, (pm_5day_X_train.shape[1]), (pm_5day_X_train.shape[2]))
pm_1day_mae, pm_1day_mse, pm_1day_rmse, pm_1day_r2, pm_1day_mape = evaluate_model(pm_1day_regression_model, pm_1day_X_test, pm_1day_y_test)
pm_2day_mae, pm_2day_mse, pm_2day_rmse, pm_2day_r2, pm_2day_mape = evaluate_model(pm_2day_regression_model, pm_2day_X_test, pm_2day_y_test)
pm_3day_mae, pm_3day_mse, pm_3day_rmse, pm_3day_r2, pm_3day_mape = evaluate_model(pm_3day_regression_model, pm_3day_X_test, pm_3day_y_test)
pm_4day_mae, pm_4day_mse, pm_4day_rmse, pm_4day_r2, pm_4day_mape = evaluate_model(pm_4day_regression_model, pm_4day_X_test, pm_4day_y_test)
pm_5day_mae, pm_5day_mse, pm_5day_rmse, pm_5day_r2, pm_5day_mape = evaluate_model(pm_5day_regression_model, pm_5day_X_test, pm_5day_y_test)

ubs_1day_regression_model = train_lstm_model(ubs_1day_X_train, ubs_1day_y_train, (ubs_1day_X_train.shape[1]), (ubs_1day_X_train.shape[2]))
ubs_2day_regression_model = train_lstm_model(ubs_2day_X_train, ubs_2day_y_train, (ubs_2day_X_train.shape[1]), (ubs_2day_X_train.shape[2]))
ubs_3day_regression_model = train_lstm_model(ubs_3day_X_train, ubs_3day_y_train, (ubs_3day_X_train.shape[1]), (ubs_3day_X_train.shape[2]))
ubs_4day_regression_model = train_lstm_model(ubs_4day_X_train, ubs_4day_y_train, (ubs_4day_X_train.shape[1]), (ubs_4day_X_train.shape[2]))
ubs_5day_regression_model = train_lstm_model(ubs_5day_X_train, ubs_5day_y_train, (ubs_5day_X_train.shape[1]), (ubs_5day_X_train.shape[2]))
ubs_1day_mae, ubs_1day_mse, ubs_1day_rmse, ubs_1day_r2, ubs_1day_mape = evaluate_model(ubs_1day_regression_model, ubs_1day_X_test, ubs_1day_y_test)
ubs_2day_mae, ubs_2day_mse, ubs_2day_rmse, ubs_2day_r2, ubs_2day_mape = evaluate_model(ubs_2day_regression_model, ubs_2day_X_test, ubs_2day_y_test)
ubs_3day_mae, ubs_3day_mse, ubs_3day_rmse, ubs_3day_r2, ubs_3day_mape = evaluate_model(ubs_3day_regression_model, ubs_3day_X_test, ubs_3day_y_test)
ubs_4day_mae, ubs_4day_mse, ubs_4day_rmse, ubs_4day_r2, ubs_4day_mape = evaluate_model(ubs_4day_regression_model, ubs_4day_X_test, ubs_4day_y_test)
ubs_5day_mae, ubs_5day_mse, ubs_5day_rmse, ubs_5day_r2, ubs_5day_mape = evaluate_model(ubs_5day_regression_model, ubs_5day_X_test, ubs_5day_y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  mape = np.mean(np.abs((y_test - predictions) / y_test)) * 100


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  mape = np.mean(np.abs((y_test - predictions) / y_test)) * 100




  mape = np.mean(np.abs((y_test - predictions) / y_test)) * 100


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

  mape = np.mean(np.abs((y_test - predictions) / y_test)) * 100




  mape = np.mean(np.abs((y_test - predictions) / y_test)) * 100




# Data Visualizations

In [None]:
for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])
  title = f'Comparing Sentiment and Stock Price Percent Change after 1 Days for {company_ticker}'
  plot_sentiment_vs_price(
      df=df,
      sentiment_col='Sentiment_onPubDate',
      price_change_col='StockPrice_PercentChangeAfter_1Days',
      title=title
  )

for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])
  title = f'Comparing Sentiment and Stock Price Percent Change after 2 Days for {company_ticker}'
  plot_sentiment_vs_price(
      df=df,
      sentiment_col='Sentiment_onPubDate',
      price_change_col='StockPrice_PercentChangeAfter_2Days',
      title=title
  )

for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])
  title = f'Comparing Sentiment and Stock Price Percent Change after 3 Days for {company_ticker}'
  plot_sentiment_vs_price(
      df=df,
      sentiment_col='Sentiment_onPubDate',
      price_change_col='StockPrice_PercentChangeAfter_3Days',
      title=title
  )

for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])
  title = f'Comparing Sentiment and Stock Price Percent Change after 4 Days for {company_ticker}'
  plot_sentiment_vs_price(
      df=df,
      sentiment_col='Sentiment_onPubDate',
      price_change_col='StockPrice_PercentChangeAfter_4Days',
      title=title
  )

for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])
  title = f'Comparing Sentiment and Stock Price Percent Change after 5 Days for {company_ticker}'
  plot_sentiment_vs_price(
      df=df,
      sentiment_col='Sentiment_onPubDate',
      price_change_col='StockPrice_PercentChangeAfter_5Days',
      title=title
  )

Output hidden; open in https://colab.research.google.com to view.

In [None]:
for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])
  title = f'Relationship between Sentiment and Percent Stock Price Change after 1 Days for {company_ticker}'
  plot_scatter_sentiment_vs_price_change(
      df,
      'Sentiment_onPubDate',
      'StockPrice_PercentChangeAfter_1Days',
      title
  )

for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])
  title = f'Relationship between Sentiment and Percent Stock Price Change after 2 Days for {company_ticker}'
  plot_scatter_sentiment_vs_price_change(
      df,
      'Sentiment_onPubDate',
      'StockPrice_PercentChangeAfter_2Days',
      title
  )

for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])
  title = f'Relationship between Sentiment and Percent Stock Price Change after 3 Days for {company_ticker}'
  plot_scatter_sentiment_vs_price_change(
      df,
      'Sentiment_onPubDate',
      'StockPrice_PercentChangeAfter_3Days',
      title
  )

for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])
  title = f'Relationship between Sentiment and Percent Stock Price Change after 4 Days for {company_ticker}'
  plot_scatter_sentiment_vs_price_change(
      df,
      'Sentiment_onPubDate',
      'StockPrice_PercentChangeAfter_4Days',
      title
  )

for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])
  title = f'Relationship between Sentiment and Percent Stock Price Change after 5 Days for {company_ticker}'
  plot_scatter_sentiment_vs_price_change(
      df,
      'Sentiment_onPubDate',
      'StockPrice_PercentChangeAfter_5Days',
      title
  )

Output hidden; open in https://colab.research.google.com to view.

In [None]:
for df in df_list:
  company_ticker = str(df['Ticker'].iloc[0])

  conditions = [
      (df['Sentiment_onPubDate'] > 0) & (df['StockPrice_PercentChangeAfter_1Days'] > 0),
      (df['Sentiment_onPubDate'] < 0) & (df['StockPrice_PercentChangeAfter_1Days'] < 0)
  ]
  choices = ['green', 'red']
  df['color_1day'] = np.select(conditions, choices, default='yellow')

  title = f'{company_ticker} - Colored Scatterplot - Sentiment vs % Price Change after 1 Days'
  plot_colored_scatter_sentiment_vs_price_change(
      df,
      sentiment_col='Sentiment_onPubDate',
      price_change_col='StockPrice_PercentChangeAfter_1Days',
      color_col='color_1day',
      title=title
  )

  # Count the number of each color
  color_counts = df['color_1day'].value_counts()

  # Print the counts
  print(f"Green:\t{color_counts.get('green', 0)}\t(positive sentiment and positive price change)")
  print(f"Yellow:\t{color_counts.get('yellow')}\t(mismatch between sentiment and price change)")
  print(f"Red:\t{color_counts.get('red', 0)}\t(negative sentiment and negative price change)")

  conditions = [
      (df['Sentiment_onPubDate'] > 0) & (df['StockPrice_PercentChangeAfter_2Days'] > 0),
      (df['Sentiment_onPubDate'] < 0) & (df['StockPrice_PercentChangeAfter_2Days'] < 0)
  ]
  choices = ['green', 'red']
  df['color_2day'] = np.select(conditions, choices, default='yellow')

  title = f'{company_ticker} - Colored Scatterplot - Sentiment vs % Price Change after 2 Days'
  plot_colored_scatter_sentiment_vs_price_change(
      df,
      sentiment_col='Sentiment_onPubDate',
      price_change_col='StockPrice_PercentChangeAfter_2Days',
      color_col='color_2day',
      title=title
  )

  # Count the number of each color
  color_counts = df['color_2day'].value_counts()

  # Print the counts
  print(f"Green:\t{color_counts.get('green', 0)}\t(positive sentiment and positive price change)")
  print(f"Yellow:\t{color_counts.get('yellow')}\t(mismatch between sentiment and price change)")
  print(f"Red:\t{color_counts.get('red', 0)}\t(negative sentiment and negative price change)")

  conditions = [
      (df['Sentiment_onPubDate'] > 0) & (df['StockPrice_PercentChangeAfter_3Days'] > 0),
      (df['Sentiment_onPubDate'] < 0) & (df['StockPrice_PercentChangeAfter_3Days'] < 0)
  ]
  choices = ['green', 'red']
  df['color_3day'] = np.select(conditions, choices, default='yellow')

  title = f'{company_ticker} - Colored Scatterplot - Sentiment vs % Price Change after 3 Days'
  plot_colored_scatter_sentiment_vs_price_change(
      df,
      sentiment_col='Sentiment_onPubDate',
      price_change_col='StockPrice_PercentChangeAfter_3Days',
      color_col='color_3day',
      title=title
  )

  # Count the number of each color
  color_counts = df['color_3day'].value_counts()

  # Print the counts
  print(f"Green:\t{color_counts.get('green', 0)}\t(positive sentiment and positive price change)")
  print(f"Yellow:\t{color_counts.get('yellow')}\t(mismatch between sentiment and price change)")
  print(f"Red:\t{color_counts.get('red', 0)}\t(negative sentiment and negative price change)")

  conditions = [
      (df['Sentiment_onPubDate'] > 0) & (df['StockPrice_PercentChangeAfter_4Days'] > 0),
      (df['Sentiment_onPubDate'] < 0) & (df['StockPrice_PercentChangeAfter_4Days'] < 0)
  ]
  choices = ['green', 'red']
  df['color_4day'] = np.select(conditions, choices, default='yellow')

  title = f'{company_ticker} - Colored Scatterplot - Sentiment vs % Price Change after 4 Days'
  plot_colored_scatter_sentiment_vs_price_change(
      df,
      sentiment_col='Sentiment_onPubDate',
      price_change_col='StockPrice_PercentChangeAfter_4Days',
      color_col='color_4day',
      title=title
  )

  # Count the number of each color
  color_counts = df['color_4day'].value_counts()

  # Print the counts
  print(f"Green:\t{color_counts.get('green', 0)}\t(positive sentiment and positive price change)")
  print(f"Yellow:\t{color_counts.get('yellow')}\t(mismatch between sentiment and price change)")
  print(f"Red:\t{color_counts.get('red', 0)}\t(negative sentiment and negative price change)")

  conditions = [
      (df['Sentiment_onPubDate'] > 0) & (df['StockPrice_PercentChangeAfter_5Days'] > 0),
      (df['Sentiment_onPubDate'] < 0) & (df['StockPrice_PercentChangeAfter_5Days'] < 0)
  ]
  choices = ['green', 'red']
  df['color_5day'] = np.select(conditions, choices, default='yellow')

  title = f'{company_ticker} - Colored Scatterplot - Sentiment vs % Price Change after 5 Days'
  plot_colored_scatter_sentiment_vs_price_change(
      df,
      sentiment_col='Sentiment_onPubDate',
      price_change_col='StockPrice_PercentChangeAfter_5Days',
      color_col='color_5day',
      title=title
  )

  # Count the number of each color
  color_counts = df['color_5day'].value_counts()

  # Print the counts
  print(f"Green:\t{color_counts.get('green', 0)}\t(positive sentiment and positive price change)")
  print(f"Yellow:\t{color_counts.get('yellow')}\t(mismatch between sentiment and price change)")
  print(f"Red:\t{color_counts.get('red', 0)}\t(negative sentiment and negative price change)")

Output hidden; open in https://colab.research.google.com to view.

# Model Evaluations

In [None]:
apple_1_day_metrics = (apple_1day_mae, apple_1day_mse, apple_1day_rmse, apple_1day_r2, apple_1day_mape)
apple_2_day_metrics = (apple_2day_mae, apple_2day_mse, apple_2day_rmse, apple_2day_r2, apple_2day_mape)
apple_3_day_metrics = (apple_3day_mae, apple_3day_mse, apple_3day_rmse, apple_3day_r2, apple_3day_mape)
apple_4_day_metrics = (apple_4day_mae, apple_4day_mse, apple_4day_rmse, apple_4day_r2, apple_4day_mape)
apple_5_day_metrics = (apple_5day_mae, apple_5day_mse, apple_5day_rmse, apple_5day_r2, apple_5day_mape)

ed_1_day_metrics = (ed_1day_mae, ed_1day_mse, ed_1day_rmse, ed_1day_r2, ed_1day_mape)
ed_2_day_metrics = (ed_2day_mae, ed_2day_mse, ed_2day_rmse, ed_2day_r2, ed_2day_mape)
ed_3_day_metrics = (ed_3day_mae, ed_3day_mse, ed_3day_rmse, ed_3day_r2, ed_3day_mape)
ed_4_day_metrics = (ed_4day_mae, ed_4day_mse, ed_4day_rmse, ed_4day_r2, ed_4day_mape)
ed_5_day_metrics = (ed_5day_mae, ed_5day_mse, ed_5day_rmse, ed_5day_r2, ed_5day_mape)

ip_1_day_metrics = (ip_1day_mae, ip_1day_mse, ip_1day_rmse, ip_1day_r2, ip_1day_mape)
ip_2_day_metrics = (ip_2day_mae, ip_2day_mse, ip_2day_rmse, ip_2day_r2, ip_2day_mape)
ip_3_day_metrics = (ip_3day_mae, ip_3day_mse, ip_3day_rmse, ip_3day_r2, ip_3day_mape)
ip_4_day_metrics = (ip_4day_mae, ip_4day_mse, ip_4day_rmse, ip_4day_r2, ip_4day_mape)
ip_5_day_metrics = (ip_5day_mae, ip_5day_mse, ip_5day_rmse, ip_5day_r2, ip_5day_mape)

levi_1_day_metrics = (levi_1day_mae, levi_1day_mse, levi_1day_rmse, levi_1day_r2, levi_1day_mape)
levi_2_day_metrics = (levi_2day_mae, levi_2day_mse, levi_2day_rmse, levi_2day_r2, levi_2day_mape)
levi_3_day_metrics = (levi_3day_mae, levi_3day_mse, levi_3day_rmse, levi_3day_r2, levi_3day_mape)
levi_4_day_metrics = (levi_4day_mae, levi_4day_mse, levi_4day_rmse, levi_4day_r2, levi_4day_mape)
levi_5_day_metrics = (levi_5day_mae, levi_5day_mse, levi_5day_rmse, levi_5day_r2, levi_5day_mape)

pm_1_day_metrics = (pm_1day_mae, pm_1day_mse, pm_1day_rmse, pm_1day_r2, pm_1day_mape)
pm_2_day_metrics = (pm_2day_mae, pm_2day_mse, pm_2day_rmse, pm_2day_r2, pm_2day_mape)
pm_3_day_metrics = (pm_3day_mae, pm_3day_mse, pm_3day_rmse, pm_3day_r2, pm_3day_mape)
pm_4_day_metrics = (pm_4day_mae, pm_4day_mse, pm_4day_rmse, pm_4day_r2, pm_4day_mape)
pm_5_day_metrics = (pm_5day_mae, pm_5day_mse, pm_5day_rmse, pm_5day_r2, pm_5day_mape)

ubs_1_day_metrics = (ubs_1day_mae, ubs_1day_mse, ubs_1day_rmse, ubs_1day_r2, ubs_1day_mape)
ubs_2_day_metrics = (ubs_2day_mae, ubs_2day_mse, ubs_2day_rmse, ubs_2day_r2, ubs_2day_mape)
ubs_3_day_metrics = (ubs_3day_mae, ubs_3day_mse, ubs_3day_rmse, ubs_3day_r2, ubs_3day_mape)
ubs_4_day_metrics = (ubs_4day_mae, ubs_4day_mse, ubs_4day_rmse, ubs_4day_r2, ubs_4day_mape)
ubs_5_day_metrics = (ubs_5day_mae, ubs_5day_mse, ubs_5day_rmse, ubs_5day_r2, ubs_5day_mape)

metrics_dict = {}
metrics_dict['AAPL'] = (apple_1_day_metrics, apple_2_day_metrics, apple_3_day_metrics, apple_4_day_metrics, apple_5_day_metrics)
metrics_dict['ED'] = (ed_1_day_metrics, ed_2_day_metrics, ed_3_day_metrics, ed_4_day_metrics, ed_5_day_metrics)
metrics_dict['IP'] = (ip_1_day_metrics, ip_2_day_metrics, ip_3_day_metrics, ip_4_day_metrics, ip_5_day_metrics)
metrics_dict['LEVI'] = (levi_1_day_metrics, levi_2_day_metrics, levi_3_day_metrics, levi_4_day_metrics, levi_5_day_metrics)
metrics_dict['PM'] = (pm_1_day_metrics, pm_2_day_metrics, pm_3_day_metrics, pm_4_day_metrics, pm_5_day_metrics)
metrics_dict['UBS'] = (ubs_1_day_metrics, ubs_2_day_metrics, ubs_3_day_metrics, ubs_4_day_metrics, ubs_5_day_metrics)


In [None]:
# List of company tickers and the corresponding dataframes
companies = ['AAPL', 'ED', 'IP', 'LEVI', 'PM', 'UBS']

for company_ticker in companies:
    print(f'\n{company_ticker} Regression Metrics:')
    for prediction_day in range(1, 6):
        metrics = metrics_dict[company_ticker][prediction_day - 1]
        mae = metrics[0]
        mse = metrics[1]
        rmse = metrics[2]
        r2 = metrics[3]
        mape = metrics[4]
        print(f'{prediction_day} Days Model Regression Results:')
        print(f'Mean Absolute Error (MAE):\t\t{mae:.4f}')
        print(f'Mean Squared Error (MSE):\t\t{mse:.4f}')
        print(f'Root Mean Squared Error (RMSE)\t\t{rmse:.4f}')
        print(f'Coefficient of Determination (R2):\t{r2:.4f}')
        print(f'Mean Absolute Percentage Error (MAPE):\t{mape:.4f}')
        print("~" * 60)



AAPL Regression Metrics:
1 Days Model Regression Results:
Mean Absolute Error (MAE):		0.9871
Mean Squared Error (MSE):		1.8477
Root Mean Squared Error (RMSE)		1.3593
Coefficient of Determination (R2):	0.0149
Mean Absolute Percentage Error (MAPE):	inf
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2 Days Model Regression Results:
Mean Absolute Error (MAE):		1.5795
Mean Squared Error (MSE):		4.0737
Root Mean Squared Error (RMSE)		2.0184
Coefficient of Determination (R2):	0.0105
Mean Absolute Percentage Error (MAPE):	107.3302
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3 Days Model Regression Results:
Mean Absolute Error (MAE):		2.0272
Mean Squared Error (MSE):		6.4586
Root Mean Squared Error (RMSE)		2.5414
Coefficient of Determination (R2):	0.0042
Mean Absolute Percentage Error (MAPE):	93.3143
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4 Days Model Regression Results:
Mean Absolute Error (MAE):		2.3594
Mean Squared Error (MSE):		9.1054
Ro