<a href="https://colab.research.google.com/github/muziejus/coms-w4995-applied-machine-learning-project/blob/main/notebooks/combine_sentiment_and_financial_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


## Set constants

In [None]:
root_data_url = "https://github.com/muziejus/coms-w4995-applied-machine-learning-project/raw/refs/heads/main/data"
financial_data_url = "financial_data"
sentiment_article_number = 75000
sentiment_data_url = f"sentiment_data/analyzed_{sentiment_article_number}"
sentiment_file_name_tail = f"_x_{sentiment_article_number}.parquet"

companies = ["dltr", "lulu", "ulta", "wba", "wmt"]

In [None]:
fin_df = pd.read_csv(f"{root_data_url}/{financial_data_url}/merged_data.csv")

In [35]:
fin_df.columns

Index(['Date', 'Open_lulu', 'High_lulu', 'Low_lulu', 'Close_lulu',
       'Volume_lulu', 'Dividends_lulu', 'Stock Splits_lulu', 'Open_wmt',
       'High_wmt', 'Low_wmt', 'Close_wmt', 'Volume_wmt', 'Dividends_wmt',
       'Stock Splits_wmt', 'Open_wba', 'High_wba', 'Low_wba', 'Close_wba',
       'Volume_wba', 'Dividends_wba', 'Stock Splits_wba', 'Open_ulta',
       'High_ulta', 'Low_ulta', 'Close_ulta', 'Volume_ulta', 'Dividends_ulta',
       'Stock Splits_ulta', 'Open_dltr', 'High_dltr', 'Low_dltr', 'Close_dltr',
       'Volume_dltr', 'Dividends_dltr', 'Stock Splits_dltr', 'CPIAUCSL', 'PCE',
       'PPIACO', 'ECIALLCIV', 'GDPDEF', 'UNRATE', 'MCUMFN', 'SP500'],
      dtype='object')

## Create aggregated sentiment data

For each company, we read in the parquet file from GitHub that has the results of our sentiment analysis. The columns in the parquet file are:

Column | Type | Description
---|---|---
`index` | int | A naive index created during aggregation.
`goid` | int | ProQuest’s globally unique identifier for the article in question.
`date`| str | The article’s publication date (`%Y-%m-%d`).
`tokens` | int | A naive (breaking on whitespace) count of tokens in the article.
`corpus` | str | The corpus from which the article comes. Used previously in aggregation.
`daily_article_count` | int | A previously calculated count of all the articles in the corpus from that day.
`daily_token_sum` | int | A sum of all the (naive) tokens from all the articles in the corpus from that day.
`text_sentiment` | float | The overall average sentiment for that article, from (-1, 1) with negative numbers corresponding to negative sentiments and positive with positive.
`text_error` | float | The weighted inverse error in analysis. Higher is more confident in classification.
`text_input_tokens` | int | The number of tokens as analyzed by RoBERTa’s byte-pair encoding tokenizer.

We group the data by date and create a new DataFrame with the following columns:

Column | Type | Description
---|---|---
`date`| str | The article’s publication date (`%Y-%m-%d`).
`analyzed_bpe_tokens`| int | The number of tokens as analyzed by RoBERTa’s BPE tokenizer for all the articles analyzed for the day.
`weighted_sentiment` | float | The mean sentiment for the day (from (-1, 1) as above), weighted by each individual article’s length.
`weighted_error` | float | The mean inverse error for the day, weighted by each individual article’s length. Higher is more confident.
`analyzed_naive_tokens` | int | The number of naive tokens (words separated by whitespace) analyzed for the day.
`daily_naive_token_sum` | int | The total number of naive tokens available for the day.
`analyzed_article_count` | int | The number of articles analyzed for the day.
`daily_article_count` | int | The number of available articles for the day.


In [33]:
def agg_weighted_avg(row, value_column, weight_column):
    return (row[value_column] * row[weight_column]).sum() / row[weight_column].sum()

for company in ["lulu"]: # companies
  sent_df = pd.read_parquet(f"{root_data_url}/{sentiment_data_url}/{company}{sentiment_file_name_tail}")
  agg_sent_df = sent_df.groupby("date").agg(
      analyzed_bpe_tokens = ("text_input_tokens", lambda row: row.sum().astype(int)),
      weighted_sentiment=("text_input_tokens", lambda row: weighted_avg(sent_df.loc[row.index], "text_sentiment", "text_input_tokens")),
      weighted_error=("text_input_tokens", lambda row: weighted_avg(sent_df.loc[row.index], "text_error", "text_input_tokens")),
      analyzed_naive_tokens = ("tokens", "sum"),
      daily_naive_token_sum = ("daily_token_sum", "first"),
      analyzed_article_count = ("index", "count"),
      daily_article_count = ("daily_article_count", "first")
  )

In [34]:
agg_sent_df.head()

Unnamed: 0_level_0,analyzed_bpe_tokens,weighted_sentiment,weighted_error,analyzed_naive_tokens,daily_naive_token_sum,analyzed_article_count,daily_article_count
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-01,78837,0.195344,41.043721,57841,60358,12,15
2019-01-02,74422,0.180192,158.752229,39626,42472,24,27
2019-01-03,115014,0.126784,156.380061,37031,37530,15,16
2019-01-04,118195,0.074154,126.906331,41872,45767,14,17
2019-01-07,109520,0.069071,114.400445,35020,36574,16,17
