In [1]:
REMOVE_SYMBOL = ["TWTR"]

In [2]:
import os
import pickle
import polars as pl
from tqdm import tqdm
from datetime import date
from rich import print
from itertools import groupby

In [3]:
with open(os.path.join("..", "data", "05_env_data", "env_data.pkl"), "rb") as f:
    env_data = pickle.load(f)

print(list(env_data.keys())[:3])
print(env_data[date(year=2021, month=8, day=17)].keys())

In [4]:
news_data = pl.read_parquet(
    os.path.join("..", "data", "04_input_data", "news_data.parquet")
)
news_data = news_data.with_columns(pl.col("date").dt.date().alias("help_col"))
print(news_data["help_col"].unique().len())
news_data.head()

author,content,datetime,source,summary,title,url,date,equity,text,help_col
str,str,datetime[μs],str,str,str,str,datetime[μs],str,str,date
"""Lisa Levin""","""""",2021-09-07 08:58:08,"""""","""Gainers 	Inn…","""53 Biggest Mov…","""https://www.be…",2021-09-07 09:00:00,"""DOCN""","""53 Biggest Mov…",2021-09-07
"""Benzinga Insig…","""""",2022-10-20 21:56:59,"""benzinga""","""Gainers Ever-…","""10 Consumer Di…","""https://www.be…",2022-10-21 09:00:00,"""EVK""","""10 Consumer Di…",2022-10-21
"""Adam Eckert""","""""",2023-02-07 21:22:03,"""benzinga""","""Microsoft Corp…","""Look Out Googl…","""https://www.be…",2023-02-08 09:00:00,"""GOOG""","""Look Out Googl…",2023-02-08
"""Chris Katje""","""""",2023-02-07 16:34:31,"""benzinga""","""Subversive Cap…","""Want To Copy '…","""https://www.be…",2023-02-08 09:00:00,"""GOOG""","""Want To Copy '…",2023-02-08
"""Adam Eckert""","""""",2023-02-07 14:25:35,"""benzinga""","""Toast Inc (NYS…","""Toast Stock Ju…","""https://www.be…",2023-02-07 09:00:00,"""GOOG""","""Toast Stock Ju…",2023-02-07


In [5]:
news_coverage = (
    news_data.groupby("equity")
    .agg(pl.col("help_col").unique().count().alias("news_coverage"))
    .sort("news_coverage", descending=True)
)
print(news_coverage)

In [6]:
print(news_coverage.describe())

In [7]:
print(news_coverage["news_coverage"].quantile(0.93))

In [8]:
print(news_coverage.filter(pl.col("news_coverage") > 293))

In [9]:
tickers_to_keep = news_coverage.filter(news_coverage["news_coverage"] > 293)[
    "equity"
].to_list()

tickers_to_keep = [x for x in tickers_to_keep if x not in REMOVE_SYMBOL]

In [10]:
print(tickers_to_keep)

# Filter The Current Data

## Keep The Original Structure

In [11]:
subset_env_data = {}
subset_data_new_structure = {}

for cur_date in tqdm(env_data):
    cur_price = env_data[cur_date]["price"]
    cur_eco = env_data[cur_date]["economic_variable"]
    if len(cur_eco) > 1:
        print(cur_date)
        print(cur_eco)
        break
    cur_filing_k = env_data[cur_date]["10k_fillings"]
    cur_filing_q = env_data[cur_date]["10q_fillings"]
    cur_news = env_data[cur_date]["news"]
    cur_record = env_data[cur_date]["ark_record"]

    # subset data
    new_price = [i for i in cur_price if i["symbol"] in tickers_to_keep]
    new_fillings_k = [i for i in cur_filing_k if i["ticker"] in tickers_to_keep]
    new_fillings_q = [i for i in cur_filing_q if i["ticker"] in tickers_to_keep]
    new_news = [i for i in cur_news if i["ticker"] in tickers_to_keep]
    new_record = [i for i in cur_record if i["equity"] in tickers_to_keep]
    subset_env_data[cur_date] = {
        "price": new_price,
        "economic_variable": cur_eco,
        "10k_fillings": new_fillings_k,
        "10q_fillings": new_fillings_q,
        "news": new_news,
        "ark_record": new_record,
    }

    cur_new_price = {
        i["symbol"]: i["Adj Close"] for i in cur_price if i["symbol"] in tickers_to_keep
    }
    cur_new_eco = cur_eco[0]
    cur_new_filing_k = {
        i["ticker"]: i["content"]
        for i in cur_filing_k
        if i["ticker"] in tickers_to_keep
    }
    cur_new_filing_q = {
        i["ticker"]: i["content"]
        for i in cur_filing_q
        if i["ticker"] in tickers_to_keep
    }
    cur_new_news = {
        key: [i["text"] for i in group]
        for key, group in groupby(cur_news, lambda x: x["ticker"])
        if key in tickers_to_keep
    }
    cur_new_record = {}
    for i in cur_record:
        if i["equity"] in tickers_to_keep:
            temp_dict = {
                "direction": i["direction"],
                "quantity": i["quantity"],
            }
            cur_new_record[i["equity"]] = temp_dict
    subset_data_new_structure[cur_date] = {
        "price": cur_new_price,
        "eco": cur_new_eco,
        "filing_k": cur_new_filing_k,
        "filing_q": cur_new_filing_q,
        "news": cur_new_news,
        "ark_record": cur_new_record,
    }

 59%|█████▉    | 296/501 [00:00<00:00, 2955.18it/s]

100%|██████████| 501/501 [00:00<00:00, 2570.53it/s]


In [12]:
with open(os.path.join("..", "data", "05_env_data", "subset_env_data.pkl"), "wb") as f:
    pickle.dump(subset_env_data, f)

In [13]:
with open(
    os.path.join("..", "data", "05_env_data", "subset_new_env_data.pkl"), "wb"
) as f:
    pickle.dump(subset_data_new_structure, f)