In [1]:
import os
import polars as pl
import yfinance as yf
from tqdm import tqdm
from rich import print
from datetime import date, timedelta

start_date = date(2021, 8, 15)
end_date = date(2023, 8, 15)

# Load Data

#### Download stock price information

In [3]:
price_data = pl.read_parquet(
    os.path.join("..", "data", "02_intermediate", "yf_data.parquet")
)
price_data.head()

Date,Open,High,Low,Close,Adj Close,Volume,symbol
datetime[ns],f64,f64,f64,f64,f64,i64,str
2021-08-16 00:00:00,67.510002,68.699997,67.5,68.68,64.446404,6753600,"""BMY"""
2021-08-17 00:00:00,68.68,69.160004,68.209999,68.959999,64.709145,7560300,"""BMY"""
2021-08-18 00:00:00,68.989998,69.07,67.860001,67.93,63.742638,6863800,"""BMY"""
2021-08-19 00:00:00,67.650002,69.110001,67.559998,68.510002,64.286888,11707400,"""BMY"""
2021-08-20 00:00:00,68.709999,69.349998,68.300003,69.199997,64.934341,9053400,"""BMY"""


#### Download news data

In [4]:
news_data = pl.read_parquet(os.path.join("..", "data", "03_primary", "news.parquet"))
news_data

author,content,datetime,source,summary,title,url,date,equity
str,str,datetime[μs],str,str,str,str,datetime[μs],str
"""Benzinga Insig…","""""",2023-06-23 21:00:37,"""benzinga""",""" ""","""What 18 Analys…","""https://www.be…",2023-06-24 09:00:00,"""SQ"""
"""Benzinga Newsd…","""""",2023-06-23 15:37:23,"""benzinga""","""""","""Truist Securit…","""https://www.be…",2023-06-23 09:00:00,"""SQ"""
"""Benzinga Insig…","""""",2022-12-27 17:32:33,"""benzinga""",""" ""","""12 Communicati…","""https://www.be…",2022-12-28 09:00:00,"""HUYA"""
"""Anusuya Lahiri…","""""",2021-09-07 17:12:59,"""""",""" ""","""DigitalOcean A…","""https://www.be…",2021-09-08 09:00:00,"""DOCN"""
"""Benzinga Newsd…","""""",2021-09-07 12:02:00,"""""","""""","""DigitalOcean A…","""https://www.be…",2021-09-07 09:00:00,"""DOCN"""
"""Lisa Levin""","""""",2021-09-07 08:58:08,"""""","""Gainers 	Inn…","""53 Biggest Mov…","""https://www.be…",2021-09-07 09:00:00,"""DOCN"""
"""Benzinga Insig…","""""",2022-10-20 21:56:59,"""benzinga""",""" Gainers Ever…","""10 Consumer Di…","""https://www.be…",2022-10-21 09:00:00,"""EVK"""
"""Adam Eckert""","""""",2023-02-07 21:22:03,"""benzinga""","""Microsoft Corp…","""Look Out Googl…","""https://www.be…",2023-02-08 09:00:00,"""GOOG"""
"""Chris Katje""","""""",2023-02-07 16:34:31,"""benzinga""","""Subversive Cap…","""Want To Copy '…","""https://www.be…",2023-02-08 09:00:00,"""GOOG"""
"""Adam Eckert""","""""",2023-02-07 14:25:35,"""benzinga""","""Toast Inc (NYS…","""Toast Stock Ju…","""https://www.be…",2023-02-07 09:00:00,"""GOOG"""


In [5]:
with open(
    os.path.join("..", "data", "02_intermediate", "failed_symbols.txt"), "r"
) as f:
    failed_symbols = f.read().split("\n")

####  Clean Price Data

In [8]:
price_data.describe()

describe,Date,Open,High,Low,Close,Adj Close,Volume,symbol
str,str,f64,f64,f64,f64,f64,f64,str
"""count""","""206398""",206398.0,206398.0,206398.0,206398.0,206398.0,206398.0,"""206398"""
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,"""0"""
"""mean""",,100.890656,102.635645,99.068485,100.911064,100.348165,4064700.0,
"""std""",,767.548258,781.253086,752.272076,769.390938,769.317201,11870000.0,
"""min""","""2021-08-16 00:…",0.00049,0.0003,0.0003,0.00049,0.00049,0.0,"""AAPL"""
"""25%""",,11.002,11.318,10.68,10.99,10.94,358300.0,
"""50%""",,38.119999,38.93,37.299999,38.099998,37.799999,946300.0,
"""75%""",,96.910004,98.57,95.019997,96.809998,95.760002,2551500.0,
"""max""","""2023-08-14 00:…",34600.0,36300.0,33700.0,35000.0,35000.0,330993900.0,"""ZS"""


In [136]:
error_record_condition = (
    (pl.col("open") == 0)
    | (pl.col("high") == 0)
    | (pl.col("low") == 0)
    | (pl.col("close") == 0)
)

price_data = price_data.filter(
    (pl.col("est_time").dt.date() >= start_date)
    & (pl.col("est_time").dt.date() <= end_date)
)

clean_price_data = price_data.filter(~error_record_condition)
error_price_data = price_data.filter(error_record_condition)
error_price_data

utc_time,est_time,dataset,schema,equity,open,high,low,close
datetime[μs],datetime[μs],str,str,str,f64,f64,f64,f64
2023-04-28 00:00:00,2023-04-28 00:00:00,"""XNAS.ITCH""","""ohlcv-1d""","""CBOE""",138.2,140.12,0.0,139.02
2023-07-24 00:00:00,2023-07-24 00:00:00,"""XNAS.ITCH""","""ohlcv-1d""","""CBOE""",0.0,143.49,0.0,142.24
2023-07-25 00:00:00,2023-07-25 00:00:00,"""XNAS.ITCH""","""ohlcv-1d""","""CBOE""",141.01,143.02,0.0,142.9
2023-07-26 00:00:00,2023-07-26 00:00:00,"""XNAS.ITCH""","""ohlcv-1d""","""CBOE""",0.0,143.83,0.0,143.6
2023-07-27 00:00:00,2023-07-27 00:00:00,"""XNAS.ITCH""","""ohlcv-1d""","""CBOE""",0.0,143.69,0.0,141.7
2023-07-28 00:00:00,2023-07-28 00:00:00,"""XNAS.ITCH""","""ohlcv-1d""","""CBOE""",141.61,142.82,0.0,140.51
2023-07-31 00:00:00,2023-07-31 00:00:00,"""XNAS.ITCH""","""ohlcv-1d""","""CBOE""",140.36,140.97,0.0,139.8
2023-08-09 00:00:00,2023-08-09 00:00:00,"""XNAS.ITCH""","""ohlcv-1d""","""CBOE""",0.0,148.275,0.0,146.57
2023-08-10 00:00:00,2023-08-10 00:00:00,"""XNAS.ITCH""","""ohlcv-1d""","""CBOE""",146.56,148.76,0.0,147.34
2023-08-11 00:00:00,2023-08-11 00:00:00,"""XNAS.ITCH""","""ohlcv-1d""","""CBOE""",0.0,150.79,0.0,150.55


In [137]:
error_price_data = error_price_data.to_dicts()

updated_records = []

cur_database = "yfinance"
for cur_record in tqdm(error_price_data):
    cur_utc_time = cur_record["utc_time"]
    cur_est_time = cur_record["est_time"]
    cur_schema = cur_record["schema"]
    equity = cur_record["equity"]

    # download price
    try:
        cur_download = yf.download(
            equity,
            start=cur_est_time.date(),
            end=cur_est_time.date() + timedelta(days=1),
            progress=False,
        )
        cur_open = cur_download.loc[cur_est_time, "Open"]
        cur_high = cur_download.loc[cur_est_time, "High"]
        cur_low = cur_download.loc[cur_est_time, "Low"]
        cur_close = cur_download.loc[cur_est_time, "Close"]

        updated_records.append(
            {
                "utc_time": cur_utc_time,
                "est_time": cur_est_time,
                "dataset": cur_database,
                "schema": cur_schema,
                "equity": equity,
                "open": cur_open,
                "high": cur_high,
                "low": cur_low,
                "close": cur_close,
            }
        )
    except KeyError as e:
        print(f"Error: {e} for {equity} at {cur_est_time}")

error_data_df = pl.DataFrame(updated_records)
error_data_df

  0%|          | 11/3401 [00:01<06:26,  8.78it/s]
1 Failed download:
['AQUA']: Exception('%ticker%: No timezone found, symbol may be delisted')
  0%|          | 14/3401 [00:01<05:52,  9.60it/s]

Error: datetime.datetime(2023, 4, 28, 0, 0) for AQUA at 2023-04-28 00:00:00


 64%|██████▎   | 2164/3401 [04:33<02:36,  7.89it/s]
1 Failed download:
['SI']: Exception('%ticker%: No timezone found, symbol may be delisted')
 64%|██████▎   | 2165/3401 [04:33<02:29,  8.27it/s]

Error: datetime.datetime(2023, 4, 28, 0, 0) for SI at 2023-04-28 00:00:00


 87%|████████▋ | 2947/3401 [06:22<00:57,  7.90it/s]
1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2948/3401 [06:22<00:54,  8.37it/s]
1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2949/3401 [06:22<00:51,  8.72it/s]

Error: datetime.datetime(2023, 4, 28, 0, 0) for SLGCW at 2023-04-28 00:00:00
Error: datetime.datetime(2023, 7, 18, 0, 0) for SLGCW at 2023-07-18 00:00:00



1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2950/3401 [06:22<00:50,  8.99it/s]
1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")

1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2952/3401 [06:22<00:47,  9.54it/s]

Error: datetime.datetime(2023, 7, 20, 0, 0) for SLGCW at 2023-07-20 00:00:00
Error: datetime.datetime(2023, 7, 21, 0, 0) for SLGCW at 2023-07-21 00:00:00
Error: datetime.datetime(2023, 7, 24, 0, 0) for SLGCW at 2023-07-24 00:00:00



1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2953/3401 [06:22<00:46,  9.58it/s]
1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")

1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2955/3401 [06:22<00:45,  9.84it/s]

Error: datetime.datetime(2023, 7, 25, 0, 0) for SLGCW at 2023-07-25 00:00:00
Error: datetime.datetime(2023, 7, 26, 0, 0) for SLGCW at 2023-07-26 00:00:00
Error: datetime.datetime(2023, 7, 27, 0, 0) for SLGCW at 2023-07-27 00:00:00



1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2956/3401 [06:23<00:45,  9.80it/s]
1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2957/3401 [06:23<00:45,  9.77it/s]

Error: datetime.datetime(2023, 7, 28, 0, 0) for SLGCW at 2023-07-28 00:00:00
Error: datetime.datetime(2023, 7, 31, 0, 0) for SLGCW at 2023-07-31 00:00:00



1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2958/3401 [06:23<00:45,  9.75it/s]
1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")

1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2960/3401 [06:23<00:43, 10.17it/s]

Error: datetime.datetime(2023, 8, 1, 0, 0) for SLGCW at 2023-08-01 00:00:00
Error: datetime.datetime(2023, 8, 2, 0, 0) for SLGCW at 2023-08-02 00:00:00
Error: datetime.datetime(2023, 8, 3, 0, 0) for SLGCW at 2023-08-03 00:00:00



1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")

1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2962/3401 [06:23<00:43, 10.00it/s]
1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")


Error: datetime.datetime(2023, 8, 4, 0, 0) for SLGCW at 2023-08-04 00:00:00
Error: datetime.datetime(2023, 8, 7, 0, 0) for SLGCW at 2023-08-07 00:00:00
Error: datetime.datetime(2023, 8, 8, 0, 0) for SLGCW at 2023-08-08 00:00:00



1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2964/3401 [06:23<00:43, 10.07it/s]
1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")

1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2966/3401 [06:24<00:42, 10.13it/s]

Error: datetime.datetime(2023, 8, 9, 0, 0) for SLGCW at 2023-08-09 00:00:00
Error: datetime.datetime(2023, 8, 10, 0, 0) for SLGCW at 2023-08-10 00:00:00
Error: datetime.datetime(2023, 8, 11, 0, 0) for SLGCW at 2023-08-11 00:00:00



1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")

1 Failed download:
['SLGCW']: Exception("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
 87%|████████▋ | 2968/3401 [06:24<00:42, 10.15it/s]

Error: datetime.datetime(2023, 8, 14, 0, 0) for SLGCW at 2023-08-14 00:00:00
Error: datetime.datetime(2023, 8, 15, 0, 0) for SLGCW at 2023-08-15 00:00:00


100%|██████████| 3401/3401 [07:20<00:00,  7.73it/s]


utc_time,est_time,dataset,schema,equity,open,high,low,close
datetime[μs],datetime[μs],str,str,str,f64,f64,f64,f64
2023-04-28 00:00:00,2023-04-28 00:00:00,"""yfinance""","""ohlcv-1d""","""CBOE""",138.270004,140.149994,138.119995,139.699997
2023-07-24 00:00:00,2023-07-24 00:00:00,"""yfinance""","""ohlcv-1d""","""CBOE""",142.570007,143.509995,142.050003,142.309998
2023-07-25 00:00:00,2023-07-25 00:00:00,"""yfinance""","""ohlcv-1d""","""CBOE""",141.470001,143.020004,141.470001,142.979996
2023-07-26 00:00:00,2023-07-26 00:00:00,"""yfinance""","""ohlcv-1d""","""CBOE""",141.919998,143.830002,141.919998,143.720001
2023-07-27 00:00:00,2023-07-27 00:00:00,"""yfinance""","""ohlcv-1d""","""CBOE""",143.5,143.699997,141.029999,141.600006
2023-07-28 00:00:00,2023-07-28 00:00:00,"""yfinance""","""ohlcv-1d""","""CBOE""",142.75,142.75,140.350006,140.360001
2023-07-31 00:00:00,2023-07-31 00:00:00,"""yfinance""","""ohlcv-1d""","""CBOE""",139.869995,141.080002,139.600006,139.679993
2023-08-09 00:00:00,2023-08-09 00:00:00,"""yfinance""","""ohlcv-1d""","""CBOE""",147.0,148.279999,145.779999,146.179993
2023-08-10 00:00:00,2023-08-10 00:00:00,"""yfinance""","""ohlcv-1d""","""CBOE""",147.339996,148.759995,146.820007,148.360001
2023-08-11 00:00:00,2023-08-11 00:00:00,"""yfinance""","""ohlcv-1d""","""CBOE""",147.309998,150.789993,147.309998,150.25


In [138]:
clean_price_data = pl.concat([clean_price_data, error_data_df])

In [139]:
clean_price_data.describe()

describe,utc_time,est_time,dataset,schema,equity,open,high,low,close
str,str,str,str,str,str,f64,f64,f64,f64
"""count""","""208595""","""208595""","""208595""","""208595""","""208595""",208595.0,208595.0,208595.0,208595.0
"""null_count""","""0""","""0""","""0""","""0""","""0""",0.0,0.0,0.0,0.0
"""mean""",,,,,,91.378948,93.042924,89.541416,91.257097
"""std""",,,,,,203.086744,206.138147,199.618004,202.857934
"""min""","""2021-08-16 00:…","""2021-08-16 00:…","""XNAS.ITCH""","""ohlcv-1d""","""AAPL""",0.0931,0.0931,0.0711,0.0712
"""25%""",,,,,,10.01,10.24,9.81,10.0
"""50%""",,,,,,36.33,37.1,35.44,36.24
"""75%""",,,,,,96.8,98.59,94.62,96.57
"""max""","""2023-08-15 00:…","""2023-08-15 00:…","""yfinance""","""ohlcv-1d""","""ZY""",3711.94,3762.68,3672.0,3700.98


In [140]:
clean_price_data.write_parquet(
    os.path.join("..", "data", "04_input_data", "price_data.parquet")
)

####  News Data

In [10]:
news_data = pl.read_parquet(os.path.join("..", "data", "03_primary", "news.parquet"))
print(news_data.shape)
print(news_data.head()["title"].to_list()[:5])

In [11]:
news_data = news_data.with_columns(
    [
        pl.col("summary").str.strip().alias("summary"),
    ]
)

news_data = news_data.filter(pl.col("summary") != "")

print(news_data.shape)
news_data.head()

author,content,datetime,source,summary,title,url,date,equity
str,str,datetime[μs],str,str,str,str,datetime[μs],str
"""Lisa Levin""","""""",2021-09-07 08:58:08,"""""","""Gainers 	Inn…","""53 Biggest Mov…","""https://www.be…",2021-09-07 09:00:00,"""DOCN"""
"""Benzinga Insig…","""""",2022-10-20 21:56:59,"""benzinga""","""Gainers Ever-…","""10 Consumer Di…","""https://www.be…",2022-10-21 09:00:00,"""EVK"""
"""Adam Eckert""","""""",2023-02-07 21:22:03,"""benzinga""","""Microsoft Corp…","""Look Out Googl…","""https://www.be…",2023-02-08 09:00:00,"""GOOG"""
"""Chris Katje""","""""",2023-02-07 16:34:31,"""benzinga""","""Subversive Cap…","""Want To Copy '…","""https://www.be…",2023-02-08 09:00:00,"""GOOG"""
"""Adam Eckert""","""""",2023-02-07 14:25:35,"""benzinga""","""Toast Inc (NYS…","""Toast Stock Ju…","""https://www.be…",2023-02-07 09:00:00,"""GOOG"""


In [12]:
news_data = news_data.with_columns(
    (pl.col("title") + " " + pl.col("summary")).alias("text")
)

news_data.head()

author,content,datetime,source,summary,title,url,date,equity,text
str,str,datetime[μs],str,str,str,str,datetime[μs],str,str
"""Lisa Levin""","""""",2021-09-07 08:58:08,"""""","""Gainers 	Inn…","""53 Biggest Mov…","""https://www.be…",2021-09-07 09:00:00,"""DOCN""","""53 Biggest Mov…"
"""Benzinga Insig…","""""",2022-10-20 21:56:59,"""benzinga""","""Gainers Ever-…","""10 Consumer Di…","""https://www.be…",2022-10-21 09:00:00,"""EVK""","""10 Consumer Di…"
"""Adam Eckert""","""""",2023-02-07 21:22:03,"""benzinga""","""Microsoft Corp…","""Look Out Googl…","""https://www.be…",2023-02-08 09:00:00,"""GOOG""","""Look Out Googl…"
"""Chris Katje""","""""",2023-02-07 16:34:31,"""benzinga""","""Subversive Cap…","""Want To Copy '…","""https://www.be…",2023-02-08 09:00:00,"""GOOG""","""Want To Copy '…"
"""Adam Eckert""","""""",2023-02-07 14:25:35,"""benzinga""","""Toast Inc (NYS…","""Toast Stock Ju…","""https://www.be…",2023-02-07 09:00:00,"""GOOG""","""Toast Stock Ju…"


In [13]:
news_data.write_parquet(
    os.path.join("..", "data", "04_input_data", "news_data.parquet")
)

####  Filing Data

In [14]:
filing_data = pl.read_parquet(
    os.path.join("..", "data", "03_primary", "filing_data.parquet")
)
filing_data.head()

document_url,content,ticker,cik,utc_timestamp,est_timestamp,type
str,str,str,str,datetime[μs],datetime[μs],str
"""https://www.se…","""item 7. manage…","""SGFY""","""1828182""",2023-02-27 08:04:37,2023-02-27 03:04:37,"""10-K"""
"""https://www.se…","""ite m 7. manag…","""DIOD""","""29002""",2023-02-10 17:06:13,2023-02-10 12:06:13,"""10-K"""
"""https://www.se…","""item 7. manage…","""HON""","""773840""",2005-02-25 17:04:43,2005-02-25 12:04:43,"""10-K"""
"""https://www.se…","""item 7: manage…","""ANSS""","""1013462""",2003-03-24 17:15:29,2003-03-24 12:15:29,"""10-K"""
"""https://www.se…","""item 7. manage…","""PCOR""","""1611052""",2023-03-01 08:16:05,2023-03-01 03:16:05,"""10-K"""


In [15]:
filing_data.head()["content"].to_list()[:2]

['item 7. management\'s discussion and analysis of financial condition and results of operations. the following discussion of our financial condition and results of operations should be read in conjunction with the audited consolidated financial statements as of and for the years ended december 31, 2022, 2021 and 2020 and the notes thereto included elsewhere in this annual report on form 10-k. in addition to historical consolidated financial information, the following discussion contains forward-looking statements that reflect our plans, estimates and beliefs and that involve risks and uncertainties. our actual results may differ materially from those discussed in the forward-looking statements as a result of various factors, including those set forth in "forward-looking statements" and "item 1a. risk factors." the following discussion contains references to periods prior to the reorganization transactions which were effective february 12, 2021. therefore, the financial results referen

In [16]:
filing_data.filter(pl.col("type") == "10-Q")["content"].to_list()[:3]

['item 2. management\'s discussion and analysis of financial condition and results of operations the following discussion should be read in conjunction with the condensed consolidated financial statements and notes thereto included in this quarterly report on form 10-q, or this quarterly report, and in conjunction with our annual report on form 10-k for the fiscal year ended december 28, 2013 . this quarterly report contains statements that are not historical in nature, are predictive, or that depend upon or refer to future events or conditions or contain forward-looking statements. statements including, but not limited to, statements regarding the extent and timing of future revenues and expenses and customer demand, statements regarding the deployment of our products, statements regarding our reliance on third parties and other statements using words such as "anticipates," "believes," "could," "estimates," "expects," "forecasts," "intends," "may," "plans," "projects," "should," "will

In [17]:
filing_data = filing_data.with_columns(
    pl.col("content")
    .str.replace(
        "item 7. management's discussion and analysis of financial condition and results of operations.",
        "",
    )
    .str.replace(
        "item 2. management's discussion and analysis of financial condition and results of operations",
        "",
    )
)

filing_data

document_url,content,ticker,cik,utc_timestamp,est_timestamp,type
str,str,str,str,datetime[μs],datetime[μs],str
"""https://www.se…",""" the following…","""SGFY""","""1828182""",2023-02-27 08:04:37,2023-02-27 03:04:37,"""10-K"""
"""https://www.se…","""ite m 7. manag…","""DIOD""","""29002""",2023-02-10 17:06:13,2023-02-10 12:06:13,"""10-K"""
"""https://www.se…","""business overv…","""HON""","""773840""",2005-02-25 17:04:43,2005-02-25 12:04:43,"""10-K"""
"""https://www.se…","""the informatio…","""ANSS""","""1013462""",2003-03-24 17:15:29,2003-03-24 12:15:29,"""10-K"""
"""https://www.se…",""" you should re…","""PCOR""","""1611052""",2023-03-01 08:16:05,2023-03-01 03:16:05,"""10-K"""
"""https://www.se…","""the critical a…","""CRS""","""17843""",2003-09-12 15:18:38,2003-09-12 11:18:38,"""10-K"""
"""https://www.se…","""##table_end ov…","""REGN""","""872589""",2010-02-18 08:30:41,2010-02-18 03:30:41,"""10-K"""
"""https://www.se…",""" ##table_end t…","""SCS""","""1050825""",2016-04-15 12:51:28,2016-04-15 08:51:28,"""10-K"""
"""https://www.se…","""the following …","""ALLE""","""1579241""",2022-02-15 06:53:37,2022-02-15 01:53:37,"""10-K"""
"""https://www.se…","""##table_end th…","""VMW""","""1124610""",2012-02-24 16:02:54,2012-02-24 11:02:54,"""10-K"""


In [None]:
filing_data.write_parquet(
    os.path.join("..", "data", "04_input_data", "filing_data.parquet")
)