Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added in a retrieval function for the tables containing financial data. #17

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions extract_items.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import bs4
import click
import cssutils
import json
Expand Down Expand Up @@ -614,6 +615,144 @@ def get_last_item_section(item_index: str, text: str, positions: List[int]) -> s

return item_section

@staticmethod
def find_background_color(tbl: bs4.element.Tag) -> bool:
trs = (
tbl.find_all("tr", attrs={"style": True})
+ tbl.find_all("td", attrs={"style": True})
+ tbl.find_all("th", attrs={"style": True})
)

background_found = False
for tr in trs:
# Parse given cssText which is assumed to be the content of a HTML style attribute
style = cssutils.parseStyle(tr["style"])
if (
style["background"]
and style["background"].lower()
not in ["none", "transparent", "#ffffff", "#fff", "white"]
) or (
style["background-color"]
and style["background-color"].lower()
not in ["none", "transparent", "#ffffff", "#fff", "white"]
):
background_found = True
break

return background_found

def retrieve_html_tables(self, doc: str, is_html: bool) -> list[pd.DataFrame]:
"""
Retrieve all HTML tables that contain numerical data as pandas DataFrames.

Args:
doc: Some html document.
is_html: Whether the document contains html code or just plain text.

Returns:
pandas DataFrame containing numerical data.
"""
dfs = []

if is_html:
tables = doc.find_all("table")

for tbl in tables:
if ExtractItems.find_background_color(tbl):
table_data = []
rows = tbl.find_all("tr")[1:]

for row in rows:
cols = [
re.sub(r"[\$\*]", "", ele.text.strip())
for ele in row.find_all("td")
]
cols = list(filter(None, cols))
tmp = []
i = 0
while i < len(cols):
if i + 1 < len(cols) and (cols[i + 1] == '%' or cols[i + 1] == ')'):
tmp.append(cols[i] + cols[i + 1])
i += 2
else:
tmp.append(cols[i])
i += 1
cols = tmp

if len(cols) > 1:
table_data.append(cols)

if table_data:
cc = max(len(r) for r in table_data)
table_data = [
r if len(r) == cc else [""] * (cc - len(r)) + r
for r in table_data
]
dfs.append(pd.DataFrame(table_data[1:], columns=table_data[0]))

return dfs

def extract_tables(self, filing_metadata: Dict[str, Any]) -> list[pd.DataFrame]:
"""
Extracts all tables for 10-K/10-Q files and returns it as a list of pandas DataFrames.

Args:
filing_metadata (Dict[str, Any]): a pandas series containing all filings metadata

Returns:
pandas DataFrame containing numerical data.
"""
absolute_10k_filename = os.path.join(
self.raw_files_folder, filing_metadata["filename"]
)

# Read the content of the 10-K file
with open(absolute_10k_filename, "r", errors="backslashreplace") as file:
content = file.read()

# Remove all embedded pdfs that might be seen in few old 10-K txt annual reports
content = re.sub(r"<PDF>.*?</PDF>", "", content, flags=regex_flags)

# Find all <DOCUMENT> tags within the content
documents = re.findall("<DOCUMENT>.*?</DOCUMENT>", content, flags=regex_flags)

# Initialize variables
doc_10k = None
found_10k, is_html = False, False

# Find the 10-K document
for doc in documents:
# Find the <TYPE> tag within each <DOCUMENT> tag to identify the type of document
doc_type = re.search(r"\n[^\S\r\n]*<TYPE>(.*?)\n", doc, flags=regex_flags)
doc_type = doc_type.group(1) if doc_type else None

# Check if the document is a 10-K
if doc_type.startswith("10"):
# Check if the document is HTML or plain text
doc_10k = BeautifulSoup(doc, "lxml")
is_html = (True if doc_10k.find("td") else False) and (
True if doc_10k.find("tr") else False
)
if not is_html:
doc_10k = doc
found_10k = True
break

if not found_10k:
if documents:
LOGGER.info(
f'\nCould not find document type 10K for {filing_metadata["filename"]}'
)
# If no 10-K document is found, parse the entire content as HTML or plain text
doc_10k = BeautifulSoup(content, "lxml")
is_html = (True if doc_10k.find("td") else False) and (
True if doc_10k.find("tr") else False
)
if not is_html:
doc_10k = content

return self.retrieve_html_tables(doc_10k, is_html)

def extract_items(self, filing_metadata: Dict[str, Any]) -> Any:
"""
Extracts all items/sections for a 10-K file and writes it to a CIK_10K_YEAR.json file (eg. 1384400_10K_2017.json)
Expand Down
Binary file added tests/fixtures/EXTRACTED_TABLES.zip
Binary file not shown.
3 changes: 3 additions & 0 deletions tests/fixtures/FILINGS_METADATA_TABLES_TEST.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
CIK,Company,Type,Date,complete_text_file_link,html_index,Filing Date,Period of Report,SIC,htm_file_link,State of Inc,State location,Fiscal Year End,filename
2488,ADVANCED MICRO DEVICES INC,10-K,2018-02-27,https://www.sec.gov/Archives/edgar/data/2488/0000002488-18-000042.txt,https://www.sec.gov/Archives/edgar/data/2488/0000002488-18-000042-index.html,2018-02-27,2017-12-30,3674,https://www.sec.gov/Archives/edgar/data/2488/000000248818000042/amd-12302017x10k.htm,DE,CA,1227,2488_10K_2017_0000002488-18-000042.htm
2488,ADVANCED MICRO DEVICES INC,10-K,2023-02-27,https://www.sec.gov/Archives/edgar/data/2488/0000002488-23-000047.txt,https://www.sec.gov/Archives/edgar/data/2488/0000002488-23-000047-index.html,2023-02-27,2022-12-31,3674,https://www.sec.gov/Archives/edgar/data/2488/000000248823000047/amd-20221231.htm,DE,CA,1230,2488_10K_2022_0000002488-23-000047.htm
Binary file added tests/fixtures/RAW_FILINGS_TABLE.zip
Binary file not shown.
82 changes: 82 additions & 0 deletions tests/test_extract_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os
import csv
import zipfile

import numpy as np
import pandas as pd
import unittest
from tqdm import tqdm

from extract_items import ExtractItems


def csv_to_list(csv_file):
"""Convert CSV file to a list of rows."""
with open(csv_file, newline="") as file:
reader = csv.reader(file)
return list(reader)


def extract_zip(input_zip):
zf = zipfile.ZipFile(input_zip)
zf.extractall(path=os.path.join("/tmp", "edgar_crawler"))


class Test(unittest.TestCase):
def test_extract_tables(self):
extract_zip(os.path.join("tests", "fixtures", "RAW_FILINGS_TABLE.zip"))
extract_zip(os.path.join("tests", "fixtures", "EXTRACTED_TABLES.zip"))

filings_metadata_df = pd.read_csv(os.path.join("tests", "fixtures", "FILINGS_METADATA_TABLES_TEST.csv"), dtype=str)
filings_metadata_df = filings_metadata_df.replace({np.nan: None})

extraction = ExtractItems(
remove_tables=True,
items_to_extract=[
"1",
"1A",
"1B",
"2",
"3",
"4",
"5",
"6",
"7",
"7A",
"8",
"9",
"9A",
"9B",
"10",
"11",
"12",
"13",
"14",
"15",
],
raw_files_folder="/tmp/edgar_crawler/RAW_FILINGS_TABLE",
extracted_files_folder="",
skip_extracted_filings=True,
)

for filing_metadata in tqdm(list(zip(*filings_metadata_df.iterrows()))[1], unit="filings", ncols=100):
csv_content_pds = extraction.extract_tables(filing_metadata)

tables_filepath = os.path.join(
"/tmp/edgar_crawler",
f"{filing_metadata['filename'].split('.')[0]}.csv"
)
with open(tables_filepath, "w") as f:
for df in csv_content_pds:
df.to_csv(f, index=False)
f.write("\n")

expected_filing_filepath = os.path.join(
"/tmp/edgar_crawler/EXTRACTED_TABLES",
f"{filing_metadata['filename'].split('.')[0]}.csv"
)
tables = csv_to_list(tables_filepath)
expected_tables = csv_to_list(expected_filing_filepath)

for i, (v1, v2) in enumerate(zip(tables, expected_tables)):
assert v1 == v2, f"({i}) row in CSV file is different: {v1} != {v2}"