nlpaueb · dxlnr · Nov 18, 2023 · Nov 19, 2023 · Nov 19, 2023
diff --git a/extract_items.py b/extract_items.py
@@ -1,3 +1,4 @@
+import bs4
 import click
 import cssutils
 import json
@@ -614,6 +615,144 @@ def get_last_item_section(item_index: str, text: str, positions: List[int]) -> s
 
         return item_section
 
+    @staticmethod
+    def find_background_color(tbl: bs4.element.Tag) -> bool:
+        trs = (
+            tbl.find_all("tr", attrs={"style": True})
+            + tbl.find_all("td", attrs={"style": True})
+            + tbl.find_all("th", attrs={"style": True})
+        )
+
+        background_found = False
+        for tr in trs:
+            # Parse given cssText which is assumed to be the content of a HTML style attribute
+            style = cssutils.parseStyle(tr["style"])
+            if (
+                style["background"]
+                and style["background"].lower()
+                not in ["none", "transparent", "#ffffff", "#fff", "white"]
+            ) or (
+                style["background-color"]
+                and style["background-color"].lower()
+                not in ["none", "transparent", "#ffffff", "#fff", "white"]
+            ):
+                background_found = True
+                break
+
+        return background_found
+
+    def retrieve_html_tables(self, doc: str, is_html: bool) -> list[pd.DataFrame]:
+        """
+        Retrieve all HTML tables that contain numerical data as pandas DataFrames.
+
+        Args:
+            doc: Some html document.
+            is_html: Whether the document contains html code or just plain text.
+
+        Returns:
+            pandas DataFrame containing numerical data.
+        """
+        dfs = []
+
+        if is_html:
+            tables = doc.find_all("table")
+
+            for tbl in tables:
+                if ExtractItems.find_background_color(tbl):
+                    table_data = []
+                    rows = tbl.find_all("tr")[1:]
+
+                    for row in rows:
+                        cols = [
+                            re.sub(r"[\$\*]", "", ele.text.strip())
+                            for ele in row.find_all("td")
+                        ]
+                        cols = list(filter(None, cols))
+                        tmp = []
+                        i = 0
+                        while i < len(cols):
+                            if i + 1 < len(cols) and (cols[i + 1] == '%' or cols[i + 1] == ')'):
+                                tmp.append(cols[i] + cols[i + 1])
+                                i += 2
+                            else:
+                                tmp.append(cols[i])
+                                i += 1
+                        cols = tmp
+
+                        if len(cols) > 1:
+                            table_data.append(cols)
+
+                    if table_data:
+                        cc = max(len(r) for r in table_data)
+                        table_data = [
+                            r if len(r) == cc else [""] * (cc - len(r)) + r
+                            for r in table_data
+                        ]
+                        dfs.append(pd.DataFrame(table_data[1:], columns=table_data[0]))
+
+        return dfs
+
+    def extract_tables(self, filing_metadata: Dict[str, Any]) -> list[pd.DataFrame]:
+        """
+        Extracts all tables for 10-K/10-Q files and returns it as a list of pandas DataFrames.
+
+        Args:
+            filing_metadata (Dict[str, Any]): a pandas series containing all filings metadata
+
+        Returns:
+            pandas DataFrame containing numerical data.
+        """
+        absolute_10k_filename = os.path.join(
+            self.raw_files_folder, filing_metadata["filename"]
+        )
+
+        # Read the content of the 10-K file
+        with open(absolute_10k_filename, "r", errors="backslashreplace") as file:
+            content = file.read()
+
+        # Remove all embedded pdfs that might be seen in few old 10-K txt annual reports
+        content = re.sub(r"<PDF>.*?</PDF>", "", content, flags=regex_flags)
+
+        # Find all <DOCUMENT> tags within the content
+        documents = re.findall("<DOCUMENT>.*?</DOCUMENT>", content, flags=regex_flags)
+
+        # Initialize variables
+        doc_10k = None
+        found_10k, is_html = False, False
+
+        # Find the 10-K document
+        for doc in documents:
+            # Find the <TYPE> tag within each <DOCUMENT> tag to identify the type of document
+            doc_type = re.search(r"\n[^\S\r\n]*<TYPE>(.*?)\n", doc, flags=regex_flags)
+            doc_type = doc_type.group(1) if doc_type else None
+
+            # Check if the document is a 10-K
+            if doc_type.startswith("10"):
+                # Check if the document is HTML or plain text
+                doc_10k = BeautifulSoup(doc, "lxml")
+                is_html = (True if doc_10k.find("td") else False) and (
+                    True if doc_10k.find("tr") else False
+                )
+                if not is_html:
+                    doc_10k = doc
+                found_10k = True
+                break
+
+        if not found_10k:
+            if documents:
+                LOGGER.info(
+                    f'\nCould not find document type 10K for {filing_metadata["filename"]}'
+                )
+            # If no 10-K document is found, parse the entire content as HTML or plain text
+            doc_10k = BeautifulSoup(content, "lxml")
+            is_html = (True if doc_10k.find("td") else False) and (
+                True if doc_10k.find("tr") else False
+            )
+            if not is_html:
+                doc_10k = content
+
+        return self.retrieve_html_tables(doc_10k, is_html)
+
     def extract_items(self, filing_metadata: Dict[str, Any]) -> Any:
         """
         Extracts all items/sections for a 10-K file and writes it to a CIK_10K_YEAR.json file (eg. 1384400_10K_2017.json)

diff --git a/tests/fixtures/EXTRACTED_TABLES.zip b/tests/fixtures/EXTRACTED_TABLES.zip
diff --git a/tests/fixtures/FILINGS_METADATA_TABLES_TEST.csv b/tests/fixtures/FILINGS_METADATA_TABLES_TEST.csv
@@ -0,0 +1,3 @@
+CIK,Company,Type,Date,complete_text_file_link,html_index,Filing Date,Period of Report,SIC,htm_file_link,State of Inc,State location,Fiscal Year End,filename
+2488,ADVANCED MICRO DEVICES INC,10-K,2018-02-27,https://www.sec.gov/Archives/edgar/data/2488/0000002488-18-000042.txt,https://www.sec.gov/Archives/edgar/data/2488/0000002488-18-000042-index.html,2018-02-27,2017-12-30,3674,https://www.sec.gov/Archives/edgar/data/2488/000000248818000042/amd-12302017x10k.htm,DE,CA,1227,2488_10K_2017_0000002488-18-000042.htm
+2488,ADVANCED MICRO DEVICES INC,10-K,2023-02-27,https://www.sec.gov/Archives/edgar/data/2488/0000002488-23-000047.txt,https://www.sec.gov/Archives/edgar/data/2488/0000002488-23-000047-index.html,2023-02-27,2022-12-31,3674,https://www.sec.gov/Archives/edgar/data/2488/000000248823000047/amd-20221231.htm,DE,CA,1230,2488_10K_2022_0000002488-23-000047.htm
diff --git a/tests/fixtures/RAW_FILINGS_TABLE.zip b/tests/fixtures/RAW_FILINGS_TABLE.zip
diff --git a/tests/test_extract_tables.py b/tests/test_extract_tables.py
@@ -0,0 +1,82 @@
+import os
+import csv
+import zipfile
+
+import numpy as np
+import pandas as pd
+import unittest
+from tqdm import tqdm
+
+from extract_items import ExtractItems
+
+
+def csv_to_list(csv_file):
+    """Convert CSV file to a list of rows."""
+    with open(csv_file, newline="") as file:
+        reader = csv.reader(file)
+        return list(reader)
+
+
+def extract_zip(input_zip):
+    zf = zipfile.ZipFile(input_zip)
+    zf.extractall(path=os.path.join("/tmp", "edgar_crawler"))
+
+
+class Test(unittest.TestCase):
+    def test_extract_tables(self):
+        extract_zip(os.path.join("tests", "fixtures", "RAW_FILINGS_TABLE.zip"))
+        extract_zip(os.path.join("tests", "fixtures", "EXTRACTED_TABLES.zip"))
+
+        filings_metadata_df = pd.read_csv(os.path.join("tests", "fixtures", "FILINGS_METADATA_TABLES_TEST.csv"), dtype=str)
+        filings_metadata_df = filings_metadata_df.replace({np.nan: None})
+
+        extraction = ExtractItems(
+            remove_tables=True,
+            items_to_extract=[
+                "1",
+                "1A",
+                "1B",
+                "2",
+                "3",
+                "4",
+                "5",
+                "6",
+                "7",
+                "7A",
+                "8",
+                "9",
+                "9A",
+                "9B",
+                "10",
+                "11",
+                "12",
+                "13",
+                "14",
+                "15",
+            ],
+            raw_files_folder="/tmp/edgar_crawler/RAW_FILINGS_TABLE",
+            extracted_files_folder="",
+            skip_extracted_filings=True,
+        )
+
+        for filing_metadata in tqdm(list(zip(*filings_metadata_df.iterrows()))[1], unit="filings", ncols=100):
+            csv_content_pds = extraction.extract_tables(filing_metadata)
+
+            tables_filepath = os.path.join(
+                "/tmp/edgar_crawler",
+                f"{filing_metadata['filename'].split('.')[0]}.csv"
+            )
+            with open(tables_filepath, "w") as f:
+                for df in csv_content_pds:
+                    df.to_csv(f, index=False)
+                    f.write("\n")
+
+            expected_filing_filepath = os.path.join(
+                "/tmp/edgar_crawler/EXTRACTED_TABLES",
+                f"{filing_metadata['filename'].split('.')[0]}.csv"
+            )
+            tables = csv_to_list(tables_filepath)
+            expected_tables = csv_to_list(expected_filing_filepath)
+
+            for i, (v1, v2) in enumerate(zip(tables, expected_tables)):
+                assert v1 == v2, f"({i}) row in CSV file is different: {v1} != {v2}"