/
filings.py
66 lines (52 loc) · 2.21 KB
/
filings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from pathlib import Path
import re
from lxml import html
from pickler import pickler
ticker_symbol = "TSLA"
default_dir = "sec-edgar-filings"
output_file = "source.html"
def download_if_needed(dir: Path):
if not dir.exists():
from sec_edgar_downloader import Downloader
dl = Downloader()
dl.get("10-Q", ticker_symbol, after="2019-01-01")
def clean_sgml_junk(dir: Path) -> list[str]:
"""Clean up an EDGAR filing by brute-force extracting all the text nodes of a reasonable length"""
output: list[str] = []
punctuation: list[str] = []
for filing_dir in sorted(dir.rglob("*")):
filing = filing_dir / "full-submission.txt"
if filing.exists():
report = filing.open().read().replace("\n", "")
p = r"<html>(.*?)</html>"
if blob := re.search(p, report, re.MULTILINE):
html_blob = blob.group(1)
parsed = html.fromstring(html_blob)
el_with_text = parsed.xpath("//*[text()!='']")
for el in el_with_text:
text = el.text
if text and text.strip():
picklered, punct = pickler([text.strip()])
punctuation.append(re.sub(r"\s+", "", "".join(punct)))
el.text = "".join(picklered)
output.append(
html.tostring(parsed, encoding="unicode", pretty_print=True)
)
# Sort the punctuation such that it's symmetric-ish
# punctuation = sorted("".join(punctuation))
final_punct: list[str] = []
# Split every 70 characters
split: list[str] = re.findall(".{1,70}", "".join(punctuation))
for group in split:
g = sorted(group)
left = [l for i, l in enumerate(g) if i % 2]
right = reversed([l for i, l in enumerate(g) if not i % 2])
final_punct.append("".join(left) + "".join(right))
p = "\n".join(final_punct)
output.append(f"<pre>{p}</pre>")
return output
if __name__ == "__main__":
dir = Path(default_dir)
download_if_needed(dir)
Path(output_file).open("w").writelines(clean_sgml_junk(dir))
print("Now run:\n cat source.html | w3m -dump -T text/html -cols 100 > output.txt")