In [1]:
%load_ext lab_black

In [2]:
import selenium
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime
import requests
import numpy as np
import time
import re
from tqdm import tqdm
import pickle
import tempfile
from collections import defaultdict

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

import logging

logging.basicConfig(level=logging.INFO, filename="../logs/scrape.log")

In [3]:
companies = pd.read_csv(
    "../data/intermediate/companies_filling_minimal.csv", index_col=0
)
companies["most_recent_filling"] = pd.to_datetime(companies["most_recent_filling"])
companies_more_2008 = companies[companies["most_recent_filling"].dt.year > 2008]

ciks = list(companies_more_2008["CIK"])
len(ciks)

4625

In [4]:
ciks_2021 = companies.loc[companies["most_recent_filling"].dt.year == 2021, "CIK"]
len(ciks_2021)

2075

In [5]:
headers = {"User-Agent": "Anselme F.E. Borgeaud (aborgeaud@gmail.com)"}

In [6]:
def get_date(soup) -> datetime:
    ps = soup.findAll("p")
    bs = soup.findAll("b")
    texts = [
        p.get_text(strip=True).replace(u"\xa0", u" ")
        for p in ps + bs
        if "fiscal year ended" in p.text
    ]
    if len(texts) > 0:
        match = re.findall(r"ended .*[0-9]{4}", texts[0])
        if len(match) == 1:
            date_str = match[0].replace("ended ", "")
            try:
                date = datetime.strptime(date_str, "%B %d, %Y")
                return date
            except:
                return None


def find_value(td_elem) -> float:
    def is_number(s):
        return len(re.findall(r"[0-9]+", s)) > 0

    elem = td_elem.find_next_sibling("td")
    if elem is None:
        return None
    i = 0
    while not is_number(elem.text) and i < 4:
        elem = elem.find_next_sibling("td")
    if is_number(elem.text):
        try:
            return float(elem.text.replace(",", ""))
        except ValueError:
            return None


def find_value_in_table(soup, key: str) -> float:
    key_elem = None
    key_text = key.strip().replace("\n", "_").replace(" ", "_").lower()
    for e in soup.findAll("td"):
        text = e.text.strip().replace("\n", "_").replace(" ", "_").lower()
        if key_text == text:
            key_elem = e
    if key_elem:
        return find_value(key_elem)

In [7]:
def expend_10k_button(driver):
    elem_10k = None
    for elem in driver.find_elements_by_class_name("expandCollapse"):
        parent = elem.find_element_by_xpath("..")
        if "10-K" in parent.text:
            elem_10k = parent
            break
    if elem_10k:
        elem_10k.click()


def click_view_all_10k(driver):
    elem_view10k = driver.find_element_by_xpath(
        '//button[@data-group="annualOrQuarterlyReports"]'
    )
    elem_view10k.click()


def input_search_10k(driver):
    search_elem = driver.find_element_by_xpath('//input[@placeholder="Search table"]')
    # blank space to avoid 10-K/A (amendments)
    search_elem.send_keys("10-K ")

In [8]:
entries = ["total_current_assets", "total_current_liabilities"]


def run(ciks: list):
    driver = webdriver.Chrome()
    time.sleep(8)

    report_urls = []
    data_dict = defaultdict(list)
    data_df = None

    for i, cik in enumerate(tqdm(ciks)):
        if (i > 0) and (i % 30 == 0):
            timestamp = time.time_ns()
            file_report = f"../data/intermediate/report_urls{timestamp}.pickle"
            with open(file_report, "wb") as f:
                pickle.dump(report_urls, f)
            data_df = pd.DataFrame(data_dict)
            data_df.to_csv(f"../data/intermediate/scraped_financials{timestamp}.csv")

        url = f"https://www.sec.gov/edgar/browse/?CIK={cik}"
        try:
            driver.get(url)
        except:
            logging.info(f"{url} Did not get page")
            continue
        time.sleep(4)

        try:
            search_elem = driver.find_element_by_xpath(
                '//input[@placeholder="Search table"]'
            )
            search_elem.send_keys("10-K ")  # blank space to avoid 10-K/A (amendments)
        except:
            logging.info(f"{url} Did not input 10-K")
            try:
                expend_10k_button(driver)
                time.sleep(1.5)
                click_view_all_10k(driver)
                time.sleep(1.5)
                input_search_10k(driver)
            except:
                logging.info(f"{url} Did not expend and input 10-K")
                continue

        link_elems = driver.find_elements_by_class_name("document-link")
        annual_report_elems = [
            e
            for e in link_elems
            if "Annual report" in e.text
            and "right" in e.get_attribute("data-placement")
        ]
        annual_report_pages = []

        for annual_report_elem in annual_report_elems:
            try:
                resp = requests.get(
                    annual_report_elem.get_property("href"), headers=headers, timeout=5
                )
                annual_report_pages.append(resp)
            except:
                href_elem = a.get_property("href")
                logging.info(f"{href_elem} Did not fetch report page")
                pass

        report_urls.extend([a.get_property("href") for a in annual_report_elems])

        for i, page in enumerate(annual_report_pages):
            soup = BeautifulSoup(page.content)

            try:
                date_str = driver.find_elements_by_xpath(
                    '//a[@data-index="reportDate"]'
                )[i].text
                date = datetime.fromisoformat(date_str)
            except:
                logging.info(f"{page.url} Did not read date")
                continue

            tmp_dict = None
            try:
                tmp_dict = dict()
                for entry in entries:
                    tmp_dict[entry] = find_value_in_table(soup, entry)
            except:
                logging.info(f"{page.url} Did not read html")
                continue
            if (tmp_dict.keys() - set(entries)) == set():
                for entry, value in tmp_dict.items():
                    data_dict[entry].append(value)
                data_dict["CIK"].append(cik)
                data_dict["date_filled"].append(date)
                data_dict["url"].append(page.url)

    data_df = pd.DataFrame(data_dict)
    data_df.to_csv("../data/intermediate/scraped_financials.csv")

    with open("../data/intermediate/report_urls.pickle", "wb") as f:
        pickle.dump(report_urls, f)

In [9]:
run(ciks_2021)

 18%|████████████████████                                                                                             | 369/2075 [2:39:08<12:15:46, 25.88s/it]


NameError: name 'a' is not defined

In [None]:
# TEST TEST

url = "https://www.sec.gov/edgar/browse/?CIK=14177"

driver = webdriver.Chrome()
driver.get(url)

In [None]:
expend_10k_button(driver)

In [None]:
click_view_all_10k(driver)

In [None]:
input_search_10k(driver)

In [None]:
link_elems = driver.find_elements_by_class_name("document-link")
annual_report_elems = [e for e in link_elems if 'Annual report' in e.text and 'right' in e.get_attribute('data-placement')]

for a in annual_report_elems:
    print(a.get_property('href'), a.text)

annual_report_elem =  annual_report_elems[0]

In [None]:
page = requests.get(
    annual_report_elem.get_property("href"), headers=headers, timeout=5
)
resp.status_code

In [None]:
soup = BeautifulSoup(page.content)
date_str = driver.find_elements_by_xpath(
    '//a[@data-index="reportDate"]'
)[0].text
date = datetime.fromisoformat(date_str)
date

In [None]:
entries = ["total_current_assets", "total_current_liabilities"]

find_value_in_table(soup, entries[1])

In [None]:
key = entries[0]
key_elem = None
key_text = key.strip().replace("\n", "_").replace(" ", "_").lower()
for e in soup.findAll("td"):
    text = e.text.strip().replace("\n", "_").replace(" ", "_").lower()
    print(text)
    if key_text == text:
        key_elem = e

In [None]:
soup

In [None]:
annual_report_elem.get_property("href")