In [1]:
%load_ext lab_black

In [2]:
import selenium
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

import requests
import numpy as np

import time
import re

from tqdm import tqdm
import pickle

import logging

logging.basicConfig(level=logging.INFO, filename="../logs/scrape.log")

In [3]:
companies = pd.read_csv(
    "../data/intermediate/companies_filling_minimal.csv", index_col=0
)
companies["most_recent_filling"] = pd.to_datetime(companies["most_recent_filling"])
companies_more_2006 = companies[companies["most_recent_filling"].dt.year > 2006]
companies_more_2008 = companies[companies["most_recent_filling"].dt.year > 2008]

ciks = list(companies_more_2008["CIK"])
len(ciks)

4625

In [4]:
headers = {"User-Agent": "Anselme F.E. Borgeaud (aborgeaud@gmail.com)"}

driver = webdriver.Chrome()

In [5]:
def is_balance_sheet(df):
    ncol = len(df.columns)
    has_assets = df.iloc[:, 0].astype("string").str.contains(r"Assets").sum() > 0
    has_cash_equivs = []
    has_cashs = []
    has_total_liabs = []
    has_equities = []
    has_debts = []
    for icol in range(min(ncol, 3)):
        has_cash_equivs.append(
            df.iloc[:, icol]
            .astype("string")
            .str.contains(r"[cC]ash.*[eE]quivalent.*")
            .sum()
            > 0
        )
        has_cashs.append(
            df.iloc[:, icol].astype("string").str.contains(r"[cC]ash").sum() > 0
        )
        has_total_liabs.append(
            df.iloc[:, icol].astype("string").str.contains(r"Total.*liabilities").sum()
            > 0
        )
        has_debts.append(
            df.iloc[:, icol].astype("string").str.contains(r"Long-term debt").sum() > 0
        )
        has_equities.append(
            df.iloc[:, icol]
            .astype("string")
            .str.contains(r"Total stockholder.*equity")
            .sum()
            > 0
        )

    has_cash = any(has_cash_equivs) or any(has_cashs)
    has_liab = any(has_total_liabs)
    has_debt = any(has_debts)
    has_equity = any(has_equities)

    return has_cash and has_equity and has_assets and (has_debt or has_liab)


def find_index_col(df):
    index_col = ""
    i_index_col = 0
    for i, col in enumerate(df.columns):
        if df.loc[:, col].astype("str").str.contains("Assets").any():
            index_col = col
            i_index_col = i
    return i_index_col, index_col


def is_finite(x):
    try:
        f = float(x)
        return np.isfinite(f)
    except:
        return False


def process_balance(df):
    i, index_col = find_index_col(df)
    arr = df.to_numpy()[:, i:]
    mask = [type(x) == str for x in arr[:, 0]]
    arr = arr[mask, :]
    data = np.apply_along_axis(lambda x: get_value(x), 1, arr)
    index = arr[:, 0]
    df = pd.DataFrame({"value": data}, index=index)
    return df


def is_balance_sheet_2(table):
    content = table.text.lower()
    has_assets = (len(re.findall(r"current.*assets", content, re.DOTALL)) > 0) or (
        len(re.findall(r"total.*assets", content, re.DOTALL)) > 0
    )
    has_liab = (len(re.findall(r"total.*liabilities", content, re.DOTALL)) > 0) or (
        len(re.findall(r"current.*liabilities", content, re.DOTALL)) > 0
    )
    has_cash = "cash" in table.text.lower()
    has_equity = "equity" in table.text.lower()
    has_asset = "assets" in table.text.lower()
    return has_asset and has_assets and has_liab and has_cash and has_equity


def get_balance_table_2(soup):
    tables = soup.findAll("table")
    balance_tables = [t for t in tables if is_balance_sheet_2(t)]
    if len(balance_tables) == 0:
        return None
    i_longest = 0
    longest = 0
    for i, table in enumerate(balance_tables):
        if len(table) > longest:
            i_longest = i
            longest = len(table)
    balance_table = balance_tables[i_longest]
    dfs = pd.read_html(balance_table.prettify(), flavor="bs4")
    if len(dfs) == 1:
        return dfs[0]


def get_value(x):
    vals = [v for v in x if is_finite(v)]
    try:
        val = float(vals[0]) * 1000
    except:
        return None
    return val


def get_date(soup):
    ps = soup.findAll("p")
    bs = soup.findAll("b")
    texts = [
        p.get_text(strip=True).replace(u"\xa0", u" ")
        for p in ps + bs
        if "fiscal year ended" in p.text
    ]
    if len(texts) > 0:
        match = re.findall(r"ended .*[0-9]{4}", texts[0])
        if len(match) == 1:
            date_str = match[0].replace("ended ", "")
            try:
                date = datetime.strptime(date_str, "%B %d, %Y")
                return date
            except:
                return None


def get_balance_table(soup):
    bs = soup.findAll("b")
    b_balance = [b for b in bs if "consolidated balance sheet" in b.text.lower()][0]
    parent = b_balance.find_parent("div")
    i = 0
    table = None
    while (table is None) and (i < 4):
        parent = parent.find_next_sibling("div")
        table = parent.find("table")
        i += 1

    try:
        return pd.read_html(table.prettify(), flavor="bs4")[0]
    except:
        return None

In [None]:
dfs_parsed = []
financial_report_urls = []

for i, cik in enumerate(tqdm(ciks)):
    if i % 30 == 0:
        file = f"../data/intermediate/df_parsed{time.time_ns()}.pickle"
        file_report = f"../data/intermediate/report_urls{time.time_ns()}.pickle"
        with open(file, "wb") as f:
            pickle.dump(dfs_parsed, f)
        with open(file_report, "wb") as f:
            pickle.dump(financial_report_urls, f)

    url = f"https://www.sec.gov/edgar/browse/?CIK={cik}"
    try:
        driver.get(url)
    except:
        continue

    time.sleep(4)

    try:
        search_elem = driver.find_element_by_xpath(
            '//input[@placeholder="Search table"]'
        )
        search_elem.send_keys("10-K ")  # blank space to avoid 10-K/A (amendments)
    except:
        continue

    link_elems = driver.find_elements_by_class_name("document-link")
    annual_report_elems = [e for e in link_elems if "Annual report" in e.text]
    annual_report_pages = []

    for annual_report_elem in annual_report_elems:
        try:
            resp = requests.get(
                annual_report_elem.get_property("href"), headers=headers, timeout=4
            )
            annual_report_pages.append(resp)
        except:
            href_elem = a.get_property("href")
            logging.info(f"{href_elem} Did not fetch report page")
            pass

    financial_report_urls.extend([a.get_property("href") for a in annual_report_elems])

    for i, page in enumerate(annual_report_pages):
        soup = BeautifulSoup(page.content)

        try:
            date_str = driver.find_elements_by_xpath(
                '//a[@data-index="reportDate"]'
            )[i].text
            date = datetime.fromisoformat(date_str)
        except:
            logging.info(f"{page.url} Did not read date")
            continue

        try:
            balance_sheet = get_balance_table_2(soup)
            if balance_sheet is None:
                logging.info(f"{page.url} Did not read html")
                continue
            balance_sheet = process_balance(balance_sheet)
        except:
            logging.info(f"{page.url} Did not read html")
            continue

        try:
            add_info = pd.DataFrame(
                data={'value': [date, cik, page.url]}, index=["date_filled", "CIK", "url"]
            )
            balance_sheet = balance_sheet.append(add_info)
            balance_sheet = balance_sheet.transpose().reset_index(drop=True)
        except:
            logging.info(f"{page.url} Did not make final dataframe")
            pass

        if balance_sheet is not None:
            logging.info(f"{page.url} added")
            dfs_parsed.append(balance_sheet)


with open("../data/intermediate/df_parsed_2.pickle", "wb") as f:
    pickle.dump(dfs_parsed, f)

with open("../data/intermediate/report_urls.pickle", "wb") as f:
    pickle.dump(financial_report_urls, f)

 50%|████████████████████████████████████████████████████████▍                                                        | 2310/4625 [3:18:46<2:46:41,  4.32s/it]