In [52]:
"""Functions to request from the SEC API.
"""

def get_facts(cik: str, user_agent: str) -> dict:
    """A few quarters of financial reports."""
    assert len(cik) == 10
    url_facts = "https://data.sec.gov/api/xbrl/companyfacts/"
    headers = {"User-Agent": user_agent}
    resp = requests.get(url_facts + "CIK" + cik + ".json", headers=headers, timeout=5)
    resp_json = None
    if resp.status_code < 400:
        resp_json = resp.json()
    return resp_json


def get_submissions(cik: str, user_agent: str) -> dict:
    """History of submissions of financial report fillings."""
    assert len(cik) == 10
    url_submissions = "https://data.sec.gov/submissions/"
    headers = {"User-Agent": user_agent}
    resp = requests.get(
        url_submissions + "CIK" + cik + ".json", headers=headers, timeout=5
    )
    resp_json = None
    if resp.status_code < 400:
        resp_json = resp.json()
    return resp_json


def get_concepts(cik: str, entry: str, user_agent: str, taxonomy="us-gaap") -> dict:
    """hisotry of values for a specific financial report entry (e.g. Revenues)
    for a specific company.
    """
    assert len(cik) == 10
    url_concepts = "https://data.sec.gov/api/xbrl/companyconcepts/"
    headers = {"User-Agent": user_agent}
    resp = requests.get(
        url_concepts + "CIK" + cik + "/" + taxonomy + "/" + entry + ".json",
        headers=headers,
        timeout=5,
    )


def get_frames(
    cik: str,
    entry: str,
    period: str,
    currency: str,
    user_agent: str,
    taxonomy="us-gaap",
) -> dict:
    """"""
    assert len(cik) == 10
    url_frames = "https://data.sec.gov/api/xbrl/frames"
    headers = {"User-Agent": user_agent}
    resp = requests.get(
        f"{url_frames}/CIK{cik}/{taxonomy}/{entry}/{currency}/CY{period}.json",
        headers=headers,
        timeout=5,
    )


def format_cik(cik: int) -> str:
    return f"{cik:010d}"

In [5]:
def is_balance_sheet(df):
    ncol = len(df.columns)
    has_assets = df.iloc[:, 0].astype("string").str.contains(r"Assets").sum() > 0
    has_cash_equivs = []
    has_cashs = []
    has_total_liabs = []
    has_equities = []
    has_debts = []
    for icol in range(min(ncol, 3)):
        has_cash_equivs.append(
            df.iloc[:, icol]
            .astype("string")
            .str.contains(r"[cC]ash.*[eE]quivalent.*")
            .sum()
            > 0
        )
        has_cashs.append(
            df.iloc[:, icol].astype("string").str.contains(r"[cC]ash").sum() > 0
        )
        has_total_liabs.append(
            df.iloc[:, icol].astype("string").str.contains(r"Total.*liabilities").sum()
            > 0
        )
        has_debts.append(
            df.iloc[:, icol].astype("string").str.contains(r"Long-term debt").sum() > 0
        )
        has_equities.append(
            df.iloc[:, icol]
            .astype("string")
            .str.contains(r"Total stockholder.*equity")
            .sum()
            > 0
        )

    has_cash = any(has_cash_equivs) or any(has_cashs)
    has_liab = any(has_total_liabs)
    has_debt = any(has_debts)
    has_equity = any(has_equities)

    return has_cash and has_equity and has_assets and (has_debt or has_liab)


def find_index_col(df):
    index_col = ""
    i_index_col = 0
    for i, col in enumerate(df.columns):
        if df.loc[:, col].astype("str").str.contains("Assets").any():
            index_col = col
            i_index_col = i
    return i_index_col, index_col


def is_finite(x):
    try:
        f = float(x)
        return np.isfinite(f)
    except:
        return False


def process_balance(df):
    i, index_col = find_index_col(df)
    arr = df.to_numpy()[:, i:]
    mask = [type(x) == str for x in arr[:, 0]]
    arr = arr[mask, :]
    data = np.apply_along_axis(lambda x: get_value(x), 1, arr)
    index = arr[:, 0]
    df = pd.DataFrame({"value": data}, index=index)
    return df


def is_balance_sheet_2(table):
    content = table.text.lower()
    has_assets = (len(re.findall(r"current.*assets", content, re.DOTALL)) > 0) or (
        len(re.findall(r"total.*assets", content, re.DOTALL)) > 0
    )
    has_liab = (len(re.findall(r"total.*liabilities", content, re.DOTALL)) > 0) or (
        len(re.findall(r"current.*liabilities", content, re.DOTALL)) > 0
    )
    has_cash = "cash" in table.text.lower()
    has_equity = "equity" in table.text.lower()
    has_asset = "assets" in table.text.lower()
    return has_asset and has_assets and has_liab and has_cash and has_equity


def get_balance_table_2(soup):
    tables = soup.findAll("table")
    balance_tables = [t for t in tables if is_balance_sheet_2(t)]
    if len(balance_tables) == 0:
        return None
    i_longest = 0
    longest = 0
    for i, table in enumerate(balance_tables):
        if len(table) > longest:
            i_longest = i
            longest = len(table)
    balance_table = balance_tables[i_longest]
    dfs = pd.read_html(balance_table.prettify(), flavor="bs4")
    if len(dfs) == 1:
        return dfs[0]


def get_value(x):
    vals = [v for v in x if is_finite(v)]
    try:
        val = float(vals[0]) * 1000
    except:
        return None
    return val


def get_date(soup):
    ps = soup.findAll("p")
    bs = soup.findAll("b")
    texts = [
        p.get_text(strip=True).replace(u"\xa0", u" ")
        for p in ps + bs
        if "fiscal year ended" in p.text
    ]
    if len(texts) > 0:
        match = re.findall(r"ended .*[0-9]{4}", texts[0])
        if len(match) == 1:
            date_str = match[0].replace("ended ", "")
            try:
                date = datetime.strptime(date_str, "%B %d, %Y")
                return date
            except:
                return None


def get_balance_table(soup):
    bs = soup.findAll("b")
    b_balance = [b for b in bs if "consolidated balance sheet" in b.text.lower()][0]
    parent = b_balance.find_parent("div")
    i = 0
    table = None
    while (table is None) and (i < 4):
        parent = parent.find_next_sibling("div")
        table = parent.find("table")
        i += 1

    try:
        return pd.read_html(table.prettify(), flavor="bs4")[0]
    except:
        return None


def find_value(row_elem):
    def is_number(s):
        return len(re.findall(r"[0-9]+", s)) > 0

    elem = row_elem.find_next_sibling("td")
    if elem is None:
        return None
    i = 0
    while not is_number(elem.text) and i < 4:
        elem = elem.find_next_sibling("td")
    if is_number(elem.text):
        try:
            return float(elem.text.replace(",", ""))
        except ValueError:
            return None


def find_value_in_table(soup, key: str) -> float:
    key_elem = None
    key_text = key.strip().replace("\n", "_").replace(" ", "_").lower()
    for e in soup.findAll("td"):
        text = e.text.strip().replace("\n", "_").replace(" ", "_").lower()
        if key_text == text:
            key_elem = e
    if key_elem:
        return find_value(key_elem)