In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
from pandas import json_normalize
import requests
import json
from collections import defaultdict
import time

In [12]:
"""Functions to request from the SEC API.
"""


def get_facts(cik: str, user_agent: str) -> dict:
    """A few quarters of financial reports."""
    assert len(cik) == 10
    url_facts = "https://data.sec.gov/api/xbrl/companyfacts/"
    headers = {"User-Agent": user_agent}
    try:
        resp = requests.get(
            url_facts + "CIK" + cik + ".json", headers=headers, timeout=5
        )
    except TimeoutError:
        return None
    resp_json = None
    if resp.status_code < 400:
        resp_json = resp.json()
    return resp_json


def get_submissions(cik: str, user_agent: str) -> dict:
    """History of submissions of financial report fillings."""
    assert len(cik) == 10
    url_submissions = "https://data.sec.gov/submissions/"
    headers = {"User-Agent": user_agent}
    resp = requests.get(
        url_submissions + "CIK" + cik + ".json", headers=headers, timeout=5
    )
    resp_json = None
    if resp.status_code < 400:
        resp_json = resp.json()
    return resp_json


def get_concepts(cik: str, entry: str, user_agent: str, taxonomy="us-gaap") -> dict:
    """hisotry of values for a specific financial report entry (e.g. Revenues)
    for a specific company.
    """
    assert len(cik) == 10
    url_concepts = "https://data.sec.gov/api/xbrl/companyconcept/"
    headers = {"User-Agent": user_agent}
    try:
        resp = requests.get(
            url_concepts + "CIK" + cik + "/" + taxonomy + "/" + entry + ".json",
            headers=headers,
            timeout=5,
        )
    except:
        return None
    #     print(resp.content)
    #     resp_json = None
    #     if resp.status_code < 400:
    #         resp_json = resp.json()
    return resp


def get_frames(
    entry: str,
    period: str,
    currency: str,
    user_agent: str,
    taxonomy="us-gaap",
) -> dict:
    """"""
    assert len(cik) == 10
    url_frames = "https://data.sec.gov/api/xbrl/frames"
    headers = {"User-Agent": user_agent}
    try:
        resp = requests.get(
            f"{url_frames}/{taxonomy}/{entry}/{currency}/CY{period}.json",
            headers=headers,
            timeout=5,
        )
    except:
        return None
    resp_json = None
    if resp.status_code < 400:
        resp_json = resp.json()
    return resp_json


def format_cik(cik: int) -> str:
    return f"{cik:010d}"

In [4]:
companies = pd.read_csv(
    "../data/intermediate/companies_filling_minimal.csv", index_col=0
)
companies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6409 entries, 0 to 3149
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Company Name         6409 non-null   object
 1   CIK                  6409 non-null   int64 
 2   most_recent_filling  6409 non-null   object
dtypes: int64(1), object(2)
memory usage: 200.3+ KB


In [5]:
# gaap

keys = [
    "ResearchAndDevelopmentExpense",
    "CommonStockDividendsPerShareDeclared",
    "Dividends",
    "EarningsPerShareBasic",
    "EarningsPerShareDiluted",
    "NumberOfStores",
    "Revenues",
    "Cash",
    "Assets",
    "AssetsCurrent",
    "LiabilitiesCurrent",
    "LongTermDebt",
    "StockholdersEquity",
]

data_dict = defaultdict(list)
for k in ["CIK", "Company Name", "period"] + keys:
    data_dict[k].append(-12345)

# later?
# MarketingExpense = []
# AdvertisingExpense = []

data_dict

defaultdict(list,
            {'CIK': [-12345],
             'Company Name': [-12345],
             'period': [-12345],
             'ResearchAndDevelopmentExpense': [-12345],
             'CommonStockDividendsPerShareDeclared': [-12345],
             'Dividends': [-12345],
             'EarningsPerShareBasic': [-12345],
             'EarningsPerShareDiluted': [-12345],
             'NumberOfStores': [-12345],
             'Revenues': [-12345],
             'Cash': [-12345],
             'Assets': [-12345],
             'AssetsCurrent': [-12345],
             'LiabilitiesCurrent': [-12345],
             'LongTermDebt': [-12345],
             'StockholdersEquity': [-12345]})

In [6]:
user_agent = "Anselme F.E. Borgeaud (aborgeaud@gmail.com)"

In [140]:
companies["most_recent_filling"] = pd.to_datetime(companies["most_recent_filling"])
cik_int = companies[companies["most_recent_filling"].dt.year == 2021].iloc[1]["CIK"]
cik = format_cik(cik_int)
entry = "AccountsPayableCurrent"
concept_json = get_concepts(cik, entry, user_agent, "us-gaap")
facts_json = get_facts(cik, user_agent)


fact_json = facts_json.json()
fact_json.keys()

dict_keys(['cik', 'entityName', 'facts'])

In [13]:
cik = format_cik(10795)
fact_json = get_facts(cik, user_agent)

In [15]:
print(fact_json["entityName"])
for k in fact_json["facts"]["us-gaap"].keys():
    if "Assets" in k:
        print(k)

BECTON, DICKINSON AND COMPANY
AmortizationOfIntangibleAssets
Assets
AssetsCurrent
AssetsFairValueDisclosure
AssetsHeldForSaleAtCarryingValue
AssetsHeldForSaleCurrent
AssetsOfDisposalGroupIncludingDiscontinuedOperation
BusinessAcquisitionPurchasePriceAllocationAssetsAcquired
BusinessAcquisitionPurchasePriceAllocationIntangibleAssetsNotAmortizable
BusinessCombinationRecognizedIdentifiableAssetsAcquiredAndLiabilitiesAssumedContingentLiability
BusinessCombinationRecognizedIdentifiableAssetsAcquiredAndLiabilitiesAssumedNet
DeferredTaxAssetsGross
DeferredTaxAssetsNet
DeferredTaxAssetsNetCurrent
DeferredTaxAssetsNetNoncurrent
DeferredTaxAssetsOther
DeferredTaxAssetsOtherTaxCarryforwards
DeferredTaxAssetsTaxDeferredExpenseCompensationAndBenefits
DeferredTaxAssetsTaxDeferredExpenseReservesAndAccrualsReserves
DeferredTaxAssetsValuationAllowance
DeferredTaxLiabilitiesOtherFiniteLivedAssets
DefinedBenefitPlanActualReturnOnPlanAssetsStillHeld
DefinedBenefitPlanFairValueOfPlanAssets
DefinedBenefitPl

In [16]:
entry = "AssetsCurrent"
fact_json["facts"]["us-gaap"][entry]

{'label': 'Assets, Current',
 'description': 'Sum of the carrying amounts as of the balance sheet date of all assets that are expected to be realized in cash, sold, or consumed within one year (or the normal operating cycle, if longer). Assets are probable future economic benefits obtained or controlled by an entity as a result of past transactions or events.',
 'units': {'USD': [{'end': '2008-09-30',
    'val': 3614675000,
    'accn': '0000950123-09-032660',
    'fy': 2009,
    'fp': 'Q3',
    'form': '10-Q',
    'filed': '2009-08-10'},
   {'end': '2008-09-30',
    'val': 3614675000,
    'accn': '0000950123-09-066233',
    'fy': 2009,
    'fp': 'FY',
    'form': '10-K',
    'filed': '2009-11-25'},
   {'end': '2008-09-30',
    'val': 3614675000,
    'accn': '0000950123-10-004150',
    'fy': 2009,
    'fp': 'FY',
    'form': '10-K/A',
    'filed': '2010-01-21',
    'frame': 'CY2008Q3I'},
   {'end': '2009-06-30',
    'val': 4491537000,
    'accn': '0000950123-09-032660',
    'fy': 2009,


In [21]:
fact_json["facts"]["us-gaap"].keys()

dict_keys(['AccountsNotesAndLoansReceivableNetCurrent', 'AccountsPayableAndAccruedLiabilitiesCurrent', 'AccountsPayableCurrent', 'AccountsReceivableNetCurrent', 'AccruedIncomeTaxesCurrent', 'AccruedLiabilitiesCurrent', 'AccumulatedDepreciationDepletionAndAmortizationPropertyPlantAndEquipment', 'AccumulatedOtherComprehensiveIncomeLossAvailableForSaleSecuritiesAdjustmentNetOfTax', 'AccumulatedOtherComprehensiveIncomeLossCumulativeChangesInNetGainLossFromCashFlowHedgesEffectNetOfTax', 'AccumulatedOtherComprehensiveIncomeLossDefinedBenefitPensionAndOtherPostretirementPlansNetOfTax', 'AccumulatedOtherComprehensiveIncomeLossForeignCurrencyTranslationAdjustmentNetOfTax', 'AccumulatedOtherComprehensiveIncomeLossNetOfTax', 'AdditionalPaidInCapitalCommonStock', 'AllocatedShareBasedCompensationExpense', 'AmortizationOfIntangibleAssets', 'AntidilutiveSecuritiesExcludedFromComputationOfEarningsPerShareAmount', 'AssetImpairmentCharges', 'Assets', 'AssetsCurrent', 'AssetsFairValueDisclosure', 'Assets

In [124]:
entry = keys[8]
print(entry)
cik_int = companies["CIK"].iloc[0]
cik = format_cik(cik_int)
print(cik)
concept_json = get_concepts(cik, entry, user_agent, "us-gaap")
facts_json = get_facts(cik, user_agent)
concept_json.content
facts_json.content

Assets
0001050122


b'<?xml version="1.0" encoding="UTF-8"?>\n<Error><Code>NoSuchKey</Code><Message>The specified key does not exist.</Message><Key>api/xbrl/companyfacts/CIK0001050122.json</Key><RequestId>XCTMCTRRF5S35KQ0</RequestId><HostId>FnOkyLETmRaPft059hidMuJBPeCcxc431D5/W77c4P1nW1Px5xzIh6tLn1Zu7vaV+khZdEuk1wk=</HostId></Error>'

In [87]:
subm_json = get_submissions(cik, user_agent)
facts_json = get_facts(cik, user_agent)
facts_json = get_facts(cik, user_agent)

b'<?xml version="1.0" encoding="UTF-8"?>\n<Error><Code>NoSuchKey</Code><Message>The specified key does not exist.</Message><Key>api/xbrl/companyfacts/CIK0001050122.json</Key><RequestId>4NMJHH5Z3V4J9FBH</RequestId><HostId>NBH51kDDJRyGkBk0NhVupAoTySQYZ+4zIHfSSHVCMq5+BPlGJIO8s+b6vVSMJxhs98U9ZgmEKh8=</HostId></Error>'

In [156]:
# test
entry = "Revenues"
period = periods[0]
# cik_int = companies["CIK"].iloc[0]
# cik = format_cik(cik_int)
frame_json = get_frames(entry, period, "USD", user_agent, "us-gaap")
frame_json.keys()

dict_keys(['taxonomy', 'tag', 'ccp', 'uom', 'label', 'description', 'pts', 'data'])

In [60]:
frame_json["ccp"]

'CY2005'

In [157]:
frame_json["pts"]

2

In [56]:
frame_json["data"][0]["entityName"]

'QUEST PATENT RESEARCH CORP'

In [52]:
[frame_json["data"][0]["val"], frame_json["description"]]

[86300,
 "The aggregate costs incurred (1) in a planned search or critical investigation aimed at discovery of new knowledge with the hope that such knowledge will be useful in developing a new product or service, a new process or technique, or in bringing about a significant improvement to an existing product or process; or (2) to translate research findings or other knowledge into a plan or design for a new product or process or for a significant improvement to an existing product or process whether intended for sale or the entity's use, during the reporting period charged to research and development projects, including the costs of developing computer software up to the point in time of achieving technological feasibility, and costs allocated in accounting for a business combination to in-process projects deemed to have no alternative future use."]

In [47]:
for entry in keys:
    for period in periods:
        for cik_int in companies['CIK']:
            cik = format_cik(cik_int)
            frame_json = get_frames(entry, period, 'USD', user_agent, 'us-gaap')
            try:
                datum = [frame_json["data"][0]["val"], frame_json["description"]]
                company_name = frame_json["data"][0]["entityName"]
            except:
                datum = None
            data_dict['cik'].append(cik)
            data_dict['Company Name'].append(company_name)
            data_dict['period'].append(period)
            data_dict[entry].append(datum)
            print(data_dict)
            time.sleep(10)

TypeError: get_frames() takes from 4 to 5 positional arguments but 6 were given

In [6]:
df = pd.read_json('../data/raw/companyfacts/CIK0000001750.json')

In [4]:
df = df.explode('facts')

In [11]:
df.loc['us-gaap', 'facts']['Assets']

{'label': 'Assets',
 'description': 'Sum of the carrying amounts as of the balance sheet date of all assets that are recognized. Assets are probable future economic benefits obtained or controlled by an entity as a result of past transactions or events.',
 'units': {'USD': [{'end': '2010-05-31',
    'val': 1501042000,
    'accn': '0001104659-10-049632',
    'fy': 2011,
    'fp': 'Q1',
    'form': '10-Q',
    'filed': '2010-09-23'},
   {'end': '2010-05-31',
    'val': 1501042000,
    'accn': '0001104659-10-063683',
    'fy': 2011,
    'fp': 'Q2',
    'form': '10-Q',
    'filed': '2010-12-21'},
   {'end': '2010-05-31',
    'val': 1501042000,
    'accn': '0001104659-11-015691',
    'fy': 2011,
    'fp': 'Q3',
    'form': '10-Q',
    'filed': '2011-03-22'},
   {'end': '2010-05-31',
    'val': 1500181000,
    'accn': '0001047469-11-006302',
    'fy': 2011,
    'fp': 'FY',
    'form': '10-K',
    'filed': '2011-07-13'},
   {'end': '2010-05-31',
    'val': 1500181000,
    'accn': '0001047469-