<a href="https://colab.research.google.com/github/patrickongwong/ColabProjects/blob/master/2020/EdgarParse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*italicized text*

In [38]:
!pip install edgar
from edgar import Company, TXTML, XBRL, XBRLElement




In [64]:
# To extend the Company class to include
from edgar.company import BASE_URL
from typing import Dict, List
import re
from datetime import datetime
from lxml import etree
from edgar import Company, TXTML, XBRL, XBRLElement
class PatrickCompany(Company):
  def get_data_files_from_10Q(comp, document_type, no_of_documents=1, isxml=False):
      tree = comp.get_all_filings(filing_type="10-Q")
      url_groups = comp._group_document_type(tree, "10-Q")[:no_of_documents]
      result = []
      for url_group in url_groups:
        for url in url_group:
          url = BASE_URL + url
          comp._document_urls.append(url)
          content_page = Company.get_request(url)
          tableFile = content_page.find_class("tableFile")
          if len(tableFile) < 2:
            continue
          table = tableFile[1]
          for row in table.getchildren():
            if document_type in row.getchildren()[3].text:
              href = row.getchildren()[2].getchildren()[0].attrib["href"]
              href = BASE_URL + href
              doc = Company.get_request(href, isxml=isxml)
              result.append(doc)
      return result
  
class PatrickXBRL(etree.ElementBase):
  def __init__(self, *children, attrib=None, nsmap=None, **_extra):
    super().__init__(*children, attrib=None, nsmap=None, **_extra)
    self.definitions = dict(
        (child.attrib["id"], self.__parse_context__(child)) for child in self.child.getchildren() if not isinstance(child, etree._Comment) and "context" in child.tag)
    self.relevant_children = [child for child in self.child.getchildren() if not isinstance(child, etree._Comment) and "context" not in child.tag]
    children = [child for child in self.child.getchildren() if XBRL.is_parsable(child)]
    for elem in children:
      XBRL.clean_tag(elem)

    self.relevant_children_parsed = children
    self.relevant_children_elements = [PatrickXBRLElement(child, 
                                                          context_ref=self.definitions[child.attrib["contextRef"]] if child.attrib.get("contextRef") else None,
                                                          context=child.attrib["contextRef"]) for child in children]

  def __parse_context__(self, context):
    children = [child for child in context.getchildren() if not isinstance(child, etree._Comment)]
    [XBRL.clean_tag(child) for child in children]
    period = [child for child in children if child.tag == 'period'][0]
    return {
        "period": self.__parse_base_elem__(period)
        }

  def __parse_base_elem__(self, elem):
    children = [child for child in elem.getchildren() if XBRL.is_parsable(child)]
    [XBRL.clean_tag(child) for child in children]
    return dict((child.tag, child.text) for child in children)

  @classmethod
  def is_parsable(cls, child):
    return not isinstance(child, etree._Comment) and "context" not in child.tag and "unit" not in child.tag and "schemaRef" not in child.tag

  @classmethod
  def clean_tag(cls, elem):
    """
    Parse tag so 
      {http://fasb.org/us-gaap/2018-01-31}Assets
    becomes
      Assets
    """
    elem.tag = elem.tag[elem.tag.find("}")+1:]

  @classmethod
  def parse_context_ref(cls, context_ref):
    """
    Duration_1_1_2018_To_12_31_2018 becomes 2018-01-01 to 2018-12-31
    As_Of_12_31_2017 becomes 2017-12-31
    """
    context_ref_to_date_text = lambda s: datetime.strptime(s, "%m_%d_%Y").date().strftime("%Y-%m-%d")
    if context_ref.startswith("Duration"):
      if len(context_ref.split("_")) <= 9:
        from_date = context_ref_to_date_text(context_ref[len("DURATION")+1:context_ref.find("_To_")])
        to_date = context_ref_to_date_text(context_ref[context_ref.find("_To_")+4:])
        return {"from": from_date, "to": to_date}
      else:
        from_date = context_ref_to_date_text(context_ref[len("DURATION")+1:context_ref.find("_To_")])
        end_idx = findnth(context_ref, "_", 7)+1
        to_date = context_ref_to_date_text(context_ref[context_ref.find("_To_")+4:end_idx-1])
        return {"from": from_date, "to": to_date}

    elif context_ref.startswith("As_Of"):
      if len(context_ref.split("_")) <= 5:
        return {"from": context_ref_to_date_text(context_ref[len("As_Of")+1:])}
      else:
        end_idx = findnth(context_ref, "_", 4)+1
        from_date = context_ref_to_date_text(context_ref[len("As_Of")+1:end_idx-1])
        return {"from": from_date}
    else:
      return {"other": context_ref.split("_")[0]}

  @property
  def child(self):
    return self.getchildren()[0]

  def find_relevant_elements_by_name(self, name):
    return [elem for elem in self.relevant_children_elements if name.lower() in elem.name.lower()]

  def match_relevant_elements_by_name(self, name):
    return [elem for elem in self.relevant_children_elements if name.lower() == elem.name.lower()]


class PatrickXBRLElement(etree.ElementBase):

  def __init__(self, *children, attrib=None, nsmap=None, context_ref=None, context=None, **_extra):
    super().__init__(*children, attrib=None, nsmap=None, **_extra)
    self.child = self.getchildren()[0]
    self.context_ref = context_ref
    self.context = context
    self.name = ' '.join(re.findall('[A-Z][^A-Z]*', self.child.tag))
    self.unit_ref = self.attrib.get("unitRef") or None

  @property
  def attrib(self) -> Dict:
    return self.child.attrib

  @property
  def value(self) -> str:
    return self.child.text.replace("\n", "").strip() if self.child.text else ""

  def to_dict(self) -> Dict:
    if self.context_ref and self.context_ref.get("period"):
      return { **{
        "name": self.name,
        "value": self.value,
        "unit_ref": self.unit_ref,
        "context_ref": self.context
      }, **self.context_ref["period"]}
    else:
      return {
        "name": self.name,
        "value": self.value,
        "unit_ref": self.unit_ref,
        "context_ref": self.context
      }

  def __repr__(self):
    return f'<{self.name}="{self.value} {self.unit_ref}" context_ref={self.context_ref}>'



In [None]:
### Get the following:
## TTM (2) - Revenue, EBIT, Interest expense
## Latest (2) - BVEquity, BVDebt, Cash, Cross Holdings and NOA, Minority Interest,
## Eff Rate, Marg Rate
## Sales to Capital
## R&D - last 5 years
## Existing NOL / NOLCO
## Others - Current Stock Prices, Shares Outstanding <- yfinance or investpy

In [65]:
company = PatrickCompany("APPLE INC", "0000320193")
#doc = company.get_10K()
results = company.get_data_files_from_10K("XML", isxml=True) # "XML", "EX-101.INS"
xbrl = PatrickXBRL(results[0])
#XBRLElement(xbrl.relevant_children_parsed[15]).to_dict()

import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df = pd.DataFrame()

for index in range(len(xbrl.relevant_children_parsed)):
  row = xbrl.relevant_children_elements[index].to_dict()
  df.loc[index,'Name'] = row['name']
  df.loc[index,'Value'] = row['value']
  df.loc[index,'Unit_ref'] = row['unit_ref']
  #df.loc[index,'context_ref'] = row['context_ref']
  try:
    df.loc[index,'context_ref'] = row['context_ref']
  except:
    pass
  try:
    df.loc[index,'StartDate'] = row['startDate']
  except:
    pass
  try:
    df.loc[index,'EndDate'] = row['endDate']
  except:
    pass
  
df.head(20)

Unnamed: 0,Name,Value,Unit_ref,context_ref,StartDate,EndDate
0,Derivative Liabilities Reductionfor Master Net...,2100000000,usd,FI2018Q4,,
1,Derivative Liabilities Reductionfor Master Net...,2700000000,usd,FI2019Q4,,
2,Long Term Marketable Securities Maturities Term,P1Y,,FD2019Q4YTD_srt_RangeAxis_srt_MinimumMember,2018-09-30,2019-09-28
3,Share Based Compensation Arrangement By Share ...,P6M,,FD2019Q4YTD_us-gaap_AwardTypeAxis_us-gaap_Empl...,2018-09-30,2019-09-28
4,Amendment Flag,false,,FD2019Q4YTD,2018-09-30,2019-09-28
5,Current Fiscal Year End Date,--09-28,,FD2019Q4YTD,2018-09-30,2019-09-28
6,Document Fiscal Period Focus,FY,,FD2019Q4YTD,2018-09-30,2019-09-28
7,Document Fiscal Year Focus,2019,,FD2019Q4YTD,2018-09-30,2019-09-28
8,Entity Central Index Key,0000320193,,FD2019Q4YTD,2018-09-30,2019-09-28
9,Common Stock Par Or Stated Value Per Share,0.00001,usdPerShare,FI2018Q4,,


In [67]:
for x in df['context_ref']:
  print(x)

FI2018Q4
FI2019Q4
FD2019Q4YTD_srt_RangeAxis_srt_MinimumMember
FD2019Q4YTD_us-gaap_AwardTypeAxis_us-gaap_EmployeeStockMember
FD2019Q4YTD
FD2019Q4YTD
FD2019Q4YTD
FD2019Q4YTD
FD2019Q4YTD
FI2018Q4
FI2019Q4
FI2018Q4
FI2019Q4
FI2018Q4
FI2019Q4
FI2018Q4
FI2019Q4
FD2019Q4YTD_srt_MajorCustomersAxis_aapl_CustomerOneMember_us-gaap_ConcentrationRiskByBenchmarkAxis_us-gaap_TradeAccountsReceivableMember_us-gaap_ConcentrationRiskByTypeAxis_us-gaap_CreditConcentrationRiskMember
FI2019Q4_srt_RangeAxis_srt_MaximumMember_us-gaap_DebtInstrumentAxis_aapl_A20132018DebtIssuancesMember_us-gaap_LongtermDebtTypeAxis_aapl_FixedRateNotesMember
FI2019Q4_srt_RangeAxis_srt_MaximumMember_us-gaap_DebtInstrumentAxis_aapl_A2019DebtIssuanceMember_us-gaap_LongtermDebtTypeAxis_aapl_FixedRateNotesMember
FI2019Q4_srt_RangeAxis_srt_MinimumMember_us-gaap_DebtInstrumentAxis_aapl_A20132018DebtIssuancesMember_us-gaap_LongtermDebtTypeAxis_aapl_FixedRateNotesMember
FI2019Q4_srt_RangeAxis_srt_MinimumMember_us-gaap_DebtInstrumentAxis

In [None]:
from edgar import Edgar
edgar = Edgar()
possible_companies = edgar.find_company_name("Cisco System")
edgar.get_cik_by_company_name('CISCO SYSTEMS INC') 