In [None]:
# Import client to connect with arelle web server
import arelle_client

# For dealing with file system
import os
from pathlib import Path

# For data manipulation
import pandas

# For parsing xml
import lxml
from lxml import etree
import html

# Method was used to download xbrl data from arelle web server
# NOTE: XBRL DATA HAS ALREADY BEEN DOWNLOADED AND INCLUDED IN THIS REPOSITORY 
# USING THE METHOD BELOW IS UNNCESSARY
def download_arelle_data(xbrl_path):

    # Getting cik and accession number from xbrl path
    cik = xbrl_path.split("/")[6]
    accession_number =  xbrl_path.split("/")[7]
    
    # Different types of xml files provided by arelle webserver
    arelle_data_sets = ["concepts", "pre", "dim", "facts", "factTable", "roleTypes"]
    
    # Save xbrl files to folder
    filing_path = "arelle_xbrl/{0}/{1}/".format(cik, accession_number)
    for s in arelle_data_sets:
        full_path = filing_path + "{0}.xml".format(s)
        
        # Download xbrl file to memory
        buffer = arelle_client.get_buffer(xbrl_path, s, "xml")
        if buffer:
            print("saving xml file to: {}".format(full_path))
            Path(filing_path).mkdir(parents=True, exist_ok=True)
            with open(full_path, mode="wb") as localfile:
                localfile.write(buffer)


In [None]:
# The following classes model the downloaded xml files and their directories to aid in pulling data

# Path to stored xml files
ARELLE_DATA_PATH = "/home/jovyan/src/arelle_xbrl/"

# Different type of xml files produced by arelle web server
ARELLE_DATA_SETS = {
    "dim": "dim.xml",
    "concepts": "concepts.xml",
    "facts": "facts.xml",
    "pre": "pre.xml",
    "roleTypes": "roleTypes.xml",
    "factTable": "factTable.xml"
}

# Define the abstracts we're interested in
# An abstract is a parent label used in XBRL that covers a set of child datapoints
# In this case, we're looking for the abstracts that represent the key financial statements
ABSTRACTS = {
    "balance_sheet": "us-gaap:StatementOfFinancialPositionAbstract",
    "income_statement": "us-gaap:IncomeStatementAbstract",
    "cash_flow_statement": "us-gaap:StatementOfCashFlowsAbstract",
}

# Model of a single company and its data; has many filings
class CompanyData():
    
    def __init__(self, cik):
        self.cik = cik
        self.base_path = ARELLE_DATA_PATH + "{0}/".format(cik)
        self.accession_numbers = os.listdir(self.base_path)
        self.filings = [Filing(cik, f.path) for f in os.scandir(self.base_path) if f.is_dir()]
    
    def get_all(self, dataset):
        return [f.get_dataset(dataset) for f in self.filings]

# Model of filing, which has several xml files
class Filing():
    
    def __init__(self, cik, path):
        self.cik = cik
        self.base_path = path
        self.filenames = os.listdir(self.base_path)
#        self.xml_files = self.process_xml_files()

    # This file provides the presentation structure on each financial statement/report presented in the filing
    def presentation_file(self):
        return PresentationFile(self.cik, self.base_path, "pre.xml")
    
    # This file provides the numeric data for each line item in a presentation
    def facts_file(self):
        return FactsFile(self.cik, self.base_path, "facts.xml")

    # This method finds the presentation in the filing that matches "abstract_name" and then merges
    # The numeric data via a LEFT JOIN; result is a DataFrame that contains all the relevant data
    # For the requested abstract_name in the filing
    def statement(self, abstract_name):
        accounts = self.presentation_file().accounts(abstract_name)
        facts = self.facts_file().data_df()
        final_statement = accounts.merge(facts, how="left", left_on="account", right_on="label")
        final_statement = final_statement.loc[:, ["account", "endInstant", "value"]]
        return final_statement
    
# Model of a single xml file
class XMLFile():
    
    def __init__(self, cik, folder_path, filename):
        self.cik = cik
        self.folder_path = folder_path
        self.filename = filename
        self.full_file_path = folder_path + "/" + filename
    
    # Parse xml content with lxml
    def get_xml(self):
        return etree.parse(self.full_file_path)

# Model of xml file that contains XBRL presentation data
class PresentationFile(XMLFile):
    
    # Pulls all xml nodes that match the requested abstract name
    def get_abstract(self, abstract):
        xml = self.get_xml()
        return xml.xpath("//concept[@label='{}']".format(abstract))

    # Takes the first xml node returned by the get_abstract method above
    # Then it finds all child nodes that does not have any child nodes itself
    # These nodes are actual line items in a report that have numeric data associated with them
    def line_items(self, abstract):
        final_items = []
        abstract_nodes = self.get_abstract(abstract)
        if abstract_nodes:
            a = abstract_nodes[0]
            line_items = a.xpath(".//concept[not(concept)]")
            for item in line_items:
                final_items.append(item.attrib['label'])
        return final_items

    # Prepare the line items found above as a DataFrame for merging with facts
    def accounts(self, abstract):
        line_items = self.line_items(abstract)
        return pandas.DataFrame(line_items, columns=["account"])
            
class FactsFile(XMLFile):
    
    # Selects all fact nodes, other than fact node with a child dimension
    # Nodes with a dimension child relate to line items usually not contained in the main financial statements
    def items(self):
        return self.get_xml().xpath("//item[not(dimension)]")

    # Prepare a DataFrame of facts for merging with the presentation line items
    def data_df(self):
        final_items = []
        items = self.items()
        for i in items:
            item_dict = {}
            item_dict['name'] = i.attrib['name']
            for child in i.getchildren():
                item_dict[child.tag] = child.text
            final_items.append(item_dict)
        return pandas.DataFrame(final_items)
    

In [None]:
# Instantiate company data objects
companies = [CompanyData(c.name) for c in os.scandir(ARELLE_DATA_PATH)]

# List all companies by cik
for c in companies:
    print(c.cik)

# List all income_statements for particular company
for filing in companies[0].filings:
    display(filing.statement(ABSTRACTS["income_statement"]))
    

# List all balance_sheets for all companies

# for c in companies:
#    for f in c.filings:
#        df = f.statement(ABSTRACTS["balance_sheet"])
#        display(df)
            
            
