In [1]:
#import client to connect with arelle web server
import arelle_client

#for dealing with file system
import os
from pathlib import Path

#load sample urls to sample SEC filings' xbrl instance
import pandas

#import lxml for parsing xml
import lxml
from lxml import etree
import html

#method was used to download xbrl data from arelle web server
#NOTE: XBRL DATA HAS ALREADY BEEN DOWNLOADED AND INCLUDED IN THIS REPOSITORY
# USING METHOD BELOW IS UNNCESSARY
def download_arelle_data(xbrl_path):

    #getting cik and accession number from xbrl path
    cik = xbrl_path.split("/")[6]
    accession_number =  xbrl_path.split("/")[7]
    
    #different types of xml files provided by arelle webserver
    arelle_data_sets = ["concepts", "pre", "dim", "facts", "factTable", "roleTypes"]
    
    #save xbrl files to folder
    filing_path = "arelle_xbrl/{0}/{1}/".format(cik, accession_number)
    for s in arelle_data_sets:
        full_path = filing_path + "{0}.xml".format(s)
        #download xbrl file to memory
        buffer = arelle_client.get_buffer(xbrl_path, s, "xml")
        if buffer:
            print("saving xml file to: {}".format(full_path))
            Path(filing_path).mkdir(parents=True, exist_ok=True)
            with open(full_path, mode="wb") as localfile:
                localfile.write(buffer)


In [63]:
#The following classes model the downloaded xml files and their directories to aid in pulling data

#path to stored xml files
ARELLE_DATA_PATH = "/home/jovyan/src/arelle_xbrl/"

#different type of xml files produced by arelle web server
ARELLE_DATA_SETS = {
    "dim": "dim.xml",
    "concepts": "concepts.xml",
    "facts": "facts.xml",
    "pre": "pre.xml",
    "roleTypes": "roleTypes.xml",
    "factTable": "factTable.xml"
}

#certain abstracts we're interested in; an abstract is a parent label used in XBRL that covers a set of child datapoints
#in this case, we're looking for the abstracts that represent the key financial statements
ABSTRACTS = {
    "balance_sheet": "us-gaap:StatementOfFinancialPositionAbstract",
    "income_statement": "us-gaap:IncomeStatementAbstract",
    "cash_flow_statement": "us-gaap:StatementOfCashFlowsAbstract",
}

#model of a single company and its data; has many filings
class CompanyData():
    
    def __init__(self, cik):
        self.cik = cik
        self.base_path = ARELLE_DATA_PATH + "{0}/".format(cik)
        self.accession_numbers = os.listdir(self.base_path)
        self.filings = [Filing(cik, f.path) for f in os.scandir(self.base_path) if f.is_dir()]
    
    def get_all(self, dataset):
        return [f.get_dataset(dataset) for f in self.filings]

#model of filing, which has several xml files
class Filing():
    
    def __init__(self, cik, path):
        self.cik = cik
        self.base_path = path
        self.filenames = os.listdir(self.base_path)
#        self.xml_files = self.process_xml_files()

    #this file provides the presentation structure on each financial statement/report presented in the filing
    def presentation_file(self):
        return PresentationFile(self.cik, self.base_path, "pre.xml")
    
    #this file provides the numeric data for each line item in a presentation
    def facts_file(self):
        return FactsFile(self.cik, self.base_path, "facts.xml")

    #this method finds the presentation in the filing that matches "abstract_name" and then merges
    #the numeric data via a LEFT JOIN; result is a DataFrame that contains all the relevant data
    #for the requested abstract_name in the filing
    def statement(self, abstract_name):
        accounts = self.presentation_file().accounts(abstract_name)
        facts = self.facts_file().data_df()
        final_statement = accounts.merge(facts, how="left", left_on="account", right_on="label")
        final_statement = final_statement.loc[:, ["account", "endInstant", "value"]]
        return final_statement
    
#model of a single xml file
class XMLFile():
    
    def __init__(self, cik, folder_path, filename):
        self.cik = cik
        self.folder_path = folder_path
        self.filename = filename
        self.full_file_path = folder_path + "/" + filename
    
    #parse xml content with lxml
    def get_xml(self):
        return etree.parse(self.full_file_path)

#model of xml file that contains XBRL presentation data
class PresentationFile(XMLFile):
    
    #pulls all xml nodes that match the requested abstract name
    def get_abstract(self, abstract):
        xml = self.get_xml()
        return xml.xpath("//concept[@label='{}']".format(abstract))

    #takes the first xml node returned by the get_abstract method above
    #then it finds all child nodes that does not have any child nodes itself
    #these nodes are actual line items in a report that have numeric data associated with them
    def line_items(self, abstract):
        final_items = []
        abstract_nodes = self.get_abstract(abstract)
        if abstract_nodes:
            a = abstract_nodes[0]
            line_items = a.xpath(".//concept[not(concept)]")
            for item in line_items:
                final_items.append(item.attrib['label'])
        return final_items

    #prepare the line items found above as a DataFrame for merging with facts
    def accounts(self, abstract):
        line_items = self.line_items(abstract)
        return pandas.DataFrame(line_items, columns=["account"])
            
class FactsFile(XMLFile):
    
    #selects all fact nodes, other than fact node with a child dimension
    #nodes with a dimension child relate to line items usually not contained in the main financial statements
    def items(self):
        return self.get_xml().xpath("//item[not(dimension)]")

    #prepare a DataFrame of facts for merging with the presentation line items
    def data_df(self):
        final_items = []
        items = self.items()
        for i in items:
            item_dict = {}
            item_dict['name'] = i.attrib['name']
            for child in i.getchildren():
                item_dict[child.tag] = child.text
            final_items.append(item_dict)
        return pandas.DataFrame(final_items)
    

In [67]:
#instantiate company data objects
companies = [CompanyData(c.name) for c in os.scandir(ARELLE_DATA_PATH)]

#list all companies by cik
for c in companies:
    print(c.cik)

#list all income_statements for particular company
for filing in companies[0].filings:
    display(filing.statement(ABSTRACTS["income_statement"]))
    

#list all balance_sheets for all companies
#for c in companies:
#    for f in c.filings:
#        df = f.statement(ABSTRACTS["balance_sheet"])
#        display(df)
            
            


1463258
1704720
1512762
884269
910329


Unnamed: 0,account,endInstant,value
0,us-gaap:OtherAlternativeEnergySalesRevenue,2013-12-31,1207618000
1,us-gaap:OtherAlternativeEnergySalesRevenue,2014-12-31,1052772000
2,us-gaap:OtherAlternativeEnergySalesRevenue,2015-12-31,1141281000
3,regi:BiodieselGovernmentIncentives,2013-12-31,290393000
4,regi:BiodieselGovernmentIncentives,2014-12-31,220634000
5,regi:BiodieselGovernmentIncentives,2015-12-31,245868000
6,us-gaap:OilAndGasRevenue,2013-12-31,1498011000
7,us-gaap:OilAndGasRevenue,2014-12-31,1273406000
8,us-gaap:OilAndGasRevenue,2015-12-31,1387149000
9,us-gaap:SalesRevenueServicesGross,2013-12-31,127000


Unnamed: 0,account,endInstant,value
0,us-gaap:AlternativeEnergyMember,,
1,regi:RenewableIdentificationNumbersMember,,
2,regi:AlternativeEnergyGovermentIncentivesMember,,
3,us-gaap:ProductAndServiceOtherMember,,
4,us-gaap:RevenueFromContractWithCustomerIncludi...,2018-12-31,2380701000
5,us-gaap:RevenueFromContractWithCustomerIncludi...,2017-12-31,2153537000
6,us-gaap:RevenueFromContractWithCustomerIncludi...,2016-12-31,2039067000
7,us-gaap:Revenues,2018-12-31,2382987000
8,us-gaap:Revenues,2017-12-31,2154655000
9,us-gaap:Revenues,2016-12-31,2039232000


Unnamed: 0,account,endInstant,value
0,us-gaap:AlternativeEnergyMember,,
1,regi:RenewableIdentificationNumbersMember,,
2,regi:AlternativeEnergyGovermentIncentivesMember,,
3,us-gaap:ProductAndServiceOtherMember,,
4,us-gaap:RevenueFromContractWithCustomerIncludi...,2019-12-31,2623576000
5,us-gaap:RevenueFromContractWithCustomerIncludi...,2018-12-31,2366192000
6,us-gaap:RevenueFromContractWithCustomerIncludi...,2017-12-31,2153537000
7,us-gaap:RevenueFromContractWithCustomerIncludi...,2019-12-31,2623576000
8,us-gaap:RevenueFromContractWithCustomerIncludi...,2018-12-31,2366192000
9,us-gaap:Revenues,2019-12-31,2625216000


Unnamed: 0,account,endInstant,value
0,us-gaap:AlternativeEnergyMember,,
1,regi:AlternativeEnergyGovermentIncentivesMember,,
2,us-gaap:ProductAndServiceOtherMember,,
3,us-gaap:RevenueFromContractWithCustomerIncludi...,2020-12-31,2135741000
4,us-gaap:RevenueFromContractWithCustomerIncludi...,2019-12-31,2623576000
5,us-gaap:RevenueFromContractWithCustomerIncludi...,2018-12-31,2366192000
6,us-gaap:Revenues,2020-12-31,2137148000
7,us-gaap:Revenues,2019-12-31,2625216000
8,us-gaap:Revenues,2018-12-31,2368478000
9,us-gaap:Revenues,2020-12-31,2137148000


Unnamed: 0,account,endInstant,value
0,us-gaap:OtherAlternativeEnergySalesRevenue,2014-12-31,922602000
1,us-gaap:OtherAlternativeEnergySalesRevenue,2015-12-31,954742000
2,us-gaap:OtherAlternativeEnergySalesRevenue,2016-12-31,1417595000
3,regi:RenewableIdentificationNumberRevenue,2014-12-31,130170000
4,regi:RenewableIdentificationNumberRevenue,2015-12-31,186539000
5,regi:RenewableIdentificationNumberRevenue,2016-12-31,274800000
6,regi:BiodieselGovernmentIncentives,2014-12-31,220634000
7,regi:BiodieselGovernmentIncentives,2015-12-31,245868000
8,regi:BiodieselGovernmentIncentives,2016-12-31,346672000
9,us-gaap:OilAndGasRevenue,2014-12-31,1273406000


Unnamed: 0,account,endInstant,value
0,us-gaap:OtherAlternativeEnergySalesRevenue,2015-12-31,954742000
1,us-gaap:OtherAlternativeEnergySalesRevenue,2016-12-31,1417595000
2,us-gaap:OtherAlternativeEnergySalesRevenue,2017-12-31,1787308000
3,regi:RenewableIdentificationNumberRevenue,2015-12-31,186539000
4,regi:RenewableIdentificationNumberRevenue,2016-12-31,274800000
5,regi:RenewableIdentificationNumberRevenue,2017-12-31,337501000
6,regi:BiodieselGovernmentIncentives,2015-12-31,245868000
7,regi:BiodieselGovernmentIncentives,2016-12-31,346672000
8,regi:BiodieselGovernmentIncentives,2017-12-31,28728000
9,us-gaap:OilAndGasRevenue,2015-12-31,1387149000


Unnamed: 0,account,endInstant,value
0,us-gaap:AlternativeEnergyMember,,
1,regi:RenewableIdentificationNumbersMember,,
2,regi:AlternativeEnergyGovermentIncentivesMember,,
3,us-gaap:ProductAndServiceOtherMember,,
4,us-gaap:RevenueFromContractWithCustomerIncludi...,2019-12-31,2639753000
5,us-gaap:RevenueFromContractWithCustomerIncludi...,2018-12-31,2380701000
6,us-gaap:RevenueFromContractWithCustomerIncludi...,2017-12-31,2153537000
7,us-gaap:Revenues,2019-12-31,2641393000
8,us-gaap:Revenues,2018-12-31,2382987000
9,us-gaap:Revenues,2017-12-31,2154655000
