# All imports

In [1]:
from sec_edgar_downloader import Downloader
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os

# Download part

### Downloading reports of the required type

In [2]:
def download_reports(ticker, report_type, num_of_reports):
    dl = Downloader('./')
    dl.get(report_type, ticker, num_of_reports)
    
    path = f'./sec_edgar_filings/{ticker}/{report_type}'
    list_of_files = os.walk(path)
    list_of_files = list(list_of_files)[0][2]
    
    return list_of_files

In [3]:
list_of_files = download_reports('AAPL', '10-Q', 5)
list_of_files

['0000320193-19-000066.txt',
 '0000320193-19-000076.txt',
 '0000320193-20-000010.txt',
 '0000320193-20-000052.txt',
 '0000320193-20-000062.txt']

In [4]:
def make_json_urls(list_of_files):
    base_url = r"https://www.sec.gov/Archives/edgar/data/"
    hrefs = []
    
    for file in list_of_files:
        hrefs.append(base_url + file.split('-')[0] + '/' + file.replace('-','').replace('.txt','/index.json'))
    
    return hrefs

In [5]:
json_urls = make_json_urls(list_of_files)
json_urls

['https://www.sec.gov/Archives/edgar/data/0000320193/000032019319000066/index.json',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000032019319000076/index.json',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000032019320000010/index.json',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000032019320000052/index.json',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000032019320000062/index.json']

In [6]:
def make_xml_urls(json_urls):
    base_url = r"https://www.sec.gov"
    
    new_urls = []

    for url in json_urls:
        content = requests.get(url).json()

        for file in content['directory']['item']:
            if file['name'] == 'FilingSummary.xml':
                xml_summary = base_url + content['directory']['name'] + "/" + file['name']
                new_urls.append(xml_summary)
    
    return new_urls

In [7]:
xml_urls = make_xml_urls(json_urls)
xml_urls

['https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/FilingSummary.xml',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000076/FilingSummary.xml',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019320000010/FilingSummary.xml',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019320000052/FilingSummary.xml',
 'https://www.sec.gov/Archives/edgar/data/320193/000032019320000062/FilingSummary.xml']

In [8]:
def make_table_urls(xml_summary):
    base_url = xml_summary.replace('FilingSummary.xml', '')

    content = requests.get(xml_summary).content
    soup = BeautifulSoup(content, 'lxml')

    reports = soup.find('myreports')

    master_reports = []

    for report in reports.find_all('report')[:-1]:

        report_dict = {}
        report_dict['name_short'] = report.shortname.text
        report_dict['url'] = base_url + report.htmlfilename.text

        master_reports.append(report_dict)
        
    return master_reports

In [9]:
all_master_reports = []

for url in xml_urls:
    all_master_reports.append(make_table_urls(url))

In [10]:
all_master_reports

[[{'name_short': 'Document and Entity Information',
   'url': 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/R1.htm'},
  {'name_short': 'CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)',
   'url': 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/R2.htm'},
  {'name_short': 'CONDENSED CONSOLIDATED STATEMENTS OF COMPREHENSIVE INCOME (Unaudited)',
   'url': 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/R3.htm'},
  {'name_short': 'CONDENSED CONSOLIDATED BALANCE SHEETS (Unaudited)',
   'url': 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/R4.htm'},
  {'name_short': 'CONDENSED CONSOLIDATED BALANCE SHEETS (Unaudited) (Parenthetical)',
   'url': 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/R5.htm'},
  {'name_short': "CONDENSED CONSOLIDATED STATEMENTS OF SHAREHOLDERS' EQUITY (Unaudited)",
   'url': 'https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/R6.htm'},
  {'name_s

In [11]:
def make_statements_data(statements_url):
    
    statements_data = []

    for statement in statements_url:

        statement_data = {}
        statement_data['headers'] = []
        statement_data['sections'] = []
        statement_data['data'] = []

        content = requests.get(statement).content
        report_soup = BeautifulSoup(content, 'html')

        for index, row in enumerate(report_soup.table.find_all('tr')):

            cols = row.find_all('td')

            if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0): 
                reg_row = [ele.text.strip() for ele in cols]
                statement_data['data'].append(reg_row)
            elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0):
                sec_row = cols[0].text.strip()
                statement_data['sections'].append(sec_row)
            elif (len(row.find_all('th')) != 0):            
                hed_row = [ele.text.strip() for ele in row.find_all('th')]
                statement_data['headers'].append(hed_row)
            else:            
                print('We encountered an error.')

        statements_data.append(statement_data)
        
    return statements_data

In [12]:
def make_final_data(i):
    
    if len(statements_data[i]['headers']) > 1:
        income_header = statements_data[i]['headers'][1]
    else:
        income_header = statements_data[i]['headers'][0][1:]
    
    income_data = statements_data[i]['data']

    income_df = pd.DataFrame(income_data)

    income_df.index = income_df[0]
    income_df.index.name = 'Category'
    income_df = income_df.drop(0, axis = 1)

    income_df = income_df.replace('[\$,)]','', regex=True )\
                         .replace( '[(]','-', regex=True)\
                         .replace( '', 'NaN', regex=True)

    try:
        income_df = income_df.astype(float)
    except:
        print('ohhhhh')

    try:    
        income_df.columns = income_header
    except:
        pass

    return income_df

In [13]:
for i in range(len(all_master_reports)):
    
    datas = []
    statements_urls = []
    for j in range(len(all_master_reports[i])):
        statements_urls.append(all_master_reports[i][j]['url'])
    statements_data = make_statements_data(statements_urls)
    
    data = {}
    for j in range(len(statements_data)):
        print(all_master_reports[i][j]['name_short'])
        print(all_master_reports[i][j]['url'])
        display(make_final_data(j))
        data[key] = make_final_data(j)
            
    datas.append(data)
    print('='*100)
    

Document and Entity Information
https://www.sec.gov/Archives/edgar/data/320193/000032019319000066/R1.htm
ohhhhh


Unnamed: 0_level_0,"Mar. 30, 2019","Apr. 22, 2019"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Document Type,10-Q,
Amendment Flag,false,
Document Period End Date,Mar. 30 2019,
Document Fiscal Year Focus,2019,
Document Fiscal Period Focus,Q2,
Trading Symbol,AAPL,
Entity Registrant Name,Apple Inc.,
Entity Central Index Key,0000320193,
Current Fiscal Year End Date,--09-28,
Entity Current Reporting Status,Yes,


ohhhhh


NameError: name 'key' is not defined