In [194]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime
import os

header = {'User-Agent': 'mdemiceli@gmail.com'}

# PULLING CIK NUMBERS AND TICKERS

In [195]:
megacap_tickers = pd.read_csv('nasdaq_megacap_tickers.csv') # tickers file downloaded from nasdaq.com

megacap_tickers = [x for x in megacap_tickers['Symbol']]
megacap_tickers

['AAPL',
 'ABBV',
 'AMZN',
 'AVGO',
 'BAC',
 'COST',
 'CSCO',
 'CVX',
 'DIS',
 'FB',
 'GOOG',
 'GOOGL',
 'HD',
 'JNJ',
 'JPM',
 'KO',
 'LLY',
 'MA',
 'MRK',
 'MSFT',
 'NVDA',
 'PEP',
 'PFE',
 'PG',
 'TMO',
 'TSLA',
 'UNH',
 'V',
 'VFC',
 'WMT',
 'XOM']

In [196]:
cik_tickers_json_url = r'https://www.sec.gov/files/company_tickers.json'

cik_tickers_bulk = requests.get(cik_tickers_json_url, headers=header)
decoded_cik_tickers_bulk = cik_tickers_bulk.json()

len(decoded_cik_tickers_bulk) # this is a mass json file with every sec registered company's cik, ticker, and name

12175

In [197]:
mega_caps = dict()
mega_caps['cik'] = []
mega_caps['ticker'] = []
mega_caps['name'] = []

for bulk in decoded_cik_tickers_bulk:
    
    # formatting cik nums
    diff = 10 - len(str(decoded_cik_tickers_bulk[str(bulk)]['cik_str']))
    cik_full = '0' * diff + str(decoded_cik_tickers_bulk[str(bulk)]['cik_str'])
    
    ticker_str = decoded_cik_tickers_bulk[str(bulk)]['ticker']
    
    if ticker_str in megacap_tickers:
        mega_caps['cik'].append(cik_full)
        mega_caps['ticker'].append(decoded_cik_tickers_bulk[str(bulk)]['ticker'])
        mega_caps['name'].append(decoded_cik_tickers_bulk[str(bulk)]['title'])
        

mega_caps

{'cik': ['0000320193',
  '0000789019',
  '0001652044',
  '0001018724',
  '0001318605',
  '0001326801',
  '0000731766',
  '0000200406',
  '0001045810',
  '0000104169',
  '0000080424',
  '0000034088',
  '0000019617',
  '0001141391',
  '0000354950',
  '0000093410',
  '0000070858',
  '0000059478',
  '0000021344',
  '0000078003',
  '0001551152',
  '0000077476',
  '0000909832',
  '0001730168',
  '0000310158',
  '0000097745',
  '0000858877',
  '0001744489',
  '0000103379',
  '0001403161',
  '0001652044'],
 'ticker': ['AAPL',
  'MSFT',
  'GOOGL',
  'AMZN',
  'TSLA',
  'FB',
  'UNH',
  'JNJ',
  'NVDA',
  'WMT',
  'PG',
  'XOM',
  'JPM',
  'MA',
  'HD',
  'CVX',
  'BAC',
  'LLY',
  'KO',
  'PFE',
  'ABBV',
  'PEP',
  'COST',
  'AVGO',
  'MRK',
  'TMO',
  'CSCO',
  'DIS',
  'VFC',
  'V',
  'GOOG'],
 'name': ['Apple Inc.',
  'MICROSOFT CORP',
  'Alphabet Inc.',
  'AMAZON COM INC',
  'Tesla, Inc.',
  'Meta Platforms, Inc.',
  'UNITEDHEALTH GROUP INC',
  'JOHNSON & JOHNSON',
  'NVIDIA CORP',
  'Walm

In [198]:
# lens are 31 because Google is duplicated

print(len(mega_caps['cik']))
print(len(mega_caps['ticker']))
print(len(mega_caps['name']))

31
31
31


In [199]:
# Using company's cik number to pull its submissions url file and geenrate a master list

master_submission_urls = []

for cik in mega_caps['cik']:
    
    base_submissions_url = r'https://data.sec.gov/submissions/CIK' + cik + r'.json'
    master_submission_urls.append(base_submissions_url)

    
print(master_submission_urls)
print(len(master_submission_urls))

['https://data.sec.gov/submissions/CIK0000320193.json', 'https://data.sec.gov/submissions/CIK0000789019.json', 'https://data.sec.gov/submissions/CIK0001652044.json', 'https://data.sec.gov/submissions/CIK0001018724.json', 'https://data.sec.gov/submissions/CIK0001318605.json', 'https://data.sec.gov/submissions/CIK0001326801.json', 'https://data.sec.gov/submissions/CIK0000731766.json', 'https://data.sec.gov/submissions/CIK0000200406.json', 'https://data.sec.gov/submissions/CIK0001045810.json', 'https://data.sec.gov/submissions/CIK0000104169.json', 'https://data.sec.gov/submissions/CIK0000080424.json', 'https://data.sec.gov/submissions/CIK0000034088.json', 'https://data.sec.gov/submissions/CIK0000019617.json', 'https://data.sec.gov/submissions/CIK0001141391.json', 'https://data.sec.gov/submissions/CIK0000354950.json', 'https://data.sec.gov/submissions/CIK0000093410.json', 'https://data.sec.gov/submissions/CIK0000070858.json', 'https://data.sec.gov/submissions/CIK0000059478.json', 'https://

In [200]:
master_filings_bulk = []

for url in master_submission_urls: # change range to full master_submission_urls list once confident

    # pulling json object
    content = requests.get(url, headers=header)
    decoded_content = content.json()
    
    # pulling relevant features (**additional company level details can be pulled from here)
    js2 = dict()
    js2['tickers'] = decoded_content['tickers']
    js2['cik'] = []
    js2['accessionNumbers'] = decoded_content['filings']['recent']['accessionNumber']
    js2['filingDate'] = decoded_content['filings']['recent']['filingDate']
    js2['forms'] = decoded_content['filings']['recent']['form']
    
    diff = 10 - len(str(decoded_content['cik']))
    full_cik = '0' * diff + decoded_content['cik']
    js2['cik'].append(full_cik)

    
    master_filings_bulk.append(js2)

master_filings_bulk

[{'tickers': ['AAPL'],
  'cik': ['0000320193'],
  'accessionNumbers': ['0000320193-22-000061',
   '0001193125-22-128368',
   '0001193125-22-128361',
   '0001193125-22-128354',
   '0000320193-22-000059',
   '0000320193-22-000058',
   '0000320193-22-000053',
   '0000320193-22-000052',
   '0000320193-22-000049',
   '0000320193-22-000048',
   '0000320193-22-000047',
   '0000320193-22-000046',
   '0000320193-22-000041',
   '0000320193-22-000040',
   '0000320193-22-000039',
   '0000320193-22-000038',
   '0000320193-22-000037',
   '0000320193-22-000036',
   '0000320193-22-000035',
   '0000320193-22-000034',
   '0001193125-22-066169',
   '0001193125-22-032738',
   '0001104659-22-016224',
   '0001214659-22-002092',
   '0000320193-22-000025',
   '0000320193-22-000023',
   '0000320193-22-000022',
   '0000320193-22-000021',
   '0000320193-22-000020',
   '0000320193-22-000019',
   '0000320193-22-000018',
   '0000320193-22-000017',
   '0000320193-22-000016',
   '0001214659-22-001589',
   '0001214659

In [201]:
accession_df = pd.DataFrame(data=master_filings_bulk)
accession_df

Unnamed: 0,tickers,cik,accessionNumbers,filingDate,forms
0,[AAPL],[0000320193],"[0000320193-22-000061, 0001193125-22-128368, 0...","[2022-05-06, 2022-04-29, 2022-04-29, 2022-04-2...","[4, S-8, S-8 POS, S-8 POS, 10-Q, 8-K, 4, 4, 4,..."
1,[MSFT],[0000789019],"[0001564590-22-015675, 0001193125-22-120207, 0...","[2022-04-26, 2022-04-26, 2022-04-19, 2022-04-1...","[10-Q, 8-K, 4, 4, 4, 4, 11-K, 4, 4, 4, 4, 4, 4..."
2,"[GOOGL, GOOG]",[0001652044],"[0001209191-22-027590, 0001209191-22-027584, 0...","[2022-05-06, 2022-05-06, 2022-05-05, 2022-05-0...","[4, 3, PX14A6G, 4, 4, PX14A6G, PX14A6G, PX14A6..."
3,[AMZN],[0001018724],"[0001214659-22-006346, 0000950123-22-004629, 0...","[2022-05-05, 2022-05-05, 2022-04-29, 2022-04-2...","[PX14A6G, 13F-HR, 10-Q, 8-K, PX14A6G, PX14A6G,..."
4,[TSLA],[0001318605],"[0001790565-22-000006, 0000899243-22-016671, 0...","[2022-05-04, 2022-05-04, 2022-05-04, 2022-05-0...","[4, 4, 4, 8-K, 10-K/A, 4, 4, 4, 4, 4, 4, 4, 4,..."
5,[FB],[0001326801],"[0000950103-22-008028, 0000950103-22-007355, 0...","[2022-05-05, 2022-04-28, 2022-04-28, 2022-04-2...","[4, 4, PX14A6G, 10-Q, 8-K, PX14A6G, PX14A6G, 4..."
6,[UNH],[0000731766],"[0000731766-22-000021, 0000731766-22-000019, 0...","[2022-05-06, 2022-05-04, 2022-04-22, 2022-04-2...","[8-K, 10-Q, 3, DEFA14A, DEF 14A, 8-K, 4, 4, 4,..."
7,[JNJ],[0000200406],"[0001193125-22-134336, 0000200406-22-000048, 0...","[2022-04-29, 2022-04-29, 2022-04-29, 2022-04-2...","[S-8, 10-Q, 8-K, 4, 8-K, PX14A6G, PX14A6G, PX1..."
8,[NVDA],[0001045810],"[0001045810-22-000070, 0001045810-22-000068, 0...","[2022-04-20, 2022-04-19, 2022-04-19, 2022-04-0...","[DEFA14A, DEF 14A, DEF 14A, PRE 14A, 4, 4, 4, ..."
9,[WMT],[0000104169],"[0001127602-22-012768, 0001214659-22-005695, 0...","[2022-04-29, 2022-04-25, 2022-04-21, 2022-04-2...","[4, PX14A6G, DEFA14A, DEF 14A, 4, 8-K, 25-NSE,..."


In [202]:
accession_ten_k = dict()
accession_ten_k['cik'] = []
accession_ten_k['accessionNumbers'] = []

for row in range(len(accession_df)):
    
    # pulling all the data for each row in df
    company = accession_df.iloc[row]
    
    # pulling out accession nums and form types
    accessionNumbers = company[2]
    forms = company[4]
    
    company_df = pd.DataFrame(data=[accessionNumbers, forms]).T
    company_df.columns = ['accessionNumbers', 'forms']
    
    # pulling cik num
    accession_ten_k['cik'].append(company[1][0])
    
    # this appends all accession nums when form type == 10-K
    accession_ten_k['accessionNumbers'].append(company_df['accessionNumbers'][company_df['forms'] == '10-K'])

    
accession_ten_k # dict organized by ticker and accessions for 10-K filings only

{'cik': ['0000320193',
  '0000789019',
  '0001652044',
  '0001018724',
  '0001318605',
  '0001326801',
  '0000731766',
  '0000200406',
  '0001045810',
  '0000104169',
  '0000080424',
  '0000034088',
  '0000019617',
  '0001141391',
  '0000354950',
  '0000093410',
  '0000070858',
  '0000059478',
  '0000021344',
  '0000078003',
  '0001551152',
  '0000077476',
  '0000909832',
  '0001730168',
  '0000310158',
  '0000097745',
  '0000858877',
  '0001744489',
  '0000103379',
  '0001403161',
  '0001652044'],
 'accessionNumbers': [49     0000320193-21-000105
  118    0000320193-20-000096
  189    0000320193-19-000119
  259    0000320193-18-000145
  347    0000320193-17-000070
  492    0001628280-16-020309
  609    0001193125-15-356351
  738    0001193125-14-383437
  847    0001193125-13-416534
  946    0001193125-12-444068
  Name: accessionNumbers, dtype: object,
  114    0001564590-21-039151
  268    0001564590-20-034944
  432    0001564590-19-027952
  598    0001564590-18-019062
  758    000156

In [203]:
df_ten_k = pd.DataFrame(accession_ten_k)
df_ten_k

Unnamed: 0,cik,accessionNumbers
0,320193,49 0000320193-21-000105 118 0000320193-...
1,789019,114 0001564590-21-039151 268 0001564590-...
2,1652044,133 0001652044-22-000019 428 0001652044-...
3,1018724,45 0001018724-22-000005 202 0001018724-...
4,1318605,35 0000950170-22-000796 167 0001564590-...
5,1326801,54 0001326801-22-000018 424 0001326801-...
6,731766,62 0000731766-22-000008 214 0000731766-...
7,200406,36 0000200406-22-000022 123 0000200406-...
8,1045810,12 0001045810-22-000036 128 0001045810-...
9,104169,22 0000104169-22-000012 261 0000104169-...


In [204]:
len(df_ten_k['cik'])

31

In [205]:
base_url = r'https://www.sec.gov/Archives/edgar/data/'

filings_summary = {}
filings_summary['cik'] = []
filings_summary['year'] = []
filings_summary['xml_summaries'] = []

accession = []

# for i in range(len(df_ten_k['cik'])):
for i in range(1):
    filings_summary['cik'] = df_ten_k['cik'][i]
    accession_list = df_ten_k['accessionNumbers'][i]
    
    # removing '-' from accession num and adding to the master list
    for num in accession_list:
        formatted_accession = num.replace('-', '')
        filings_summary['year'].append(str(formatted_accession[10:12]))
        accession.append(formatted_accession)

for i in range(len(accession)):
    filing_url = base_url + str(cik) + '/' + str(accession[i]) + '/index.json'
    filing_content = requests.get(filing_url, headers=header).json()

    for file in filing_content['directory']['item']:

        # grab the filing summary and create a new url leading to the file so we can download it
        if file['name'] == 'FilingSummary.xml':

            xml_summary = r'https://www.sec.gov' + filing_content['directory']['name'] + '/' + file['name']
            filings_summary['xml_summaries'].append(xml_summary)

#                 print('-'*100)
#                 print('File Name: ' + file['name'])
#                 print('File Path: ' + xml_summary)

filings_summary

{'cik': '0000320193',
 'year': ['21', '20', '19', '18', '17', '16', '15', '14', '13', '12'],
 'xml_summaries': ['https://www.sec.gov/Archives/edgar/data/1652044/000032019321000105/FilingSummary.xml',
  'https://www.sec.gov/Archives/edgar/data/1652044/000032019320000096/FilingSummary.xml',
  'https://www.sec.gov/Archives/edgar/data/1652044/000032019319000119/FilingSummary.xml',
  'https://www.sec.gov/Archives/edgar/data/1652044/000032019318000145/FilingSummary.xml',
  'https://www.sec.gov/Archives/edgar/data/1652044/000032019317000070/FilingSummary.xml',
  'https://www.sec.gov/Archives/edgar/data/1652044/000162828016020309/FilingSummary.xml',
  'https://www.sec.gov/Archives/edgar/data/1652044/000119312515356351/FilingSummary.xml',
  'https://www.sec.gov/Archives/edgar/data/1652044/000119312514383437/FilingSummary.xml',
  'https://www.sec.gov/Archives/edgar/data/1652044/000119312513416534/FilingSummary.xml',
  'https://www.sec.gov/Archives/edgar/data/1652044/000119312512444068/FilingSumm

In [206]:
filings_df = pd.DataFrame(data=filings_summary)
filings_df

Unnamed: 0,cik,year,xml_summaries
0,320193,21,https://www.sec.gov/Archives/edgar/data/165204...
1,320193,20,https://www.sec.gov/Archives/edgar/data/165204...
2,320193,19,https://www.sec.gov/Archives/edgar/data/165204...
3,320193,18,https://www.sec.gov/Archives/edgar/data/165204...
4,320193,17,https://www.sec.gov/Archives/edgar/data/165204...
5,320193,16,https://www.sec.gov/Archives/edgar/data/165204...
6,320193,15,https://www.sec.gov/Archives/edgar/data/165204...
7,320193,14,https://www.sec.gov/Archives/edgar/data/165204...
8,320193,13,https://www.sec.gov/Archives/edgar/data/165204...
9,320193,12,https://www.sec.gov/Archives/edgar/data/165204...


# THIS IS AS FAR AS I HAVE GOTTEN IN THE NEWEST ROUND OF EDITS

-MD

### SCRAPING FINANCIAL STATEMENT DATA FROM FILING SUMMARY

In [207]:
base_url = r'https://www.sec.gov/Archives/edgar/data/'
cik = r'0001065280'
accession = r'000106528022000036'

filing_url = base_url + cik + '/' + accession + '/index.json'

filing_content = requests.get(filing_url, headers=header).json()

for file in filing_content['directory']['item']:
    
    # grab the filing summary and create a new url leading to the file so we can download it
    if file['name'] == 'FilingSummary.xml':
        
        xml_summary = r'https://www.sec.gov' + filing_content['directory']['name'] + '/' + file['name']
        
        print('-'*100)
        print('File Name: ' + file['name'])
        print('File Path: ' + xml_summary)

----------------------------------------------------------------------------------------------------
File Name: FilingSummary.xml
File Path: https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/FilingSummary.xml


In [208]:
base_url = xml_summary.replace('FilingSummary.xml', '')
content = requests.get(xml_summary, headers=header).content
soup = BeautifulSoup(content, 'lxml')

reports = soup.find('myreports')

master_reports = []

for report in reports.find_all('report')[:-1]:
    
    report_dict = dict()
    report_dict['name_short'] = report.shortname.text
    report_dict['name_long'] = report.longname.text
    report_dict['position'] = report.position.text
    report_dict['category'] = report.menucategory.text
    report_dict['url'] = base_url + report.htmlfilename.text
    
    master_reports.append(report_dict)

    
statements_url = []

for report_dict in master_reports:
    
    item1 = r'Consolidated Balance Sheets'.upper()
    item2 = r'Consolidated Statements of Operations'.upper()
    item3 = r'Consolidated Statements of Cash Flows'.upper()
    
    report_list = [item1, item2, item3]
    
    if report_dict['name_short'] in report_list:
        print('-'*100)
        print(report_dict['name_short'])
        print(report_dict['url'])
        
        statements_url.append(report_dict['url'])

----------------------------------------------------------------------------------------------------
CONSOLIDATED STATEMENTS OF OPERATIONS
https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/R3.htm
----------------------------------------------------------------------------------------------------
CONSOLIDATED STATEMENTS OF CASH FLOWS
https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/R5.htm
----------------------------------------------------------------------------------------------------
CONSOLIDATED BALANCE SHEETS
https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/R6.htm


In [209]:
statements_data = []

for statement in statements_url:
    
    statement_data = dict()
    statement_data['headers'] = []
    statement_data['sections'] = []
    statement_data['data'] = []
    
    content = requests.get(statement, headers=header).content
    report_soup = BeautifulSoup(content, 'html')
    
    for index, row in enumerate(report_soup.table.find_all('tr')):
        
        cols = row.find_all('td')
        
        if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0):
            
            reg_row = [ele.text.strip() for ele in cols]
            statement_data['data'].append(reg_row)
            
        elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0):
            
            sec_row = cols[0].text.strip()
            statement_data['sections'].append(sec_row)
            
        elif (len(row.find_all('th')) != 0):
            
            hed_row = [ele.text.strip() for ele in row.find_all('th')]
            statement_data['headers'].append(hed_row)
              
        else:
              print('We encountered an error.')
              
    statements_data.append(statement_data)

In [210]:
income_headers = statements_data[0]['headers'][1]
income_data = statements_data[0]['data']

# put the data into a df
income_df = pd.DataFrame(income_data)

# define an rename index col
income_df.index = income_df[0]
income_df.index.name = 'Category'
income_df = income_df.drop(0, axis=1)

# get rid of the $, (), and convert empty cells to NaNs
income_df = income_df.replace('[\$,)]', '', regex=True)\
                     .replace('[(]', '-', regex=True)\
                     .replace('', 'NaN', regex=True)

# everything otiginally comes in as string, so we onvert to float
income_df = income_df.astype(float)

# change column headers
income_df.columns = income_headers

income_df

Unnamed: 0_level_0,"Dec. 31, 2021","Dec. 31, 2020","Dec. 31, 2019"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Revenues,29697844.0,24996056.0,20156447.0
Cost of revenues,17332683.0,15276319.0,12440213.0
Marketing,2545146.0,2228362.0,2652462.0
Technology and development,2273885.0,1829600.0,1545149.0
General and administrative,1351621.0,1076486.0,914369.0
Operating income,6194509.0,4585289.0,2604254.0
Interest expense,-765620.0,-767499.0,-626023.0
Interest and other income (expense),411214.0,-618441.0,84000.0
Income before income taxes,5840103.0,3199349.0,2062231.0
Provision for income taxes,-723875.0,-437954.0,-195315.0


In [211]:
socf_headers = statements_data[1]['headers'][1]
socf_data = statements_data[1]['data']

# put the data into a df
socf_df = pd.DataFrame(socf_data)

# define an rename index col
socf_df.index = socf_df[0]
socf_df.index.name = 'Category'
socf_df = socf_df.drop(0, axis=1)


# get rid of the $, (), and convert empty cells to NaNs
socf_df = socf_df.replace('[\$,)]', '', regex=True)\
                     .replace('[(]', '-', regex=True)\
                     .replace('', 'NaN', regex=True)

# everything otiginally comes in as string, so we onvert to float
socf_df = socf_df.astype(float)

# change column headers
socf_df.columns = socf_headers

socf_df

Unnamed: 0_level_0,"Dec. 31, 2021","Dec. 31, 2020","Dec. 31, 2019"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Net income,5116228.0,2761395.0,1866916.0
Additions to content assets,-17702202.0,-11779284.0,-13916683.0
Change in content liabilities,232898.0,-757433.0,-694011.0
Amortization of content assets,12230367.0,10806912.0,9216247.0
"Depreciation and amortization of property, equipment and intangibles",208412.0,115710.0,103579.0
Stock-based compensation expense,403220.0,415180.0,405376.0
Foreign currency remeasurement loss (gain) on debt,-430661.0,533278.0,-45576.0
Other non-cash items,376777.0,293126.0,228230.0
Deferred income taxes,199548.0,70066.0,-94443.0
Other current assets,-369681.0,-187623.0,-252113.0


In [212]:
bs_headers = statements_data[2]['headers'][0]
main, fy21, fy20 = bs_headers
bs_headers = fy21, fy20

bs_data = statements_data[2]['data']

# put the data into a df
bs_df = pd.DataFrame(bs_data)

# define an rename index col
bs_df.index = bs_df[0]
bs_df.index.name = 'Category'
bs_df = bs_df.drop(0, axis=1)


# get rid of the $, (), and convert empty cells to NaNs
bs_df = bs_df.replace('[\$,)]', '', regex=True)\
             .replace('[(]', '-', regex=True)\
             .replace('', 'NaN', regex=True)

# everything otiginally comes in as string, so we onvert to float
bs_df = bs_df.astype(float)

# change column headers
bs_df.columns = bs_headers

bs_df

Unnamed: 0_level_0,"Dec. 31, 2021","Dec. 31, 2020"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Cash and cash equivalents,6027804.0,8205550.0
Other current assets,2042021.0,1556030.0
Total current assets,8069825.0,9761580.0
"Content assets, net",30919539.0,25383950.0
"Property and equipment, net",1323453.0,960183.0
Other non-current assets,4271846.0,3174646.0
Total assets,44584663.0,39280359.0
Current content liabilities,4292967.0,4429536.0
Accounts payable,837483.0,656183.0
Accrued expenses and other liabilities,1449351.0,1102196.0
