In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime

header = {'User-Agent': 'mdemiceli@gmail.com'}

# PULLING ACCESSION NUMBERS FOR FILINGS

##### url for filings from 2011 to 2018 --> https://data.sec.gov/submissions/CIK0001065280.json
##### url for filings from 2011 to 2018 --> https://data.sec.gov/submissions/CIK0001065280-submissions-001.json

In [40]:
cik = '0001065280'

base_submissions_url = r'https://data.sec.gov/submissions/CIK' + cik + r'.json'

content = requests.get(base_submissions_url, headers=header)
decoded_content = content.json()

submissions_json_file = open(r'sec_' + cik + r'.json', 'w')
json.dump(decoded_content, submissions_json_file, indent=3)

with open(r'sec_' + cik + r'.json') as sec:
    submissions_mass = json.load(sec)
    


In [22]:
# mass_copy = submissions_mass.copy()

# base_submissions_url_2 = r'https://data.sec.gov/submissions/CIK0001065280-submissions-001.json'

# content_2 = requests.get(base_submissions_url_2, headers=header)
# decoded_content_2 = content_2.json()

# submissions_json_file_2 = open(r'sec_' + cik + r'.json', 'w')
# json.dump(decoded_content_2, submissions_json_file_2, indent=3)

# with open(r'sec_' + cik + r'.json') as sec:
#     submissions_mass_2 = json.load(sec)

# submissions_mass_2

# mass_copy.update(submissions_mass_2)
# mass_copy['filingDate']

In [39]:
forms = submissions_mass['filings']['recent']['form']

filing_dates = submissions_mass['filings']['recent']['filingDate']
filing_dates_datetime = []
for date in filing_dates:
    date_time = pd.to_datetime(date, infer_datetime_format=True)
    filing_dates_datetime.append(date_time)

    
accession_numbers = submissions_mass['filings']['recent']['accessionNumber']
accession_numbers_clean = []
for number in accession_numbers:
    accession_numbers_clean.append(number.replace('-', ''))
    
    
submission_data = {'FilingDates': filing_dates_datetime, 'AccessionNumber':  accession_numbers_clean, 'Form': forms}

In [24]:
# dates = pd.DatetimeIndex(submissions_df['FilingDates']).year
# dates

## Cleaning Submissions Data & Establishing Dataframe

In [25]:
print(len(submission_data['FilingDates']))
print(len(submission_data['AccessionNumber']))
print(len(submission_data['Form']))

1009
1009
1009


In [38]:
submissions_df = pd.DataFrame(data=submission_data)

In [27]:
accession_ten_k = submissions_df['AccessionNumber'][submissions_df['Form'] == '10-K']
accession_ten_k

64     000106528022000036
316    000106528021000040
594    000106528020000040
842    000106528019000043
Name: AccessionNumber, dtype: object

# SCRAPING FINANCIAL STATEMENT DATA FROM FILING SUMMARY

In [28]:
base_url = r'https://www.sec.gov/Archives/edgar/data/'
cik = r'0001065280'
accession = r'000106528022000036'

filing_url = base_url + cik + '/' + accession + '/index.json'

filing_content = requests.get(filing_url, headers=header).json()

for file in filing_content['directory']['item']:
    
    # grab the filing summary and create a new url leading to the file so we can download it
    if file['name'] == 'FilingSummary.xml':
        
        xml_summary = r'https://www.sec.gov' + filing_content['directory']['name'] + '/' + file['name']
        
        print('-'*100)
        print('File Name: ' + file['name'])
        print('File Path: ' + xml_summary)

----------------------------------------------------------------------------------------------------
File Name: FilingSummary.xml
File Path: https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/FilingSummary.xml


In [29]:
base_url = xml_summary.replace('FilingSummary.xml', '')
content = requests.get(xml_summary, headers=header).content
soup = BeautifulSoup(content, 'lxml')

reports = soup.find('myreports')

master_reports = []

for report in reports.find_all('report')[:-1]:
    
    report_dict = {}
    report_dict['name_short'] = report.shortname.text
    report_dict['name_long'] = report.longname.text
    report_dict['position'] = report.position.text
    report_dict['category'] = report.menucategory.text
    report_dict['url'] = base_url + report.htmlfilename.text
    
    master_reports.append(report_dict)

    
statements_url = []

for report_dict in master_reports:
    
    item1 = r'Consolidated Balance Sheets'.upper()
    item2 = r'Consolidated Statements of Operations'.upper()
    item3 = r'Consolidated Statements of Cash Flows'.upper()
    
    report_list = [item1, item2, item3]
    
    if report_dict['name_short'] in report_list:
        print('-'*100)
        print(report_dict['name_short'])
        print(report_dict['url'])
        
        statements_url.append(report_dict['url'])

----------------------------------------------------------------------------------------------------
CONSOLIDATED STATEMENTS OF OPERATIONS
https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/R3.htm
----------------------------------------------------------------------------------------------------
CONSOLIDATED STATEMENTS OF CASH FLOWS
https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/R5.htm
----------------------------------------------------------------------------------------------------
CONSOLIDATED BALANCE SHEETS
https://www.sec.gov/Archives/edgar/data/1065280/000106528022000036/R6.htm


In [42]:
statements_data = []

for statement in statements_url:
    
    statement_data = {}
    statement_data['headers'] = []
    statement_data['sections'] = []
    statement_data['data'] = []
    
    content = requests.get(statement, headers=header).content
    report_soup = BeautifulSoup(content, 'html')
    
    for index, row in enumerate(report_soup.table.find_all('tr')):
        
        cols = row.find_all('td')
        
        if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0):
            
            reg_row = [ele.text.strip() for ele in cols]
            statement_data['data'].append(reg_row)
            
        elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0):
            
            sec_row = cols[0].text.strip()
            statement_data['sections'].append(sec_row)
            
        elif (len(row.find_all('th')) != 0):
            
            hed_row = [ele.text.strip() for ele in row.find_all('th')]
            statement_data['headers'].append(hed_row)
              
        else:
              print('We encountered an error.')
              
    statements_data.append(statement_data)

In [36]:
income_headers = statements_data[0]['headers'][1]
income_data = statements_data[0]['data']

# put the data into a df
income_df = pd.DataFrame(income_data)

# define an rename index col
income_df.index = income_df[0]
income_df.index.name = 'Category'
income_df = income_df.drop(0, axis=1)

# get rid of the $, (), and convert empty cells to NaNs
income_df = income_df.replace('[\$,)]', '', regex=True)\
                     .replace('[(]', '-', regex=True)\
                     .replace('', 'NaN', regex=True)

# everything otiginally comes in as string, so we onvert to float
income_df = income_df.astype(float)

# change column headers
income_df.columns = income_headers

income_df

Unnamed: 0_level_0,"Dec. 31, 2021","Dec. 31, 2020","Dec. 31, 2019"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Revenues,29697844.0,24996056.0,20156447.0
Cost of revenues,17332683.0,15276319.0,12440213.0
Marketing,2545146.0,2228362.0,2652462.0
Technology and development,2273885.0,1829600.0,1545149.0
General and administrative,1351621.0,1076486.0,914369.0
Operating income,6194509.0,4585289.0,2604254.0
Interest expense,-765620.0,-767499.0,-626023.0
Interest and other income (expense),411214.0,-618441.0,84000.0
Income before income taxes,5840103.0,3199349.0,2062231.0
Provision for income taxes,-723875.0,-437954.0,-195315.0


In [32]:
socf_headers = statements_data[1]['headers'][1]
socf_data = statements_data[1]['data']

# put the data into a df
socf_df = pd.DataFrame(socf_data)

# define an rename index col
socf_df.index = socf_df[0]
socf_df.index.name = 'Category'
socf_df = socf_df.drop(0, axis=1)


# get rid of the $, (), and convert empty cells to NaNs
socf_df = socf_df.replace('[\$,)]', '', regex=True)\
                     .replace('[(]', '-', regex=True)\
                     .replace('', 'NaN', regex=True)

# everything otiginally comes in as string, so we onvert to float
socf_df = socf_df.astype(float)

# change column headers
socf_df.columns = socf_headers

socf_df

Unnamed: 0_level_0,"Dec. 31, 2021","Dec. 31, 2020","Dec. 31, 2019"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Net income,5116228.0,2761395.0,1866916.0
Additions to content assets,-17702202.0,-11779284.0,-13916683.0
Change in content liabilities,232898.0,-757433.0,-694011.0
Amortization of content assets,12230367.0,10806912.0,9216247.0
"Depreciation and amortization of property, equipment and intangibles",208412.0,115710.0,103579.0
Stock-based compensation expense,403220.0,415180.0,405376.0
Foreign currency remeasurement loss (gain) on debt,-430661.0,533278.0,-45576.0
Other non-cash items,376777.0,293126.0,228230.0
Deferred income taxes,199548.0,70066.0,-94443.0
Other current assets,-369681.0,-187623.0,-252113.0


In [33]:
bs_headers = statements_data[2]['headers'][0]
main, fy21, fy20 = bs_headers
bs_headers = fy21, fy20

bs_data = statements_data[2]['data']

# put the data into a df
bs_df = pd.DataFrame(bs_data)

# define an rename index col
bs_df.index = bs_df[0]
bs_df.index.name = 'Category'
bs_df = bs_df.drop(0, axis=1)


# get rid of the $, (), and convert empty cells to NaNs
bs_df = bs_df.replace('[\$,)]', '', regex=True)\
             .replace('[(]', '-', regex=True)\
             .replace('', 'NaN', regex=True)

# everything otiginally comes in as string, so we onvert to float
bs_df = bs_df.astype(float)

# change column headers
bs_df.columns = bs_headers

bs_df

Unnamed: 0_level_0,"Dec. 31, 2021","Dec. 31, 2020"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Cash and cash equivalents,6027804.0,8205550.0
Other current assets,2042021.0,1556030.0
Total current assets,8069825.0,9761580.0
"Content assets, net",30919539.0,25383950.0
"Property and equipment, net",1323453.0,960183.0
Other non-current assets,4271846.0,3174646.0
Total assets,44584663.0,39280359.0
Current content liabilities,4292967.0,4429536.0
Accounts payable,837483.0,656183.0
Accrued expenses and other liabilities,1449351.0,1102196.0
