In [311]:
import pandas as pd
import os
import requests
import re
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
from pandas_datareader import data
import pandas_datareader as web
from bs4 import BeautifulSoup
import json

# Yahoo Finance WEBSCRAPER

In [None]:
stocks = ['GOOGL','AAPL','TSLA','AMZN','META', 'SNAP']
User_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'

In [None]:
# Current Stock prices
base_url = 'https://finance.yahoo.com/quote/'
tickr_url = [f'{x}?p={x}&.tsrc=fin-srch' for x in stocks]

current_price = []
missing_data = []
print('Getting current prices for:')
for tickr,stock in zip(tickr_url,stocks):
    print(f'...{stock}...')
    try:
        content = requests.get(base_url+tickr,headers={'User-Agent':User_agent}).content
        soup = BeautifulSoup(content)
        soup = soup.find('fin-streamer',class_='Fw(b) Fz(36px) Mb(-4px) D(ib)') #Fw(b) Fz(36px) Mb(-4px) D(ib)
        current_price.append(float(soup['value']))
    except:
        print(f'No data for {stock}')
        missing_data.append(stock)
        current_price.append('Na')

pd.DataFrame({'Ticker':stocks, 'Current Price':current_price})

In [None]:
# 20Y Historical Stock Prices
base_url = 'https://query1.finance.yahoo.com/v7/finance/download/{}?' # 'AAPL?period1=1645596185&amp;period2=1677132185&amp;interval=1d&amp;events=history&amp;includeAdjustedClose=true'
params = {'range': '20y',
          'interval': '1d',
          'events':'history'}
print('Getting ({}) historical prices for:'.format(params['range']))
for stock in stocks:
    print(f'...{stock}...')
    try:
        os.mkdir('./Historical/{}/{}'.format(params['range'],stock))
        print('New directory completed')
    except FileExistsError:
        print('A directory already exists, additional data for {} is being saved'.format(stock))
    try:
        response = requests.get(base_url.format(stock), params=params,headers={'User-Agent': User_agent}).content
        soup = BeautifulSoup(response)
        ls = str(soup.find('p')).replace('<p>','').splitlines()
        cols = ls[0]
        data = np.array([x.split(',') for x in ls[1:]])
        pd.DataFrame(data=data,columns=cols.split(',')).to_csv('./Historical/{}/{}/historical_price.csv'.format(params['range'],stock))
    except:
        print(f'No data for {stock}')


# SEC DATA


In [313]:
SEC_user_agent = 'matiasrhuber@gmail.com'
encoding = 'gzip, deflate'
host = 'www.sec.gov'
SEC_headers={'User-Agent': SEC_user_agent,'Accept-Encoding':'gzip','Host':host}
CIK = '320193'
years = range(2000,2023)
params = {'action':'getcompany',
          'CIK': '789019',
          'type': '10-k', #optional
          'dateb': '20190101', #optional
          #'datea': 20220101, #optional
          'owner': 'exclude', # default set to exclude
          'start':'',
          'output': 'atom',
          'count': '100' #number of results I want to see default is 40
          } #### Never worked :(

In [314]:
# Ticker and CIK mapping
tickers_cik = requests.get("https://www.sec.gov/files/company_tickers.json",headers=SEC_headers)
tickr_text = tickers_cik.text
tickr_dict = json.loads(tickr_text)

for num in range(len(tickr_dict)):
    tickr_dict[(tickr_dict[str(num)]["ticker"])] = tickr_dict[str(num)]
    del tickr_dict[str(num)]
# for tickr in tickr_dict:
#     if tickr_dict[tickr]['cik_str'] == '1265107':
#         print(tickr)
# POTENTIAL MISSING DATA

In [315]:
# Index for file locations of 10-k and 10-q filings
base_url = 'https://www.sec.gov/Archives/edgar/full-index/'
quarters = ['QTR1','QTR2','QTR3','QTR4']
years = range(2000,2022) 
for year in years:
    try:
        os.mkdir(f'./SEC/Master_Index/{year}')
    except:
        print(f'Previous data from year {year} already saved')
    for q in quarters:
        print(f'Going through files from {year} {q}...')
        master_index = requests.get(base_url+f'{year}/{q}/master.idx', headers=SEC_headers).text
        master_index = master_index.split('--------------------------------------------------------------------------------')[1]
        master_index = master_index.replace('\n','|').split('|')
        del master_index[0]
        keys = master_index[::5]
        del master_index[::5]

        # Fit data into dictionary and select only 10-k and 10-q file types
        shape = int(len(master_index)/4)
        print(f'File size for {year} {q}: {shape}')
        
        data = np.array(master_index).reshape(shape,4)
        cols = ['Company Name', 'Form Type', 'Date Filled', 'File Name']
        dict_master = {}
        for set,key in zip(data,keys):
            dict_temp = {key:value for (key,value) in zip(cols,set)}
            if dict_temp['Form Type'] == '10-K':
                dict_master[key+'_10-k'] = dict_temp
            if dict_temp['Form Type'] == '10-Q':
                dict_master[key+'_10-q'] = dict_temp
        print(f'10-K or 10-Q Filings saved: {len(dict_master)}')
        
        with open(f'./SEC/Master_Index/{year}/{q}.txt','w') as f:
            f.write(json.dumps(dict_master))
   

Previous data from year 2000 already saved
Going through files from 2000 QTR1...
File size for 2000 QTR1: 116209
10-K or 10-Q Filings saved: 6832
Going through files from 2000 QTR2...
File size for 2000 QTR2: 81129
10-K or 10-Q Filings saved: 9617
Going through files from 2000 QTR3...
File size for 2000 QTR3: 72571
10-K or 10-Q Filings saved: 9022
Going through files from 2000 QTR4...
File size for 2000 QTR4: 72053
10-K or 10-Q Filings saved: 8978
Previous data from year 2001 already saved
Going through files from 2001 QTR1...
File size for 2001 QTR1: 111740
10-K or 10-Q Filings saved: 5311
Going through files from 2001 QTR2...
File size for 2001 QTR2: 90283
10-K or 10-Q Filings saved: 10073
Going through files from 2001 QTR3...
File size for 2001 QTR3: 74313
10-K or 10-Q Filings saved: 8335
Going through files from 2001 QTR4...
File size for 2001 QTR4: 75107
10-K or 10-Q Filings saved: 8248
Previous data from year 2002 already saved
Going through files from 2002 QTR1...
File size for 

KeyboardInterrupt: 

In [316]:
base_url = 'https://www.sec.gov/Archives/'
# stock = stocks[0]
file_types = ['_10-q','_10-k']
quarters = ['QTR1','QTR2','QTR3','QTR4']
years = ['2020','2021','2022']
curr_dir = os.getcwd()
folder = 'SEC'
# ocf = ['Net cash provided by operating activities', 'Cash generated by operating activities']
# title = 'CONSOLIDATED STATEMENTS OF CASH FLOWS'
# oex = ['Purchases of property and equipment', 'Payments for acquisition of property, plant and equipment']
# for year in years:
#     for quarter in quarters:
year = '2020'
quarter = 'QTR1'
file_type = '_10-q'
with open(f'./SEC/Master_Index/{year}/{quarter}.txt','r') as f: # Apply different years and quarters
    text = f.read()
dict_ind = json.loads(text)
stocks = 'GOOGL'
for stock in stocks:
    cik = tickr_dict[stock]['cik_str']
    try:
        file_type = file_types[0] # Quarterly Filings
        url = dict_ind[str(cik)+file_type]['File Name'].replace('-','').replace('.txt','')
    except:
        file_type = file_types[1] # Yearly Filings
        url = dict_ind[str(cik)+file_type]['File Name'].replace('-','').replace('.txt','')
    file = '/FilingSummary.xml'
    data = requests.get(base_url+url+file,headers=SEC_headers).content
    soup = BeautifulSoup(data, 'lxml')
    myreports = soup.find('myreports')

    # List with individual components from myreports
    master_reports = []
    print(f'Gathering data for {stock}{file_type} @ {quarter} {year}')
    for report in myreports.find_all('report')[:-1]:

    # dictionary with all relevant parts
        report_dict = {}
        report_dict['name_short'] = report.shortname.text
        report_dict['name_long'] = report.longname.text
        report_dict['position'] = report.position.text
        report_dict['category'] = report.menucategory.text
        report_dict['url'] = base_url + url + '/' + report.htmlfilename.text
        master_reports.append(report_dict)
        print(report_dict['name_short'])

    try: 
        os.mkdir(os.path.join(curr_dir,folder,stock))
    except:
        print('Saving master report along existing data {}'.format(stock))
    with open(os.path.join(curr_dir,folder,stock,f'master_reports_{file_type}_{year}_{quarter}.json'), 'w') as fout:
        json.dump(master_reports , fout)



Gathering data for G_10-k @ QTR1 2020
Document and Entity Information
Consolidated Balance Sheets
Consolidated Balance Sheets (Parenthetical)
Consolidated Statements of Income
Consolidated Statements of Comprehensive Income (Loss)
Consolidated Statements of Equity and Redeemable Non-controlling Interest
Consolidated Statements of Equity and Redeemable Non-controlling Interest (Parenthetical)
Consolidated Statements of Cash Flows
Organization
Summary of significant accounting policies
Business acquisitions
Cash and cash equivalents
Accounts receivable, net of reserve for doubtful receivables
Fair Value Measurements
Derivative financial instruments
Prepaid expenses and other current assets
Property, plant and equipment, net
Goodwill and intangible assets
Other Assets
Leases
Accrued expenses and other current liabilities
Long-term debt
Short-term borrowings
Other liabilities
Employee benefit plans
Stock-based compensation
Capital stock
Earnings per share
Cost of revenue
Selling, general a

In [317]:
financial_statements_mapping = {
    "Consolidated Balance Sheets": [
        "Consolidated Balance Sheets",
        "CONSOLIDATED BALANCE SHEETS",
        "Condensed Consolidated Balance Sheets",
        "CONDENSED CONSOLIDATED BALANCE SHEETS",
        "Consolidated Balance Sheets (Unaudited)",
        "CONSOLIDATED BALANCE SHEETS (Unaudited)",
        "Condensed Consolidated Balance Sheets (Unaudited)",
        "CONDENSED CONSOLIDATED BALANCE SHEETS (Unaudited)",
        "Balance Sheet",
        "BALANCE SHEET",
        "Statement of Financial Position",
        "STATEMENT OF FINANCIAL POSITION",
        "Consolidated Statements of Financial Position",
        "CONSOLIDATED STATEMENTS OF FINANCIAL POSITION",
        "Consolidated Statements of Financial Condition",
        "CONSOLIDATED STATEMENTS OF FINANCIAL CONDITION",
        "Consolidated Statements of Position",
        "CONSOLIDATED STATEMENTS OF POSITION",
        "Financial Position",
        "FINANCIAL POSITION"
    ],
    "Consolidated Statements of Operations": [
        "Consolidated Statements of Operations",
        "CONSOLIDATED STATEMENTS OF OPERATIONS",
        "Consolidated Statements Of Operations",
        "Consolidated Statements of Operations (Unaudited)",
        "CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)",
        "Condensed Consolidated Statements of Operations (Unaudited)",
        "CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)",
        "Consolidated Statements of Income",
        "CONSOLIDATED STATEMENTS OF INCOME",
        "Condensed Consolidated Statements of Income",
        "CONDENSED CONSOLIDATED STATEMENTS OF INCOME",
        "Income Statement",
        "INCOME STATEMENT",
        "Statement of Earnings",
        "STATEMENT OF EARNINGS",
        "Profit and Loss Statement (P&L)",
        "PROFIT AND LOSS STATEMENT (P&L)",
        "Statement of Comprehensive Income",
        "STATEMENT OF COMPREHENSIVE INCOME",
        "Statement of Operations",
        "STATEMENT OF OPERATIONS",
        "Statement of Income",
        "STATEMENT OF INCOME",
        "Earnings Statement",
        "EARNINGS STATEMENT",
        "Revenue and Expense Statement",
        "REVENUE AND EXPENSE STATEMENT",
        "Operating Statement",
        "OPERATING STATEMENT",
        "Statement of Profit and Loss and Other Comprehensive Income",
        "STATEMENT OF PROFIT AND LOSS AND OTHER COMPREHENSIVE INCOME",
        "Statement of Earnings and Retained Earnings",
        "STATEMENT OF EARNINGS AND RETAINED EARNINGS",
        "Statement of Income and Expenditure",
        "STATEMENT OF INCOME AND EXPENDITURE"
    ],
    "Consolidated Statements of Cash Flows": [
        "Consolidated Statements of Cash Flows",
        "CONSOLIDATED STATEMENTS OF CASH FLOWS",
        "Consolidated Statements of Cash Flows (Unaudited)",
        "CONDENSED CONSOLIDATED STATEMENTS OF CASH FLOWS (Unaudited)",
        "Condensed Consolidated Statements of Cash Flows",
        "CONDENSED CONSOLIDATED STATEMENTS OF CASH FLOWS",
        "Cash Flow Statement",
        "CASH FLOW STATEMENT",
        "Statement of Cash Flows",
        "STATEMENT OF CASH FLOWS",
        "Statement of Changes in Cash",
        "STATEMENT OF CHANGES IN CASH",
        "Statement of Cash Flow",
        "STATEMENT OF CASH FLOW"
    ],
    "Consolidated Statements of Shareholders' Equity": [
        "Consolidated Statements of Shareholders' Equity",
        "CONSOLIDATED STATEMENTS OF SHAREHOLDERS' EQUITY",
        "Condensed Consolidated Statements of Shareholders' Equity (Unaudited)",
        "CONDENSED CONSOLIDATED STATEMENTS OF SHAREHOLDERS' EQUITY (Unaudited)",
        "Consolidated Statements of Redeemable Noncontrolling Interest and Stockholders' Equity (Unaudited)",
        "CONSOLIDATED STATEMENTS OF REDEEMABLE NONCONTROLLING INTEREST AND STOCKHOLDERS' EQUITY (Unaudited)",
        "Consolidated Statements of Redeemable Noncontrolling Interest and Stockholders' Equity",
        "CONSOLIDATED STATEMENTS OF REDEEMABLE NONCONTROLLING INTEREST AND STOCKHOLDERS' EQUITY",
        "Consolidated Statements of Stockholders' Equity",
        "CONSOLIDATED STATEMENTS OF STOCKHOLDERS' EQUITY",
        "Condensed Statements of Shareholders' Equity",
        "CONDENSED STATEMENTS OF SHAREHOLDERS' EQUITY",
        "Statement of Shareholders' Equity",
        "STATEMENT OF SHAREHOLDERS' EQUITY",
        "Shareholders' Equity Statement",
        "STOCKHOLDERS' EQUITY",
        "Stockholders' Equity",
        "SHAREHOLDERS' EQUITY STATEMENT",
        "Equity Statement",
        "EQUITY STATEMENT"
        
    ]
}


In [322]:
# Earnings files are collected with desired data
### Figure out all possible formatting for the document names ###
stocks = ['GOOGL','AAPL','TSLA','AMZN','META', 'SNAP']
file_types = ['_10-q','_10-k']
quarters = ['QTR1','QTR2','QTR3','QTR4']
years = ['2020','2021','2022']
curr_dir = os.getcwd()
folder = 'SEC'
statements_url = []
# item1 = [r"Consolidated Balance Sheets", r"CONSOLIDATED BALANCE SHEETS", r"CONDENSED CONSOLIDATED BALANCE SHEETS (Unaudited)"]   #"CONSOLIDATED BALANCE SHEETS"
# item2 = [r"Consolidated Statements of Operations", r"CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)", r"CONSOLIDATED STATEMENTS OF INCOME", r"Consolidated Statements Of Operations (Unaudited)", r"CONSOLIDATED STATEMENTS OF OPERATIONS"]   #"CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)"
# item3 = [r"Consolidated Statements of Cash Flows (Unaudited)", r"CONSOLIDATED STATEMENTS OF CASH FLOWS", r"CONDENSED CONSOLIDATED STATEMENTS OF CASH FLOWS (Unaudited)", r"Consolidated Statements of Cash Flows"] #"CONSOLIDATED STATEMENTS OF CASH FLOWS" #CONDENSED CONSOLIDATED STATEMENTS OF CASH FLOWS (Unaudited)"
# item4 = [r"Consolidated Statements of Redeemable Noncontrolling Interest and Stockholders' Equity", r"CONDENSED CONSOLIDATED STATEMENTS OF SHAREHOLDERS' EQUITY (Unaudited)", r"CONSOLIDATED STATEMENTS OF STOCKHOLDERS' EQUITY", r"Consolidated Statements of Stockholders' Equity", r"CONSOLIDATED STATEMENTS OF SHAREHOLDERS' EQUITY"]   #"CONDENSED CONSOLIDATED STATEMENTS OF SHAREHOLDERS' EQUITY (Unaudited)"
   
missing_data_ls = [] 
num_missing = 0

for stock in stocks:
    try:
        os.mkdir(os.path.join(curr_dir,folder,stock,'financial_statements_url'))
    except:
        print('...previous financial statements already exist...')
    for year in years:
        for quarter in quarters:
            print('*'*100)
            print(f'{stock} {year} {quarter}')
            try:
                file_type = file_types[0]
                with open(os.path.join(curr_dir,folder,stock,'master_reports',f'master_reports_{file_type}_{year}_{quarter}.json')) as f:
                    master_reports = json.load(f)
            except:
                file_type = file_types[1]
                with open(os.path.join(curr_dir,folder,stock,'master_reports',f'master_reports_{file_type}_{year}_{quarter}.json')) as f:
                    master_reports = json.load(f)

            missing_data = [item for item in financial_statements_mapping]
            # THIS NEEDS TO BE DONE WITH RE ### this text may change slightly per quarter
            statements_url = []
            dict_url = {}
            for report_dict in master_reports:
                
                report_list = financial_statements_mapping 
                for item in report_list:
                    if report_dict['name_short'] in financial_statements_mapping[item]:                        

                        print('-'*100)
                        print(report_dict['name_short'])
                        print(report_dict['url'])
                        
                        
                        # print(item)
                        # print(report_dict['name_short'])
                        # print(missing_data)
                        try:
                            missing_data.remove(item)
                            statements_url.append(report_dict['url'])
                            dict_url[item] = report_dict['url']
                        except ValueError:
                            print(f'MULTIPLE REPORTS FOUND FOR: {item}')
            
            with open(os.path.join(curr_dir,folder,stock,'financial_statements_url',f'url{file_type}_{year}_{quarter}.json'), 'w') as f:
                json.dump(dict_url,f)
                
                        

                    # elif report_dict['name_short'] in ['CONDENSED ' + x for x in financial_statements_mapping[item]]:
                    
                    #     print('-'*100)
                    #     print(report_dict['name_short'])
                    #     print(report_dict['url'])
                        
                    #     statements_url.append(report_dict['url'])
                    #     missing_data.remove(item)
            if len(missing_data) != 0:
                num_missing += len(missing_data)
                missing_data_ls.append(f'Missing Data for {stock}, {year}, {quarter}, {file_type} \n {len(missing_data)}')
                
                
for x in missing_data_ls:
    print(x)
    
print(f'Total: {num_missing}')
print(statements_url)

...previous financial statements already exist...
****************************************************************************************************
GOOGL 2020 QTR1
----------------------------------------------------------------------------------------------------
CONSOLIDATED BALANCE SHEETS
https://www.sec.gov/Archives/edgar/data/1652044/000165204420000008/R2.htm
----------------------------------------------------------------------------------------------------
CONSOLIDATED STATEMENTS OF INCOME
https://www.sec.gov/Archives/edgar/data/1652044/000165204420000008/R4.htm
----------------------------------------------------------------------------------------------------
CONSOLIDATED STATEMENTS OF STOCKHOLDERS' EQUITY
https://www.sec.gov/Archives/edgar/data/1652044/000165204420000008/R7.htm
----------------------------------------------------------------------------------------------------
CONSOLIDATED STATEMENTS OF CASH FLOWS
https://www.sec.gov/Archives/edgar/data/1652044/00016520442

FileNotFoundError: [Errno 2] No such file or directory: './SEC/GOOGL/financial_statements_url/url_10-k_2020_QTR3.json'

In [324]:
with open(f'./SEC/AMZN/financial_statements_url/url_10-q_2020_QTR2.json','r') as f: 
    text = f.read()
    statements_url = json.loads(text)
print(statements_url)

with open(f'./SEC/GOOGL/financial_statements_url/url_10-q_2020_QTR2.json','r') as f: 
    text = f.read()
    statements_url = json.loads(text)
print(statements_url)

with open(f'./SEC/TSLA/financial_statements_url/url_10-q_2020_QTR2.json','r') as f: 
    text = f.read()
    statements_url = json.loads(text)
print(statements_url)

{'Consolidated Statements of Cash Flows': 'https://www.sec.gov/Archives/edgar/data/1018724/000101872420000010/R2.htm', 'Consolidated Statements of Operations': 'https://www.sec.gov/Archives/edgar/data/1018724/000101872420000010/R3.htm', 'Consolidated Balance Sheets': 'https://www.sec.gov/Archives/edgar/data/1018724/000101872420000010/R6.htm', "Consolidated Statements of Shareholders' Equity": 'https://www.sec.gov/Archives/edgar/data/1018724/000101872420000010/R13.htm'}
{'Consolidated Balance Sheets': 'https://www.sec.gov/Archives/edgar/data/1652044/000165204420000021/R2.htm', 'Consolidated Statements of Operations': 'https://www.sec.gov/Archives/edgar/data/1652044/000165204420000021/R4.htm', "Consolidated Statements of Shareholders' Equity": 'https://www.sec.gov/Archives/edgar/data/1652044/000165204420000021/R7.htm', 'Consolidated Statements of Cash Flows': 'https://www.sec.gov/Archives/edgar/data/1652044/000165204420000021/R8.htm'}
{'Consolidated Balance Sheets': 'https://www.sec.gov/

In [325]:
# let's assume we want all the statements in a single data set.
statements_data = []

# loop through each statement url
for statement in statements_url.values():

    # define a dictionary that will store the different parts of the statement.
    statement_data = {}
    statement_data['headers'] = []
    statement_data['sections'] = []
    statement_data['data'] = []
    
    # request the statement file content
    content = requests.get(statement, headers=SEC_headers).content
    report_soup = BeautifulSoup(content, 'lxml')

    # find all the rows, figure out what type of row it is, parse the elements, and store in the statement file list.
    for index, row in enumerate(report_soup.table.find_all('tr')):
        
        # first let's get all the elements.
        cols = row.find_all('td')
        
        # if it's a regular row and not a section or a table header
        if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0): 
            reg_row = [ele.text.strip() for ele in cols]
            statement_data['data'].append(reg_row)
            
        # if it's a regular row and a section but not a table header
        elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0):
            sec_row = cols[0].text.strip()
            statement_data['sections'].append(sec_row)
            
        # finally if it's not any of those it must be a header
        elif (len(row.find_all('th')) != 0):            
            hed_row = [ele.text.strip() for ele in row.find_all('th')]
            statement_data['headers'].append(hed_row)
            
        else:            
            print('We encountered an error.')

    # append it to the master list.
    statements_data.append(statement_data)

In [290]:
statements_data[2]['headers'][0]

['Consolidated Balance Sheets - USD ($) $ in Millions',
 'Dec. 31, 2019',
 'Dec. 31, 2018']

In [326]:
l = statements_data[0]['headers'][0]
[ item for item in l for _ in range(2) ]
n = 2
file_type = '_10-q'
period_mapping = {'_10-q' : '3 Months Ended', '_10-k' : '12 Months Ended'}

if len(statements_data[n]['headers']) != 1:
    subheaders = statements_data[n]['headers'][1]
    overheaders = statements_data[n]['headers'][0][1:]
    mult = int(len(subheaders)/len(overheaders))
    overheaders_table = [item for item in overheaders for _ in range(mult)]
    income_header = [x+' ('+y+')' for x,y in zip(subheaders,overheaders_table)]
else:
    subheaders = statements_data[n]['headers'][0][1:]
    income_header = [x+' ('+period_mapping[file_type]+')' for x in subheaders]

print(income_header)

['Total (3 Months Ended)', 'February 2020 Public Offering [Member] (3 Months Ended)', 'Redeemable Noncontrolling Interests [Member] (3 Months Ended)', 'Common Stock [Member] (3 Months Ended)', 'Common Stock [Member] \nFebruary 2020 Public Offering [Member] (3 Months Ended)', 'Additional Paid-In Capital [Member] (3 Months Ended)', 'Additional Paid-In Capital [Member] \nFebruary 2020 Public Offering [Member] (3 Months Ended)', 'Accumulated Deficit [Member] (3 Months Ended)', 'Accumulated Other Comprehensive Loss [Member] (3 Months Ended)', "Total Stockholder's Equity [Member] (3 Months Ended)", "Total Stockholder's Equity [Member] \nFebruary 2020 Public Offering [Member] (3 Months Ended)", 'Noncontrolling Interests in Subsidiaries [Member] (3 Months Ended)']


In [302]:
curr_dir = os.getcwd()
folder = 'SEC'
data_dir = os.path.join(curr_dir,folder,'AAPL','financial_statements_raw')
for data in os.listdir(data_dir):
    with open(os.path.join(data_dir,data)) as f:
        statements_data = json.load(f)

In [310]:
statements_data[2]['headers'][0][0]

"CONDENSED CONSOLIDATED STATEMENTS OF SHAREHOLDERS' EQUITY (Unaudited) - USD ($) $ in Millions"

In [309]:
statements_data[0]['headers'][0][0]

'CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited) - USD ($) shares in Thousands, $ in Millions'

In [329]:
# Grab the proper components
i = 3
### Figure out all possible formatting according to document title, number of headers, and display table accordingly ###

if len(statements_data[i]['headers']) != 1:
    subheaders = statements_data[i]['headers'][1]
    overheaders = statements_data[i]['headers'][0][1:]
    mult = int(len(subheaders)/len(overheaders))
    overheaders_table = [item for item in overheaders for _ in range(mult)]
    income_header = [x+' ('+y+')' for x,y in zip(subheaders,overheaders_table)]
else:
    subheaders = statements_data[i]['headers'][0][1:]
    income_header = [x+' ('+period_mapping[file_type]+')' for x in subheaders]

# income_header =  statements_data[i]['headers'][-1] #threee months ended gives different formatting #[0][1:]
income_data = statements_data[i]['data']

# Put the data in a DataFrame
income_df = pd.DataFrame(income_data)

# Display
print('-'*100)
print('Before Reindexing')
print('-'*100)
display(income_df.head())

# Define the Index column, rename it, and we need to make sure to drop the old column once we reindex.
income_df.index = income_df[0]
income_df.index.name = 'Category'
income_df = income_df.drop(0, axis = 1)

# Display
print('-'*100)
print('Before Regex')
print('-'*100)
display(income_df.head())

# Get rid of the '$', '(', ')', and convert the '' to NaNs.
income_df = income_df.replace('[\$,)]','', regex=True )\
                     .replace( '[(]','-', regex=True)\
                     .replace( '', 'NaN', regex=True)

# Display
print('-'*100)
print('Before type conversion')
print('-'*100)
display(income_df.head())

# everything is a string, so let's convert all the data to a float.
df_copy = income_df.copy()
income_df = income_df.astype(float)

# Change the column headers
print(income_header)
income_df.columns = income_header

# Display
print('-'*100)
print('Final Product')
print('-'*100)

# show the df
display(income_df)

current_dir = os.getcwd()
try:
    folder_dir = os.path.join(current_dir,'SEC',stock)
    os.mkdir(folder_dir)
except:
    print(f'Saving over data for {stock} in {year} {quarter}...')
income_df.to_csv(os.path.join(folder_dir,f"{year+quarter+file_type}.csv"))


----------------------------------------------------------------------------------------------------
Before Reindexing
----------------------------------------------------------------------------------------------------


Unnamed: 0,0,1,2
0,Net income (loss),$ 68,$ (668)
1,"Depreciation, amortization and impairment",553,468
2,Stock-based compensation,211,208
3,Amortization of debt discounts and issuance costs,46,40
4,Inventory and purchase commitments write-downs,45,81


----------------------------------------------------------------------------------------------------
Before Regex
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,1,2
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Net income (loss),$ 68,$ (668)
"Depreciation, amortization and impairment",553,468
Stock-based compensation,211,208
Amortization of debt discounts and issuance costs,46,40
Inventory and purchase commitments write-downs,45,81


----------------------------------------------------------------------------------------------------
Before type conversion
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,1,2
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Net income (loss),68,-668
"Depreciation, amortization and impairment",553,468
Stock-based compensation,211,208
Amortization of debt discounts and issuance costs,46,40
Inventory and purchase commitments write-downs,45,81


['Mar. 31, 2020 (3 Months Ended)', 'Mar. 31, 2019 (3 Months Ended)']
----------------------------------------------------------------------------------------------------
Final Product
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,"Mar. 31, 2020 (3 Months Ended)","Mar. 31, 2019 (3 Months Ended)"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Net income (loss),68.0,-668.0
"Depreciation, amortization and impairment",553.0,468.0
Stock-based compensation,211.0,208.0
Amortization of debt discounts and issuance costs,46.0,40.0
Inventory and purchase commitments write-downs,45.0,81.0
Loss on disposals of fixed assets,7.0,18.0
Foreign currency transaction net loss (gain),19.0,-39.0
Non-cash interest and other operating activities,58.0,116.0
Operating cash flow related to repayment of discounted convertible notes,,-188.0
Accounts receivable,-14.0,-100.0


Saving over data for SNAP in 2022 QTR4...


In [330]:
df_copy

Unnamed: 0_level_0,1,2
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Net income (loss),68.0,-668.0
"Depreciation, amortization and impairment",553.0,468.0
Stock-based compensation,211.0,208.0
Amortization of debt discounts and issuance costs,46.0,40.0
Inventory and purchase commitments write-downs,45.0,81.0
Loss on disposals of fixed assets,7.0,18.0
Foreign currency transaction net loss (gain),19.0,-39.0
Non-cash interest and other operating activities,58.0,116.0
Operating cash flow related to repayment of discounted convertible notes,,-188.0
Accounts receivable,-14.0,-100.0


In [228]:
# Grab the proper components

### Figure out all possible formatting according to document title, number of headers, and display table accordingly ###
income_header =  statements_data[1]['headers'][1] #threee months ended gives different formatting #[0][1:]
income_data = statements_data[1]['data']

# Put the data in a DataFrame
income_df = pd.DataFrame(income_data)

# Define the Index column, rename it, and we need to make sure to drop the old column once we reindex.
income_df.index = income_df[0]
income_df.index.name = 'Category'
income_df = income_df.drop(0, axis = 1)

# Get rid of the '$', '(', ')', and convert the '' to NaNs.
income_df = income_df.replace('[\$,)]','', regex=True )\
                     .replace( '[(]','-', regex=True)\
                     .replace( '', 'NaN', regex=True)

# everything is a string, so let's convert all the data to a float.
income_df = income_df.astype(float)

# Change the column headers
income_df.columns = income_header

# show the df
display(income_df)

current_dir = os.getcwd()
try:
    folder_dir = os.path.join(current_dir,'SEC',stock)
    os.mkdir(folder_dir)
except:
    print(f'Saving over data for {stock} in {year} {quarter}...')
income_df.to_csv(os.path.join(folder_dir,f"{year+quarter+file_type}.csv"))


Unnamed: 0_level_0,"Dec. 31, 2019","Dec. 31, 2018","Dec. 31, 2017"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Total net sales,280522.0,232887.0,177866.0
Cost of sales,165536.0,139156.0,111934.0
Fulfillment,40232.0,34027.0,25249.0
Technology and content,35931.0,28837.0,22620.0
Marketing,18878.0,13814.0,10069.0
General and administrative,5203.0,4336.0,3674.0
"Other operating expense (income), net",201.0,296.0,214.0
Total operating expenses,265981.0,220466.0,173760.0
Operating income,14541.0,12421.0,4106.0
Interest income,832.0,440.0,202.0


Saving over data for SNAP in 2022 QTR4...


In [None]:
statements_data[3]

In [None]:
income_df.to_csv

In [None]:
# Search for Filings for a stock
base_url = 'https://www.sec.gov/Archives/'
ocf = ['Net cash provided by operating activities', 'Cash generated by operating activities']
title = 'CONSOLIDATED STATEMENTS OF CASH FLOWS'
oex = ['Purchases of property and equipment', 'Payments for acquisition of property, plant and equipment']
with open('./SEC/Master_Index/2020/QTR1.txt','r') as f:
    text = f.read()
dict_ind = json.loads(text)
cik = tickr_dict['META']['cik_str']
url = dict_ind[str(cik)+'_10-k']['File Name'].replace('-','').replace('.txt','')
for x in range(1,90):
    file = f'R{x}.htm'
    data = requests.get(base_url+url+f'/{file}',headers=SEC_headers).text
    if title in data:
        print(x)



In [None]:
data = requests.get(base_url+url+f'/R8.htm',headers=SEC_headers).text
data

# Stock Analysis

In [None]:
## Gather Data
def YfinanceData(Symbols,daymonthyear):
    # Yfinance_list = []
    Yfinance_dict = {}
    current_dir = os.getcwd()


    # Interval required 5 minutes
    start = dt.datetime(daymonthyear[2], daymonthyear[1], daymonthyear[0])

    for Symbol in Symbols:
        Yfinance_DataFrame = yf.download(tickers=Symbol, interval='1d', start= start)
        Yfinance_DataFrame['Ticker'] = Symbol
        # Yfinance_list.append(Yfinance_DataFrame)
        Yfinance_dict[Symbol] = Yfinance_DataFrame
        data_dir = os.path.join(current_dir,f'Stock_Data/{Symbol}_{daymonthyear[2]}{daymonthyear[1]}{daymonthyear[0]}.csv')
        Yfinance_DataFrame.to_csv(data_dir)
        
    return Yfinance_dict

In [None]:
def StockPerformance(tickers,daymonthyear):
    current_dir = os.getcwd()
    analysis_dir = os.path.join(current_dir,'Stock_Analysis')
    data_dir = os.path.join(current_dir,'Stock_Data')
    for tckr in tickers:
        data = pd.read_csv(os.path.join(data_dir,f'{tckr}_{daymonthyear[2]}{daymonthyear[1]}{daymonthyear[0]}.csv'))
        
        ticker_dir = os.path.join(analysis_dir,tckr)
        try:
            os.mkdir(ticker_dir)
        except FileExistsError:
            pass
        
        fig = plt.figure()
        plt.plot(data['Close'])
        plt.title(f'{tckr}_{daymonthyear[2]}{daymonthyear[1]}{daymonthyear[0]}')
        plt.savefig(os.path.join(analysis_dir,tckr,f'{tckr}_{daymonthyear[2]}{daymonthyear[1]}{daymonthyear[0]}_plot.pdf'))

In [None]:
# Specify stocks and Timeperiod
tickers = ['AMZN','GOOG','APPL']
daymonthyear = [30,12,2005]
data = YfinanceData(tickers,daymonthyear)
StockPerformance(tickers,daymonthyear)

In [None]:
market_cap_data = web.get_quote_yahoo(tickers)['marketCap']

In [None]:
tsla = yf.Ticker("TSLA")
tsla.info

In [None]:
for x in range(3):
    print(x)

In [None]:
import pandas as pd

df = pd.DataFrame({
    'Category': ['Fruit','Vegetable','Vegetable','Fruit','Vegetable','Vegetable','Fruit','Vegetable','Fruit','Vegetable'],
    'SubCategories': ['Apple','Brinjal','Brinjal','Apple','Carrot','Potato','Apple','Carrot','Banana','Brinjal'],
    'Count': [2,1,1,1,3,1,1,2,1,1],
})

df.set_index(['Category','SubCategories']).groupby(level=[0,1]).sum()

    

In [297]:
dic ={"geeks": "A","for":"B","geeks":"C"}

value = {i for i in dic if dic[i]=="B"}
print("key by value:",value)


key by value: {'for'}
