### Create Financial Statements with Edgar Index Files

###### Objective: download data using the edgar library and visualize financial statements for any selected company


In [1]:
import pandas as pd
import edgar
import psycopg2
import os
import glob
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

In [174]:
#Only run if this cell if you have already run the whole script once and the table exists:
conn = psycopg2.connect("host=localhost dbname=postgres user=postgres password=Paolino1")
cur = conn.cursor()
cur.execute("""DROP TABLE SIC""")
cur.close()
conn.close()

In [175]:
!rm -R data_sic

In [176]:
!mkdir data_sic

In [177]:
#Downloading all files:
edgar.download_index('data_sic', 1993)

In [178]:
!ls data_sic

1993-QTR1.tsv 1998-QTR3.tsv 2004-QTR1.tsv 2009-QTR3.tsv 2015-QTR1.tsv
1993-QTR2.tsv 1998-QTR4.tsv 2004-QTR2.tsv 2009-QTR4.tsv 2015-QTR2.tsv
1993-QTR3.tsv 1999-QTR1.tsv 2004-QTR3.tsv 2010-QTR1.tsv 2015-QTR3.tsv
1993-QTR4.tsv 1999-QTR2.tsv 2004-QTR4.tsv 2010-QTR2.tsv 2015-QTR4.tsv
1994-QTR1.tsv 1999-QTR3.tsv 2005-QTR1.tsv 2010-QTR3.tsv 2016-QTR1.tsv
1994-QTR2.tsv 1999-QTR4.tsv 2005-QTR2.tsv 2010-QTR4.tsv 2016-QTR2.tsv
1994-QTR3.tsv 2000-QTR1.tsv 2005-QTR3.tsv 2011-QTR1.tsv 2016-QTR3.tsv
1994-QTR4.tsv 2000-QTR2.tsv 2005-QTR4.tsv 2011-QTR2.tsv 2016-QTR4.tsv
1995-QTR1.tsv 2000-QTR3.tsv 2006-QTR1.tsv 2011-QTR3.tsv 2017-QTR1.tsv
1995-QTR2.tsv 2000-QTR4.tsv 2006-QTR2.tsv 2011-QTR4.tsv 2017-QTR2.tsv
1995-QTR3.tsv 2001-QTR1.tsv 2006-QTR3.tsv 2012-QTR1.tsv 2017-QTR3.tsv
1995-QTR4.tsv 2001-QTR2.tsv 2006-QTR4.tsv 2012-QTR2.tsv 2017-QTR4.tsv
1996-QTR1.tsv 2001-QTR3.tsv 2007-QTR1.tsv 2012-QTR3.tsv 2018-QTR1.tsv
1996-QTR2.tsv 2001-QTR4.tsv 2007-QTR2.tsv 2012-QTR4.tsv 2018-QTR2.tsv
1996-Q

In [179]:
#Exploring a random file:

df = pd.read_csv('data_sic/2019-QTR3.tsv', sep='|', header=0)
print(df.shape)
print(df.head())
del df

(191792, 6)
   1000045  NICHOLAS FINANCIAL INC 10-Q  2019-08-14  \
0  1000045  NICHOLAS FINANCIAL INC    4  2019-08-14   
1  1000045  NICHOLAS FINANCIAL INC    4  2019-08-15   
2  1000045  NICHOLAS FINANCIAL INC    4  2019-08-20   
3  1000045  NICHOLAS FINANCIAL INC    4  2019-08-21   
4  1000045  NICHOLAS FINANCIAL INC    4  2019-08-27   

   edgar/data/1000045/0001564590-19-031992.txt  \
0  edgar/data/1000045/0001357521-19-000014.txt   
1  edgar/data/1000045/0001398344-19-014522.txt   
2  edgar/data/1000045/0001398344-19-014703.txt   
3  edgar/data/1000045/0001000045-19-000004.txt   
4  edgar/data/1000045/0001398344-19-015344.txt   

  edgar/data/1000045/0001564590-19-031992-index.html  
0  edgar/data/1000045/0001357521-19-000014-index....  
1  edgar/data/1000045/0001398344-19-014522-index....  
2  edgar/data/1000045/0001398344-19-014703-index....  
3  edgar/data/1000045/0001000045-19-000004-index....  
4  edgar/data/1000045/0001398344-19-015344-index....  


In [180]:
for i in os.listdir('data_sic'):
    if str(i).endswith('tsv'):
        print('Converting file: '+i)
        name = i[:-4]+'.csv'
        
        df = pd.read_csv('data_sic/'+i, sep='|', header=0)
        df.columns = ['CIK', 'Company_Name', 'Filing_Type', 'Filing_Date', 'URL_txt', 'URL_html']
        df.to_csv('data_sic/'+name)#index=False
        del df
        print('Created file: '+name)

Converting file: 2007-QTR1.tsv
Created file: 2007-QTR1.csv
Converting file: 2006-QTR1.tsv
Created file: 2006-QTR1.csv
Converting file: 2007-QTR3.tsv
Created file: 2007-QTR3.csv
Converting file: 2006-QTR3.tsv
Created file: 2006-QTR3.csv
Converting file: 2006-QTR2.tsv
Created file: 2006-QTR2.csv
Converting file: 2007-QTR2.tsv
Created file: 2007-QTR2.csv
Converting file: 2010-QTR4.tsv
Created file: 2010-QTR4.csv
Converting file: 2011-QTR4.tsv
Created file: 2011-QTR4.csv
Converting file: 2010-QTR1.tsv
Created file: 2010-QTR1.csv
Converting file: 2011-QTR1.tsv
Created file: 2011-QTR1.csv
Converting file: 2010-QTR3.tsv
Created file: 2010-QTR3.csv
Converting file: 2011-QTR3.tsv
Created file: 2011-QTR3.csv
Converting file: 2007-QTR4.tsv
Created file: 2007-QTR4.csv
Converting file: 2006-QTR4.tsv
Created file: 2006-QTR4.csv
Converting file: 2011-QTR2.tsv
Created file: 2011-QTR2.csv
Converting file: 2010-QTR2.tsv
Created file: 2010-QTR2.csv
Converting file: 1997-QTR1.tsv
Created file: 1997-QTR1.c

In [181]:
master_df = pd.concat([pd.read_csv(f'data_sic/{f}', sep=',', header=0, encoding='utf8').drop('Unnamed: 0', axis=1) for f in os.listdir('data_sic') if f.endswith('.csv')])

In [182]:
master_df.head()

Unnamed: 0,CIK,Company_Name,Filing_Type,Filing_Date,URL_txt,URL_html
0,100030,TRW INC,DEF 14A,1995-03-14,edgar/data/100030/0000950132-95-000059.txt,edgar/data/100030/0000950132-95-000059-index.html
1,100030,TRW INC,S-8,1995-03-27,edgar/data/100030/0000950152-95-000436.txt,edgar/data/100030/0000950152-95-000436-index.html
2,100030,TRW INC,S-8,1995-03-27,edgar/data/100030/0000950152-95-000439.txt,edgar/data/100030/0000950152-95-000439-index.html
3,100030,TRW INC,SC 13G/A,1995-02-02,edgar/data/100030/0000950152-95-000083.txt,edgar/data/100030/0000950152-95-000083-index.html
4,100030,TRW INC,SC 13G,1995-02-10,edgar/data/100030/0000315066-95-002097.txt,edgar/data/100030/0000315066-95-002097-index.html


In [183]:
master_df.to_csv('data_sic/master.csv', sep='@', index=False)

### API

In [35]:
#Grab CIK for each CIK in df:
#For each CIK, send request to 10-k
#In 10K, extract SIC
#Write SIC back to df

In [2]:
def get_SIC(url):
    
    download = requests.get(url)
    soup = BeautifulSoup(download.content, 'html')
    
    if soup.find('b') == None:
        sic = 'None'
    else:
        b = [i for i in list(map(str,soup.find_all('b'))) if 'SIC' in i]
        sic = int(re.findall(r'\d+', ''.join(b))[0]) 
    
    return sic

In [3]:
master_df = pd.read_csv('data_sic/master.csv', sep='@')

In [4]:
df_10k = master_df[master_df['Filing_Type'] == '10-K']
df_10q = master_df[master_df['Filing_Type'] == '10-Q']

In [5]:
df_10k.shape

(189488, 6)

In [6]:
df_10q.shape

(587200, 6)

In [7]:
intersect = df_10k[df_10k['CIK'].isin(df_10q['CIK'])]

In [8]:
tenk_notin_tenq = df_10k[~df_10k['CIK'].isin(df_10q['CIK'])]

In [9]:
tenq_notin_tenk = df_10q[~df_10q['CIK'].isin(df_10k['CIK'])]

In [10]:
total_df = intersect.append(tenk_notin_tenq).append(tenq_notin_tenk)

In [11]:
total_df.shape

(202553, 6)

In [12]:
total_df = total_df.sort_values('Filing_Date', ascending=False)

In [13]:
total_df = total_df.drop_duplicates(subset=['CIK'], keep='first')

In [14]:
total_df['URL_html'] = total_df['URL_html'].astype(str)

In [15]:
total_df.shape

(38820, 6)

In [16]:
if len(total_df[total_df['URL_html'] == 'nan']) == 0:
    print('Test Passed: No nulls in URL column')

Test Passed: No nulls in URL column


In [17]:
total_df.head()

Unnamed: 0,CIK,Company_Name,Filing_Type,Filing_Date,URL_txt,URL_html
7973492,1067294,"CRACKER BARREL OLD COUNTRY STORE, INC",10-K,2019-09-27,edgar/data/1067294/0001140361-19-017413.txt,edgar/data/1067294/0001140361-19-017413-index....
8080190,1715768,Peninsula Acquisition Corp,10-K,2019-09-27,edgar/data/1715768/0001213900-19-019134.txt,edgar/data/1715768/0001213900-19-019134-index....
8132896,828530,ADMIRAL FINANCIAL CORP,10-K,2019-09-27,edgar/data/828530/0001553350-19-001013.txt,edgar/data/828530/0001553350-19-001013-index.html
8072104,1682593,"Community Savings Bancorp, Inc.",10-K,2019-09-27,edgar/data/1682593/0001144204-19-046330.txt,edgar/data/1682593/0001144204-19-046330-index....
8047042,1580149,BIOVIE INC.,10-K,2019-09-27,edgar/data/1580149/0001520138-19-000336.txt,edgar/data/1580149/0001520138-19-000336-index....


In [18]:
def CIK_SIC_mapping(df):

    SIC = {}
    for cik, url in zip(df['CIK'], df['URL_html']):
        #print('https://www.sec.gov/Archives/'+url)
        sic = get_SIC('https://www.sec.gov/Archives/'+url)
        #print('https://www.sec.gov/Archives/'+url)
        #print(sic)
        SIC[cik] = sic
    
    return SIC

In [19]:
df1, df2, df3, df4, df5, df6, df7, df8, df9, df10 = np.array_split(total_df,10)

In [20]:
#Stop here

In [21]:
def make_sic_list(df, filename):
    cik_sic_list = []
    cik_sic_list.append(CIK_SIC_mapping(df))
    out = pd.DataFrame(cik_sic_list)
    out.to_csv(filename, index=False)
    return out

In [54]:
out1 = make_sic_list(df1, 'sic_cik_mapping_1_new.csv')

In [22]:
out2 = make_sic_list(df2, 'sic_cik_mapping_2_new.csv')

In [23]:
out2.head()

Unnamed: 0,1800,1961,2034,2186,2488,2969,3116,3146,3153,3197,...,1743415,1743587,1744179,1744895,1745916,1747009,1747777,1749849,1752773,1754824
0,2834,7372,5122,3663,3674,2810,2834,4924,4911,3564,...,,6189,,6770,6162,4922,,100,,6770


In [24]:
out3 = make_sic_list(df3, 'sic_cik_mapping_3_new.csv')

In [25]:
out3.head()

Unnamed: 0,2491,3952,5133,5768,6314,6732,8411,8855,9235,9263,...,1672985,1673731,1674786,1677761,1679629,1681281,1681292,1685311,1689265,1693691
0,7372,3480,2771,3844,3663,3560,1381,2540,6282,8741,...,6770,7389,,6798,5712,6770,6770,5812,,6770


In [26]:
out4 = make_sic_list(df4, 'sic_cik_mapping_4_new.csv')

In [27]:
out4.head()

Unnamed: 0,20,1853,1923,2135,2601,3202,3673,3906,3982,4611,...,1542708,1542774,1543668,1548223,1548404,1548805,1550957,1551335,1553210,1554864
0,3823,3711,1540,7374,3674,4512,4911,,1389,3089,...,7370,7370,6770,3550,5051,5190,6770,4941,7372,6770


In [28]:
out5 = make_sic_list(df5, 'sic_cik_mapping_5_new.csv')

In [29]:
out5.head()

Unnamed: 0,3370,4458,5187,6383,6571,9128,9779,10081,10427,14073,...,1444820,1444874,1445059,1445222,1445223,1447235,1447879,1448239,1450687,1451821
0,5045,7510,2834,4841,7200,1000,5600,2834,3851,6513,...,6321,6282,6021,8200,3679,6189,,6770,6189,6189


In [30]:
out6 = make_sic_list(df6, 'sic_cik_mapping_6_new.csv')

In [31]:
out6.head()

Unnamed: 0,3133,3327,3333,4164,4285,4317,6720,9534,9626,9801,...,1383053,1383245,1383361,1383362,1383363,1383727,1383944,1383945,1383946,1384062
0,6022,5990,5411,3357,3330,7373,3949,3011,6022,2750,...,6189,6311,6189,6189,6189,6189,6189,6189,6189,6189


In [32]:
out7 = make_sic_list(df7, 'sic_cik_mapping_7_new.csv')

In [33]:
out7.head()

Unnamed: 0,2062,3753,4344,5550,5907,6260,6342,7694,8643,8734,...,1327524,1327541,1328325,1328389,1329037,1329054,1329090,1329589,1329619,1329638
0,6351,3931,7359,9995,4813,3861,,5160,8090,5812,...,6189,6189,6189,6189,6189,6189,6189,6189,6189,6189


In [34]:
out8 = make_sic_list(df8, 'sic_cik_mapping_8_new.csv')

In [35]:
out8.head()

Unnamed: 0,2024,2589,2852,3000,3642,3721,3959,4310,5016,5197,...,1225763,1225764,1227941,1237917,1237918,1237919,1237921,1237922,1239203,1245445
0,5070,8711,2040,4513,6331,3663,2731,2834,6331,8062,...,1311,1311,8062,4841,4841,4841,4841,4841,5500,8062


In [36]:
out9 = make_sic_list(df9, 'sic_cik_mapping_9_new.csv')

In [37]:
out9.head()

Unnamed: 0,1952,1985,2070,2310,2880,3292,3662,3941,4165,4507,...,1116152,1116561,1117535,1117927,1118402,1119744,1121750,1127921,1127922,1127923
0,1700,6311,3620,3555,5040,2250,3634,3523,9995,2030,...,3576,7389,6189,,6189,7372,3728,3089,3089,3089


In [38]:
out10 = make_sic_list(df10, 'sic_cik_mapping_10_new.csv')

In [39]:
out10.head()

Unnamed: 0,1761,1969,1988,2145,2435,2457,2648,3228,3398,3449,...,1058625,1058634,1058635,1058673,1059010,1059067,1059068,1061154,1061156,1092481
0,2670,4955,3577,5190,3672,3564,6311,1040,3661,6411,...,7320,7320,7320,,,,,,,


In [None]:
#Old code here:

In [216]:
cik_sic_list_1 = []
for df in [df1, df2, df3, df4, df5]:
    cik_sic_list_1.append(CIK_SIC_mapping(df))

In [16]:
#len(cik_sic_list_1)

In [17]:
#len(cik_sic_list_2)

In [251]:
#Already done
#final1 = pd.DataFrame(cik_sic_list_1)

In [252]:
#Already done
#final2 = pd.DataFrame(cik_sic_list_2)

In [None]:
#JUST READ MASTER IN CSV FORM NOW!

In [253]:
#Already done
#final1.to_csv('sic_cik_mapping_1.csv', index=False)

In [254]:
#Already done
#final2.to_csv('sic_cik_mapping_2.csv', index=False)

In [None]:
#### stop here

In [20]:
#Already done
#cik_sic_list_more = []

In [25]:
#Already done
#cik_sic_list_more.append(CIK_SIC_mapping(df8))

In [23]:
#Already done
#cik_sic_list_more.append(CIK_SIC_mapping(df9))

In [18]:
df10['URL_html_x'] = df10['URL_html_x'].astype(str)

In [30]:
#Already done
#final3 = pd.DataFrame(cik_sic_list_more)

In [31]:
#Already done
#final3.to_csv('cik_sic_list_more.csv', index=False)

In [19]:
cik_sic_list_more2 = []

In [20]:
#Finish running this:
cik_sic_list_more2.append(CIK_SIC_mapping(df10))

In [22]:
final4 = pd.DataFrame(cik_sic_list_more2)

In [23]:
final4.to_csv('cik_sic_list_more2.csv', index=False)

In [25]:
final4.T

Unnamed: 0,0
1988,
2435,
2880,
3228,
4438,
5177,
5696,
6992,
7119,
9696,


In [33]:
df10['URL_html_x']

6986410    edgar/data/1369017/0001136999-07-000585-index....
6986411    edgar/data/1369031/0001369031-07-000006-index....
6986412    edgar/data/1369057/0001056404-07-001261-index....
6986413    edgar/data/1369060/0000950137-07-004820-index....
6986414    edgar/data/1369100/0001369100-07-000006-index....
6986415    edgar/data/1369101/0001369101-07-000006-index....
6986416    edgar/data/1369105/0001285495-07-000019-index....
6986417    edgar/data/1369106/0001285495-07-000018-index....
6986418    edgar/data/1369107/0000950137-07-004821-index....
6986419    edgar/data/1369137/0001369137-07-000008-index....
6986420    edgar/data/1369168/0001369168-07-000002-index....
6986421    edgar/data/1369175/0001136999-07-000478-index....
6986422    edgar/data/1369197/0001369197-07-000006-index....
6986423    edgar/data/1369221/0001056404-07-001215-index....
6986424    edgar/data/1369285/0001369285-07-000001-index....
6986425    edgar/data/1369367/0001019965-07-000201-index....
6986426    edgar/data/13

In [29]:
df10['URL_html_x'].iloc[0]

'edgar/data/1369017/0001136999-07-000585-index.html'

In [None]:
#Old Code - Not needed

In [None]:
new_master = master_df.merge(df_un, how='left', on='CIK')

In [None]:
new_master.fillna(method='ffil')

In [94]:
download = requests.get('https://www.sec.gov/Archives/edgar/data/1002638/0001002638-19-000014-index.html')
soup1 = BeautifulSoup(download.content, 'html')

In [99]:
soup1.find_all('b')

[<b><a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=6153&amp;owner=include">6153</a></b>]

In [101]:
download = requests.get('https://www.sec.gov/Archives/edgar/data/1000097/0001000097-19-000004-index.html')
soup = BeautifulSoup(download.content, 'html')

In [103]:
soup.find_all('b')

[]

In [85]:
soup.find_all('b')

[<b>[Amend]</b>,
 <b><a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=6153&amp;owner=include">6153</a></b>]

In [87]:
soup = BeautifulSoup('<b><a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=6153&amp;owner=include">6153</a></b>', 'html')

In [89]:
soup.b

<b><a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=6153&amp;owner=include">6153</a></b>

In [None]:
download = requests.get(co_url)

In [None]:
soup = BeautifulSoup(download.content, 'html')

In [None]:
soup

In [None]:
soup.find('b').text

In [None]:
def download_summary(co_url):

    ## Function to retrieve the 10-K Summary which contains links to individual reports
    ## Expects input from index_url function
    
    download = requests.get(co_url).json()
    
    xml_summary = r'https://www.sec.gov'
    
    for item in download['directory']['item']:
        if item['name'] == 'FilingSummary.xml':
            xml_summary += download['directory']['name']+'/' + item['name']

    new_base_url = xml_summary.replace('FilingSummary.xml', '')
    content = requests.get(xml_summary).content
    
    #Using BS4 to parse the XML content
    soup = BeautifulSoup(content, 'lxml')
    reports = soup.find('myreports')
    
    if reports is None:
        print('---------------------')
        print('Sorry, no reports were found for this company!')
        print('Please try a different public company!')
        print('---------------------')
        pass
    
    return new_base_url, reports

In [None]:
# Function to query database by company:

def select_company(company_names):
    
    ## Function to query the index by company name
    ## Expects list of company names to execute select query
    
    ##Connect to DB:
    conn = psycopg2.connect("host=localhost dbname=postgres user=postgres password=Paolino1")
    cur = conn.cursor()
    
    ##Create Query:
    SQL = 'SELECT * FROM IX WHERE Company_Name IN %(list)s;'
    cur.execute(SQL, {
        'list':tuple(company_names),
    })
    
    #Present query results:
    output = cur.fetchall()
    columns = ['CIK', 'Company_Name', 'Filing_Type', 'Filing_Date', 'URL_txt', 'URL_html']
    result = pd.DataFrame(output, columns=columns)
    
    ##Closing DB connection:
    cur.close()
    conn.close()
    
    return result

In [None]:
#Sample query to retrieve index data for Amazon and American Airlines:
df_co_name = select_company(['AMAZON COM INC', 'AMERICAN AIRLINES INC'])

In [None]:
#Checking that both Amazon and AA were found:
df_co_name.Company_Name.unique()

In [None]:
# Function to query database by CIK:

def select_cik(company_ciks):
    
    ## Function to query the index by CIK
    
    ##Connect to DB:
    conn = psycopg2.connect("host=localhost dbname=postgres user=postgres password=Paolino1")
    cur = conn.cursor()
    
    ##Create Query:
    SQL = 'SELECT * FROM IX WHERE CIK IN %(list)s;'
    cur.execute(SQL, {
        'list':tuple(company_ciks),
    })
    
    #Present query results:
    output = cur.fetchall()
    columns = ['CIK', 'Company_Name', 'Filing_Type', 'Filing_Date', 'URL_txt', 'URL_html']
    result = pd.DataFrame(output, columns=columns)
    
    ##Closing DB connection:
    cur.close()
    conn.close()
    
    return result

In [None]:
df_cik = select_cik([1018724, 100030])

In [None]:
df_cik.CIK.unique()

### ETL  

You will have to enter DB login credentials here:

In [24]:
conn = psycopg2.connect("host=localhost dbname=postgres user=postgres password=Paolino1")

In [25]:
cur = conn.cursor()

In [26]:
#Only run once
#cur.execute("""CREATE TABLE SIC(CIK integer, Company_Name text, Filing_Type text,Filing_Date text,URL_text text, URL_html text)""")

In [27]:
#Only run once
with open('data_sic/master.csv', 'r', encoding='utf-8') as f:
    #cur.copy_from(f, 'new_index', sep='@')
    cur.copy_expert("COPY SIC FROM STDIN WITH CSV HEADER DELIMITER AS '@'", f)
    conn.commit()

In [28]:
#Only run once
#del master_df

In [29]:
#Checking simple query
cur.execute('SELECT Company_Name FROM SIC LIMIT 10')

In [30]:
query = cur.fetchall()

In [31]:
query

[('NICHOLAS FINANCIAL INC',),
 ('NICHOLAS FINANCIAL INC',),
 ('NICHOLAS FINANCIAL INC',),
 ('NICHOLAS FINANCIAL INC',),
 ('NICHOLAS FINANCIAL INC',),
 ('NICHOLAS FINANCIAL INC',),
 ('KINGDON CAPITAL MANAGEMENT, L.L.C.',),
 ('NORDIC AMERICAN TANKERS Ltd',),
 ('NORDIC AMERICAN TANKERS Ltd',),
 ('NORDIC AMERICAN TANKERS Ltd',)]

In [32]:
cur.close()

In [33]:
conn.close()

In [34]:
#Cleaning directory
#The data is now only in the database
#!rm data_sic/*sv

Reference for XML Table Schema: https://www.w3schools.com/html/html_tables.asp

In [None]:
import requests
from bs4 import BeautifulSoup
import lxml

In [None]:
def index_url(df, base_url = r"https://www.sec.gov/Archives/"):
    
    ## Function to retrieve url for 2018 10-K
    ## Expects input from select_company function
    
    #Returning 10-K from 2018. If it does not exist, returning most recent 10-K
    
    df_recent = df[(df['Filing_Type'] == '10-K')&(df['Filing_Date'].str.contains('2018'))]
    
    if len(df_recent) == 0:
        
        df_recent = df[(df['Filing_Type'] == '10-K')].sort_values('Filing_Date', ascending=False)
        
        if len(df_recent) == 0:
            print('No 10-K available for this company')
        else:
            declination = df_recent[['URL_txt']].iloc[0][0]
    
    declination = df_recent[['URL_txt']].iloc[0][0]
    
    declination = declination.replace("-","").replace(".txt","/index.json")
    
    return base_url + declination

In [None]:
def download_summary(co_url):

    ## Function to retrieve the 10-K Summary which contains links to individual reports
    ## Expects input from index_url function
    
    download = requests.get(co_url).json()
    
    xml_summary = r'https://www.sec.gov'
    
    for item in download['directory']['item']:
        if item['name'] == 'FilingSummary.xml':
            xml_summary += download['directory']['name']+'/' + item['name']

    new_base_url = xml_summary.replace('FilingSummary.xml', '')
    content = requests.get(xml_summary).content
    
    #Using BS4 to parse the XML content
    soup = BeautifulSoup(content, 'lxml')
    reports = soup.find('myreports')
    
    if reports is None:
        print('---------------------')
        print('Sorry, no reports were found for this company!')
        print('Please try a different public company!')
        print('---------------------')
        pass
    
    return new_base_url, reports

In [None]:
def show_statements(reports):
    
    #Function to select among available statements contained in the 10-K:
    print('------------------------')
    print('Please select the reports you would like to include in your download')
    print('Copy your selections into the user input provided by the select_reports function')
    print('------------------------')
    
    if reports is None:
        
        print('---------------------')
        print('Sorry, no reports were found for this company!')
        print('Please try a different public company!')
        print('---------------------')
        pass
    
    all_reports = []
    for report in reports.find_all('report'):
        print(report.shortname.text)
        all_reports.append(report.shortname.text)

    return all_reports

In [None]:
def select_reports(all_reports):
    
    all_selections = []
    
    while len(all_selections)<4:
        
        selected = input()
        
        if selected not in all_reports:
            print('Your entry is not valid')
            print('Please copy/paste a report name from the list above.')
            print('Please do not use quotes when entering the report name.')
        else:
            print('-----')
            print('You have entered a valid report!')
            print('Please enter another report from the list above.')
            print('\n')
            all_selections.append(selected)
            
    return all_selections

In [None]:
def statements_urls(new_base_url, reports, short_names):
    
    #Function to create the 4 URL's associated with each Financial Statement Report (see list below)
    #Expectes input from download_summary and pick_statments functions
    
    #Empty dictionary with URL's to downloaded reports
    statements_urls = {}
    
    #Iterating through different reports in the index
    for report in reports.find_all('report'):
        if report.shortname.text in short_names:
            #print('URL for '+report.shortname.text)
            #print(new_base_url+report.htmlfilename.text)
            statements_urls[report.shortname.text] = new_base_url+report.htmlfilename.text

    return statements_urls

In [None]:
def statements_data(statements_urls):
    
    ## Function scrapes financial statements at URL's found and returns a dictionary of unparsed data   
    ## Expects dictionary output by statements_url function
    
    stm_data = {}
    for key, url in statements_urls.items():
        
        statement_data = {}
        statement_data['headers'] = []
        statement_data['sections'] = []
        statement_data['data'] = []

        content = requests.get(url).content
        soup = BeautifulSoup(content, 'html')

        for index, row in enumerate(soup.table.find_all('tr')):

            cols = row.find_all('td')

            if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0): 
                reg_row = [dt.text.strip() for dt in cols]
                statement_data['data'].append(reg_row)

            elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0):
                sec_row = cols[0].text.strip()
                statement_data['sections'].append(sec_row)

            elif (len(row.find_all('th')) != 0):            
                hed_row = [dt.text.strip() for dt in row.find_all('th')]
                statement_data['headers'].append(hed_row)

            else:            
                print('Error.')

        stm_data[key] = statement_data   
        
    return stm_data

In [None]:
def financial_statements(stm_data):
    
    ## Function to create dictionary containing the four reports that make up a financial statement
    ## Expects inputs from statements_data function

    financial_statements = {}
    for key, statement in stm_data.items():

        header =  statement['headers']
        data = statement['data']
        df = pd.DataFrame(data)

        #Some formatting to make sure every df is presented well regardless of dimension
        df = df.replace('[\$,)]','', regex=True )\
                             .replace( '[(]','-', regex=True)\
                             .replace( '', 'NaN', regex=True)\
                             .replace( '\[.*?]', 'NaN', regex=True)

        df.index = df.iloc[:,0]

        df = df.drop(df.columns[0], axis=1)

        df = df.replace('((?:[ a-z ]+\S*\d+|\d\S*[a-z]+)[a-z\d_-]*)', 'NaN', regex=True)\
                        .replace('[^0-9]','', regex=True)\
                        .replace('',np.nan, regex=True)

        df.fillna(value=pd.np.nan, inplace=True)
        df = df.dropna(axis=1, how='all')
        df = df.dropna(axis=0, how='all')

        var = df.shape[1]
        temp = [i for sublist in header for i in sublist]
        df.columns = temp[-var:]
        df.index.name = temp[0]

        df = df.apply(pd.to_numeric)

        #Appending All 4 Statements to Dictionary:
        financial_statements[key]=df
        
    return financial_statements

In [None]:
def run_all(df_co_name):
    
    #Function executes all functions to retrieve Financial Statements for all listed companies
    #Returns a dictionary whose key is the company CIK and values are the 4 statements
    
    uniques = df_co_name.Company_Name.unique()
    df_list = []
    
    company_statements_dict = {}
    
    for val in uniques:
        df_list.append(df_co_name[df_co_name['Company_Name'] == val])
        
    #Executing all functions in sequence:
    for df in df_list:
        #Retrieving url
        df_url = index_url(df, base_url = r"https://www.sec.gov/Archives/")
        #Downloading index
        new_base_url, reports = download_summary(df_url)
        #Showing all reports for company
        all_reports = show_statements(reports)
        #Selecting 4 reports
        selected_reports = select_reports(all_reports)
        #Selecting URL's for each report
        urls = statements_urls(new_base_url, reports, selected_reports)
        #Downloading statements
        stm_data = statements_data(urls)
        #Converting statements into Pandas Dataframes
        statements_dict = financial_statements(stm_data)
        #Loading dictionary containing all statements for all input companies:
        key = df.Company_Name.unique()[0]
        company_statements_dict[key] = statements_dict
        
        
    return company_statements_dict

#### Run the following Function and pick the statements you want to download

In [None]:
out = run_all(df_co_name)

In [None]:
print('Financial statements for the followign companies have been downloaded:')
print(out.keys())

In [None]:
print('The following statements for each company have been downloaded for AA:')
print(out['AMERICAN AIRLINES INC'].keys())

In [None]:
print('The following statements for each company have been downloaded for Amazon:')
print(out['AMAZON COM INC'].keys())

American Airlines

In [None]:
out['AMERICAN AIRLINES INC']['Consolidated Statements of Comprehensive Income']

In [None]:
out['AMERICAN AIRLINES INC']['Consolidated Balance Sheets']

In [None]:
out['AMERICAN AIRLINES INC']['Consolidated Statements of Cash Flows']

In [None]:
out['AMERICAN AIRLINES INC']["Consolidated Statements of Stockholders' Equity"]

Amazon

In [None]:
out['AMAZON COM INC']['Consolidated Statements of Cash Flows']

In [None]:
out['AMAZON COM INC']['Consolidated Statements of Comprehensive Income']

In [None]:
out['AMAZON COM INC']['Consolidated Balance Sheets']

In [None]:
out['AMAZON COM INC']["Consolidated Statements of Stockholders' Equity"]