### Create Financial Statements with Edgar Index Files

###### Objective: download data using the edgar library and visualize financial statements for any selected company


In [8]:
import pandas as pd
import edgar
import psycopg2
import os
import glob
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

### API

In [35]:
#Grab CIK for each CIK in df:
#For each CIK, send request to 10-k
#In 10K, extract SIC
#Write SIC back to df

In [9]:
def get_SIC(url):
    
    download = requests.get(url)
    soup = BeautifulSoup(download.content, 'html')
    
    if soup.find('b') == None:
        sic = 'None'
    else:
        b = [i for i in list(map(str,soup.find_all('b'))) if 'SIC' in i]
        sic = int(re.findall(r'\d+', ''.join(b))[0]) 
    
    return sic

In [10]:
master_df = pd.read_csv('data_sic/master.csv', sep='@')

In [11]:
df_10k = master_df[master_df['Filing_Type'] == '10-K']
df_10q = master_df[master_df['Filing_Type'] == '10-Q']

In [12]:
df_10k.shape

(189488, 6)

In [13]:
df_10q.shape

(587200, 6)

In [66]:
df_10k.merge(df_10q, how='outer', on='CIK').shape

(6967324, 11)

In [15]:
outer_df = df_10k.merge(df_10q, how='outer', on='CIK')

In [67]:
outer_df.head(10)

Unnamed: 0,CIK,Company_Name_x,Filing_Type_x,Filing_Date_x,URL_txt_x,URL_html_x,Company_Name_y,Filing_Type_y,Filing_Date_y,URL_txt_y,URL_html_y
0,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,edgar/data/100122/0000100122-95-000007-index.html,TUCSON ELECTRIC POWER CO,10-Q,2008-11-10,edgar/data/100122/0001362310-08-006917.txt,edgar/data/100122/0001362310-08-006917-index.html
1,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,edgar/data/100122/0000100122-95-000007-index.html,TUCSON ELECTRIC POWER CO,10-Q,2009-10-30,edgar/data/100122/0000950123-09-055048.txt,edgar/data/100122/0000950123-09-055048-index.html
2,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,edgar/data/100122/0000100122-95-000007-index.html,TUCSON ELECTRIC POWER CO,10-Q,2015-07-31,edgar/data/100122/0000100122-15-000018.txt,edgar/data/100122/0000100122-15-000018-index.html
3,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,edgar/data/100122/0000100122-95-000007-index.html,TUCSON ELECTRIC POWER CO,10-Q,2014-07-29,edgar/data/100122/0000941138-14-000128.txt,edgar/data/100122/0000941138-14-000128-index.html
4,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,edgar/data/100122/0000100122-95-000007-index.html,TUCSON ELECTRIC POWER CO,10-Q,1995-05-15,edgar/data/100122/0000100122-95-000013.txt,edgar/data/100122/0000100122-95-000013-index.html
5,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,edgar/data/100122/0000100122-95-000007-index.html,TUCSON ELECTRIC POWER CO,10-Q,2002-11-14,edgar/data/100122/0000941138-02-000020.txt,edgar/data/100122/0000941138-02-000020-index.html
6,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,edgar/data/100122/0000100122-95-000007-index.html,TUCSON ELECTRIC POWER CO,10-Q,2003-11-10,edgar/data/100122/0000941138-03-000123.txt,edgar/data/100122/0000941138-03-000123-index.html
7,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,edgar/data/100122/0000100122-95-000007-index.html,TUCSON ELECTRIC POWER CO,10-Q,1995-08-02,edgar/data/100122/0000100122-95-000017.txt,edgar/data/100122/0000100122-95-000017-index.html
8,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,edgar/data/100122/0000100122-95-000007-index.html,TUCSON ELECTRIC POWER CO,10-Q,2014-04-28,edgar/data/100122/0000941138-14-000099.txt,edgar/data/100122/0000941138-14-000099-index.html
9,100122,TUCSON ELECTRIC POWER CO,10-K,1995-03-09,edgar/data/100122/0000100122-95-000007.txt,edgar/data/100122/0000100122-95-000007-index.html,TUCSON ELECTRIC POWER CO,10-Q,2015-05-05,edgar/data/100122/0000100122-15-000013.txt,edgar/data/100122/0000100122-15-000013-index.html


In [72]:
df_10k[df_10k['CIK'].isin(df_10q['CIK'])].shape

(169925, 6)

In [73]:
df_10k[~df_10k['CIK'].isin(df_10q['CIK'])].shape

(19563, 6)

In [74]:
df_10q[~df_10q['CIK'].isin(df_10k['CIK'])].shape

(13065, 6)

In [89]:
intersect = df_10k[df_10k['CIK'].isin(df_10q['CIK'])]

In [90]:
tenk_notin_tenq = df_10k[~df_10k['CIK'].isin(df_10q['CIK'])]

In [91]:
tenq_notin_tenk = df_10q[~df_10q['CIK'].isin(df_10k['CIK'])]

In [92]:
total_df = intersect.append(tenk_notin_tenq).append(tenq_notin_tenk)

In [93]:
total_df.shape

(202553, 6)

In [94]:
total_df = total_df.sort_values('Filing_Date', ascending=False)

In [95]:
total_df = total_df.drop_duplicates(subset=['CIK'], keep='first')

In [96]:
total_df['URL_html'] = total_df['URL_html'].astype(str)

In [97]:
total_df.shape

(38820, 6)

In [105]:
total_df[total_df['URL_html'] == 'nan']

Unnamed: 0,CIK,Company_Name,Filing_Type,Filing_Date,URL_txt,URL_html


In [98]:
sum(total_df['URL_html'].isnull())

0

In [16]:
final_df = outer_df.drop_duplicates(subset=['CIK'], keep='first')

In [17]:
final_df = final_df.drop(columns=['Company_Name_y', 'Filing_Type_y', 'Filing_Date_y',
       'URL_txt_y', 'URL_html_y'])

In [18]:
final_df.shape

(38820, 6)

In [19]:
def CIK_SIC_mapping(df):

    SIC = {}
    for cik, url in zip(df['CIK'], df['URL_html_x']):
        #print('https://www.sec.gov/Archives/'+url)
        sic = get_SIC('https://www.sec.gov/Archives/'+url)
        #print('https://www.sec.gov/Archives/'+url)
        #print(sic)
        SIC[cik] = sic
    
    return SIC

In [20]:
df1, df2, df3, df4, df5, df6, df7, df8, df9, df10 = np.array_split(final_df,10)

In [None]:
#Stop here

In [216]:
cik_sic_list_1 = []
for df in [df1, df2, df3, df4, df5]:
    cik_sic_list_1.append(CIK_SIC_mapping(df))

In [16]:
#len(cik_sic_list_1)

In [17]:
#len(cik_sic_list_2)

In [251]:
#Already done
#final1 = pd.DataFrame(cik_sic_list_1)

In [252]:
#Already done
#final2 = pd.DataFrame(cik_sic_list_2)

In [None]:
#JUST READ MASTER IN CSV FORM NOW!

In [253]:
#Already done
#final1.to_csv('sic_cik_mapping_1.csv', index=False)

In [254]:
#Already done
#final2.to_csv('sic_cik_mapping_2.csv', index=False)

In [None]:
#### stop here

In [20]:
#Already done
#cik_sic_list_more = []

In [25]:
#Already done
#cik_sic_list_more.append(CIK_SIC_mapping(df8))

In [23]:
#Already done
#cik_sic_list_more.append(CIK_SIC_mapping(df9))

In [29]:
df10['URL_html_x'] = df10['URL_html_x'].astype(str)

In [30]:
#Already done
#final3 = pd.DataFrame(cik_sic_list_more)

In [31]:
#Already done
#final3.to_csv('cik_sic_list_more.csv', index=False)

In [32]:
cik_sic_list_more2 = []

In [34]:
df10.shape

(3882, 6)

In [102]:
len(df10[df10['URL_html_x'] == 'nan'])

3412

In [35]:
df_test = df10.iloc[:1000,:]

In [36]:
#Finish running this:
cik_sic_list_more2.append(CIK_SIC_mapping(df_test))

In [40]:
for n, i in enumerate(cik_sic_list_more2):
    print(n,i)

0 {1369017: 6189, 1369031: 6189, 1369057: 6189, 1369060: 6321, 1369100: 6189, 1369101: 6189, 1369105: 6311, 1369106: 6311, 1369107: 6321, 1369137: 6189, 1369168: 6189, 1369175: 6189, 1369197: 6189, 1369221: 6189, 1369285: 6189, 1369367: 6189, 1369413: 6189, 1369462: 6189, 1369463: 6189, 1369465: 6189, 1369467: 6189, 1369504: 6189, 1369665: 6189, 1369689: 6189, 1369690: 6189, 1369736: 6189, 1370175: 6189, 1370218: 6189, 1370219: 6189, 1370220: 6189, 1370221: 6189, 1370357: 6189, 1370358: 6189, 1370359: 6189, 1370360: 6189, 1370361: 6189, 1370362: 6189, 1370363: 6189, 1370409: 6189, 1370434: 6798, 1370518: 6189, 1370647: 6189, 1370770: 6189, 1370862: 6189, 1370873: 6189, 1370972: 6189, 1370985: 6189, 1371096: 6189, 1371097: 6189, 1371110: 6189, 1371111: 6189, 1371112: 6189, 1371114: 6189, 1371124: 6321, 1371127: 6189, 1371131: 6189, 1371132: 6321, 1371152: 6189, 1371153: 6189, 1371291: 6189, 1371292: 6189, 1371294: 6189, 1371385: 6189, 1371606: 6189, 1371608: 6189, 1371666: 6189, 1371670

In [42]:
df10[df10['CIK'] == 105189]

Unnamed: 0,CIK,Company_Name_x,Filing_Type_x,Filing_Date_x,URL_txt_x,URL_html_x
6986887,105189,,,,,


In [53]:
null_ciks = df10.loc[6986887:,:]['CIK']

In [54]:
nulls = master_df[master_df['CIK'].isin(null_ciks)]

In [64]:
nulls['Filing_Type'].unique()

array(['10-Q', 'SC 13G', '15-12G', 'SC 13G/A', '10-K405', '10-Q/A', '8-K',
       'DEF 14A', '15-15D', 'DEFS14A', '11-K', '8-K/A', 'SC 13D/A',
       'PRE 14A', 'S-2/A', 'DEF13E3/A', 'DEFM14C', 'PREM14C', 'SC 13E3/A',
       'SC 14D1/A', 'S-3/A', 'S-8', 'SC 13D', 'SC 13E3', 'SC 14D1',
       'SC 14D9', 'S-8 POS', 'DEFA14A', 'POS AM', 'NT 10-K', 'S-4/A',
       '424B3', '10-K/A', 'S-4', '8-A12G/A', '15-15D/A', 'NSAR-B/A',
       'NSAR-B', '13F-E', '424B2', '8-A12B', 'S-3', '485B24E', '497',
       '10-C', 'SC 13E4/A', 'PRE13E3/A', 'PRE13E3', 'PRER14C', 'N-2',
       'N-30B-2', 'SC 14D9/A', 'PRES14A', 'NT 10-Q', 'DEFM14A', '10-12B',
       'NSAR-A', 'S-1/A', 'S-1', '424B5', 'REGDEX', 'REVOKED', '425',
       'PREM14A', 'RW', 'CORRESP', 'UPLOAD', 'PRER14A', '4', '40-17G',
       'N-PX', 'N-Q', '10KSB', '10QSB/A', '10QSB', '40-F/A', '6-K',
       '20-F', '3', '4/A', 'ARS', '10KSB/A', 'D', 'NTN 10Q', 'CT ORDER',
       'EFFECT', 'PRE 14C', 'SC 14F1', '424B4', '25-NSE', 'DEF 14C',
       '15

In [59]:
sum(nulls['URL_html'].isnull())

0

In [57]:
nulls[nulls['Filing_Type'] == '10-Q']

Unnamed: 0,CIK,Company_Name,Filing_Type,Filing_Date,URL_txt,URL_html
522,105189,WEBB DEL CORP,10-Q,1995-02-06,edgar/data/105189/0000950147-95-000006.txt,edgar/data/105189/0000950147-95-000006-index.html
2195,20975,CLEVETRUST REALTY INVESTORS,10-Q,1995-02-10,edgar/data/20975/0000950152-95-000133.txt,edgar/data/20975/0000950152-95-000133-index.html
2359,216991,SAGE ENERGY CO,10-Q,1995-02-14,edgar/data/216991/0000950129-95-000071.txt,edgar/data/216991/0000950129-95-000071-index.html
3429,28755,DIBRELL BROTHERS INC,10-Q,1995-02-13,edgar/data/28755/0000028755-95-000005.txt,edgar/data/28755/0000028755-95-000005-index.html
3822,312259,WALNUT EQUIPMENT LEASING CO INC,10-Q,1995-03-16,edgar/data/312259/0000312259-95-000004.txt,edgar/data/312259/0000312259-95-000004-index.html
6088,350874,ALLERION INC,10-Q,1995-03-28,edgar/data/350874/0000950110-95-000186.txt,edgar/data/350874/0000950110-95-000186-index.html
8459,48948,HUDSON GENERAL CORP,10-Q,1995-02-09,edgar/data/48948/0000950123-95-000255.txt,edgar/data/48948/0000950123-95-000255-index.html
10264,67217,MHI GROUP INC,10-Q,1995-03-15,edgar/data/67217/0000950144-95-000635.txt,edgar/data/67217/0000950144-95-000635-index.html
10571,68361,MORTON INTERNATIONAL INC,10-Q,1995-02-10,edgar/data/68361/0000068361-95-000002.txt,edgar/data/68361/0000068361-95-000002-index.html
10789,701290,REXON INC,10-Q,1995-02-15,edgar/data/701290/0000950150-95-000083.txt,edgar/data/701290/0000950150-95-000083-index.html


In [37]:
final4 = pd.DataFrame(cik_sic_list_more2)

In [38]:
final4.head()

Unnamed: 0,2435,3228,4438,5177,5696,7119,13839,18620,20975,22366,...,1628739,1632276,1632363,1634052,1634422,1634443,1635607,1639143,1641521,1645316
0,,,,,,,,,,,...,,,,,,,,,,


In [51]:
master_df.head()

Unnamed: 0,CIK,Company_Name,Filing_Type,Filing_Date,URL_txt,URL_html
0,100030,TRW INC,DEF 14A,1995-03-14,edgar/data/100030/0000950132-95-000059.txt,edgar/data/100030/0000950132-95-000059-index.html
1,100030,TRW INC,S-8,1995-03-27,edgar/data/100030/0000950152-95-000436.txt,edgar/data/100030/0000950152-95-000436-index.html
2,100030,TRW INC,S-8,1995-03-27,edgar/data/100030/0000950152-95-000439.txt,edgar/data/100030/0000950152-95-000439-index.html
3,100030,TRW INC,SC 13G/A,1995-02-02,edgar/data/100030/0000950152-95-000083.txt,edgar/data/100030/0000950152-95-000083-index.html
4,100030,TRW INC,SC 13G,1995-02-10,edgar/data/100030/0000315066-95-002097.txt,edgar/data/100030/0000315066-95-002097-index.html


In [23]:
final4.to_csv('cik_sic_list_more2.csv', index=False)

In [29]:
df10['URL_html_x'].iloc[0]

'edgar/data/1369017/0001136999-07-000585-index.html'

In [None]:
#Old Code - Not needed - use for testing purposes

In [None]:
new_master = master_df.merge(df_un, how='left', on='CIK')

In [None]:
new_master.fillna(method='ffil')

In [94]:
download = requests.get('https://www.sec.gov/Archives/edgar/data/1002638/0001002638-19-000014-index.html')
soup1 = BeautifulSoup(download.content, 'html')

In [99]:
soup1.find_all('b')

[<b><a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=6153&amp;owner=include">6153</a></b>]

In [101]:
download = requests.get('https://www.sec.gov/Archives/edgar/data/1000097/0001000097-19-000004-index.html')
soup = BeautifulSoup(download.content, 'html')

In [103]:
soup.find_all('b')

[]

In [85]:
soup.find_all('b')

[<b>[Amend]</b>,
 <b><a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=6153&amp;owner=include">6153</a></b>]

In [87]:
soup = BeautifulSoup('<b><a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=6153&amp;owner=include">6153</a></b>', 'html')

In [89]:
soup.b

<b><a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=6153&amp;owner=include">6153</a></b>