## File IO

In [1]:
# open(파일 이름, 리딩이면 r, 라이팅이면 w, 인코딩은 언제나 utf-8)
with open('../company_list', 'r', encoding='utf-8') as f:
    next(f)
    company = [line.replace('\n','') for line in f]
    #company = company[1:]

company_list = [line.split('\t') for line in company]
company_list = [line for line in company_list if line[2] != '']

미리 tmp 폴더랑 0 ~ 19 까지 하위폴더를 만들겁니다 

In [4]:
import os

for i in range(20):
    if not os.path.exists('./tmp/%d/' % i):
        os.makedirs('./tmp/%d/' % i)

In [3]:
company_list[0]

['1347 CAPITAL CORP', '0001606163', '1606163']

## Scrapping 용 함수들

In [7]:
from bs4 import BeautifulSoup
import requests

def get_html_source(url):
    try:
        html_source = requests.get(url).text
        return html_source
    except:
        return None


def get_soup(url):
    try:
        html_source = requests.get(url).text
        soup = BeautifulSoup(html_source, 'lxml')
        return soup
    except:
        return None


def parse_html_link(soup):
    '''
    BeautifulSoup(html_source)로부터 끝에 htm이 들어간 링크들만 가져오기
    '''
    links = soup.find_all('a') 
    links = [link.attrs.get('href', '').strip() for link in links] 
    links = [link for link in links if '.htm' in link]
    return links

## Scrapping 전략

Apple의 경우를 예로 들자

- Step 1
    - EDGAR Search Results 사이트에서 Document 링크들을 가져온다. 상위 10개만 필요하기 때문에 count=10으로 고정한다
    - Apple 회사 아이디 CIK=0000320193
    - https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000320193&type=def+14a&dateb=&owner=exclude&count=10

- Step 2
    - Filing Detail에서 DEFINITIVE PROXY STATEMENT 를 가져온다
    - https://www.sec.gov/Archives/edgar/data/320193/000119312517003753/0001193125-17-003753-index.htm

- Step 3
    - DEFINITIVE PROXY STATEMENT 문서를 가져와서 저장한다

- Step 4
    - 저장된 DEFINITIVE PROXY STATEMENT에서 ToC를 이용해서 원하는 부분을 parsing한다

## Step 1: EDGAR Search Scrapping

In [8]:
step1_base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=def+14a&dateb=&owner=exclude&count=20'

In [9]:
company_id = '0000320193' 
company_id_int = int(company_id)
url = step1_base_url % company_id
print(url)

https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000320193&type=def+14a&dateb=&owner=exclude&count=20


In [10]:
def parse_document_links(soup):
    links = soup.select('a[href^=/Archives/edgar/data/%d]' % company_id_int)
    links = [link.attrs.get('href', '').strip() for link in links]
    links = ['https://www.sec.gov'+link for link in links if '.htm' in link]
    return links

def parse_document_links_with_filling_date(soup):
    rows = soup.select('table[class=tableFile2] tr')
    if len(rows) <= 1:
        return []
    
    links_and_date = []
    
    for i, row in enumerate(rows):
        if i == 0:
            continue
        
        td = row.select('td')
        document_link = 'https://www.sec.gov' + td[1].select('a')[0].attrs.get('href', '')
        filling_date = td[3].text
         # 리스트 추가
        links_and_date.append((document_link, filling_date))
        
    return links_and_date

In [11]:
soup = get_soup(url)
links = parse_document_links_with_filling_date(soup)
links

[('https://www.sec.gov/Archives/edgar/data/320193/000119312517380130/0001193125-17-380130-index.htm',
  '2017-12-27'),
 ('https://www.sec.gov/Archives/edgar/data/320193/000119312517003753/0001193125-17-003753-index.htm',
  '2017-01-06'),
 ('https://www.sec.gov/Archives/edgar/data/320193/000119312516422528/0001193125-16-422528-index.htm',
  '2016-01-06'),
 ('https://www.sec.gov/Archives/edgar/data/320193/000119312515017607/0001193125-15-017607-index.htm',
  '2015-01-22'),
 ('https://www.sec.gov/Archives/edgar/data/320193/000119312514008074/0001193125-14-008074-index.htm',
  '2014-01-10'),
 ('https://www.sec.gov/Archives/edgar/data/320193/000119312513005529/0001193125-13-005529-index.htm',
  '2013-01-07'),
 ('https://www.sec.gov/Archives/edgar/data/320193/000119312512006704/0001193125-12-006704-index.htm',
  '2012-01-09'),
 ('https://www.sec.gov/Archives/edgar/data/320193/000119312511003231/0001193125-11-003231-index.htm',
  '2011-01-07'),
 ('https://www.sec.gov/Archives/edgar/data/32019

## Step 2: DEFINITIVE PROXY STATEMENT가져오기

In [12]:
soup = get_soup(links[0][0])

In [13]:
table_rows = soup.select('table[class=tableFile] tr')

In [14]:
len(table_rows)

42

In [13]:
proxy_statement_link = table_rows[1].select('a')
proxy_statement_link = proxy_statement_link[0].attrs.get('href')
proxy_statement_link = 'https://www.sec.gov' + proxy_statement_link 
proxy_statement_link

'https://www.sec.gov/Archives/edgar/data/320193/000119312517003753/d257185ddef14a.htm'

## Step 3: DEFINITIVE PROXY STATEMENT 문서를 가져와서 저장

In [14]:
html_source = get_html_source(proxy_statement_link)

In [15]:
proxy_statement_link

'https://www.sec.gov/Archives/edgar/data/320193/000119312517003753/d257185ddef14a.htm'

In [20]:
# dps_html_fname = 'tmp/%s_%s.html' % (company_id, links[0][1])

# with open(dps_html_fname, 'w', encoding='utf-8') as f:
#     f.write(html_source)

## Step 1 - 3까지 한번에 다하기

In [16]:
# company_list = [ ['Apple CORP', '0000320193', '320193'], 
#                  ['APOLLO INVESTMENT CORP', '0001278752', '1278752'],
#                  ['BMC STOCK HOLDINGS INC', '0001574815', '1574815']
#                ]

[(link, filling_date), (link, filling dt), ... ]

In [None]:
import time

for num_company, company in enumerate(company_list):
    
    if num_company == 0:
        print('begin scrapper')
    elif ((num_company < 50) and (num_company % 5 == 0)):
        print('  .. scrapping (%d in %d)' % (num_company+1, len(company_list)))
    elif ((num_company < 500) and (num_company % 50 == 0)):
        print('  .. scrapping (%d in %d)' % (num_company+1, len(company_list)))
    elif num_company % 500 == 0:
        print('  .. scrapping (%d in %d)' % (num_company+1, len(company_list)))

    # debug code
    # print('begin %s (%s), (%d in %d)' % (company[0], company[2], num_company, len(company_list)) )

    try:
        company_id = company[1]
        company_id_int = int(company[2])

        url = step1_base_url % company_id
        step1_soup = get_soup(url)

        if step1_soup == None:
            continue

        links = parse_document_links_with_filling_date(step1_soup)

        for num_dps, (step2link, filling_date) in enumerate(links):

            step2_soup = get_soup(step2link)
            if step2_soup == None:
                continue

            table_rows = step2_soup.select('table[class=tableFile] tr')
            if len(table_rows) <= 1:
                continue

            proxy_statement_link = table_rows[1].select('a')
            if len(proxy_statement_link) == 0:
                continue

            proxy_statement_link = proxy_statement_link[0].attrs.get('href', '')
            proxy_statement_link = 'https://www.sec.gov' + proxy_statement_link 
            proxy_statement_link

            dps_html_source = get_html_source(proxy_statement_link)
            if dps_html_source == None:
                continue

            company_folder = num_company % 20

            dps_html_fname = '/mnt/sdc1/secgov/tmp/%d/%s_%s.html' % (company_folder, company_id, filling_date)
            with open(dps_html_fname, 'w', encoding='utf-8') as f:
                f.write(dps_html_source)

            # debug code
            # print('  > filling date = %s' % filling_date)

        time.sleep(1)
    
    except Exception as e:
        print('error message = %s (num_company = %d)' % (str(e), num_company))

    # debug code
    # print('done %s (%s), (%d in %d)' % (company[0], company[2], num_company, len(company_list)))