## Functional WebScraper

In [150]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib2
import re

def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', (element).encode('utf-8')):
        return False
    return True

In [151]:
htmls = ['https://www.sec.gov/Archives/edgar/data/1065280/000119312508040378/d10k.htm',
          'https://www.sec.gov/Archives/edgar/data/1065280/000119312509037430/d10k.htm',
          'https://www.sec.gov/Archives/edgar/data/1065280/000119312510036181/d10k.htm',
          'https://www.sec.gov/Archives/edgar/data/1065280/000119312511040217/d10k.htm',
          'https://www.sec.gov/Archives/edgar/data/1065280/000119312512053009/d260328d10k.htm',
          'https://www.sec.gov/Archives/edgar/data/1065280/000106528013000008/nflx1231201210kdoc.htm',
          'https://www.sec.gov/Archives/edgar/data/1065280/000106528014000006/nflx10k2013.htm',
          'https://www.sec.gov/Archives/edgar/data/1065280/000106528015000006/nflx201410k.htm',
          'https://www.sec.gov/Archives/edgar/data/1065280/000106528016000047/nflx201510k.htm'
         ]

In [213]:
errors = pd.DataFrame(columns=['id', 'year', 'section', 'comment'])
logs = pd.DataFrame(columns=['id', 'year', 'section', 'pre_length', 'length', 'pct_length'])
htmls = ['https://www.sec.gov/Archives/edgar/data/37996/000003799616000092/f1231201510-k.htm']

all_docs = {}

for html in htmls:
    all_sections = {}
    soup = BeautifulSoup(urllib2.urlopen(html).read(), 'html.parser')
    
    ## Extract visible text.
    texts = soup.findAll(text=True)
    visible_texts = filter(visible, texts)
    len_total = len(visible_texts)
    
    ## Extract year and id.
    doc_id = html.split('/')[6]
    pattern_year = re.compile('\d{2}, \d{4}.*')
    year = [line for line in visible_texts if pattern_year.findall(line)][0].split(',')[-1].strip()
    
    ## Log error and skip document if year is wrong.
    if len(year) != 4:
        print 'Year incorrectly defined! Skipping document.'
        errors.loc[len(errors)] = [doc_id, year, 'all', 'invalid year format']
        continue
    
    ## List all available sections.
    sections = ['1', '1A', '1B', '2', '3', '4', '5', '6', '7', '7A', \
                '8', '9', '9A', '9B', '10', '11', '12', '13', '14', '15']
    
    for i in range(len(sections)-1):
        ## Starting and ending lines for section.
        pattern_start = re.compile("(\s)?Item[(\\xa0)|(\s)]?" + sections[i] + "\.", re.I)
        pattern_end = re.compile("(\s)?Item[(\\xa0)|(\s)]?" + sections[i+1] + "\.", re.I)

        ## Get the start and end headers, and make sure they are exactly 2.
        start = [[s,line] for s,line in enumerate(visible_texts) if pattern_start.match(line)]
        end = [[e,line] for e,line in enumerate(visible_texts) if pattern_end.match(line)]
        
        ## If we extracted more than 1 item for each header, ignore the TOC one.
        if len(start) >1 and len(end) > 1:
            start = [start[1]]
            end = [end[1]]
            
        ## If either of the sections has zero length, report an error.
        if len(start) == 0 or len(end) == 0:
            print 'Section %s incorrectly defined! Skipping...' %sections[i]
            errors.loc[len(errors)] = [doc_id, year, 'all', 'headers not defined']
            continue
        
        ## Extract section counter, and remove small text.
        content = visible_texts[start[0][0]:end[0][0]]
        len_pre = len(content)
        content_valid = [item for item in content if len(item) > 50]
        
        ## Checks for section length.
        len_content = len(content_valid)
        len_ratio = len_content / float(len_total)

        if len_ratio == 0:
            errors.loc[len(errors)] = [doc_id, year, sections[i], 'length zero']
            print 'Section %s on year %s has zero length!!!' %(sections[i], year)
            
        elif len_ratio > 0.8:
            errors.loc[len(errors)] = [doc_id, year, sections[i], 'length 80% of document']
            print 'Section %s on year %s is more than 80% of the document.'
        ## Add section to dictionary and log.
        logs.loc[len(logs)] = [doc_id, year, sections[i], len_pre, len_content, len_ratio]
        all_sections[sections[i]] = content_valid
    all_docs[(doc_id, year)] = all_sections            

Section 1B on year 2015 has zero length!!!
Section 9B on year 2015 has zero length!!!


## Test Zone
To test the extraction of individual sections on specific documents.

In [158]:
htmls = 'https://www.sec.gov/Archives/edgar/data/51143/000104746910001151/a2195966z10-k.htm'
soup = BeautifulSoup(urllib2.urlopen(html).read(), 'html.parser')

texts = soup.findAll(text=True)
visible_texts = filter(visible, texts)

## Check if year is extracted.
year = [line for line in visible_texts if pattern_year.findall(line)][0].split(',')[-1].strip()
print year

sections = ['1A', '1B']
i=0

## Starting and ending patterns.
pattern_start = re.compile("(\s)?Item[(\\xa0)|(\s)]?" + sections[i] + "\.")
pattern_end = re.compile("(\s)?Item[(\\xa0)|(\s)]?" + sections[i+1] + "\.")

## Try to get headers.
start = [[s,line] for s,line in enumerate(visible_texts) if pattern_start.match(line)]
end = [[e,line] for e,line in enumerate(visible_texts) if pattern_end.match(line)]

print start
print end

2009
[[5083, u'Item 1A. Risk Factors']]
[[5089, u'Item 1B. Unresolved Staff Comments']]


In [211]:
pattern_start = re.compile("(\s)?Item[(\\xa0)|(\s)]?" + '1A' + "\.", re.I)
print pattern_start.match('ITEM 1A.')

<_sre.SRE_Match object at 0x7f9c3cf03300>
