In [8]:
from secedgar.filings import Filing,FilingType,CIK
import bs4 as bs
import unicodedata
import re
from string import printable
import json
import os
from functools import partial 
from multiprocessing.pool import Pool
from tqdm import tqdm_notebook
import datetime
import pickle

In [9]:
def build_ticker_list(ticker_file):
    tl = []
    with open(ticker_file,'r') as f:
        lines = f.readlines()
    
    for line in lines:
        tl.append(line.split('\t')[0])
    return tl


In [16]:
def download_10k(ticker):
    t_cik = CIK([ticker])
    start_date = datetime.datetime(2019,1,1)
    t_filing = Filing(cik=t_cik,filing_type = FilingType.FILING_10K,
                      start_date=start_date,count=1)
    t_dir = '../crawl/' + 'orig/' + ticker 
    return t_filing.save_simple(t_dir)

In [14]:
def process_match(match,ticker,write_invalid=True):
    match_str = match[0]
    if len(match_str) >= 2500:
        # valid match, write out to valid matches
        fn = '../crawl/valid/' + ticker + '/matchedText.txt'
        os.makedirs(os.path.dirname(fn))
        with open(fn,'w') as f:
            f.write(match_str)
        if 'font' in match_str.lower():
            with open('../dirty_writes.txt','a') as f:
                f.write(ticker + '\n')
        return True
    elif write_invalid:
        print('invalid write for %s' %ticker)
        # invalid match, write out to invalid matches
        fn = '../crawl/invalid/' + ticker + '/matchedText.txt'
        os.makedirs(os.path.dirname(fn))
        with open(fn,'w') as f:
            f.write(match_str)
        with open('../invalid.txt','a') as f:
            f.write(ticker + '\n')
        return True
    
    return False

In [15]:
def extract(file_name,ticker):
    
    with open(file_name,'r',encoding='utf-8') as f:
        sauce = f.read()
    sauce = sauce.strip()
    sauce = sauce.replace('\n', ' ')\
                 .replace('\t', ' ')\
                 .replace('\r', ' ')\
                 .replace('&nbsp', ' ')\
                 .replace('&#160;', ' ')\
                 .replace(';',' ')
    while '  ' in sauce:
        sauce = sauce.replace('  ',' ')
    
    soup   = bs.BeautifulSoup(sauce,'lxml').text
    soup   = ''.join([ch if ch in printable else ' ' for ch in soup])
    regex15_4  = r"Item 15.+?Item 4"
    regex1_4   = r"Item 1(?=[^A])(?=[\D]).+?Item 4"   
    
    outer_match = re.search(regex15_4,soup,flags=re.IGNORECASE)
    if outer_match:
        outer_str   = outer_match[0]
        inner_match = re.search(regex1_4,outer_str,flags=re.IGNORECASE)
        
        if inner_match and \
            process_match(inner_match,ticker,write_invalid=False):
                return
            
        
        # at this point either inner match is empty or too small
        process_match(outer_match,ticker)
    else:
        inner_match = re.search(regex1_4,soup,flags=re.IGNORECASE)
        if inner_match:
            process_match(inner_match,ticker)
        else:
            fn = '../crawl/no_match/' + ticker + '/soup.txt'
            print('no match for %s' %ticker)
            os.makedirs(os.path.dirname(fn))
            with open(fn,'w') as f:
                f.write(soup)
    
    return 

In [13]:
def dl_extract(ticker,found_list):
    try:
        fn = download_10k(ticker)
    except Exception as e:
        print('ticker %s not found')
        with open('not_found.txt','a') as f:
            f.write(ticker + '\n')
        return 
    print('found %s', %ticker)
    found_list.append(ticker)
    extract(fn,ticker)

In [None]:
## code execution starts here 
tl = build_ticker_list('/home/mmr/Downloads/ticker.txt')

In [2]:
with open('../not_found.txt','r') as f:
    text = f.read()
tl = text.split('\n')

In [6]:
tl = pickle.load(open('next_round.pkl','rb'))

In [17]:
# for multi-process run here 
pool = Pool(2)
found_list = []
for _ in tqdm_notebook(pool.imap_unordered(dl_extract,tl)):
    pass


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [18]:
found_list

[]

In [None]:
tickers = ['amgn','ccl','celg','expe','fdx','hpq','mat','swks','tss']

In [None]:
import os

In [None]:
for t in tickers[6:]:
    fp = '../crawl/orig/' + t
    fstump = os.listdir(fp)[0]
    fn = fp + '/' + fstump
    extract(fn,t)

In [None]:
tickers[6:]

In [None]:
## for single process run here 

for tckr in tl:
    dl_extract(tckr)