In [None]:
import numpy as np
import pandas as pd

In [None]:
import warnings
warnings.filterwarnings('ignore')

## CIKs

In [None]:
df = pd.read_csv('sp500_w_addl_id_with_cik.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df['date'] = pd.to_datetime(df['date'].to_list())
df['start'] = pd.to_datetime(df['start'].to_list())
df['ending'] = pd.to_datetime(df['ending'].to_list())

In [None]:
tickers = df['ticker'].unique()
tickers.sort()
idmap = pd.concat([pd.DataFrame([df[df['ticker'] == tick].iloc[0]], columns=df.columns) for tick in tickers], ignore_index=True)
idmap.drop(columns=['date', 'ret'], inplace=True)
idmap.head()

In [None]:
cik_lookup = {}
for index, row in idmap.iterrows():
    cik_lookup[row['ticker']] = int(row['cik'])

## Get 10-Ks and 10-Qs

In [None]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from tqdm import tqdm
import re

In [None]:
from ratelimit import limits, sleep_and_retry

### Get list of 10-Ks and 10-Qs

In [None]:
class SecAPI(object):
    SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}
    @staticmethod
    @sleep_and_retry
    @limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds'])
    def _call_sec(url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'}
        return requests.get(url, headers=headers)

    def get(self, url):
        return self._call_sec(url).text

In [None]:
sec_api = SecAPI()

In [None]:
def get_sec_entries(cik, doc_type, start=0, count=100, datea=20160101, dateb=20220101):
    url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
        '&CIK={}&type={}&start={}&count={}&owner=exclude&datea={}&dateb={}&output=atom' \
        .format(cik, doc_type, start, count, datea, dateb)
    sec_data = sec_api.get(url)
    feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
    entries = [(entry.content.find('filing-href').getText(),
                entry.content.find('filing-type').getText(),
                entry.content.find('filing-date').getText())
                for entry in feed.find_all('entry', recursive=False)]
    return entries

In [None]:
sec_entries = {}
for ticker, cik in tqdm(cik_lookup.items(), desc=f'Getting 10-K/Q entries per ticker', unit='ticker'):
    sec_entries[ticker] = get_sec_entries(cik, '10-Q')
    sec_entries[ticker] += get_sec_entries(cik, '10-K')

### Download 10-Ks and 10-Qs

In [None]:
def download_sec_data(sec_entries):
    raw_fillings_by_ticker = {}
    for ticker, data in sec_entries.items():
        raw_fillings_by_ticker[ticker] = {}
        for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
            if (file_type == '10-K' or file_type == '10-Q'):
                file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')            
                raw_fillings_by_ticker[ticker][file_date] = sec_api.get(file_url)
    return raw_fillings_by_ticker

In [None]:
def get_documents(text):
    extracted_docs = []
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')   
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]
    for doc_start_i, doc_end_i in zip(doc_start_is, doc_end_is):
        extracted_docs.append(text[doc_start_i:doc_end_i])
    return extracted_docs

In [None]:
def get_document_type(doc):
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    doc_type = type_pattern.findall(doc)[0][len('<TYPE>'):] 
    return doc_type.upper()

In [None]:
def get_document_sequence(doc):
    sequence_pattern = re.compile(r'<SEQUENCE>[^\n]+')
    doc_sequence = sequence_pattern.findall(doc)[0][len('<SEQUENCE>'):] 
    return doc_sequence.strip()

In [None]:
def get_data(entries):
    for ticker, data in entries.items():
        raw_fillings_by_ticker = {}
        for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
            if (file_type == '10-K' or file_type == '10-Q'):
                file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')            
                raw_fillings_by_ticker[file_date] = sec_api.get(file_url)
        filling_documents_by_ticker = {}
        for file_date, filling in tqdm(raw_fillings_by_ticker.items(), desc='Getting Documents from {} Fillings'.format(ticker), unit='filling'):
            filling_documents_by_ticker[file_date] = get_documents(filling)
        ten_ks_by_ticker = []
        for file_date, documents in filling_documents_by_ticker.items():
            for document in documents:
                document_type = get_document_type(document)
                document_sequence = get_document_sequence(document)
                if (document_type == '10-K' or document_type == '10-Q') and document_sequence == '1':
                    ten_ks_by_ticker.append([document_type, file_date, document])
        tmp = pd.DataFrame(data=ten_ks_by_ticker, columns=['type', 'date', 'file'])
        tmp.to_csv(f'raw/{ticker}.csv', index=False)

In [None]:
get_data(sec_entries)

## Preprocess data

### Clean data

In [None]:
tickers = list(cik_lookup.keys())

In [None]:
import nltk
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')

In [None]:
def clean_file(file):
    body = BeautifulSoup(file.lower(), 'html.parser').get_text(separator=' ', strip=True)
    tok = nltk.word_tokenize(body)
    begin_doc = ['united', 'states', 'securities', 'and', 'exchange', 'commission']
    for idx in range(len(tok) - len(begin_doc) + 1):
        if tok[idx : idx + len(begin_doc)] == begin_doc:
            break
    if idx == len(tok) - len(begin_doc):
        for idx in range(len(tok) - len(begin_doc[2:]) + 1):
            if tok[idx : idx + len(begin_doc[2:])] == begin_doc[2:]:
                break
    alpha = []
    for word in tok[idx:]:
        if word.isalpha():
            alpha.append(word)
        else:
            parts = word.split("'")
            bo = True
            for part in parts:
                if not part.isalpha():
                    bo = False
                    break
            if bo:
                alpha.append(word)
    res = [word for word in alpha if word not in stopwords_list]
    return ' '.join(res)

In [None]:
def clean_data(tickers):
    for ticker in tickers:
        tmp = pd.read_csv(f'raw/{ticker}.csv')
        start = pd.to_datetime(idmap[idmap['ticker'] == ticker]['start'].to_list()[0])
        end = pd.to_datetime(idmap[idmap['ticker'] == ticker]['ending'].to_list()[0])
        files = []
        for i in tqdm(range(tmp.shape[0]), desc=f'Cleaning {ticker} 10-K/Qs', unit='file'):
            if start <= pd.to_datetime(tmp.loc[i,'date']) <= end:
                files.append([tmp.loc[i,'type'], tmp.loc[i,'date'], clean_file(tmp.loc[i,'file'])])
        if len(files) > 0:
            tmp2 = pd.DataFrame(data=files, columns=tmp.columns)
            tmp2.to_csv(f'clean/{ticker}.csv', index=False)

In [None]:
clean_data(tickers)