# Pull 10k Filings
Based off Sample mappings


In [1]:
%%capture
%pip install python-dotenv pandas tqdm lxml

In [2]:
import pandas as pd
from dotenv import load_dotenv
import os

pd.set_option('display.width', 0)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 50)


## Get and Save 10k URLs

In [3]:
map_df = pd.read_csv('cik-cusip-sample-map.csv')
map_df.cik = map_df.cik.astype(int)
map_df.sort_values(by='cusip6')
map_df

Unnamed: 0,cusip6,cik,name,cusip
0,001055,4977,{'AFLAC INC'},"{'001055102', '001055952', '000105510', '001055902'}"
1,001084,1100441,{'AGCO CORP'},"{'001084952', '001084902', '001084102'}"
2,001228,1514281,{'AG MORTGAGE INVESTMENT TRUST'},{'001228501'}
3,00123Q,1423689,"{'AGNC INVT CORP', 'AGNC Investment Corp.'}","{'00123q104', '00123Q104', '00123Q954', '00123Q904'}"
4,00130H,874761,{'AES CORP'},"{'00130H905', '00130H955', '00130H105', '00130h105'}"
...,...,...,...,...
4643,Y81669,1328919,{'STEALTHGAS INC'},{'Y81669106'}
4644,Y8564W,911971,"{'TEEKAY CORPORATION', 'Teekay Inc'}","{'Y8564W103', 'y8564w103', 'Y8564W903', 'Y8564W953'}"
4645,Y8565N,1419945,"{'TEEKAY TANKERS LTD', 'Teekay Tankers', 'TEEKAY TANKERS LTD-CLASS A'}","{'Y8565N900', 'y8565n102', 'Y8565N300', 'y8565n300', 'Y8565N950'}"
4646,Y8897Y,1296484,{'TOP SHIPS INC'},{'y8897y198'}


In [4]:
import json
from typing import Dict, Tuple, List
from datetime import datetime
from tqdm import tqdm
import http.client

BASE_URL = 'https://www.sec.gov'

def create_urls_df(cusip_df: pd.DataFrame,
                   dates_from_to:Tuple[str,str] = ('2023-01-01', '2024-01-01'),
                   user_email:str='sales@neo4j.com',
                   user_name='Neo4j') -> pd.DataFrame:
    
    print(f'Found {cusip_df.shape[0]:,} companies to pull filings for')
    start_date = datetime.strptime(dates_from_to[0], '%Y-%m-%d').date()
    end_date = datetime.strptime(dates_from_to[1], '%Y-%m-%d').date()

    urls_list = []

    counter = 0
    for ind, row in tqdm(cusip_df.iterrows()):
        #counter += 1
        #print(f'pulling 10k urls for cusip6: {row.cusip6}, {counter} of {map_df.shape[0]} cusips')
        urls = get_urls(row.cik, start_date, end_date, f'{user_name} {user_email}')
        #print(f'{row.cik}: {urls}')
        urls_list.append(urls)

    cusip_df['form10KUrls'] = urls_list
    return cusip_df[cusip_df.form10KUrls.map(len) > 0].explode(column='form10KUrls')


def get_urls(cik: str, start_date: datetime.date, end_date: datetime.date, user_agent: str):
    filing_accessors = get_filing_accessors(cik, start_date, end_date, user_agent)
    return [format_url(cik, f) for f in filing_accessors]


def get_filing_accessors(cik: str, start_date: datetime.date, end_date: datetime.date, user_agent: str) -> List[str]:
    history = get_filing_history(cik, user_agent)
    history_df = pd.DataFrame.from_dict(history['filings']['recent'])
    history_df.filingDate = pd.to_datetime(history_df.filingDate).dt.date
    filtered_df = history_df[(history_df.filingDate <= end_date) &
                             (history_df.filingDate >= start_date) &
                             (history_df.form == '10-K')]
    return filtered_df.accessionNumber.tolist()


def get_filing_history(cik: str, user_agent: str) -> Dict:
    url = f'https://data.sec.gov//submissions/CIK{int(cik):010d}.json'
    #print(f'Downloading filing history for cik: {cik}')
    conn = http.client.HTTPSConnection('www.sec.gov')
    conn.request('GET', url, headers={'User-Agent': user_agent})
    response = conn.getresponse()
    #print(response.status, response.reason)
    data = response.read()
    conn.close()

    if response.status == 200 and response.reason == 'OK':
        res = data.decode('utf-8')
        return json.loads(res)
    else:
        print(f'Download failed for cik: {cik} filings.')
    return dict()


def format_url(cik: str, filing_accessor: str):
    return BASE_URL + f'/Archives/edgar/data/{int(cik)}/{filing_accessor.replace("-", "")}/{filing_accessor}.txt'


In [22]:
ddf = map_df[:3].reset_index(drop=True)
create_urls_df(ddf)

Found 3 companies to pull filings for


3it [00:00,  3.29it/s]


Unnamed: 0,cusip6,cik,name,cusip,form10KUrls
0,1055,4977,{'AFLAC INC'},"{'001055102', '001055952', '000105510', '001055902'}",https://www.sec.gov/Archives/edgar/data/4977/000000497723000055/0000004977-23-000055.txt
2,1228,1514281,{'AG MORTGAGE INVESTMENT TRUST'},{'001228501'},https://www.sec.gov/Archives/edgar/data/1514281/000151428123000020/0001514281-23-000020.txt


In [23]:
urls_df = create_urls_df(map_df)
urls_df

Found 4,648 companies to pull filings for


4648it [23:40,  3.27it/s]


Unnamed: 0,cusip6,cik,name,cusip,form10KUrls
0,001055,4977,{'AFLAC INC'},"{'001055102', '001055952', '000105510', '001055902'}",https://www.sec.gov/Archives/edgar/data/4977/000000497723000055/0000004977-23-000055.txt
2,001228,1514281,{'AG MORTGAGE INVESTMENT TRUST'},{'001228501'},https://www.sec.gov/Archives/edgar/data/1514281/000151428123000020/0001514281-23-000020.txt
3,00123Q,1423689,"{'AGNC INVT CORP', 'AGNC Investment Corp.'}","{'00123q104', '00123Q104', '00123Q954', '00123Q904'}",https://www.sec.gov/Archives/edgar/data/1423689/000142368923000017/0001423689-23-000017.txt
4,00130H,874761,{'AES CORP'},"{'00130H905', '00130H955', '00130H105', '00130h105'}",https://www.sec.gov/Archives/edgar/data/874761/000087476123000010/0000874761-23-000010.txt
6,00164V,1514991,"{'AMC NETWORKS INC', 'AMC NETWORKS INC A'}","{'00164V103', '00164V903', '00164V953'}",https://www.sec.gov/Archives/edgar/data/1514991/000151499123000009/0001514991-23-000009.txt
...,...,...,...,...,...
4624,Y2187A,1322439,{'EAGLE BULK SHIPPING INC'},"{'y2187a150', 'Y2187A150', 'Y2187A900', 'Y2187A950'}",https://www.sec.gov/Archives/edgar/data/1322439/000162828023007526/0001628280-23-007526.txt
4626,Y2573F,866374,"{'Flex Ltd', 'FLEX LTD'}","{'y2573f102', 'Y2573F102', 'Y2573F902', 'Y2573F952'}",https://www.sec.gov/Archives/edgar/data/866374/000086637423000028/0000866374-23-000028.txt
4627,Y2685T,1326200,"{'GENCO SHIPPING AND TRADING LIM', 'GENCO SHIPPING & TRADING LTD', 'GENCO SHIPPING TRADING LTD'}","{'Y2685T131', 'Y2685T901', 'Y2685T951', 'y2685t131'}",https://www.sec.gov/Archives/edgar/data/1326200/000155837023001781/0001558370-23-001781.txt
4632,Y41053,1679049,{'INTERNATIONAL SEAWAYS INC'},"{'Y41053952', 'Y41053102', 'Y41053902'}",https://www.sec.gov/Archives/edgar/data/1679049/000155837023002247/0001558370-23-002247.txt


In [5]:
urls_df.to_csv('f10k-urls-map.csv', index=False)

NameError: name 'urls_df' is not defined

## Download, Parse, and Format 10k Forms

In [7]:
urls_df = pd.read_csv('f10k-urls-map.csv')
urls_df

Unnamed: 0,cusip6,cik,name,cusip,form10KUrls
0,001055,4977,{'AFLAC INC'},"{'001055102', '001055952', '000105510', '001055902'}",https://www.sec.gov/Archives/edgar/data/4977/000000497723000055/0000004977-23-000055.txt
1,001228,1514281,{'AG MORTGAGE INVESTMENT TRUST'},{'001228501'},https://www.sec.gov/Archives/edgar/data/1514281/000151428123000020/0001514281-23-000020.txt
2,00123Q,1423689,"{'AGNC INVT CORP', 'AGNC Investment Corp.'}","{'00123q104', '00123Q104', '00123Q954', '00123Q904'}",https://www.sec.gov/Archives/edgar/data/1423689/000142368923000017/0001423689-23-000017.txt
3,00130H,874761,{'AES CORP'},"{'00130H905', '00130H955', '00130H105', '00130h105'}",https://www.sec.gov/Archives/edgar/data/874761/000087476123000010/0000874761-23-000010.txt
4,00164V,1514991,"{'AMC NETWORKS INC', 'AMC NETWORKS INC A'}","{'00164V103', '00164V903', '00164V953'}",https://www.sec.gov/Archives/edgar/data/1514991/000151499123000009/0001514991-23-000009.txt
...,...,...,...,...,...
3360,Y2187A,1322439,{'EAGLE BULK SHIPPING INC'},"{'y2187a150', 'Y2187A150', 'Y2187A900', 'Y2187A950'}",https://www.sec.gov/Archives/edgar/data/1322439/000162828023007526/0001628280-23-007526.txt
3361,Y2573F,866374,"{'Flex Ltd', 'FLEX LTD'}","{'y2573f102', 'Y2573F102', 'Y2573F902', 'Y2573F952'}",https://www.sec.gov/Archives/edgar/data/866374/000086637423000028/0000866374-23-000028.txt
3362,Y2685T,1326200,"{'GENCO SHIPPING AND TRADING LIM', 'GENCO SHIPPING & TRADING LTD', 'GENCO SHIPPING TRADING LTD'}","{'Y2685T131', 'Y2685T901', 'Y2685T951', 'y2685t131'}",https://www.sec.gov/Archives/edgar/data/1326200/000155837023001781/0001558370-23-001781.txt
3363,Y41053,1679049,{'INTERNATIONAL SEAWAYS INC'},"{'Y41053952', 'Y41053102', 'Y41053902'}",https://www.sec.gov/Archives/edgar/data/1679049/000155837023002247/0001558370-23-002247.txt


In [8]:
import re
import io
from bs4 import BeautifulSoup

def make_10k_jsons(url_df:pd.DataFrame,
                   temp_dir:str='data/form10k',
                   output_dir:str='data/form10k-clean',
                   user_email:str='sales@neo4j.com',
                   user_name='Neo4j') -> int:


    print(f'Found {url_df.shape[0]:,} companies to pull filings for')
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    count = 0
    total = url_df.shape[0]
    print(f'=== Downloading {total:,} 10K filings ===')
    for ind, row in url_df.iterrows():
        count += 1
        print(f'--- Downloading {count:,} of {total:,} 10K filings for {row.name}')
        raw_file_path, file_id = download_filing(row.form10KUrls, f'{user_name} {user_email}', temp_dir)
        if len(raw_file_path) > 0:
            output_file_path = os.path.join(output_dir, file_id + '.json')
            try:
                load_parse_save(raw_file_path, output_file_path, row.cik, row.cusip6, row.form10KUrls, row['name'])
                os.remove(raw_file_path)
            except Exception as e:
                print(e)
    return 0


def download_filing(url: str, user_agent: str, temp_dir: str) -> tuple:
    conn = http.client.HTTPSConnection('www.sec.gov')
    conn.request('GET', url, headers={'User-Agent': user_agent})
    response = conn.getresponse()
    data = response.read()
    conn.close()

    if response.status == 200 and response.reason == 'OK':
        text = data.decode('utf-8')
        file = io.StringIO(text)
        contents = file.read()
        file.close()
        file_id = url[url.rindex('/') + 1:url.rindex('.')]
        file_path = os.path.join(temp_dir, 'raw_' + file_id + '.txt')
        with open(file_path, 'w') as file:
            file.write(contents)
        return file_path, file_id
    else:
        print('Download failed for form13 file.')
        print(response.status, response.reason)
        return '', ''


def extract_10_k(txt: str) -> str:
    # Regex to find <DOCUMENT> tags
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    # Regex to find <TYPE> tag proceeding any characters, terminating at new line
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    # Create 3 lists with the span idices for each regex

    # There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
    # First filter will give us document tag start <end> and document tag end's <start>
    # We will use this to later grab content in between these tags
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(txt)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(txt)]

    # Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
    # to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
    # Once we have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K'
    # as section names
    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(txt)]
    # Create a loop to go through each section type and save only the 10-K section in the dictionary
    # there is just one 10-K section
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            return txt[doc_start:doc_end]


# Extract text using position dataframe and beautiful soup
def beautify_text(txt: str) -> str:
    stg_txt = BeautifulSoup(txt, 'lxml')
    return stg_txt.get_text('\n')


def extract_text(row: pd.Series, txt: str):
    section_txt = txt[row.start:row.sectionEnd].replace('Error! Bookmark not defined.', '')
    return beautify_text(section_txt)


def extract_section_text(doc: str) -> Dict[str, str]:
    # Write the regex
    regex = re.compile(r'(>(Item|ITEM)(\s|&#160;|&nbsp;)(1A|1B|1\.|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|1\.|7A|7|8))')
    # Use finditer to math the regex
    matches = regex.finditer(doc)
    # Write a for loop to print the matches
    # Create the dataframe
    item_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])
    item_df.columns = ['item', 'start', 'end']
    item_df['item'] = item_df.item.str.lower()

    item_df.replace('&#160;', ' ', regex=True, inplace=True)
    item_df.replace('&nbsp;', ' ', regex=True, inplace=True)
    item_df.replace(' ', '', regex=True, inplace=True)
    item_df.replace('\.', '', regex=True, inplace=True)
    item_df.replace('>', '', regex=True, inplace=True)

    all_pos_df = item_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last').set_index(
        'item')
    # Add section end using start of next section
    all_pos_df['sectionEnd'] = all_pos_df.start.iloc[1:].tolist() + [len(doc)]
    # filter to just the sections we care about
    pos_df = all_pos_df.loc[['item1', 'item1a', 'item7', 'item7a'], :]
    res = dict()
    for i, row in pos_df.iterrows():
        res[i] = extract_text(row, doc)
    return res

def extract_first_name(x):
    return x[2:x.find("'", x.find("'") + 1)].upper()

def load_parse_save(input_file_path: str, output_file_path: str, cik: str, cusip6: str, url: str, name:str) -> None:
    with open(input_file_path, 'r') as file:
        raw_txt = file.read()
    print('Extracting 10-K')
    doc = extract_10_k(raw_txt)
    print('Parsing relevant sections')
    cleaned_json_txt = extract_section_text(doc)
    cleaned_json_txt['cik'] = cik
    cleaned_json_txt['cusip6'] = cusip6
    cleaned_json_txt['source'] = url[:url.rindex('.')] + '-index.htm'
    cleaned_json_txt['name'] = extract_first_name(name)
    print('Writing clean text to json')
    with open(output_file_path, 'w') as json_file:
        json.dump(cleaned_json_txt, json_file, indent=4)

In [10]:
url_samp_df = urls_df
make_10k_jsons(url_samp_df)

Found 3,365 companies to pull filings for
=== Downloading 3,365 10K filings ===
--- Downloading 1 of 3,365 10K filings for 0
Extracting 10-K
Parsing relevant sections
Writing clean text to json
--- Downloading 2 of 3,365 10K filings for 1
Extracting 10-K
Parsing relevant sections
Writing clean text to json
--- Downloading 3 of 3,365 10K filings for 2
Extracting 10-K
Parsing relevant sections
Writing clean text to json
--- Downloading 4 of 3,365 10K filings for 3
Extracting 10-K
Parsing relevant sections
Writing clean text to json
--- Downloading 5 of 3,365 10K filings for 4
Extracting 10-K
Parsing relevant sections
Writing clean text to json
--- Downloading 6 of 3,365 10K filings for 5
Extracting 10-K
Parsing relevant sections
Writing clean text to json
--- Downloading 7 of 3,365 10K filings for 6
Extracting 10-K
Parsing relevant sections
Writing clean text to json
--- Downloading 8 of 3,365 10K filings for 7
Extracting 10-K
Parsing relevant sections
"['item7a'] not in index"
--- Downl

0