# Pull Form 13s


In [1]:
%%capture
%pip install pandas requests-ratelimiter xmltodict

## Download Raw Form 13 Files

In [6]:
import io
import math
from typing import Tuple
import datetime
import os
import csv
import xmltodict

from requests_ratelimiter import LimiterSession

session = LimiterSession(per_second=10)

def pull_form13s(output_dir:str='data/form13',
         dates_from_to:Tuple[str,str] = ('2023-01-01', '2024-01-01'),
         user_email:str='sales@neo4j.com',
         user_name='Neo4j') -> int:

    start_date = datetime.datetime.strptime( dates_from_to[0], '%Y-%m-%d').date()
    date = datetime.datetime.strptime( dates_from_to[1], '%Y-%m-%d').date()
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    while date >= start_date:
        print(date)
        download_date(date, output_dir, f'{user_name} {user_email}')
        date = date - datetime.timedelta(days=1)
    return 0

def download_date(date, output_dir, user_agent):
    form13_paths = get_form13_urls(date)
    # Download each Form 13
    print('We have ' + str(len(form13_paths)) + ' Form 13 URLs for the date ' + str(date))
    for path in form13_paths:
        try:
            filings = download_form13(path, user_agent)
            with open(os.path.join(output_dir, path.replace('/', '_')), 'w') as file:
                file.write(filings)
        except Exception as e:
            print(e)


def get_form13_urls(date):
    print('Composing the URL of the master file...')
    year = str(date.year)
    quarter = 'QTR' + str(math.ceil(date.month / 3))
    date = date.strftime('%Y%m%d')
    path = '/Archives/edgar/daily-index/' + year + '/' + quarter + '/master.' + date + '.idx'
    url = 'https://www.sec.gov' + path
    print('The URL of the master file is ' + url)

    print('Downloading the master file...')
    # conn = http.client.HTTPSConnection('www.sec.gov')
    # conn.request('GET', path, headers={'User-Agent': 'Neo4j Ben.Lackey@Neo4j.com'})
    # response = conn.getresponse()
    #data = response.read()
    # conn.close()
    response = session.get('https://www.sec.gov/' + path, headers={'User-Agent': 'Neo4j andreas.kollegger@neo4j.com'})
    print(response.status_code)

    if response.status_code == 200: # and response.reason == 'OK':
        # text = data.decode('windows-1252')
        text = response.text
        form4_paths = parse_master_file(text)
        return form4_paths
    else:
        print('Download failed for master file.', response.status_code)
        return []


def parse_master_file(text):
    print('Parsing the master file...')
    form4_paths = []
    file = io.StringIO(text)
    reader = csv.reader(file, delimiter='|')
    for row in reader:
        if len(row) != 5:
            # This is a header
            pass
        elif row[2] == '13F-HR':
            # This is a Form 13
            form4_paths.append('/Archives/' + row[4])

    return form4_paths


def download_form13(path, user_agent):
    # conn = http.client.HTTPSConnection('www.sec.gov')
    # conn.request('GET', path, headers={'User-Agent': 'Neo4j sales@neo4j.com'})
    # response = conn.getresponse()
    # data = response.read()
    # conn.close()
    response = session.get('https://www.sec.gov/' + path, headers={'User-Agent': user_agent})

    if response.status_code == 200: # and response.reason == 'OK':
        print('http://sec.gov' + path)
        # text = data.decode('utf-8')
        text = response.text
        file = io.StringIO(text)
        contents = file.read()
        file.close()
        return contents
    else:
        print('Download failed for form13 file: HTTP ', response.status_code)
        return []


In [7]:
pull_form13s(dates_from_to=('2023-01-01', '2023-02-02'))

2023-02-02
Composing the URL of the master file...
The URL of the master file is https://www.sec.gov/Archives/edgar/daily-index/2023/QTR1/master.20230202.idx
Downloading the master file...
200
Parsing the master file...
We have 176 Form 13 URLs for the date 2023-02-02
http://sec.gov/Archives/edgar/data/1042063/0001042063-23-000002.txt
http://sec.gov/Archives/edgar/data/1053292/0001053292-23-000002.txt
http://sec.gov/Archives/edgar/data/105495/0001085146-23-000575.txt
http://sec.gov/Archives/edgar/data/1058231/0001567619-23-001698.txt
http://sec.gov/Archives/edgar/data/1071483/0000909012-23-000017.txt
http://sec.gov/Archives/edgar/data/1080576/0001080576-23-000001.txt
http://sec.gov/Archives/edgar/data/1088950/0001088950-23-000001.txt
http://sec.gov/Archives/edgar/data/1121908/0001104659-23-009854.txt
http://sec.gov/Archives/edgar/data/1128213/0001128213-23-000001.txt
http://sec.gov/Archives/edgar/data/1129919/0001085146-23-000583.txt
http://sec.gov/Archives/edgar/data/1140771/000114077

KeyboardInterrupt: 

## Format Form13 Info to CSV and Save

In [12]:
from typing import List, Dict
import pandas as pd
import xmltodict

FILING_MANAGER_NAME_COL = 'managerName'
FILING_MANAGER_CIK_COL = 'managerCik'
REPORT_PERIOD_COL = 'reportCalendarOrQuarter'
COMPANY_CUSIP_COL = 'cusip'
COMPANY_CUSIP6_COL = 'cusip6'
COMPANY_NAME_COL = 'companyName'
SOURCE_ID_COL = 'source'
VALUE_COL = 'value'
SHARES_COL = 'shares'


def format_form13s(input_dir:str='data/form13',
                   user_email:str='sales@neo4j.com',
                   user_name='Neo4j', top_periods:int=4) -> pd.DataFrame:
    filings_df, failures = parse_from_dir(input_dir)
    stg_df = aggregate_data(filings_df)
    if top_periods is not None:
        stg_df = filter_data(stg_df, top_periods)
    print(f'===== Had {len(failures)} failed file parsings ====')
    for failure in failures:
        print(failure)
    return stg_df


# function to strip namespaces post xmltodict transformation
def strip_ns(x):
    if isinstance(x, dict):
        x_striped = dict()
        for k, v in x.items():
            x_striped[k.split(':')[-1]] = strip_ns(v)
    elif isinstance(x, list):
        x_striped = [strip_ns(i) for i in x]
    else:
        x_striped = x
    return x_striped


def extract_submission_info(contents: str) -> str:
    xml = contents[1].split('</XML>')[0].strip()
    return strip_ns(xmltodict.parse(xml))['edgarSubmission']


def extract_investment_info(contents: str) -> str:
    xml = contents[2].split('</XML>')[0].strip()
    return strip_ns(xmltodict.parse(xml))['informationTable']['infoTable']


def estimate_cusip6(cusip: str) -> str:
    # Padding of 3 zeros is suspect - likely has a padded zero. This is inconsistent among form13 filers
    if cusip.startswith('000'):
        return cusip.upper()[1:7]
    return cusip.upper()[:6]


def filter_and_format(info_tables: str, manager_cik: str, manager_name: str,
                      report_period: datetime.date) -> List[Dict]:
    res = []
    if isinstance(info_tables, dict):
        info_tables = [info_tables]
    for info_table in info_tables:
        # Skip none to report incidences
        if info_table['cusip'] == '000000000':
            pass
        # Only want stock holdings, not options
        if info_table['shrsOrPrnAmt']['sshPrnamtType'] != 'SH':
            pass
        # Only want holdings over $10m
        elif (float(info_table['value']) * 1000) < 10000000:
            pass
        # Only want common stock
        elif info_table['titleOfClass'] != 'COM':
            pass
        else:
            res.append({FILING_MANAGER_CIK_COL: manager_cik,
                        FILING_MANAGER_NAME_COL: manager_name,
                        REPORT_PERIOD_COL: report_period,
                        COMPANY_CUSIP_COL: info_table['cusip'].upper(),
                        COMPANY_CUSIP6_COL: estimate_cusip6(info_table['cusip']),
                        COMPANY_NAME_COL: info_table['nameOfIssuer'],
                        VALUE_COL: info_table['value'].replace(' ', '') + '000',
                        SHARES_COL: info_table['shrsOrPrnAmt']['sshPrnamt']})
    return res


def extract_dicts(txt: str) -> List[Dict]:
    contents = txt.split('<XML>')
    submt_dict = extract_submission_info(contents)
    mng_cik = submt_dict['headerData']['filerInfo']['filer']['credentials']['cik']
    mng_name = submt_dict['formData']['coverPage']['filingManager']['name']
    report_period = submt_dict['formData']['coverPage']['reportCalendarOrQuarter']
    info_dict = extract_investment_info(contents)
    return filter_and_format(info_dict, mng_cik, mng_name, report_period)


def parse_from_dir(directory_path: str):
    # Go through all files and concatenate to dataframe
    print(f'=== Begin Parsing from {directory_path} ===')
    filing_dfs = []
    failures = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.txt'):
            print(f'parsing {file_name}')
            file_path = os.path.join(directory_path, file_name)
            try:
                with open(file_path, 'r') as file:
                    filing = extract_dicts(file.read())
                    tmp_filing_df = pd.DataFrame(filing)
                    tmp_filing_df[SOURCE_ID_COL] = 'https://sec.gov' + file_name.replace('_', '/')
                    filing_dfs.append(tmp_filing_df)
            except Exception as e:
                print(e)
                failures.append(file_name)
    filing_df = pd.concat(filing_dfs, ignore_index=True)
    filing_df[REPORT_PERIOD_COL] = pd.to_datetime(filing_df[REPORT_PERIOD_COL]).dt.date
    filing_df[VALUE_COL] = filing_df[VALUE_COL].astype(float)
    filing_df[SHARES_COL] = filing_df[SHARES_COL].astype(int)
    return filing_df, failures


# This data contains duplicates where an asset is reported more than once for the same filing manager within the same
# report calendar/quarter.
# See for example https://www.sec.gov/Archives/edgar/data/1962636/000139834423009400/0001398344-23-009400.txt
# for our intents and purposes we will sum over values and shares to aggregate the duplicates out
def aggregate_data(filings_df: pd.DataFrame) -> pd.DataFrame:
    print(f'=== Aggregating Parsed Data ===')
    return filings_df.groupby([SOURCE_ID_COL, FILING_MANAGER_CIK_COL, FILING_MANAGER_NAME_COL, REPORT_PERIOD_COL,
                               COMPANY_CUSIP6_COL, COMPANY_CUSIP_COL]) \
        .agg({COMPANY_NAME_COL: 'first', VALUE_COL: "sum", SHARES_COL: "sum"}).reset_index()


def filter_data(filings_df: pd.DataFrame, top_n_periods: int) -> pd.DataFrame:
    print(f'=== Filtering Data ===')
    periods_df = filings_df[[REPORT_PERIOD_COL, VALUE_COL]] \
        .groupby(REPORT_PERIOD_COL).count().reset_index().sort_values(REPORT_PERIOD_COL)
    num_periods = min(periods_df.shape[0], top_n_periods)
    top_periods = periods_df[REPORT_PERIOD_COL][-num_periods:].tolist()
    return filings_df[filings_df[REPORT_PERIOD_COL].isin(top_periods)]

In [13]:
form13_df = format_form13s()
form13_df

=== Begin Parsing from data/form13 ===
parsing _Archives_edgar_data_1721168_0001721168-23-000002.txt
parsing _Archives_edgar_data_1698006_0001420506-23-000200.txt
parsing _Archives_edgar_data_1127761_0000909012-23-000016.txt
parsing _Archives_edgar_data_1753219_0001941040-23-000038.txt
parsing _Archives_edgar_data_1609674_0001609674-23-000001.txt
parsing _Archives_edgar_data_1843275_0001843275-23-000001.txt
parsing _Archives_edgar_data_1961290_0001085146-23-000505.txt
parsing _Archives_edgar_data_1729300_0001729300-23-000001.txt
parsing _Archives_edgar_data_934866_0000934866-23-000008.txt
parsing _Archives_edgar_data_1522877_0001104659-23-008665.txt
parsing _Archives_edgar_data_1852808_0001951757-23-000092.txt
parsing _Archives_edgar_data_1903859_0001903859-23-000001.txt
parsing _Archives_edgar_data_1780365_0001085146-23-000462.txt
parsing _Archives_edgar_data_1109228_0001109228-23-000002.txt
parsing _Archives_edgar_data_1908976_0001085146-23-000587.txt
parsing _Archives_edgar_data_153

Unnamed: 0,source,managerCik,managerName,reportCalendarOrQuarter,cusip6,cusip,companyName,value,shares
0,https://sec.gov/Archives/edgar/data/1002152/00...,0001002152,"COMPASS CAPITAL MANAGEMENT, INC",2022-12-31,037833,037833100,APPLE INC,2.483222e+09,19112
1,https://sec.gov/Archives/edgar/data/1002152/00...,0001002152,"COMPASS CAPITAL MANAGEMENT, INC",2022-12-31,126650,126650100,CVS HEALTH CORP,4.818667e+10,517080
2,https://sec.gov/Archives/edgar/data/1002152/00...,0001002152,"COMPASS CAPITAL MANAGEMENT, INC",2022-12-31,171340,171340102,CHURCH & DWIGHT CO INC,4.752927e+10,589620
3,https://sec.gov/Archives/edgar/data/1002152/00...,0001002152,"COMPASS CAPITAL MANAGEMENT, INC",2022-12-31,235851,235851102,DANAHER CORPORATION,5.153830e+10,194176
4,https://sec.gov/Archives/edgar/data/1002152/00...,0001002152,"COMPASS CAPITAL MANAGEMENT, INC",2022-12-31,254687,254687106,DISNEY WALT CO,3.941057e+10,453621
...,...,...,...,...,...,...,...,...,...
52234,https://sec.gov/Archives/edgar/data/948669/000...,0000948669,"PARNASSUS INVESTMENTS, LLC",2022-12-31,92345Y,92345Y106,VERISK ANALYTICS INC,9.742218e+11,5522173
52235,https://sec.gov/Archives/edgar/data/948669/000...,0000948669,"PARNASSUS INVESTMENTS, LLC",2022-12-31,92532F,92532F100,VERTEX PHARMACEUTICALS INC,6.938835e+10,240281
52236,https://sec.gov/Archives/edgar/data/948669/000...,0000948669,"PARNASSUS INVESTMENTS, LLC",2022-12-31,94106L,94106L109,WASTE MGMT INC DEL,7.621199e+11,4857980
52237,https://sec.gov/Archives/edgar/data/948669/000...,0000948669,"PARNASSUS INVESTMENTS, LLC",2022-12-31,958102,958102105,WESTERN DIGITAL CORP.,1.448312e+11,4590528


In [14]:
form13_df.to_csv('data/form13.csv', index=False)