In [1]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup

from datetime import date

In [162]:
# load master.tsv file created by go program
df = pd.read_csv("C:\\Users\\miche\\Desktop\\Code\\1.Projects\\parser\\data\\master.tsv", 
                   delimiter='|', 
                   encoding = "ISO-8859-1", names=['CIK', 'CompanyName', 'FormType', 'Date', 'FileName'])

In [163]:
df.head()

Unnamed: 0,CIK,CompanyName,FormType,Date,FileName
0,860585,RBS PARTNERS L P /CT,13FCONP,1993-02-11,edgar/data/860585/9999999997-04-035713.txt
1,880794,MERRILL LYNCH LIFE VARIABLE ANNUITY SEPARATE A...,NSAR-B,1993-02-26,edgar/data/880794/9999999997-05-050433.txt
2,926688,SMITH THOMAS W,13F-HR,1993-02-12,edgar/data/926688/9999999997-05-015654.txt
3,94673,STORAGE TECHNOLOGY CORP,CERTNYS,1993-02-24,edgar/data/94673/9999999997-05-037760.txt
4,860585,RBS PARTNERS L P /CT,13FCONP,1993-05-12,edgar/data/860585/9999999997-04-042068.txt


In [164]:
rows = df.shape[0]
cols = df.shape[1]

print(f'Dataset contains {rows} rows and {cols} columns')

Dataset contains 18208662 rows and 5 columns


In [165]:
# convert date column to date format
df['Date'] = pd.to_datetime(df['Date'], format="%Y/%m/%d")

In [166]:
def filter_df(filters, dateFilter=None):
    """
    Function that filters df based on values in dict
    
    Args:
        filters (dict): Dict containing key (column) and value (value to filter)
        dateFilter (dict): Dict that needs to contain "start_date" and "end_date"
        
    Returns:
        df_filtered (dataframe): Filtered dataframe
    """
    df_filtered = df.loc[(df[list(filters)] == pd.Series(filters)).all(axis=1)]
    if dateFilter != None:
        df_filtered = df_filtered[(df_filtered['Date'] >= dateFilter["start_date"]) & (df_filtered['Date'] <= dateFilter["end_date"])]
    return df_filtered

In [167]:
f = dict({"FormType": "SC 13D/A"})
d = dict({"start_date": "2013-06-05", "end_date": "2013-06-05"})

In [168]:
df_filtered = filter_df(f, d)
df_filtered

Unnamed: 0,CIK,CompanyName,FormType,Date,FileName
12774458,1001288,LEXMARK INTERNATIONAL INC /KY/,SC 13D/A,2013-06-05,edgar/data/1001288/0001193125-13-248444.txt
12778726,1015899,BURKLE RONALD W,SC 13D/A,2013-06-05,edgar/data/1015899/0001104659-13-047033.txt
12789263,1048462,WEXFORD CAPITAL LP,SC 13D/A,2013-06-05,edgar/data/1048462/0001048462-13-000031.txt
12792044,1055951,ORBIMED ADVISORS LLC,SC 13D/A,2013-06-05,edgar/data/1055951/0000947871-13-000393.txt
12794265,1064015,AMERITRANS CAPITAL CORP,SC 13D/A,2013-06-05,edgar/data/1064015/0001072613-13-000274.txt
12800733,1087939,BAKER JULIAN,SC 13D/A,2013-06-05,edgar/data/1087939/0001144204-13-033445.txt
12812205,1129137,AMERICA MOVIL SAB DE CV/,SC 13D/A,2013-06-05,edgar/data/1129137/0001193125-13-248544.txt
12812782,1131324,GENOMIC HEALTH INC,SC 13D/A,2013-06-05,edgar/data/1131324/0001144204-13-033445.txt
12829993,1196298,NEPHROS INC,SC 13D/A,2013-06-05,edgar/data/1196298/0001048462-13-000031.txt
12862687,1320441,Ammerman Robert C,SC 13D/A,2013-06-05,edgar/data/1320441/0001072613-13-000274.txt


In [169]:
EDGAR_PREFIX = "https://www.sec.gov/Archives/"

In [170]:
# create links to filling ...-index.html (this could have been done in the goland application that downloads all indexes)
filling_links = []

for idx, row in df_filtered.iterrows():
    link_idx = EDGAR_PREFIX + row['FileName']
    link_list = link_idx.split(".")
    link_list[len(link_list)-1] = "html"
    link_list[len(link_list)-2] += "-index"
    filling_html = ".".join(link_list)
    filling_links.append(filling_html)

filling_links

['https://www.sec.gov/Archives/edgar/data/1001288/0001193125-13-248444-index.html',
 'https://www.sec.gov/Archives/edgar/data/1015899/0001104659-13-047033-index.html',
 'https://www.sec.gov/Archives/edgar/data/1048462/0001048462-13-000031-index.html',
 'https://www.sec.gov/Archives/edgar/data/1055951/0000947871-13-000393-index.html',
 'https://www.sec.gov/Archives/edgar/data/1064015/0001072613-13-000274-index.html',
 'https://www.sec.gov/Archives/edgar/data/1087939/0001144204-13-033445-index.html',
 'https://www.sec.gov/Archives/edgar/data/1129137/0001193125-13-248544-index.html',
 'https://www.sec.gov/Archives/edgar/data/1131324/0001144204-13-033445-index.html',
 'https://www.sec.gov/Archives/edgar/data/1196298/0001048462-13-000031-index.html',
 'https://www.sec.gov/Archives/edgar/data/1320441/0001072613-13-000274-index.html',
 'https://www.sec.gov/Archives/edgar/data/1342126/0001104659-13-047033-index.html',
 'https://www.sec.gov/Archives/edgar/data/1356576/0000947871-13-000393-index

## Parsing of Argeement and Plan or Merger

In [8]:
# download file and use soup HTML parser
filling_to_parse = "https://www.sec.gov/Archives/edgar/data/355766/000119312513248778/d550051dex992.htm"

result = requests.get(filling_to_parse)
soup = BeautifulSoup(result.content, 'html.parser')

In [150]:
# MEGA function that tries to find clause and text
def find_clause(clause):
    dict_to_return = {clause : ""}
    clause_text = ""
    # regex to find if clause is near the start of the line
    reg_start = '^.{2,13}'
    reg_end = f'(?:{clause})'
    regex_start = reg_start + reg_end

    # regex to find if clause is part of predefined list like structure
    regex_list_clause = f'^(?:Section\s)?\(?(?:\\b[A-Za-z]+|\d+\.?\d*)\.?\)?\s({clause})'
    
    # regex to check next <p> after found clause
    regex_list_identifier = '^(?:Section\s)?\(?(?:\\b[A-Za-z]+|\d+\.?\d*)\.?\)?\s'
    
    start = soup.find("u", text=clause)
    # check if <u>clause</u> exists
    if start == None:
        dict_to_return[clause] = "No clause found"
        return dict_to_return
    
    # get parent tag. In this type of document this should be a <p> tag
    parent = start.parent
    
    # apply regex to see if clause is some of the first words in <p>
    p_start = re.compile(regex_start)
    p_start_match = p_start.match(parent.text)
    if p_start_match == None:
        dict_to_return[clause] = "No clause found"
        return dict_to_return
    
    # apply regex to see if start of <p> looks like a list and clause is the next word/sentence
    p_contains_clause = re.compile(regex_list_clause)
    p_clause_match = p_contains_clause.match(parent.text)
    if p_clause_match == None:
        dict_to_return[clause] = "No clause found"
        return dict_to_return
    
    # if we get here we have found a clause
    # next up is to try and find the text associated with the clause
    text_p1 = p_clause_match.group(0)
    # all clauses ends with a . in this document
    start = len(text_p1) + 1
    text_p1 = parent.text[start:].strip()
    if text_p1 == "":
    #if text_p1.strip() == "" or text_p1.strip().isdigit():
        # found clause but empty text
        # TODO check if this clause has an empty text but continues
        next_p_empty_clause = parent.find_next('p')
        next_p_u_empty_clause = next_p_empty_clause.find('u')
        if next_p_u_empty_clause:
            start_of_line = re.compile(regex_list_identifier)
            start_of_line_match = start_of_line.match(next_p_empty_clause.text)
            possible_clause = start_of_line_match.group(0) + next_p_u_empty_clause.text
            if next_p_empty_clause.text.startswith(possible_clause):       
                # empty clause
                dict_to_return[clause] = "Clause found but no text"
                return dict_to_return
    
    # add the text inside this <p> tag to clause
    text_p1 = parent.text[start:]
    clause_text += text_p1.strip()
    dict_to_return[clause] = clause_text

    # find next p tag after clause
    next_p = parent.find_next('p')
    paragraph_span_over_page_break = False
    
    while True:
        # check if this p tag contains u tag
        next_u = next_p.find('u')
        # if u check if this looks like a list aka new clause
        if next_u:
            next_p_caluse = re.compile(regex_list_identifier)
            next_p_match = next_p_caluse.match(next_p.text)
            possible_clause = ""
            if next_p_match:
                possible_clause = next_p_match.group(0)
            if next_p.text.startswith(possible_clause + next_u.text):  
                break;
            else:
                start = len(next_p_match.group(0)) if next_p_match else 0
                text_p2 = next_p.text[start:]
                clause_text += " " + text_p2.strip() + "\n\n"
                dict_to_return[clause] = clause_text
                next_p = next_p.find_next('p')
        # if next p does not contain an u tag check for page break or part of clause
        else:
            next_p_clause = re.compile(regex_list_identifier)
            next_p_match = next_p_clause.match(next_p.text)
            if next_p_match:
                start = len(next_p_match.group(0))
                text_p2 = next_p.text[start:]
                clause_text += text_p2.strip() + "\n\n"
                dict_to_return[clause] = clause_text
                next_p = next_p.find_next('p')
            elif next_p_match == None:
                # page break does start with empty p followed by p with digit and then a strange p tag
                counter = 0
                if next_p.text.strip() == "":
                    counter += 1
                    if next_p.find_next('p').text.strip().isdigit():
                        counter += 1
                if counter == 2:
                    # we have a page break so we skip 2 p tags
                    next_p = next_p.find_next('p').find_next('p').find_next('p')
                # check if clause text ends with "." if not this is a continuing paragraph
                elif clause_text.strip().endswith('.') == False:
                    clause_text += next_p.text.strip() + "\n\n"
                    dict_to_return[clause] = clause_text
                    next_p = next_p.find_next('p').find_next('p').find_next('p')
                else:
                    break
        
    return dict_to_return

In [160]:
find_clause("Corporate Authority Relative to This Agreement; No Violation")

{'Corporate Authority Relative to This Agreement; No Violation': 'Each of Parent and Merger Sub has all requisite corporate power and authority to enter into this Agreement and, subject to the adoption of this Agreement by Parent as the sole stockholder of Merger\nSub, which adoption Parent will provide immediately following execution and delivery of this Agreement, to consummate the transactions contemplated hereby. The execution and delivery of this Agreement and the consummation of the transactions\ncontemplated hereby have been duly and validly authorized by the Boards of Directors of Parent and Merger Sub, and, except for the filing of the Certificate of Merger with the Secretary of State of the State of Delaware and the adoption of this\nAgreement by Parent as the sole stockholder of Merger Sub, no other corporate proceedings on the part of Parent or Merger Sub are necessary to authorize the consummation of the transactions contemplated hereby. This Agreement has been duly and\nv

In [157]:
find_clause("Financial Capability; Source of Funds")

{'Financial Capability; Source of Funds': 'Parent and Merger Sub (i)\xa0collectively have, and will have at the\nClosing, sufficient internal funds readily available to pay in cash the aggregate Merger Consideration, to consummate the Merger upon the terms contemplated by this Agreement and to pay any all related fees and expenses associated therewith;\n(ii)\xa0have, and will have at the Closing, the resources and capabilities (financial or otherwise) to perform their obligations hereunder; (iii)\xa0have not incurred any obligation, commitment, restriction or liability of any kind, which would\nimpair or adversely affect suchresources and capabilities and (iv)\xa0are not relying, and will not need to rely, on any of the capital, assets or other resources of the Company as they exist as of the date of this Agreement\nin order to consummate the transactions contemplated hereby. The source of such internal funds referenced in (i)\xa0above will be a capital contribution from the stockholde

## Download HTML files for fillings or individual filling

In [171]:
html_links = []

for filling in filling_links:
    result = requests.get(filling)
    soup = BeautifulSoup(result.content, 'html.parser')
    links_table = soup.find('table', attrs={'class' : 'tableFile'})

    for table in soup.find_all("table", {"class":"tableFile"}):
        for link in table.select("a"):
            if link['href'].endswith(".htm"):
                html_links.append(link['href'])
html_links

['/Archives/edgar/data/1001288/000119312513248444/d548007dsc13da.htm',
 '/Archives/edgar/data/1001288/000119312513248444/d548007dex2.htm',
 '/Archives/edgar/data/1015899/000110465913047033/a13-12763_1sc13da.htm',
 '/Archives/edgar/data/1048462/000104846213000031/form13da4.htm',
 '/Archives/edgar/data/1055951/000094787113000393/ss177961_sc13da.htm',
 '/Archives/edgar/data/1055951/000094787113000393/ss177961_ex99a.htm',
 '/Archives/edgar/data/1064015/000107261313000274/ammerman-sch13d_17523.htm',
 '/Archives/edgar/data/1087939/000114420413033445/v347105_sc13da.htm',
 '/Archives/edgar/data/1087939/000114420413033445/v347105_ex99-1.htm',
 '/Archives/edgar/data/732717/000119312513248544/d548225dsc13da.htm',
 '/Archives/edgar/data/732717/000119312513248544/d548225dex99i.htm',
 '/Archives/edgar/data/1087939/000114420413033445/v347105_sc13da.htm',
 '/Archives/edgar/data/1087939/000114420413033445/v347105_ex99-1.htm',
 '/Archives/edgar/data/1048462/000104846213000031/form13da4.htm',
 '/Archives

In [173]:
# get html document for specific filling
result = requests.get(filling_links[0])

if result.status_code == 200:
    # create parser
    soup = BeautifulSoup(result.content, 'html.parser')
else:
    print('Error downloading file')

# find all html links for a specific filling
links_table = soup.find('table', attrs={'class' : 'tableFile'})
html_links = []

for table in soup.find_all("table", {"class":"tableFile"}):
    for link in table.select("a"):
        if link['href'].endswith(".htm"):
            html_links.append(link['href'])

html_links

['/Archives/edgar/data/1001288/000119312513248444/d548007dsc13da.htm',
 '/Archives/edgar/data/1001288/000119312513248444/d548007dex2.htm']