In [1]:
from bs4 import BeautifulSoup as BS
from tqdm import tqdm
import re
import pandas as pd
import glob
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English
import spacy
from nltk import sent_tokenize


In [2]:
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
nlp=spacy.load('en_core_web_sm')

In [3]:
#defining function to extract tables in html
def tables(filepath):
    with open(filepath, encoding = 'utf-8') as fi:
        soup= BS(fi, 'html.parser')
        [table.decompose() for table in soup.find_all("table")]
        ten_k=soup.text
        
 
    return ten_k
    #return soup

In [4]:
#testing our function on one file
filepath= '../data/earn-20211231.htm'

no_table=tables(filepath)
no_table

'\nearn-20211231000156067212/312021FYfalse0.010.01500,000,000500,000,00013,109,92612,343,54213,109,92612,343,5420.010.01100,000,000100,000,000————one year9.502.2450.209.533.85—30.775.432.82191780821917832.85479452054794522.69041095890410942.854794520547945232.69041095890410942.690410958904109401.015The Manager receives an annual management fee in an amount equal to 1.50%\xa0per annum of shareholders\' equity (as defined in the Management Agreement) as of the end of each fiscal quarter (before deductions for any management fee with respect to such fiscal period). The management fee is payable quarterly in arrears.The Management Agreement requires the Company to pay a termination fee to the Manager in the event of (1) the Company\'s termination or non-renewal of the Management Agreement without cause or (2) the Manager\'s termination of the Management Agreement upon a default by the Company in the performance of any material term of the Management Agreement. Such termination fee will be 

In [5]:
#type(ten_k)

In [6]:
type(no_table)

str

In [7]:
#this helps us to remove unicode characters in html

#string_unicode = ten_k
string_encode =no_table.encode("ascii", "ignore")
string_decode = string_encode.decode()
#(string_decode)

In [8]:
string_decode

'\nearn-20211231000156067212/312021FYfalse0.010.01500,000,000500,000,00013,109,92612,343,54213,109,92612,343,5420.010.01100,000,000100,000,000one year9.502.2450.209.533.8530.775.432.82191780821917832.85479452054794522.69041095890410942.854794520547945232.69041095890410942.690410958904109401.015The Manager receives an annual management fee in an amount equal to 1.50%per annum of shareholders\' equity (as defined in the Management Agreement) as of the end of each fiscal quarter (before deductions for any management fee with respect to such fiscal period). The management fee is payable quarterly in arrears.The Management Agreement requires the Company to pay a termination fee to the Manager in the event of (1) the Company\'s termination or non-renewal of the Management Agreement without cause or (2) the Manager\'s termination of the Management Agreement upon a default by the Company in the performance of any material term of the Management Agreement. Such termination fee will be equal to 

In [9]:
#defining a function to read html file, decompose tables, remove unicode characters, clean up using regex, tokenzie sentences 

def import_10k(filepath):
    with open(filepath, encoding = 'utf-8') as fi:
        soup= BS(fi, 'html.parser')
        [table.decompose() for table in soup.find_all("table")]
        ten_k=soup.text
        ten_k =ten_k.encode("ascii", "ignore").decode()
        #string_decode = string_encode.decode()
        ten_k=re.sub(r'(\.)([A-Z])', r'\1 \2', ten_k)
    
        sentences_tokenized=sent_tokenize(ten_k)
        
 
    return sentences_tokenized

            
    #return soup
    #return ten_k
    # return sentences_tokenized

In [10]:
#testing our function on a file
filepath= '../data/earn-20211231.htm'

imported_file=import_10k(filepath)
imported_file

["\nearn-20211231000156067212/312021FYfalse0.010.01500,000,000500,000,00013,109,92612,343,54213,109,92612,343,5420.010.01100,000,000100,000,000one year9.502.2450.209.533.8530.775.432.82191780821917832.85479452054794522.69041095890410942.854794520547945232.69041095890410942.690410958904109401.015The Manager receives an annual management fee in an amount equal to 1.50%per annum of shareholders' equity (as defined in the Management Agreement) as of the end of each fiscal quarter (before deductions for any management fee with respect to such fiscal period).",
 'The management fee is payable quarterly in arrears.',
 "The Management Agreement requires the Company to pay a termination fee to the Manager in the event of (1) the Company's termination or non-renewal of the Management Agreement without cause or (2) the Manager's termination of the Management Agreement upon a default by the Company in the performance of any material term of the Management Agreement.",
 "Such termination fee will b

In [11]:
#defining a function to extarct the paragraphs we want and cleaning it up using regex 
def regex_10k(sentences_tokenized):
    
    sentences=[]
    for sentence in sentences_tokenized:
        match=re.search(r"(share|stock) repurchase (program)?", sentence)
        if match: 
            match = re.search(r'authorized|approved', sentence)
            if match:
                sentences.append(sentence)  
    paragraph = [re.sub(r"\\[a-z]+\d?", " ",repr(sentence).strip('"').strip("'")) for sentence in sentences]
    
    return sentences
#paragraph

In [12]:
#testing our function 
paragraph=regex_10k(imported_file)
paragraph

['On June 13, 2018, our Board of Trustees approved the adoption of a share repurchase program under which we are authorized to repurchase up to 1.2 million common shares.',
 "On June 13, 2018, the Company's Board of Trustees approved the adoption of a share repurchase program under which the Company is authorized to repurchase up to 1.2 million common shares."]

In [13]:
#defining a function to extract entities/labels 
def ner_10k(paragraph):
    labels=[]
    for p in paragraph:
        doc=nlp(p)
        for ent in doc.ents:
            if ent.label_ in ['DATE', 'CARDINAL','MONEY']:
                labels.append((p,ent.text, ent.label_))
                
    return labels
  

In [14]:
#testing our function 
labels=ner_10k(paragraph)
labels

[('On June 13, 2018, our Board of Trustees approved the adoption of a share repurchase program under which we are authorized to repurchase up to 1.2 million common shares.',
  'June 13, 2018',
  'DATE'),
 ('On June 13, 2018, our Board of Trustees approved the adoption of a share repurchase program under which we are authorized to repurchase up to 1.2 million common shares.',
  '1.2 million',
  'CARDINAL'),
 ("On June 13, 2018, the Company's Board of Trustees approved the adoption of a share repurchase program under which the Company is authorized to repurchase up to 1.2 million common shares.",
  'June 13, 2018',
  'DATE'),
 ("On June 13, 2018, the Company's Board of Trustees approved the adoption of a share repurchase program under which the Company is authorized to repurchase up to 1.2 million common shares.",
  '1.2 million',
  'CARDINAL')]

In [15]:
#combining all functions together
def search_10k(filepath):
    imported_file = import_10k(filepath)
    paragraph = regex_10k(imported_file)
    labels = ner_10k(paragraph)
    return labels
    

In [16]:
#testing the function 
search_10k(filepath)

[('On June 13, 2018, our Board of Trustees approved the adoption of a share repurchase program under which we are authorized to repurchase up to 1.2 million common shares.',
  'June 13, 2018',
  'DATE'),
 ('On June 13, 2018, our Board of Trustees approved the adoption of a share repurchase program under which we are authorized to repurchase up to 1.2 million common shares.',
  '1.2 million',
  'CARDINAL'),
 ("On June 13, 2018, the Company's Board of Trustees approved the adoption of a share repurchase program under which the Company is authorized to repurchase up to 1.2 million common shares.",
  'June 13, 2018',
  'DATE'),
 ("On June 13, 2018, the Company's Board of Trustees approved the adoption of a share repurchase program under which the Company is authorized to repurchase up to 1.2 million common shares.",
  '1.2 million',
  'CARDINAL')]

In [17]:
#creating a dictionary to include filename as keys and paragraphs as values, we also extract entities 
ten_k_dict ={}

for f in tqdm (glob.glob('../data/*')):
    title = f
    ten_k = search_10k(f)
    
    ten_k_dict[title] = ten_k

100%|██████████| 31/31 [00:22<00:00,  1.38it/s]


In [18]:
ten_k_dict

{'../data/lake_10k.html': [('On February 17, 2021, the Companys board of directors approved a stock repurchase program under which the Company may repurchase up to $5 million of its outstanding common stock.',
   'February 17, 2021',
   'DATE'),
  ('On February 17, 2021, the Companys board of directors approved a stock repurchase program under which the Company may repurchase up to $5 million of its outstanding common stock.',
   'up to $5 million',
   'MONEY'),
  ('On July 6, 2021, the Board of Directors authorized an increase in the Companys current stock repurchase program under which the Company may repurchase up to an additional $5 million of its outstanding common stock (the Existing Share Repurchase Program).',
   'July 6, 2021',
   'DATE'),
  ('On July 6, 2021, the Board of Directors authorized an increase in the Companys current stock repurchase program under which the Company may repurchase up to an additional $5 million of its outstanding common stock (the Existing Share Rep

In [19]:
#trying to extarct span of entities from this sample text 
text= "On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicablefederal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so."


In [20]:
#this gives us the index of start and end for the word 'November' in the text
re.search('November', text)

<re.Match object; span=(3, 11), match='November'>

In [21]:
#to make sure the result in the previous cell is accurate
text[3:11]

'November'

In [22]:
#we can also use .end() or .start()
re.search('November', text).end()#.start()

11

In [23]:
#we can also use .span()
re.search('November 10, 2020', text).span() 

(3, 20)

In [24]:
#this gives us the start index for 'November'
result = text.index('November')
print(result)


3
