In [1]:
from bs4 import BeautifulSoup as BS
from tqdm import tqdm
import re
import pandas as pd
import glob
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English
import spacy
from nltk import sent_tokenize


In [2]:
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
nlp=spacy.load('en_core_web_sm')

In [3]:
#defining a function to read files, extract text out of html, and tokenize sentences 

def import_10K(filepath):
    with open(filepath, encoding = 'utf-8') as fi:
        soup= BS(fi, 'html.parser')
        ten_k=soup.text
        sentences_tokenized=sent_tokenize(ten_k)
            
    return sentences_tokenized

In [4]:
filepath= '../data/earn-20211231.htm'

imported_file=import_10K(filepath)
imported_file

["\nearn-20211231000156067212/312021FYfalse0.010.01500,000,000500,000,00013,109,92612,343,54213,109,92612,343,5420.010.01100,000,000100,000,000————one year9.502.2450.209.533.85—30.775.432.82191780821917832.85479452054794522.69041095890410942.854794520547945232.69041095890410942.690410958904109401.015The Manager receives an annual management fee in an amount equal to 1.50%\xa0per annum of shareholders' equity (as defined in the Management Agreement) as of the end of each fiscal quarter (before deductions for any management fee with respect to such fiscal period).",
 "The management fee is payable quarterly in arrears.The Management Agreement requires the Company to pay a termination fee to the Manager in the event of (1) the Company's termination or non-renewal of the Management Agreement without cause or (2) the Manager's termination of the Management Agreement upon a default by the Company in the performance of any material term of the Management Agreement.",
 "Such termination fee wi

In [5]:
#defining a function to apply regex 
def regex_10k(sentences_tokenized):
    
    sentences=[]
    for sentence in sentences_tokenized:
        match=re.search(r"(share|stock) repurchase (program)?", sentence)
        if match: 
            match = re.search(r'authorized|approved', sentence)
            if match:
                sentences.append(sentence)  
    paragraph = [re.sub(r"\\[a-z]+\d?", " ", repr(sentence).strip("'")) for sentence in sentences]
    
    return paragraph

In [6]:
paragraph=regex_10k(imported_file)
paragraph

['On June 13, 2018, our Board of Trustees approved the adoption of a share repurchase program under which we are authorized to repurchase up to 1.2 million common shares.',
 '"The Company did not receive any proceeds from the common shares sold by the Blackstone Funds.Detailed below is a roll forward of the Company\'s common shares outstanding for the years ended December 31, 2021 and 2020:Year EndedDecember 31, 2021December 31, 2020Common Shares Outstanding (12/31/2020 and 12/31/2019, respectively)12,343,542 12,455,758 Share Activity:Common shares issued738,269 — Restricted common shares issued28,115 23,926 Common shares repurchased— (136,142)Common Shares Outstanding (12/31/2021 and 12/31/2020, respectively)13,109,926 12,343,542 Unvested restricted shares outstanding (12/31/2021 and 12/31/2020, respectively)32,567 27,594 102The below table provides details on the Company\'s restricted shares granted pursuant to share award agreements which are unvested at December 31, 2021:Grant Reci

In [11]:
def ner_10k(paragraph):
    labels=[]
    for p in paragraph:
        doc=nlp(p)
        for ent in doc.ents:
            if ent.label_ in ['DATE', 'CARDINAL','MONEY']:
                labels.append((p,ent.text, ent.label_))
                
    return labels
  

In [12]:
labels=ner_10k(paragraph)
labels

[('On June 13, 2018, our Board of Trustees approved the adoption of a share repurchase program under which we are authorized to repurchase up to 1.2 million common shares.',
  'June 13, 2018',
  'DATE'),
 ('On June 13, 2018, our Board of Trustees approved the adoption of a share repurchase program under which we are authorized to repurchase up to 1.2 million common shares.',
  '1.2 million',
  'CARDINAL'),
 ('"The Company did not receive any proceeds from the common shares sold by the Blackstone Funds.Detailed below is a roll forward of the Company\'s common shares outstanding for the years ended December 31, 2021 and 2020:Year EndedDecember 31, 2021December 31, 2020Common Shares Outstanding (12/31/2020 and 12/31/2019, respectively)12,343,542 12,455,758 Share Activity:Common shares issued738,269 — Restricted common shares issued28,115 23,926 Common shares repurchased— (136,142)Common Shares Outstanding (12/31/2021 and 12/31/2020, respectively)13,109,926 12,343,542 Unvested restricted s

In [13]:
#combining all functions together
def search_10k(filepath):
    imported_file = import_10K(filepath)
    paragraph = regex_10k(imported_file)
    labels = ner_10k(paragraph)
    return labels
    

In [14]:
search_10k(filepath)

[('On June 13, 2018, our Board of Trustees approved the adoption of a share repurchase program under which we are authorized to repurchase up to 1.2 million common shares.',
  'June 13, 2018',
  'DATE'),
 ('On June 13, 2018, our Board of Trustees approved the adoption of a share repurchase program under which we are authorized to repurchase up to 1.2 million common shares.',
  '1.2 million',
  'CARDINAL'),
 ('"The Company did not receive any proceeds from the common shares sold by the Blackstone Funds.Detailed below is a roll forward of the Company\'s common shares outstanding for the years ended December 31, 2021 and 2020:Year EndedDecember 31, 2021December 31, 2020Common Shares Outstanding (12/31/2020 and 12/31/2019, respectively)12,343,542 12,455,758 Share Activity:Common shares issued738,269 — Restricted common shares issued28,115 23,926 Common shares repurchased— (136,142)Common Shares Outstanding (12/31/2021 and 12/31/2020, respectively)13,109,926 12,343,542 Unvested restricted s

In [15]:
ten_k_dict ={}

for f in tqdm (glob.glob('../data/*')):
    title = f
    ten_k = search_10k(f)
    
    ten_k_dict[title] = ten_k

 23%|██▎       | 7/31 [00:05<00:18,  1.28it/s]


KeyboardInterrupt: 

In [16]:
ten_k_dict

{'../data/lake_10k.html': [('On February 17, 2021, the Company’s board of directors approved a stock repurchase program under which the Company may repurchase up to $5 million of its outstanding common stock.',
   'February 17, 2021',
   'DATE'),
  ('On February 17, 2021, the Company’s board of directors approved a stock repurchase program under which the Company may repurchase up to $5 million of its outstanding common stock.',
   'up to $5 million',
   'MONEY'),
  ('On July 6, 2021, the Board of Directors authorized an increase in the Company’s current stock repurchase program under which the Company may repurchase up to an additional $5 million of its outstanding common stock (the “Existing Share Repurchase Program”).',
   'July 6, 2021',
   'DATE'),
  ('On July 6, 2021, the Board of Directors authorized an increase in the Company’s current stock repurchase program under which the Company may repurchase up to an additional $5 million of its outstanding common stock (the “Existing Sh