In [34]:
from bs4 import BeautifulSoup as BS
from tqdm import tqdm
import re
import pandas as pd
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English
import spacy
import glob
from nltk import sent_tokenize
nlp = spacy.load('en_core_web_sm')

In [44]:
filepath = './data/amrk-10k_20210630.htm'

In [45]:
regex_10k(import_10k(filepath))

['Fiscal 2021        Fiscal 2020       Quarter     High        Low        High        Low       First     $  4.51        $  8.99        $  4.47        $  1.40       Second     $  7.40        $  5.08        $  3.26        $  .00       Third     $  9.74        $  6.67        $  2.28        $  .85       Fourth     $  6.35        $  5.11        $  9.56        $  2.13        Issuer Purchases of Equity Securities On April 26, 2018, the Company’s Board of Directors authorized a stock repurchase program for up to 500,000 shares of the Company’s stock.',
 '"Share Repurchase Program In April 2018, the Company\'s Board of Directors approved a share repurchase program which authorizes the Company to purchase up to 500,000 shares of its common stock from time to time, either in the open market or in block purchase transactions."']

In [2]:
#function #1 import the html file and tokenize the sentences
def import_10k(filepath):
    """ This function takes the filepath of the 10-K filing as input and returns the tokenized
    sentences of the 10-K file"""
    with open(filepath, encoding='utf-8') as fp:
        soup = BS(fp, 'html.parser')
        ten_k = soup.text
        sentences_tokenized=sent_tokenize(ten_k)
    return sentences_tokenized 

In [38]:
#function 2 regex the tokenized sentences to find the 
def regex_10k(sentences_tokenized):
    """This function takes the tokenized sentences from the 10-K file as input and returns
    the paragraphs referring to share repurchase programs"""
    sentences=[]
    for sentence in sentences_tokenized:
        match=re.search(r"(share|stock) repurchase (program)?", sentence)
        if match: 
            match = re.search(r'authorized|approved', sentence)
            if match:
                sentences.append(sentence)  
    paragraph = [re.sub(r"\\[a-z]+\d?", " ", repr(sentence).strip("'")) for sentence in sentences]
    
    return paragraph

In [43]:
regex_10k(par)

[]

In [35]:
#function 3 NER on paragraphs using spaCy
def ner_10k(paragraph):
    """This function takes the tokenized and regexed sentences from the 10-K file and uses named entity
    recognition from spaCy to identify dates, number of shares and the amount authorized for repurchasing"""
    
    labels = []
    for p in paragraph:
        doc=nlp(p)
        for ent in doc.ents:
            if ent.label_ in ["DATE", "CARDINAL", "MONEY"]:
                labels.append((p, ent.text, ent.label_))
    
    return labels

In [5]:
#combine it all together
def search_10k(filepath):
    """this function combines the previous functions into one to create the values we will input into our
    dictionary to mimic the json file."""
    imported_file = import_10k(filepath)
    paragraph = regex_10k(imported_file)
    labels = ner_10k(paragraph)
    return labels

In [42]:
par = ["We repurchased 1,177,704 shares under the program, for a total cost of $32.9 million, in fiscal 2021, and 686,997 shares under the program, for a total cost of $25.1 million, in fiscal 2020. The Company has repurchased a total of 9,425,462 shares, at a total cost of $307.3 million, since the inception of this program. We have remaining authority to repurchase 1,824,538 shares under this program, which has no expiration date."]

In [39]:
labels = ner_10k(paragraph)

In [40]:
labels

[('We repurchased 1,177,704 shares under the program, for a total cost of $32.9 million, in fiscal 2021, and 686,997 shares under the program, for a total cost of $25.1 million, in fiscal 2020. The Company has repurchased a total of 9,425,462 shares, at a total cost of $307.3 million, since the inception of this program. We have remaining authority to repurchase 1,824,538 shares under this program, which has no expiration date.',
  '1,177,704',
  'CARDINAL'),
 ('We repurchased 1,177,704 shares under the program, for a total cost of $32.9 million, in fiscal 2021, and 686,997 shares under the program, for a total cost of $25.1 million, in fiscal 2020. The Company has repurchased a total of 9,425,462 shares, at a total cost of $307.3 million, since the inception of this program. We have remaining authority to repurchase 1,824,538 shares under this program, which has no expiration date.',
  '$32.9 million',
  'MONEY'),
 ('We repurchased 1,177,704 shares under the program, for a total cost 

In [12]:
filepath= './data/earn-20211231.htm'

In [13]:
search_10k(filepath)

June 13, 2018 DATE
1.2 million CARDINAL
the years ended December 31, 2021 DATE
2020 DATE
31 DATE
2021December 31 DATE
2020Common CARDINAL
12/31/2020 DATE
12/31/2019 DATE
12,455,758 CARDINAL
23,926 CARDINAL
136,142)Common CARDINAL
12/31/2021 DATE
12/31/2020 DATE
12,343,542 CARDINAL
12/31/2021 DATE
12/31/2020 DATE
27,594 CARDINAL
December 31, 2021: DATE
September 14 DATE
2021September 13 DATE
2020December 17, 20225,650 DATE
December 16, 2021December 16, DATE
20225,649 December 16, 2021December 16 DATE
December 31, 2021 DATE
2020 DATE
268,831 CARDINAL
274,798 CARDINAL
June 13, 2018 DATE
1.2 million CARDINAL


In [46]:
tenkdict = {}

for f in tqdm(glob.glob('./data/*')):
    title = f
    ten_k = search_10k(f)
    
    tenkdict[title] = ten_k

 10%|▉         | 3/31 [00:28<04:22,  9.38s/it]


KeyboardInterrupt: 

In [47]:
tenkdict

{'./data/lake_10k.html': [('On February 17, 2021, the Company’s board of directors approved a stock repurchase program under which the Company may repurchase up to $5 million of its outstanding common stock.',
   'February 17, 2021',
   'DATE'),
  ('On February 17, 2021, the Company’s board of directors approved a stock repurchase program under which the Company may repurchase up to $5 million of its outstanding common stock.',
   'up to $5 million',
   'MONEY'),
  ('On July 6, 2021, the Board of Directors authorized an increase in the Company’s current stock repurchase program under which the Company may repurchase up to an additional $5 million of its outstanding common stock (the “Existing Share Repurchase Program”).',
   'July 6, 2021',
   'DATE'),
  ('On July 6, 2021, the Board of Directors authorized an increase in the Company’s current stock repurchase program under which the Company may repurchase up to an additional $5 million of its outstanding common stock (the “Existing Sha