In [92]:
import pandas as pd
from bs4 import BeautifulSoup
import glob
from spacy.lang.en import English
import spacy
from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [3]:
filepaths = glob.glob('data/*.htm*')

In [4]:
ten_ks = {}

for filepath in filepaths:
    with open(filepath, 'r') as f:
        content = f.read()
        soup = BeautifulSoup(content, 'html5lib')

        # Kill all script and style elements
        for script in soup(['script', 'style']):
            script.extract()

        # Get text    
        text = soup.get_text()

        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())

        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

        # Drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)

        # Append to dictionary
        ten_ks[filepath] = text

In [48]:
doc = ten_ks['data\\azz-20220228.html']

In [66]:
sent_doc = [sentence.lower() for sentence in sent_tokenize(doc)]

In [87]:
sent_doc

['azz-202202280000008947--02-28falsefy2022p2y1112http://www.azz.com/20220228#financeleaseandoperatingleaseliabilitycurrenthttp://www.azz.com/20220228#financeleaseandoperatingleaseliabilitycurrenthttp://www.azz.com/20220228#financeleaseandoperatingleaseliabilitynoncurrenthttp://www.azz.com/20220228#financeleaseandoperatingleaseliabilitynoncurrenthttp://www.azz.com/20220228#financeleaseandoperatingleaseliabilitycurrenthttp://www.azz.com/20220228#financeleaseandoperatingleaseliabilitycurrenthttp://www.azz.com/20220228#financeleaseandoperatingleaseliabilitynoncurrenthttp://www.azz.com/20220228#financeleaseandoperatingleaseliabilitynoncurrent00000089472021-03-012022-02-2800000089472021-08-31iso4217:usd00000089472022-04-18xbrli:shares0000008947us-gaap:revolvingcreditfacilitymemberus-gaap:lineofcreditmember2022-02-2800000089472022-02-2800000089472021-02-28iso4217:usdxbrli:shares00000089472020-03-012021-02-2800000089472019-03-012020-02-2900000089472020-02-2900000089472019-02-280000008947us-gaa

In [88]:
sentences = []
stemmer = PorterStemmer()
stock_stem = stemmer.stem('stock')
shares_stem = stemmer.stem('shares')

for idx, sentence in enumerate(sent_doc):
    if 'repurchase program' in sentence:
        chunk = sent_doc[idx-5:idx+6]
        for x in chunk:
            if stock_stem in x or shares_stem in x:
                sentences.append(chunk)
                break

In [89]:
sentences

[['dividend policythe payment of dividends is within the discretion of our board and is dependent on our earnings, capital requirements, operating and financial condition and other factors.',
  'the company has a history of paying dividends on a quarterly basis.',
  'dividends paid totaled $16.9 million, $17.6 million, and $17.8 million during fiscal 2022, 2021, and 2020, respectively.',
  'dividend payments may be restricted to total payments of $20.0 million per fiscal year based on covenants with the company\'s lenders in the event that the company\'s leverage ratio (defined as net debt to earnings before interest, taxes, depreciation and amortization, or "ebitda") exceeds 3.0 to 1.0.',
  'currently, there are no restrictions on dividend payments.',
  'any future dividends payments will be reviewed each quarter and declared by the board of directors at its discretion.purchases of equity securitieson november 10, 2020, our board of directors authorized a $100\xa0million share repurch

In [93]:
nlp = spacy.load('en_core_web_sm')
values = []
for chunk in sentences:
    for sentence in chunk:
        text = nlp(sentence)
        for token in text:
            if token.like_num:
                #values.append(text[token.i-10:token.i+11])
                values.append(sentence)
                break
values

['dividends paid totaled $16.9 million, $17.6 million, and $17.8 million during fiscal 2022, 2021, and 2020, respectively.',
 'dividend payments may be restricted to total payments of $20.0 million per fiscal year based on covenants with the company\'s lenders in the event that the company\'s leverage ratio (defined as net debt to earnings before interest, taxes, depreciation and amortization, or "ebitda") exceeds 3.0 to 1.0.',
 'any future dividends payments will be reviewed each quarter and declared by the board of directors at its discretion.purchases of equity securitieson november 10, 2020, our board of directors authorized a $100\xa0million share repurchase program pursuant to which the company may repurchase our common stock (the “2020 authorization”).',
 'repurchases under the 2020 authorization will be made through open market or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to rule 10b5-1 trading plans, whi

In [None]:
token.ent