In [1]:
import json 
from bs4 import BeautifulSoup as BS
import re
import pandas as pd
import requests
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy 
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
import glob
from tqdm.notebook import tqdm

In [2]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [3]:
#reading in the json file
with open('../data/annotations.json', 'r') as f:
      annotations = json.load(f)

In [4]:
#gives us all filings, texts, entities
for index in range(len(annotations)):
    for key in annotations[index]:
        print(annotations[index][key])

azz-20220228.html
On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.
November 10, 2020
$100 million
azz-20220228.html
The Company purchased 601,822 of its common shares in the amount of $30.8 million at an average purchase price of $51.20 under the 2020 Authorization during fiscal 2022.
fiscal 2022
601,822
$30.8 million
apog-20220226.html
During fiscal 2004, the Board of Directors authorized a share repurchase program, with subsequent increases in authorization. We repurchased 2,292,846 shares under the program during fiscal 2022,

In [5]:
#extracting the first annotations from the json and calling it first_filing
first_filing=annotations[0]
first_filing

{'filing': 'azz-20220228.html',
 'text': "On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.",
 'authorization_date': 'November 10, 2020',
 'authorization_amount': '$100 million'}

In [6]:
#extracting keys and values from the first annotation
for key, value in first_filing.items():
    print (key)
    print (value)

filing
azz-20220228.html
text
On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.
authorization_date
November 10, 2020
authorization_amount
$100 million


In [7]:
#extracting just the text
text=first_filing['text']
text

"On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so."

## Creating training data for the first data point in annotations 

In [8]:
#creating training_data for the index[0] in annotations 
training_data=[]

#all annotations have file name and text, so let's put these seperate
filing= first_filing['filing']
text=first_filing['text']


#creating a dictionary which includes entities and relevant paragraphs 
dictionary={}
dictionary['entities']=[]

#extracting in a tuple format, the text, entities and their start/end index  
for key, value in first_filing.items():
    if key not in ('filing', 'text'):
        entity_start = text.index(value)
        entity_end = text.index(value) + len(value)
        #entity_end=text.rindex(value)#, start= -1)
        entity=(entity_start, entity_end, key)
        dictionary['entities'].append(entity)


training_data.append((text,dictionary))
       

#dictionary
training_data   
 

[("On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.",
  {'entities': [(3, 20, 'authorization_date'),
    (68, 80, 'authorization_amount')]})]

In [9]:
dictionary

{'entities': [(3, 20, 'authorization_date'), (68, 80, 'authorization_amount')]}

In [10]:
training_data

[("On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.",
  {'entities': [(3, 20, 'authorization_date'),
    (68, 80, 'authorization_amount')]})]

In [11]:
#extracting entities 
for key,value in first_filing.items():
    if key == 'authorization_date':
        print(first_filing['authorization_date'])
    if key=='authorization_amount':
        print(first_filing['authorization_amount'])

November 10, 2020
$100 million


In [12]:
#another way to extract entities
index = 0
while index < len(annotations):
    for key in annotations[index]:
        if key == 'authorization_date':
            print(annotations[index]['filing'], annotations[index]['authorization_date'])
        if key=='authorization_amount':
            print(annotations[index]['authorization_amount'])
    index+=1

azz-20220228.html November 10, 2020
$100 million
apog-20220226.html fiscal 2004
fnhc-20211231.html December 2018
$10.0 million
fnhc-20211231.html December 2019
$10 million
fnhc-20211231.html March 2020
$10 million
vrrm-20211231.html August 9, 2021
$100 million
bbw20220129_10k.html November 2021
$25 million
bbw20220129_10k.html November 2021
$25 million
bbw20220129_10k.html November 2021
$25 million
bbw20220129_10k.html November 30, 2021
$25.0 million
cnxc-20211130.htm September 2021
$500 million
ctrn-20220129x10k.html November 30, 2021
$30 million
ctrn-20220129x10k.html March 15, 2022
$30 million
earn-20211231.htm June 13, 2018
$2.5 billion
info-20211130.htm October 2019
$2.5 billion
kmx-20220228.html October 23, 2018
$2 billion
kmx-20220228.html April 2022
$2 billion
lake_10k.html February 17, 2021
$5 million
lake_10k.html July 6, 2021
$5 million
lake_10k.html April 7, 2022
$5 million
mdc-20211231.htm December 31, 2021
noc-20211231.htm September 16, 2015
$4.0 billion
noc-20211231.htm 

In [13]:
#extrating filing and text
index = 0
while index < len(annotations):
    for key in annotations[index]:
        if key == 'filing':
            print(annotations[index]['filing'], annotations[index]['filing'])
        if key=='text':
            print(annotations[index]['text'])
    index+=1

azz-20220228.html azz-20220228.html
On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.
azz-20220228.html azz-20220228.html
The Company purchased 601,822 of its common shares in the amount of $30.8 million at an average purchase price of $51.20 under the 2020 Authorization during fiscal 2022.
apog-20220226.html apog-20220226.html
During fiscal 2004, the Board of Directors authorized a share repurchase program, with subsequent increases in authorization. We repurchased 2,292,846 shares under the program during fiscal 2022, for a tot

In [14]:
#there are 96 files in the json annotations
all_filing=annotations[0:96]
all_filing

[{'filing': 'azz-20220228.html',
  'text': "On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.",
  'authorization_date': 'November 10, 2020',
  'authorization_amount': '$100 million'},
 {'filing': 'azz-20220228.html',
  'text': 'The Company purchased 601,822 of its common shares in the amount of $30.8 million at an average purchase price of $51.20 under the 2020 Authorization during fiscal 2022.',
  'repurchase_date': 'fiscal 2022',
  'repurchase_number': '601,822',
  'repurchase_amount': '$30.8 million'},
 {'filing': 'apog-202202

## Creating training data for all data points in annotations 

In [15]:
#creating traning data containing all data in annotations 

#creating list of training data
training_data=[]

#oops is a list that will include all data points that our foor loops fails to return 
#this is mainly for data that have more than one value for each entity 
oops=[]


#creating a dictionary which includes entities, filing and text 
for annotation in annotations:
    try:
        dictionary={}
        dictionary['entities']=[]
        filing= annotation['filing']
        text=annotation['text']


#extracting in a tuple format, the text, entities and their start/end index  

        for key, value in annotation.items():
       
            if key not in ('filing', 'text'):
                entity_start = text.index(value)
                entity_end = text.index(value) + len(value)
                entity=(entity_start, entity_end, key)
                dictionary['entities'].append(entity)
            

        training_data.append((text,dictionary))
        
    except ValueError: #this is where we might get an error message when there is more than one value for entities
        oops.append(annotation)
       
      
       
#dictionary
training_data   
 

[("On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.",
  {'entities': [(3, 20, 'authorization_date'),
    (68, 80, 'authorization_amount')]}),
 ('The Company purchased 601,822 of its common shares in the amount of $30.8 million at an average purchase price of $51.20 under the 2020 Authorization during fiscal 2022.',
  {'entities': [(157, 168, 'repurchase_date'),
    (22, 29, 'repurchase_number'),
    (68, 81, 'repurchase_amount')]}),
 ('During fiscal 2004, the Board of Directors authorized a share repurchase program, with subsequ

In [16]:
#checking to see which filing failed to be included in the training data 
oops

[{'filing': 'ctrn-20220129x10k.html',
  'text': 'During fiscal 2021 and 2020, we returned $115.3 million and $32.9 million, respectively, to shareholders through share repurchases. See Part II, Item 5 of this Report and Note 6 to the Financial Statements for more information.',
  'repurchase_date': 'fiscal 2020',
  'repurchase_amount': '$32.9 million'},
 {'filing': 'info-20211130.htm',
  'text': 'During the years ended November 30, 2020 and 2019, we entered into various accelerated share repurchase (“ASR”) agreements, repurchasing 13.0 million, and 7.6 million shares, respectively, for $950.0 million and $500.0 million, respectively.',
  'repurchase_date': 'year ended November 30, 2020',
  'repurchase_number': '13.0 million',
  'repurchase_amount': '$950.0 million'},
 {'filing': 'info-20211130.htm',
  'text': 'During the years ended November 30, 2020 and 2019, we entered into various accelerated share repurchase (“ASR”) agreements, repurchasing 13.0 million, and 7.6 million shares, res

In [17]:
#checking our training dat
training_data

[("On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.",
  {'entities': [(3, 20, 'authorization_date'),
    (68, 80, 'authorization_amount')]}),
 ('The Company purchased 601,822 of its common shares in the amount of $30.8 million at an average purchase price of $51.20 under the 2020 Authorization during fiscal 2022.',
  {'entities': [(157, 168, 'repurchase_date'),
    (22, 29, 'repurchase_number'),
    (68, 81, 'repurchase_amount')]}),
 ('During fiscal 2004, the Board of Directors authorized a share repurchase program, with subsequ

In [18]:
#we have 85 traing data
len(training_data)

85

In [19]:
#checking just the tect 
text

'In October 2020, our board of directors authorized and approved a stock repurchase program of up to $200.0 million of our outstanding common stock. During the fiscal year ended July 31, 2021, we repurchased 1,488,991 shares of common stock at an average price of $109.17 per share for an aggregate purchase price of $162.5 million. As of July 31, 2021, $37.5 million remained available for future share repurchases.'

In [20]:
#checking the value
value

'$162.5 million'

## Modeling

In [29]:
#creating a blank model
nlp = spacy.blank('en')

In [30]:
#pipeline
ner = nlp.create_pipe('ner')
nlp.add_pipe('ner')

<spacy.pipeline.ner.EntityRecognizer at 0x7fb3700536d0>

In [31]:
#adding labels
ner.add_label('authorization_date')
ner.add_label('authorization_amount')
ner.add_label('repurchase_date')
ner.add_label('repurchase_number')
ner.add_label('repurchase_amount')
ner.add_label('authorization_number')

1

In [32]:
#importing the libary we need
import random
from spacy.training import Example

In [33]:
#training the model

nlp.begin_training()

for itn in range(10):
    random.shuffle(training_data)
    losses = {}
    for batch in spacy.util.minibatch(training_data, size=2):
        for text, annotations in batch:
            # create Example
            try:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], losses=losses, drop=0.3)
                print(losses)
            except ValueError:
                print(text)

{'ner': 78.29999274015427}
{'ner': 188.93674194812775}
{'ner': 235.7759119272232}
{'ner': 310.25804603099823}
{'ner': 379.0779595375061}
{'ner': 455.4169545173645}
{'ner': 473.4965234398842}
{'ner': 532.4986289143562}
{'ner': 574.1986733824015}
{'ner': 603.9411513041705}
{'ner': 618.0139535712078}
{'ner': 629.3079774254584}
{'ner': 633.1433034371221}
{'ner': 639.3619793045946}
{'ner': 643.2694449585734}
{'ner': 647.239272659501}
{'ner': 651.3095225260404}
{'ner': 653.3239497261125}
{'ner': 657.2700827345265}
{'ner': 661.0337878029945}
{'ner': 664.9634041838407}
{'ner': 672.8242926795564}
{'ner': 678.7856730715673}
{'ner': 684.6318995782241}
{'ner': 688.4165372063668}
{'ner': 694.2531647336415}
{'ner': 701.6279594875647}
{'ner': 704.2111112989493}
{'ner': 711.8191635958771}
{'ner': 714.9954639339318}
{'ner': 718.0999839778721}
{'ner': 725.0801773686159}
{'ner': 731.3750862395005}
{'ner': 733.8083607640198}
{'ner': 736.4631553985288}
{'ner': 739.4549697501403}
{'ner': 743.4506056228335}


{'ner': 149.0180850717029}
{'ner': 156.711870933001}
{'ner': 167.625226210056}
{'ner': 172.77950967201326}
{'ner': 178.46165979406908}
{'ner': 183.69949138838896}
{'ner': 233.45384537054738}
{'ner': 237.59880501583393}
{'ner': 239.20840393774142}
{'ner': 241.66041634160283}
{'ner': 247.73403137430745}
{'ner': 250.36589430322624}
{'ner': 254.89187298451444}
{'ner': 284.8209886806461}
{'ner': 289.0441776139411}
{'ner': 293.0117781071291}
{'ner': 304.1998278153993}
{'ner': 308.46961092889507}
{'ner': 316.0591780239812}
{'ner': 317.56254038356263}
{'ner': 318.210595582413}
{'ner': 321.9088424066978}
{'ner': 326.7621289926475}
{'ner': 331.3507085644347}
{'ner': 337.1721529023949}
{'ner': 338.3774238379828}
{'ner': 343.65112479141646}
{'ner': 349.30304165191217}
{'ner': 354.6609177032039}
{'ner': 364.2587408704754}
{'ner': 365.1089219866978}
{'ner': 372.84200657850033}
{'ner': 375.4302798776408}
{'ner': 380.63061173372205}
{'ner': 382.53270138423727}
{'ner': 386.81968049070633}
{'ner': 389.6

{'ner': 63.16022307147162}
{'ner': 69.90236666188807}
{'ner': 69.90388418035191}
{'ner': 76.58493169946152}
On August 9, 2021, the Company announced that its Board authorized a share repurchase program for up to an aggregate amount of $100 million of its outstanding shares of Class A Common Stock. On August 20, 2021, the Company repurchased and retired 6,849,315 shares of its Class A Common Stock from the Platinum Stockholder at a price per share of $14.60, which was equal to the price at which the underwriter exercised the overallotment option for the secondary offering discussed above. The Company paid $100 million to fund the share repurchase using existing cash on hand. The Company accounted for the share repurchase and retirement under the cost method by deducting its par value from the common stock account, reducing $72.0 million in the additional paid-in-capital account using the share price when the stock was originally issued, and the remaining excess cost of $28.0 million by 

{'ner': 94.94562691400208}
{'ner': 94.94921575588046}
{'ner': 95.78435003150751}
{'ner': 101.04178454401931}
{'ner': 101.0719594255054}
{'ner': 102.9137757900645}
{'ner': 103.2800772699025}
{'ner': 104.17973650735483}
{'ner': 109.37601621971979}
{'ner': 115.38940683080813}
{'ner': 115.45268019081632}
{'ner': 117.93498696831702}
{'ner': 121.2632698712057}
{'ner': 121.26337354966633}
{'ner': 126.96289785258918}
{'ner': 128.9728133065239}
{'ner': 129.3749259136553}
{'ner': 136.6918924324267}
{'ner': 136.74746041531682}
{'ner': 136.7474712191811}
{'ner': 143.16069351396968}
{'ner': 143.16075034501918}
{'ner': 143.16075146033066}
{'ner': 143.35555750823127}
{'ner': 145.62000284908538}
{'ner': 151.20186773327026}
{'ner': 156.88916052403835}
{'ner': 156.889171707316}
{'ner': 156.88957203774945}
{'ner': 162.68238138504555}
{'ner': 0.000515306850133238}
{'ner': 4.428215871028306}
{'ner': 8.380204767794872}
{'ner': 8.413848128167942}
{'ner': 8.416031115731167}
{'ner': 8.443828476149038}
{'ner': 

In [36]:
#testing the model on a sentence

sentence = 'On August 9, 2021, the Company announced that its Board authorized a share repurchase program for up\
to an aggregate amount of $100 million of its outstanding shares of Class A Common Stock. On August 20, 2021,\
the Company repurchased and retired 6,849,315 shares of its Class A Common Stock from the Platinum Stockholder\
at a price per share of $14.60, which was equal to the price at which the underwriter exercised the overallotment\
option for the secondary offering discussed above. The Company paid $100 million to fund the share repurchase\
using existing cash on hand. The Company accounted for the share repurchase and retirement under the cost method\
by deducting its par value from the common stock account, reducing $72.0 million in the additional paid-in-capital\
account using the share price when the stock was originally issued, and the remaining excess cost of $28.0 million\
by increasing the accumulated deficit account.'


In [52]:
doc=nlp(sentence)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('August 9, 2021', 'authorization_date'), ('$100 million', 'authorization_amount'), ('August 20, 2021', 'authorization_date'), ('6,849,315', 'repurchase_number'), ('$100 million', 'authorization_amount')]


In [39]:
#changed $100 to $100.0
#the model seems to dislike decimals 

sentence1 = 'On August 9, 2021, the Company announced that its Board authorized a share repurchase program for up\
to an aggregate amount of $100.0 million of its outstanding shares of Class A Common Stock. On August 20, 2021,\
the Company repurchased and retired 6,849,315 shares of its Class A Common Stock from the Platinum Stockholder\
at a price per share of $14.60, which was equal to the price at which the underwriter exercised the overallotment\
option for the secondary offering discussed above. The Company paid $100 million to fund the share repurchase\
using existing cash on hand. The Company accounted for the share repurchase and retirement under the cost method\
by deducting its par value from the common stock account, reducing $72.0 million in the additional paid-in-capital\
account using the share price when the stock was originally issued, and the remaining excess cost of $28.0 million\
by increasing the accumulated deficit account.'

In [40]:
doc=nlp(sentence1)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('August 9, 2021', 'authorization_date'), ('August 20, 2021,the', 'authorization_date'), ('6,849,315', 'repurchase_number'), ('$100 million', 'authorization_amount')]
