# Read in HTMLs of SEC Filings

## Imports

In [255]:
import glob
import json

import itertools
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
from tqdm.auto import tqdm

import re

import spacy

## Create function to extract stock from filepath

In [127]:
def extract_stock(string, filepath = True):
    if filepath:
        return re.search(r'data\\([a-z]+)[-\d_]',string).group(1)
    else:
        return re.search(r'^([a-z]+)[-\d_]',string).group(1)

## Make soups dictionary

In [96]:
soups = {extract_stock(filepath): BS(open(filepath), 'html.parser') for filepath in glob.glob('../data/*.htm*')}

## Create text-creation function and texts dictionary

In [108]:
 def make_text(stock):
    text_list = [tag.get_text().strip() for tag in soups[stock].find_all(['div', 'p'])]
    text = '\n'.join(text_list).replace('\xa0', ' ').replace('  ', ' ')
    return text

In [111]:
texts_dict = {stock: make_text(stock) for stock in soups.keys()}

In [114]:
texts = texts_dict.values()
stocks = texts_dict.keys()

## Read in annotations

In [126]:
texts_dict.keys()

dict_keys(['amrk', 'apog', 'azz', 'bbw', 'cnxc', 'ctrn', 'earn', 'fnhc', 'ful', 'gwre', 'info', 'intc', 'kbh', 'kmx', 'lake', 'lvis', 'mcft', 'mdc', 'mtn', 'nflx', 'noc', 'panw', 'prgs', 'pstg', 'siri', 'slb', 'tlys', 'tmb', 'uri', 'vrrm'])

In [119]:
annotations = json.load(open('../data/annotations.json'))

In [125]:
annotations[0]['filing']

'azz-20220228.html'

In [156]:
extract_dict = {}
for entry in annotations:
    stock = extract_stock(entry['filing'], filepath=False)
    text = entry['text']
    if stock in extract_dict.keys():
        extract_dict[stock][text] = {key:value for key, value in entry.items() if key not in ['filing', 'text']}
    else:
        extract_dict[stock] = {}
        extract_dict[stock][text] = {key:value for key, value in entry.items() if key not in ['filing', 'text']}

In [157]:
list(extract_dict['intc'].keys())[0]

'In the first quarter of 2021, we repurchased the remaining $2.4 billion in shares of our planned $20.0 billion share repurchases announced in October 2019. We expect our future stock repurchases to be significantly below our levels from the last few years.'

## Generate spacy model, documentize texts, and add metadata

### Let's create training tuples for each 10k

In [219]:
# check the keys
extract_dict.keys()

dict_keys(['azz', 'apog', 'fnhc', 'vrrm', 'bbw', 'cnxc', 'ctrn', 'earn', 'info', 'kmx', 'lake', 'mdc', 'noc', 'prgs', 'kbh', 'pstg', 'siri', 'tlys', 'tmb', 'uri', 'lvis', 'nflx', 'slb', 'ful', 'intc', 'mcft', 'mtn', 'panw', 'amrk', 'gwre'])

### Start with a test case (azz)

In [220]:
# use azz as a test case for labeling
list(extract_dict.get('azz').keys())

["On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.",
 'The Company purchased 601,822 of its common shares in the amount of $30.8 million at an average purchase price of $51.20 under the 2020 Authorization during fiscal 2022.']

In [379]:
# tokenize by text bounded by line breaks
azz_tok = regexp_tokenize(texts_dict['azz'], r'[^\n]+')

# find which tokens are equal to our annotations
azz_annotation_text = 1
azz_annot_index = [tok == list(extract_dict.get('azz').keys())[azz_annotation_text] for tok in azz_tok].index(True)
azz_tok[azz_annot_index]

'The Company purchased 601,822 of its common shares in the amount of $30.8 million at an average purchase price of $51.20 under the 2020 Authorization during fiscal 2022.'

In [234]:
# verify that exactly two tokens match our annotations for azz
[tok in list(extract_dict.get('azz').keys()) for tok in azz_tok]

2

### Build dictionaries containing the labels (True or False) for each "paragraph" of each 10k
- True corresponds to a repurchase-related paragraph and False corresponds to an irrelevant paragraph.
- labels_dict has the stock as keys and the texts and labels as tuples
- labels_dict_for_df has the same information but prepared for dataframe conversion

In [246]:
# {stock: (token, label)}
labels_dict = {key: [(tok, tok in list(extract_dict.get(key).keys())) for tok in regexp_tokenize(texts_dict[key], r'[^\n]+')] for key in texts_dict.keys()}

# {stock: {'token': tokens_list, 'label': labels_list}}
labels_dict_for_df = {key: {'token': [tok for tok in regexp_tokenize(texts_dict[key], r'[^\n]+')], 
                            'label': [tok in list(extract_dict.get(key).keys()) for tok in regexp_tokenize(texts_dict[key], r'[^\n]+')]} 
                      for key in texts_dict.keys()
                      }

### Make the labels dataframe and use sklearn to train_test_split it

In [252]:
labels_df = pd.concat({key: pd.DataFrame.from_dict(value) for key, value in labels_dict_for_df.items()})

In [257]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
import classy_classification

In [258]:
X = labels_df['token']
y = labels_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [312]:
# verifying that the df works as expected
X_test[y_test]

fnhc  1578     In December 2019, the Company's Board of Direc...
vrrm  11371    On August 9, 2021, the Company announced that ...
mdc   674      At December 31, 2021, we were authorized to re...
bbw   828      We ended fiscal 2021 with no borrowings under ...
cnxc  546      In September 2021, our board of directors auth...
info  1227     During the years ended November 30, 2020 and 2...
ctrn  1381     During fiscal 2021 and 2020, we returned $115....
Name: token, dtype: object

### Build first model off of base spacy small (with help from classy)
This model never predicts repurchase (True)

In [269]:
data = {'repurchase': X_train[y_train == True].values.tolist(),
        'irrelevant': X_train[y_train == False].values.tolist()}

In [270]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("text_categorizer",
              config = {"model": 'spacy',
                        "data": data
                      }
        )

Fitting 5 folds for each of 6 candidates, totalling 30 fits


<classy_classification.classifiers.spacy_internal.classySpacyInternal at 0x1435a4f9520>

In [275]:
preds = [nlp(test_tok)._.cats for test_tok in X_test.values]

In [281]:
preds_summary = [pred['repurchase'] > 0.5 for pred in preds]
preds_repur = [pred['repurchase'] for pred in preds]

In [381]:
sorted(enumerate(preds_repur), key = lambda x: x[1], reverse=True)[:10]

[(16103, 0.19159251223724055),
 (1294, 0.06239400916238595),
 (20462, 0.06129394065852936),
 (19452, 0.05920115414171221),
 (30384, 0.05449088135414002),
 (19580, 0.0520140554850948),
 (26721, 0.05004572590881312),
 (4894, 0.04695912011592231),
 (14323, 0.04020289978825083),
 (20793, 0.03301462391344541)]

### Here's a failed attempt at implementing this with BERT

In [327]:
nlp = spacy.blank("en")
nlp.add_pipe("text_categorizer",
              config = {"model": ' sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
                        "data": data,
                        "device": "cpu",
                        
                      }
        )

HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/api/models/%20sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2

### Here's a successful attempt at implementing this with BERT's Q&A model
Note, there's no training here, so this is essentially a baseline

In [339]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

query_embedding = model.encode('Are there repurchases')
passage_embedding = model.encode(X_test.values.tolist())

bert_test_scores = util.dot_score(query_embedding, passage_embedding)

In [378]:
sorted(enumerate(bert_test_scores.numpy()[0]), key = lambda x: x[1], reverse=True)[0:20]

[(12930, 0.7698134),
 (28849, 0.74283355),
 (2765, 0.7428335),
 (4720, 0.7428335),
 (10357, 0.7428335),
 (25149, 0.70291966),
 (17106, 0.7006395),
 (3128, 0.69797015),
 (13357, 0.68717366),
 (1843, 0.6508532),
 (20597, 0.6508532),
 (21475, 0.6508532),
 (22824, 0.6506263),
 (17033, 0.6450168),
 (11819, 0.64463985),
 (2389, 0.6433725),
 (3268, 0.6433725),
 (10127, 0.6433725),
 (14323, 0.6414058),
 (7140, 0.6253795)]