# Read in HTMLs of SEC Filings

## Imports

In [117]:
import glob
import json

import requests
from bs4 import BeautifulSoup as BS
import pandas as pd

import re

import spacy

## Create function to extract stock from filepath

In [93]:
def extract_stock(filepath):
    return re.search(r'data\\([a-z]+)[-\d_]',filepath).group(1)

## Make soups dictionary

In [96]:
soups = {extract_stock(filepath): BS(open(filepath), 'html.parser') for filepath in glob.glob('../data/*.htm*')}

## Create text-creation function and texts dictionary

In [108]:
 def make_text(stock):
    text_list = [tag.get_text().strip() for tag in soups[stock].find_all(['div', 'p'])]
    text = '\n'.join(text_list).replace('\xa0', ' ').replace('  ', ' ')
    return text

In [111]:
texts_dict = {stock: make_text(stock) for stock in soups.keys()}

In [114]:
texts = texts_dict.values()
stocks = texts_dict.keys()

## Read in annotations

In [119]:
annotations = json.load(open('../data/annotations.json'))

In [122]:
annotations[0]

{'filing': 'azz-20220228.html',
 'text': "On November 10, 2020, the Company's Board of Directors authorized a $100 million share repurchase program pursuant to which the Company may repurchase its common stock (the “2020 Authorization”). Repurchases under the 2020 Authorization will be made through open market and/or private transactions, in accordance with applicable federal securities laws, and could include repurchases pursuant to Rule 10b5-1 trading plans, which allows stock repurchases when the Company might otherwise be precluded from doing so.",
 'authorization_date': 'November 10, 2020',
 'authorization_amount': '$100 million'}

## Generate spacy model, documentize texts, and add metadata

In [116]:
nlp = spacy.load("en_core_web_sm")
Docs = nlp.pipe(texts, as_tuples=True)