In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re

In [3]:
#Make a get request to retrieve the page
html_page = requests.get('https://www.british-history.ac.uk/court-husting-wills/vol2/pp1-6')

#Pass the page contents to beautiful soup for parsing
soup = BeautifulSoup(html_page.content, 'html.parser')

In [4]:
# for em in soup("em"):
#     em.unwrap()
for a in soup("a"):
    a.decompose()

paragraphs = [para.get_text(" ", strip=True) for para in soup(id=re.compile("^p"))]
paragraphs

['Monday next after the Feast of S. Mathias, Apostle [24 Feb.].',
 'Wandelesworth (William de), corder.—To be buried in the churchyard of S. Paul of the greater Pardon ( maioris venie ). Bequests to the church of All Hallows at the Hay and ministers thereof. Forty pence to be distributed around his corpse on the day of his burial. To Alianora his sister he leaves twenty shillings, to Henry and John his apprentices and Thomas his kinsman divers gowns, and to Johanna his servant twenty shillings. The residue of his goods and all his tenements he leaves to Alice his wife and his children in equal portions. Dated London, Saturday next before the Feast of Purification of V. Mary [2 Feb.], A.D. 1357.',
 'Roll 86 (14).',
 'Monday next after the Feast of Ascension [10 May].',
 'Aubrey (Andrew), pepperer.—To Johanna his wife certain tenements in the parishes of S. Mary de Aldermariecherche and S. Thomas the Apostle, and certain others which he had by sale and grant of John Chaucer, brother and 

In [5]:
citation = soup(attrs={"class": "active"})[0].text
citation

'Wills: 32 Edward III (1358-9)'

In [103]:
df = pd.DataFrame(paragraphs, columns=["paragraphs"])
df.insert(0, "source", soup(attrs={"class": "active"})[0].text)
df

Unnamed: 0,source,paragraphs
0,Wills: 32 Edward III (1358-9),"Monday next after the Feast of S. Mathias, Apo..."
1,Wills: 32 Edward III (1358-9),"Wandelesworth (William de), corder.—To be buri..."
2,Wills: 32 Edward III (1358-9),Roll 86 (14).
3,Wills: 32 Edward III (1358-9),Monday next after the Feast of Ascension [10 M...
4,Wills: 32 Edward III (1358-9),"Aubrey (Andrew), pepperer.—To Johanna his wife..."
5,Wills: 32 Edward III (1358-9),Monday the Morrow of H. Trinity [27 May].
6,Wills: 32 Edward III (1358-9),"Useflete (Thomas de), Dean of S. Martin le Gra..."
7,Wills: 32 Edward III (1358-9),Roll 86 (49).
8,Wills: 32 Edward III (1358-9),"Monday next after the Feast of S. Petronilla, ..."
9,Wills: 32 Edward III (1358-9),Nichol (William).—His goods and chattels movab...


### Cleaning up the entries a little bit

This process is specific to the corpus in question, and will have to be altered for every new source examined.

In [104]:
def fix_names(text):
    'Moves first names out of parentheses at beginning of document'
    p = re.compile(r"""(?<=\()      # looks inside parentheses
                       \D.*?        # grabs all text
                       (?=(\,|\)))  # stops at first , or )
                    """, re.X)
    results = p.search(text)
    if results:
        text = results[0] + ' ' + text[:results.start()] + text[results.end():]
        return re.sub('\s\(\)', '', text)
    else:
        return text

In [105]:
df.paragraphs = df.paragraphs.map(lambda x: fix_names(x))
df

Unnamed: 0,source,paragraphs
0,Wills: 32 Edward III (1358-9),"Monday next after the Feast of S. Mathias, Apo..."
1,Wills: 32 Edward III (1358-9),"William de Wandelesworth, corder.—To be buried..."
2,Wills: 32 Edward III (1358-9),Roll 86 (14).
3,Wills: 32 Edward III (1358-9),Monday next after the Feast of Ascension [10 M...
4,Wills: 32 Edward III (1358-9),"Andrew Aubrey, pepperer.—To Johanna his wife c..."
5,Wills: 32 Edward III (1358-9),Monday the Morrow of H. Trinity [27 May].
6,Wills: 32 Edward III (1358-9),"Thomas de Useflete, Dean of S. Martin le Grand..."
7,Wills: 32 Edward III (1358-9),Roll 86 (49).
8,Wills: 32 Edward III (1358-9),"Monday next after the Feast of S. Petronilla, ..."
9,Wills: 32 Edward III (1358-9),William Nichol.—His goods and chattels movable...


## Named Entity Recognition (NER)

Code sourced from https://medium.com/explore-artificial-intelligence/introduction-to-named-entity-recognition-eda8c97c2db1

In [106]:
# from: https://medium.com/explore-artificial-intelligence/introduction-to-named-entity-recognition-eda8c97c2db1

import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')

print('NTLK version: %s' % (nltk.__version__))

def fn_preprocess(art):
    art = nltk.word_tokenize(art)
    art = nltk.pos_tag(art)
    return art

art_processed = fn_preprocess(df.paragraphs[4])

results = ne_chunk(art_processed)

for x in str(results).split('\n'):
    if '/NN' in x:
        print(x)
        
pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(art_processed)
print(cs)

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

NTLK version: 3.4
  (PERSON Andrew/NNP)
  (ORGANIZATION Aubrey/NNP)
  pepperer.—To/NN
  (PERSON Johanna/NNP)
  wife/NN
  tenements/NNS
  parishes/NNS
  S./NNP
  Mary/NNP
  (PERSON Aldermariecherche/NNP)
  S./NNP
  Thomas/NNP
  (GPE Apostle/NNP)
  others/NNS
  sale/NN
  grant/NN
  (PERSON John/NNP Chaucer/NNP)
  brother/NN
  executor/NN
  (PERSON Thomas/NNP Heyroun/NNP)
  vintner/NN
  parish/NN
  S./NNP
  Mary/NNP
  tenements/NNS
  rents/NNS
  (GPE Milkestrete/NNP)
  parish/NN
  S./NNP
  Laurence/NNP
  Jewry/NNP
  parish/NN
  (ORGANIZATION All/NNP Hallows/NNP)
  de/NNP
  Bredstrete/NNP
  life/NN
  remainder/NN
  (PERSON John/NNP)
  son/NN
  tail/NN
  part/NN
  trust/NN
  sale/NN
  good/NN
  soul/NN
  souls/NN
  Roger/NNP
  father/NN
  (GPE Dionisia/NNP)
  mother/NN
  (PERSON Thomas/NNP)
  (PERSON Enefelde/NNP)
  others/NNS
  part/NN
  (PERSON Thomas/NNP Aubrey/NNP)
  kinsman/NN
  tail/NN
  chaplains/NNS
  goods/NNS
  chapel/NN
  church/NN
  S./NNP
  Antonin/NNP
  mansion/NN
  licence/NN

[nltk_data] Downloading package words to C:\Users\Matthew
[nltk_data]     Parker\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Matthew Parker\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to C:\Users\Matthew
[nltk_data]     Parker\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to C:\Users\Matthew
[nltk_data]     Parker\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


In [107]:
df.paragraphs[4]

'Andrew Aubrey, pepperer.—To Johanna his wife certain tenements in the parishes of S. Mary de Aldermariecherche and S. Thomas the Apostle, and certain others which he had by sale and grant of John Chaucer, brother and executor of Thomas Heyroun, late vintner, in the aforesaid parish of S. Mary; also tenements and rents in Milkestrete, parish of S. Laurence\n\nJewry, and the parish of All Hallows de Bredstrete for life, with remainder to John his son in tail; remainder as to part in trust for sale for the good of his soul and the souls of Roger his father, Dionisia his mother, Thomas de Enefelde, and others; remainder as to another part to Thomas Aubrey his kinsman in tail. To the chaplains whom he has appointed, in his will of movable goods, to pray for him in the chapel which he recently built adjoining the church of S. Antonin he leaves the mansion which he was permitted to build by licence from Sir John de Hicchen, late rector of the said church, and certain parishioners. Also to hi