# Extract Data from .txt

## Import packages

In [1]:
import string
import sys
from import_txts import *
import os
from glob import glob
import regex as re
import pandas as pd

## Parameters

In [2]:
## Parameters
year = '1858'

## Import files
### Make a list of images that are to be converted into text

In [3]:
image_path = glob('tesseract/' + year + '/*')
image_path.sort()

### Import .txt files

In [4]:
pages = []

file = image_path
for f in file:
    s = open(f, "r").read()
    pages.append(s)

## Clean unusual punctuation

Probate records contain a small subset of punctuation marks. Weird characters are likely Tesseract trying to interpret squiggles of dust as characters, so should be removed.

### Symbols that I want to keep

In [5]:
punctuation = '.' + '-' + '—' + ',' + '(' + ')' + "'" + '‘' + '’' + '"' + '“' + '”'
whitelist = string.ascii_letters + string.digits + punctuation + ' ' + '\n' + '£'

### Set comparison

Empty lists and sets to store unique characters pre- and post-clean:

In [6]:
clean_pages = []
old_set = set()
new_set = set()

### Remove characters that aren't on the whitelist

In [7]:
for page in pages:
    clean_string = ''
    for char in page:
        if char in whitelist:
            clean_string += char
        else:
            clean_string += ''
    clean_string = clean_string.replace('\‘|\’', "'")
    clean_string = clean_string.replace('\“|\”', '"')
    clean_string = clean_string.replace('\—', '-')
    clean_string = clean_string.replace('. .', '.')
    clean_pages.append(clean_string)
    old = set(page)
    new = set(clean_string)
    old_set = old_set | old
    new_set = new_set | new

### Validation

Quick check of removed characters. Do I want to keep any of these?

In [33]:
old_set - new_set

{'\x0c',
 '!',
 '#',
 '$',
 '%',
 '&',
 '*',
 '+',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '_',
 '{',
 '|',
 '}',
 '~',
 '¢',
 '¥',
 '§',
 '©',
 '«',
 '®',
 '°',
 '»',
 'é',
 '€',
 '™'}

## Remove superfluous pages

### Canterbury court records

*Note.* Only relevant for records in 1858. Prior to mid-January of that year, records were proved locally. Most of these cases were proved in Canterbury. As such, the records contain a few instances where we have lists of non-relevant data.

We can remove these files by:
1. checking whether the file year is 1858, then
2. checking whether the page header indicates that it is in the legacy format

In [8]:
if year == '1858':
    clean_pages = [page for page in clean_pages if 'COURT OF CANTERBURY' not in page]
    clean_pages = [page for page in clean_pages if 'WILLS PROVED' not in page]
    clean_pages = [page for page in clean_pages if 'PROVED IN THE' not in page]
    clean_pages = [page for page in clean_pages if 'ADMINISTRATIONS GRANTED BY THE' not in page]

### Scottish and Irish probate

For simplicity, we will look at the wills proved in England and Wales. We can remove this line if we'd like to expand the analysis.

In [9]:
clean_pages = [page for page in clean_pages if 'SCOTCH CONFIRMATIONS' not in page]
clean_pages = [page for page in clean_pages if 'RISH PROBATES' not in page]

### Appendices

These are handwritten notes.

In [10]:
clean_pages = [page for page in clean_pages if 'APPENDIX' not in page]
clean_pages = [page for page in clean_pages if 'PBOLTIVTLD' not in page]

In [14]:
print(clean_pages[4])

92

ASHCROFT Robert.
Effects under £600.

ASHDOWN Sarah.

Effects under £1,500.

ASHDOWN William.
Effects under £200.

ASHER Hannah.
Effects under £200.

ASHER Rachel.
Effects under £600.

WILLS. 1858.

“

at Derby by the oaths of Henry Kerry of
Spondon aforesaid Farmer and Isaac Potter
of the same place Farmer the Executors.

9 November. The Will

of Robert Ashcroft late of Bootle-cum-Linacre
in the County of Lancaster Gardener
deceased who died 22 October 1858 at
Bootle-cum-Linacre aforesaid was proved at
Liverpool by the oath of Robert Ashcroft
of Orrell near Seaforth in the same County
Labourer the Son and one of the Executors,

28 August. The Will

with a Codicil of Sarah Ashdown formerly
of 4 Regent-place West in the County of
Middlesex and late of Lingfield in the County
of Surrey Spinster deceased who died
17 May 1858 at Lingfield aforesaid was
proved at the Principal Registry by the
oaths of Faringdon Lane of 103 Southwark-
bridge-road in the said County of Surrey
Civil Engine

## Regex
### Page metadata
Split page metadata from body of page.

#### Search strings

In [56]:
## page, probate, year
ppy = re.compile('(\d+\s+(WILLS|ADMINISTRATIONS).\s+%s.)' % year)
## probate, year, page
pyp = re.compile('((WILLS|ADMINISTRATIONS).\s+%s.\s+\d+)' % year)
# fuzzy page, probate, year
fuzz_ppy = re.compile('(\d+\s+(WILLS|ADMINISTRATIONS).\s+%s.){e<=2}' % year)
# fuzzy probate, year, page
fuzz_pyp = re.compile('((WILLS|ADMINISTRATIONS).\s+%s.\s+\d+){e<=2}' % year)
# page, gap, probate, year
pgpy = re.compile('(^\d+)(.*)((WILLS|ADMINISTRATIONS).)(\s+%s.)' % year)
# page in top right corner of page
page_no = re.compile('^\d+')
# page and year string
py = re.compile('(WILLS|ADMINISTRATIONS). %s.' % year)

#### Extraction

In [194]:
pages = []
probate_type = []
bodies = []

for p in clean_pages:
    
    try:
        page = re.search(ppy, p).group(0)
        
    except AttributeError:
        try:
            page = re.search(pyp, p).group(0)
            
        except AttributeError:
            try:
                pn = re.search(page_no, p).group(0)
                pys = re.search(py, p).group(0)
                page = pn + ' ' + pys
                
            except AttributeError:
                try:
                    page = re.search(fuzz_ppy, p).group(0)
                    
                except AttributeError:
                    try:
                        page = re.search(fuzz_pyp, p).group(0)
                        
                    except AttributeError:
                        page = None
                        body = page
    finally:
        if page != None:
            page = re.sub('\n', '', page)
            page = re.sub('\s+', ' ', page)
            try:
                body = re.sub(page, '', p)
            except:
                pass
            try:
                page_number = int(re.findall('(^\d+|\d+$)', page)[0])
            
            except:
                page_number = None
            
            will_or_admin = re.findall('[A-Z]+', page)[0]

        else:
            page_number = None
            body = p
            
        pages.append(page_number)
        probate_type.append(will_or_admin)
        bodies.append(body)        

### Names and effects
#### Search strings

In [276]:
## regex pattern for names
surname_firstname = re.compile('([A-Z]+([\n\r\s]+[A-Z][a-z]+)+)')
compiled = re.compile('(Effects under £\d+){e<=2}')

#### Extraction

In [None]:
name_effs = []

for body in bodies:
    

In [281]:
name_effs = {}
name_effs['name'] = []
name_effs['effects'] = []

for body in bodies[:2]: ## For each page in a year
    bodies_split = re.split('\n\n', body)

    page_names = []
    page_effects = []

    for bod in bodies_split: # for each line in a page
        try:
            n = re.search(surname_firstname, bod).group(0)
            page_names.append(n)
            
        except AttributeError:
            try:
                e = re.search(compiled, bod).group(0)
                page_effects.append(e)
                
            except AttributeError:
                pass
    
    if len(page_names) == len(page_effects):
        print('yay')
    else:
        print('oops')
    name_effs['name'].append(page_names)
    name_effs['effects'].append(page_effects)


oops
oops


In [275]:
re.search('^[A-Z]+([\n\r\s]+[A-Z][a-z]+)+', 'otherwise\n\nASBRIDGE\nEffects under £1,000,\n\nASHBRIDGE\n Jane.\n')
bodies_split
# b = re.split('\n', body)
# names_and_effects = list(filter(surname_firstname.match, b))

In [298]:
months = '(January|February|March|April|May|June|July|August|September|October|November|December)'

calendar = re.compile('(\d+\s+%s\.){e<=1}' % months)
re.search(calendar, body).group()

print(bodies[0])



ARTHUR James.
Effects under £450.

ARTISS James Benjamin.
Effects under £50.

otherwise

ASHBRIDGE
is ane.
ASBRIDGE

ASBURY William.
Effects under £300.

ASH Elizabeth.
Effects under £20.

ASH Isaac.
Effects under £100.

29 January.

12 June.

1 June.

14 December.

15 September.

The Will
of James Arthur late of the Parish of
Egloshayle in the County of Cornwall
Yeoman deceased wha died 21 December
1857 at Egloshayle aforesaid was proved at
Bodmin by the oath of Martha Williams
(Wife of Edward Williams of the Parish of
Luxullion in the said County) the sole
Executrix.

The Will
of James Benjamin Artiss late of the Parish of
Kenton in the County of Suffolk Draper
deceased who died 17 March 1858 at St.
Bartholomew’s Hospital London ‘was proved
at the Principal Registry by the oath of
Alfred Artiss of 1 Arlington-square Islington
in the County of Middlesex Clerk to the
Electric Telegraph Company Lothbury in the
City of London the Brother and one of the
Executors. -




See ASHBRIDGE.



In [109]:
#re.compile('(\d+\s+(WILLS|ADMINISTRATIONS).\s+%s.){e<=2}')



for bod in b:
    try:
        abc = re.search(compiled, bod).group(0)
        print(abc)
    except:
        pass

Effects under £100.
Effects under £200.
Effects under £1,000,
Effects under £100.
Effects under £450. 
Effects under £3,000.


In [106]:
b

['',
 '',
 'ASH Thomas.',
 'Effects under £100.',
 '',
 'ASHBEE John.',
 '',
 'Effects under £200.',
 '',
 'Aart l8b0 . Under £300',
 '',
 'otherwise',
 '',
 'ASBRIDGE',
 'Effects under £1,000,',
 '',
 'ASHBRIDGE',
 ' Jane.',
 '',
 'ASHBURN William',
 'Effects under £100.',
 '',
 'ASHBURNER Ann.',
 'Effects under £450. -',
 '',
 'ASHBURNER John.',
 'Effects under £3,000.',
 '',
 'and Charles Prince the younger of The Bents',
 'near Tean in the same County Farmer the',
 'Executors,',
 '',
 '2 October. The Will',
 '',
 'of Thomas Ash late of The Limpits in the',
 'Parish of Whitmore in the County of Stafford',
 'Farmer deceased who died 14 April 1847 at',
 'The Limpits aforesaid was proved at Lich-',
 'field by the oath of Isaac Watts, of Stonnall',
 'in the Parish of Shenstone in the said County',
 'Schoolmaster one of the Executors.',
 '',
 '27 February. The Will',
 '',
 'of John Ashbee late of Hawkesbury Upton',
 'in the County of Gloucester Stonemason',
 'deceased who died 2 January 

In [17]:
for p in clean_pages:
    # 1) PAGE METADATA
    ## - Split page metadata from body of page
    page, body = re.split(r'(?<= %s.)' % year, p, 1)
    
    # 2) NAME AND EFFECTS
    ## - Split by pound sign, which denotes split between N&E and biography
    ### If any pound values are in the thousands, remove the comma
    body = body.replace('?<=£\d+),(?=\d+)', '')
    ### Split names and effects from biographical information
    biog_str, name_str = re.split(r'(?r)(?<=£\d+\.)', body, 1)
    ### Split by person
    name_row = re.split('\n\n', name_str)
    ### Split name and effects by compiled string
    r = re.compile('^[A-Z]+(\s[A-Z][a-z]+)+\.')
    names_only = list(filter(r.match, name_row)) # Read Note
    names = [re.split('\n', n)[0] for n in names_only]
    effects = [re.split('\n', n)[1] for n in names_only]

    
    # 3) PROVED DATE
    ## Split proved date from biographical information
    dates_str, biog = re.split(r'\n(?=The Will)', biog_str, 1)
    dates_w_spaces = re.split('\n', dates_str)
    dates = list(filter(None, dates_w_spaces))
    
    # 4) BIOGRAPHY
    biog_split = re.split('\n\n', biog_str)
    biog_filter = re.compile('(^The Will)|Execut')
    biog_subset = list(filter(biog_filter.match, biog_split))
    biog_subset = [b.replace('-\n', '') for b in biog_subset]
    biography = [b.replace('\n', ' ') for b in biog_subset]
    
    if (biography[0].startswith('The Will')) == False:
        bio_from_prev_page = biography[0]
        biography = biography[1:]
    else:
        bio_from_prev_page = None
        
    check = len(biography) ==len(names_final) == len(effects) == len(dates) == len(biography)

    if check == True:
        print('okay')
    else:
        print('oops')
    
    fin = {'page': page, 'names': names, 'date': dates, 'effects': effects, 'biography': biography}
    res = pd.DataFrame(data = fin)
    df.append(res, ignore_index = True)


NameError: name 'names_final' is not defined

In [18]:
# 1) PAGE METADATA
## - Split page metadata from body of page

pages = []
bodies = []
for p in clean_pages[0]:
    page = re.search(r'(\d+ )?(WILLS. %s.)( \d+)?' % year, p).group(0)
    pages.append(page)
    body = re.sub(r'(\d+ )?(WILLS. %s.)( \d+)?' % year, '', p)
    bodies.append(body)



AttributeError: 'NoneType' object has no attribute 'group'

In [19]:
pages = []
bodies = []

for p in clean_pages:
    try:
        page = re.search(r'(\d+ )?((WILLS|ADMINISTRATIONS). %s.)( \d+)?' % year, p).group()
    except:
        try: 
            page = re.search(r'(\d+ )?([A-Z]+. %s.)( \d+)?' % year, p).group()
        except:    
            try:
                page = re.search(r'(\d+ )?( %s.)( \d+)?' % year, p).group()
            except:
                pass
    pages.append(page)

    
# body = re.sub(r'(\d+ )?([A-Z]+. %s.)( \d+)?' % year, '', p).group()    
# bodies.append(body)

pages

['88 WILLS. 1858.',
 'WILLS. 1858. 89',
 '90 WILLS. 1858.',
 'WILLS. 1858. 91',
 'WILLS. 1858.',
 'WILLS. 1858. 93',
 '94 WILLS. 1858.',
 'WILLS. 1858. 95',
 'WILLS. 1858.',
 'WILLS. 1858.',
 '98 WILLS. 1858.',
 'WILLS. 1858. 99',
 '100 WILLS. 1858.',
 'WILLS. 1858. 101',
 'WILLS. 1858. 7',
 'WILLS. 1858. 103',
 '104 WILLS. 1858.',
 'WILLS. 1858. 105',
 '106 WILLS. 1858.',
 'WILLS. 1858. 107',
 'WILLS. 1858.',
 'WILLS. 1858. 109',
 '110 WILLS. 1858.',
 'WILLS. 1858. 111',
 '112 WILLS. 1858.',
 'WILLS. 1858. 113',
 '114 WILLIS. 1858.',
 'WILLS. 1858. 115',
 'WILLS. 1858.',
 'WILLS. 1858. 117',
 '118 WILLS. 1858.',
 'WILLS. 1858. 119',
 'WILLS. 1858.',
 ' 1858 ',
 '122 WILLS 1858.',
 'WILLS. 1858. 123',
 'ADMINISTRATIONS. 1858.',
 'ADMINISTRATIONS. 1858. 125',
 '126 ADMINISTRATIONS. 1858.',
 'ADMINISTRATIONS. 1858. 127',
 '128 ADMINISTRATIONS. 1858.',
 'ADMINISTRATIONS. 1858. 129',
 '130 ADMINISTRATIONS. 1858.',
 'ADMINISTRATIONS. 1858. 131',
 'ADMINISTRATIONS. 1858,',
 'ADMINISTRATIONS.

In [76]:
[i for i in range(len(pages)) if pages[i] == None]
clean_pages[33]

'WILLS.\n\nAYLMER Thomas Brabazon. Eset 4 Aug\nEffects under £40,000.\n\n1858. 121\n\nthe oaths of George Lake Russell of Lincoln’s\nInn in the said County Esquire Barrister-at-\nlaw and Charles Brune Graves Sawle of\n\nRestormel in the County of Cornwall Esquire \n\nthe Executors.\n\nust, The Will\n\nof Thomas Brabazon Aylmer late of Worthing\nin the County of Sussex Esquire a General\nin H. M. Army deceased who died 19 July\n1858 at Worthing aforesaid was proved at the\nPrincipal Registry. by the oaths of\nThomas Brabazon Aylmer of Congresbury in\nthe County of Somerset and George Edward\nAylmer of the United Service Club Pall Mall\nin the County of Middlesex Esquires the Sons\nand the Executors.\n\nAYNSLEY Lionel. 5 November. Letters of Administration\n\nEffects under £20.\n\n(with the Will annexed) of the Personal estate\nand effects of Lionel Aynsley late of Churm\nin the Parish of Longhorsley in the County\nof Northumberland deceased who died\n3 May 1829 at Churm aforesaid left u

In [None]:
# 2) NAME AND EFFECTS
## - Split by pound sign, which denotes split between N&E and biography
### If any pound values are in the thousands, remove the comma
body = body.replace('?<=£\d+),(?=\d+)', '')
### Split names and effects from biographical information
biog_str, name_str = re.split(r'(?r)(?<=£\d+\.)', body, 1)
### Split by person
name_row = re.split('\n\n', name_str)
### Split name and effects by compiled string
r = re.compile('^[A-Z]+(\s[A-Z][a-z]+)+\.')
names_only = list(filter(r.match, name_row)) # Read Note
names = [re.split('\n', n)[0] for n in names_only]
effects = [re.split('\n', n)[1] for n in names_only]

In [None]:
# 3) PROVED DATE
## Split proved date from biographical information
dates_str, biog = re.split(r'\n(?=The Will)', biog_str, 1)
dates_w_spaces = re.split('\n', dates_str)
dates = list(filter(None, dates_w_spaces))

In [None]:
# 4) BIOGRAPHY
biog_split = re.split('\n\n', biog_str)
biog_filter = re.compile('(^The Will)|Execut')
biog_subset = list(filter(biog_filter.match, biog_split))
biog_subset = [b.replace('-\n', '') for b in biog_subset]
biography = [b.replace('\n', ' ') for b in biog_subset]

In [None]:
if (biography[0].startswith('The Will')) == False:
    bio_from_prev_page = biography[0]
    biography = biography[1:]
else:
    bio_from_prev_page = None

check = len(biography) ==len(names_final) == len(effects) == len(dates) == len(biography)

if check == True:
    print('okay')
else:
    print('oops')

In [None]:
#Create empty data frame so that I can store the output from regex
df = pd.DataFrame(columns=['page', 'names', 'date', 'effects', 'biography'])
fin = {'page': page, 'names': names, 'date': dates, 'effects': effects, 'biography': biography}
res = pd.DataFrame(data = fin)
df.append(res, ignore_index = True)