There exist mentions of elibility requirements in page text, some in specialized `Eligibility` sections, others within the context of `smart_answers` or `answer` document_types. With a focus on age requirements, extract stuff out.

In [1]:
import os
import pandas as pd
import numpy as np

import re
from ast import literal_eval
from pprint import pprint

import spacy
from textblob import TextBlob

from lxml import etree
from bs4 import BeautifulSoup

from dateutil.parser import parse
import datefinder

## Set up data directories 

In [2]:
DATA_DIR = os.getenv("DATA_DIR")
content_path = os.path.join(DATA_DIR, 
                            "preprocessed_content_store_wdetails_june_en_june.csv.gz")
df = pd.read_csv(content_path, compression="gzip")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.head()

Unnamed: 0,base_path,content_id,title,description,document_type,details,orgs_id,orgs_title,sbs_details,pages_part_of_step_nav,text,taxons,locale
0,/aaib-reports/aaib-investigation-to-hawker-sea...,96eacfbe-0385-45ef-9289-8428dacad258,"AAIB investigation to Hawker Sea Fury T MK 20,...","Engine failure and landing gear collapse, RNAS...",aaib_report,"{'body': [{'content_type': 'text/govspeak', 'c...",{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: The aircraft was performing in a publ...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
1,/aaib-reports/aaib-investigation-to-hph-glasfl...,1d697c99-b1d0-4855-b72d-a97d83a4fc91,"AAIB investigation to HPH Glasflugel 304 eS, G...",Front Electric Sustainer (FES) battery fire du...,aaib_report,"{'body': [{'content_type': 'text/govspeak', 'c...",{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: During a normal touchdown following a...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
2,/aaib-reports/aaib-investigation-to-ikarus-c42...,5814334a-77d0-426e-8e78-ff2b05ea6322,"AAIB investigation to Ikarus C42 FB UK, G-IKUS\t",Aircraft crashed whilst avoiding a hedge when ...,aaib_report,"{'body': [{'content_type': 'text/govspeak', 'c...",{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: The pilot was attempting to take off ...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
3,/aaib-reports/aaib-investigation-to-ikarus-c42...,c8f31c76-eab0-4be6-95a5-5e6e7f32056c,"AAIB investigation to Ikarus C42 FB100, G-CEHG\t","Overturned on landing, Farm Strip, Hardwicke, ...",aaib_report,"{'body': [{'content_type': 'text/govspeak', 'c...",{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: G-CEHG was landing on a private airst...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
4,/aaib-reports/aaib-investigation-to-ikarus-c42...,4cc0ae15-ad87-42ce-8a89-2bdb14e19e26,"AAIB investigation to Ikarus C42 FB100, G-ZAVI","Aircraft struck sheep on landing, Lundy Island...",aaib_report,{'metadata': {'date_of_occurrence': '2014-06-2...,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: The pilot reported that this was his ...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en


In [4]:
df['details'] = df['details'].map(literal_eval)

In [5]:
def flatten_dict(parts_dict):
    return {item['slug']:"".join([b['content'] 
                                  for b in item['body']]) for item in parts_dict}    

In [6]:
df['details_parts'] = df['details'].map(lambda x: flatten_dict(x['parts']) 
                                        if 'parts' in x.keys() 
                                          else np.nan)

In [7]:
df['eligibility_html'] = df['details_parts'].map(lambda x: x.get('eligibility',np.nan) 
                                                 if not isinstance(x,float) else np.nan)

### Set up dataframe with `base_paths` that include non-null eligibility requirements

In [8]:
df_welig = df[~df['eligibility_html'].isna()]

In [9]:
df_welig.reset_index(drop=True,inplace=True)

In [10]:
def extract_text(body):
    """
    Extract text from html body
    :param body: <str> containing html.
    """
    # TODO: Tidy this up!
    r = None
    # body != "\n" and
    if body and body != "\n" and not body.isspace():
        try:
            # print("this is", body)
            tree = etree.HTML(body)
            r = tree.xpath('//text()')
            r = ' '.join(r)
            r = r.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
            r = r.replace('\n', ' ').replace(',', ' ')
            # r = r.lower()
            r = ' '.join(r.split())
        except ValueError:
            print("exception @ extract:", type(body), body)
    if not r:
        r = ' '
    return r

In [11]:
df_welig.base_path.values[0:5]

array(['/1619-bursary-fund', '/additional-state-pension',
       '/access-to-elected-office-fund', '/access-to-work',
       '/attendance-allowance'], dtype=object)

## 1. Regex from previous nb iteration to detect age mentions within listings of eligibility requirements. 
This can now be applied to single detected sentences/bullets

In [12]:
elig_texts = dict(zip(df_welig.base_path, df_welig.eligibility_html))

In [13]:
rowlist = []
for key,value in list(elig_texts.items()):

    if "This fund is currently closed." not in value:
#         \s?\d\d\s?(and|or)?\s?(over|under)?(\s\d\d)?
        search = re.finditer(""".*(born on|(reached|over|under) State Pension age|
                             age(d)?|('re|were|are|be( at least)?|have been)\s?(over|under)?\d\d).*""", value)
        if search is not None:
#             print(f'https://www.gov.uk{key}/eligibility')
            for i,s in enumerate(search):
                start = s.start()
                end = s.end()
#                 print(f'At {i}: ({start},{end})')
                text = value[start:end]
#                 print(text)
                rowlist.append({'base_path':key, 
                                'instance #':int(i+1),
                                'start':start,
                                'end':end,
                                'age_text':value[start:end],
                                'text':value})
        else:
            rowlist.append({'base_path':key, 'age_text':'Age requirements not detected',
                           'text':value})
            
    else:
#         print(f'https://www.gov.uk{key}/eligibility')
#         print("closed")
        rowlist.append({'base_path':key, 'age_text':'Fund closed','text':value})
df_eligibility = pd.DataFrame(rowlist)
df_eligibility = df_eligibility[['base_path', 'text','instance #', 'start', 'end','age_text']]

## 2. Detect date dependencies in requirements text

An example: https://www.gov.uk/1619-bursary-fund/eligibility
       
       Eligibility
       You must:

        + be at least 16 and under 19 on 31 August 2019
        + study at a publicly funded school or college, or be on an unpaid training course
        + meet the residency requirements - your school or college can check this
In the above, the age requirement `at least 16 and under 19` is dependent on the date listed `on 31 August 2019`. Depending on the preposition used (examples include `on|before|by|after|between|from|until|in|up to`), the timeframe within which the age req is relevant is affected

In [71]:
cond = r"\s(on( or (after|before))?|before|by|after|between|from|until|in|up to)\s.{0,20}\s\d{4}"

In [72]:
text = "this happened before April 2019"
bool(re.search(cond, text))

True

In [73]:
len("1 October")

9

In [79]:
counter = 0
for i,row in df_welig.iterrows():
    text = row['eligibility_html']
#     previous, more lenient condition two =  bool(re.search(".*\d{4}.*", text))
#     not one and two
    if bool(re.search(cond, text)):
        detected_dates = [date.strftime("%d/%m/%Y")
               for date in datefinder.find_dates(re.search(cond, text).group(0))]
        ### additional condition required because sometimes we're capturing phone numbers
        if len(detected_dates)>0:
            print(i,":",row['base_path'])
            for i,g in enumerate(re.finditer(cond, text)):
                print(i,g.group(), g.start(), g.end())
            counter+=1
            print(detected_dates)
            print("####")
counter, df_welig.shape

0 : /1619-bursary-fund
0  on 31 August 2019 42 60
1  on 31 August 2019 1805 1823
['31/08/2019']
####
1 : /additional-state-pension
0  on or after 6 April 2016 31 56
1  on or after 6 April 2016 163 188
2  before 6 April 2016 281 301
3  before 6 April 2016 338 358
4  before 6 April 2016 1237 1257
5  between 6 April 2002 1392 1413
6  in the 2015 to 2016 1518 1538
7  on or after 6 April 2016 1988 2013
8  on or after 6 April 2016 2137 2162
9  before 6 April 2016 2324 2344
10  before 6 April 2016 2386 2406
11  before 6 April 2016 3705 3725
12  between 6 April 2002 3941 3962
13  in the 2015 to 2016 4078 4098
['06/04/2016']
####
6 : /bereavement-allowance
0  before 6 April 2017 107 127
1  on or after 6 April 2017 444 469
2  before 6 April 2017 2051 2071
3  on or after 6 April 2017 2517 2542
['06/04/2017']
####
7 : /bereavement-payment
0  before 6 April 2017 88 108
1  on or after 6 April 2017 197 222
2  before 6 April 2017 2054 2074
3  on or after 6 April 2017 2249 2274
['06/04/2017']
####
8 : 

(38, (127, 15))

## 3. Set up `Section` class to contain eligibility sections detected and extracted from page html
Hierarchical reference: `<h2> > <p> > <ul> > un-attached <p>`
An example at https://www.gov.uk/api/content/1619-bursary-fund
#### Section 1
    <p>You must:</p> 
        <ul> 
            <li>be at least 16 and under 19 on 31 August 2019</li> 
            <li>study at a publicly funded school or college, or be on an unpaid training course</li> 
            <li>meet the residency requirements - your school or college can check this</li> 
        </ul> 
#### Section 2
**This section in particular has a not immediately obvious hierarchy of bullet lists and paragraphs**

    <h2 id="bursary-for-students-in-vulnerable-groups">Bursary for students in vulnerable groups</h2> 
        <p>You could get up to £1,200 if at least one of the following applies:</p> 
            <ul> 
                <li>you’re in or recently left local authority care</li> 
                <li>you get Income Support or Universal Credit because you’re financially supporting yourself</li> 
                <li>you get Disability Living Allowance (<abbr title="Disability Living Allowance">DLA</abbr>) in your name and either Employment and Support Allowance (<abbr title="Employment and Support Allowance">ESA</abbr>) or Universal Credit</li> 
                <li>you get Personal Independence Payment (<abbr title="Personal Independence Payment">PIP</abbr>) in your name and either <abbr title="Employment and Support Allowance">ESA</abbr> or Universal Credit</li> 
            </ul> 
        <p>You may get the full amount if you have expenses and study full-time on a course of at least 30 weeks.</p> 
**This is a list of things that actually make an applicant ineligible for a bursary**

        <p>You’ll usually get less than the full amount, or no bursary, if one of the following apply:</p> 
            <ul> 
                <li>your course is shorter than 30 weeks</li> 
                <li>you study part time</li> 
                <li>you have few expenses</li> 
            </ul> 
        <p>You’ll be told what evidence you need, for example benefit letters.</p> 
#### Section 3
    <h2 id="discretionary-bursary">Discretionary bursary</h2> 
        <p>Your school or college will have their own criteria for discretionary bursaries. They’ll look at your individual circumstances - this usually includes your family income.</p> 
        <p>Ask student services about their criteria and any evidence you’ll need.</p> 
        <p>You can apply to a discretionary bursary if you’re over 19 and either:</p> 
            <ul> 
                <li>continuing on a course you started aged 16 to 18 (known as being a ‘19+ continuer’)</li> 
                <li>have an <a href="/children-with-special-educational-needs/extra-SEN-help">Education, Health and Care Plan (EHCP)</a> </li> 
            </ul>

From the above 3 identified sections, extract the bullet lists where applicable, attach them to the their introductory paragraph/sentence

In [17]:
def specialized_scenario(header_string):
    return any([x in header_string for x in [' for ','group']]) 

In [18]:
specialized_scenario("group")

True

In [19]:
class Section:
    
    def __init__(self, section_type="", header=""):
        self.section_type = section_type
        self.header = header
        self.bullets = []
        self.paragraphs = []
    
    def set_type(self, section_type):
        '''
        Base or secondary
        '''
        self.section_type = section_type
        
    def set_header(self, header):
        '''The text tagged as <h2> or the first <p> of a section.'''
        self.header = header
    
    def set_list_header(self, paragraph_text):
        '''The text tagged as <p> preceeding a <ul> bulleted list'''
        self.bullets.append([paragraph_text,
                             self._is_exclusion(paragraph_text),
                             []])
        
    def add_bullet(self, bullet):
        '''Add a bullet requirement to the most recent bulleted list'''
        if bullet not in self.bullets[-1][2]:
            self.bullets[-1][2].append(bullet)
            
    def add_paragraph(self, paragraph):
        '''Add an un-attached paragraph mentioned within section text.
        Probably does not influence any bulleted lists of requirements'''
        if paragraph not in self.paragraphs:
            self.paragraphs.append(paragraph)   
            
    def to_string(self):
        return "type: {}\nheader: {}\nbullets: {}\nparagraphs: {}"\
                    .format(self.section_type, self.header, self.bullets, self.paragraphs)
    
    def to_dict(self):
        return {'type': self.section_type, 
                'header': self.header,
                'bullets': self.bullets,
                'paragraphs': self.paragraphs}
    
    def _is_exclusion(self,text):
        '''Check whether a list of requirements is posed as anti-eligibility/reasons for rejection'''
        return bool(re.search("not (eligible|qualify)|can('t|not)\s?(get|apply)?", text)) or \
                        TextBlob(text).sentiment.polarity < -0.2

In [27]:
class Requirement:
    
    def __init__(self, tag="", text=""):
        self.tag = tag
        self.text = text
        self.is_age = self._contains_age(text)
    
    def to_string(self):
        return f'tag: {self.tag}\ttext: {self.text}\tis_age: {self.is_age}'
    
    def _contains_age(self, text):
        return bool(re.search(""".*(born on|(reached|over|under) State Pension age|
                         age(d)?|('re|were|are|be( at least)?|have been)\s?(over|under)?\d\d).*""", text))
    
    def _detect_date_dependency(self, requirement_text):
        condition = r"(on|before|by|after|between|from|until|in|up to).*\s\d{4}"
        if bool(re.search(cond, text)):
            detected_dates = [date.strftime("%d/%m/%Y")
                   for date in datefinder.find_dates(re.search(cond, text).group(0))]
        ### additional condition required because sometimes we're capturing phone numbers
        if len(detected_dates)>0:
            print(i,":",row['base_path'])
            print("first:",re.search(cond, text).group(0))
            counter+=1
            print(detected_dates)
            print("####")
        return None
    
    def _resolve_prepositions(self, text):
        return None
    
    def _extract_age_range(self, age_text):
        dates = [s.group() for s in re.finditer("\d{2}",age_text)]
        if len(dates) == 2:
            self.min_age = dates[0]
            self.max_age = dates[1]
        else:
            #### resolve preposition
        

In [21]:
def extract_age_range(age_text):
    return [s.group() for s in re.finditer("\d{2}",age_text)]

**TODO:** At this point we need to account for the various ways an age may be defined, given final output will be in a `min_age` and `max_age` format

1. Two ages, leftmost = min, rightmost = max. Ensure that min < max
2. Age may be stated as "state pension age", account for this (mutable value unfort)
3. Phrasing:
        1. Over, above, have been, at least, reached? = min
        2. Under, below = max

In [22]:
## test the thing, should add condition if number of ages listed is not == 2
min_age, max_age = [s.group() for s in re.finditer("\d{2}","this is one 19 this is other 20")]
min_age, max_age

('19', '20')

In [23]:
r = Requirement("<li>", "born on 1983")
print(r.to_string())

tag: <li>	text: born on 1983	is_age: True


In [25]:
section = Section()
section.set_list_header("you must")
section.add_bullet("womp")
section.set_list_header("wat")
section.add_bullet("womp")
section.bullets

[['you must', False, ['womp']], ['wat', False, ['womp']]]

In [26]:
Section._contains_date(_, "be at least 16 and under 19 on 31 August 2019")

AttributeError: type object 'Section' has no attribute '_contains_date'

In [None]:
section.to_dict()

## 4. BeautifulSoup attempt to get all the bulleted lists/headers out, split into sections 
**TODO: Include age + date dependency detection in this extract step as well (probably within the `Section` class)**

In [None]:
test = df_welig.iloc[0]
text = test.eligibility_html
soup = BeautifulSoup(text, "html.parser")

In [None]:
test.base_path

In [None]:
text[0:10]

    if header -> new scenario
    first header (overall?)
    other headers -> secondary scenarios, unless first header mentions a requirement disputed by scenario, then it's a different class

In [None]:
def get_type(header):
    if header=="p":
        return "base"
    return "secondary"

In [None]:
all_stacks = []
for i,row in df_welig.iterrows():
    text = row['eligibility_html']
    soup = BeautifulSoup(text, "html.parser")
    section_stack = []
    print("https://www.gov.uk"+row['base_path']+"/eligibility")
    items = list(soup.find_all(["h1","h2","h3", "p", "ul"]))
    
    # first element on eligibility page
    section_stack = [Section(get_type(items[0].name), items[0].text.strip())]
    if len(items)>1 and items[1].name == "ul":
        section_stack[-1].set_list_header(items[0].text.strip()) 
        
    for j, header in enumerate(items[1:],1):
        print(f'at index: {j} type: <{header.name}>\nwords: {header.text}')
        if header.name.startswith("h"):
            section_stack.append(Section("secondary",header.text.strip()))
            if j+1< len(items) and items[j+1].name == "ul":
                section_stack[-1].set_list_header(header.text.strip())    

        elif header.name == "p":
            if j+1< len(items) and items[j+1].name == "ul":
                section_stack[-1].set_list_header(header.text.strip())  
            else:
                 section_stack[-1].add_paragraph(header.text.strip())
        else:
            for bullet in header.find_all(["li"]):
                section_stack[-1].add_bullet(bullet.text.strip())
    print("####")
    all_stacks.append([row['base_path'],section_stack])

In [None]:
for thing, section in all_stacks:
    print(thing)
    for sec in section:
        print(sec.to_string())
        print("###")
    print("###\n")