There exist mentions of elibility requirements in page text, some in specialized `Eligibility` sections, others within the context of `smart_answers` or `answer` document_types. With a focus on age requirements, extract stuff out.

In [23]:
import os
import pandas as pd
import numpy as np

import re
from ast import literal_eval
from pprint import pprint

import spacy
from textblob import TextBlob

from lxml import etree
from bs4 import BeautifulSoup

from dateutil.parser import parse
import datefinder

## Set up data directories 

In [3]:
DATA_DIR = os.getenv("DATA_DIR")
content_path = os.path.join(DATA_DIR, 
                            "preprocessed_content_store_wdetails_june_en_june.csv.gz")
df = pd.read_csv(content_path, compression="gzip")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.head()

Unnamed: 0,base_path,content_id,title,description,document_type,details,orgs_id,orgs_title,sbs_details,pages_part_of_step_nav,text,taxons,locale
0,/aaib-reports/aaib-investigation-to-hawker-sea...,96eacfbe-0385-45ef-9289-8428dacad258,"AAIB investigation to Hawker Sea Fury T MK 20,...","Engine failure and landing gear collapse, RNAS...",aaib_report,"{'body': [{'content_type': 'text/govspeak', 'c...",{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: The aircraft was performing in a publ...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
1,/aaib-reports/aaib-investigation-to-hph-glasfl...,1d697c99-b1d0-4855-b72d-a97d83a4fc91,"AAIB investigation to HPH Glasflugel 304 eS, G...",Front Electric Sustainer (FES) battery fire du...,aaib_report,"{'body': [{'content_type': 'text/govspeak', 'c...",{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: During a normal touchdown following a...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
2,/aaib-reports/aaib-investigation-to-ikarus-c42...,5814334a-77d0-426e-8e78-ff2b05ea6322,"AAIB investigation to Ikarus C42 FB UK, G-IKUS\t",Aircraft crashed whilst avoiding a hedge when ...,aaib_report,"{'body': [{'content_type': 'text/govspeak', 'c...",{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: The pilot was attempting to take off ...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
3,/aaib-reports/aaib-investigation-to-ikarus-c42...,c8f31c76-eab0-4be6-95a5-5e6e7f32056c,"AAIB investigation to Ikarus C42 FB100, G-CEHG\t","Overturned on landing, Farm Strip, Hardwicke, ...",aaib_report,"{'body': [{'content_type': 'text/govspeak', 'c...",{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: G-CEHG was landing on a private airst...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
4,/aaib-reports/aaib-investigation-to-ikarus-c42...,4cc0ae15-ad87-42ce-8a89-2bdb14e19e26,"AAIB investigation to Ikarus C42 FB100, G-ZAVI","Aircraft struck sheep on landing, Lundy Island...",aaib_report,{'metadata': {'date_of_occurrence': '2014-06-2...,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: The pilot reported that this was his ...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en


In [5]:
df['details'] = df['details'].map(literal_eval)

In [6]:
def flatten_dict(parts_dict):
    return {item['slug']:"".join([b['content'] 
                                  for b in item['body']]) for item in parts_dict}    

In [7]:
df['details_parts'] = df['details'].map(lambda x: flatten_dict(x['parts']) 
                                        if 'parts' in x.keys() 
                                          else np.nan)

In [8]:
df['eligibility_html'] = df['details_parts'].map(lambda x: x.get('eligibility',np.nan) 
                                                 if not isinstance(x,float) else np.nan)

### Set up dataframe with `base_paths` that include non-null eligibility requirements

In [9]:
df_welig = df[~df['eligibility_html'].isna()]

In [None]:
df_welig.reset_index(drop=True,inplace=True)

In [10]:
def extract_text(body):
    """
    Extract text from html body
    :param body: <str> containing html.
    """
    # TODO: Tidy this up!
    r = None
    # body != "\n" and
    if body and body != "\n" and not body.isspace():
        try:
            # print("this is", body)
            tree = etree.HTML(body)
            r = tree.xpath('//text()')
            r = ' '.join(r)
            r = r.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
            r = r.replace('\n', ' ').replace(',', ' ')
            # r = r.lower()
            r = ' '.join(r.split())
        except ValueError:
            print("exception @ extract:", type(body), body)
    if not r:
        r = ' '
    return r

In [51]:
df_welig.base_path.values[0:5]

array(['/1619-bursary-fund', '/additional-state-pension',
       '/access-to-elected-office-fund', '/access-to-work',
       '/attendance-allowance'], dtype=object)

## 1. Regex from previous nb iteration to detect age mentions within listings of eligibility requirements. 
This can now be applied to single detected sentences/bullets

In [43]:
elig_texts = dict(zip(df_welig.base_path, df_welig.eligibility_html))

In [44]:
rowlist = []
for key,value in list(elig_texts.items()):

    if "This fund is currently closed." not in value:
#         \s?\d\d\s?(and|or)?\s?(over|under)?(\s\d\d)?
        search = re.finditer(""".*(born on|(reached|over|under) State Pension age|
                             age(d)?|('re|were|are|be( at least)?|have been)\s?(over|under)?\d\d).*""", value)
        if search is not None:
#             print(f'https://www.gov.uk{key}/eligibility')
            for i,s in enumerate(search):
                start = s.start()
                end = s.end()
#                 print(f'At {i}: ({start},{end})')
                text = value[start:end]
#                 print(text)
                rowlist.append({'base_path':key, 
                                'instance #':int(i+1),
                                'start':start,
                                'end':end,
                                'age_text':value[start:end],
                                'text':value})
        else:
            rowlist.append({'base_path':key, 'age_text':'Age requirements not detected',
                           'text':value})
            
    else:
#         print(f'https://www.gov.uk{key}/eligibility')
#         print("closed")
        rowlist.append({'base_path':key, 'age_text':'Fund closed','text':value})
df_eligibility = pd.DataFrame(rowlist)
df_eligibility = df_eligibility[['base_path', 'text','instance #', 'start', 'end','age_text']]

## 2. Detect date dependencies in requirements text

An example: https://www.gov.uk/1619-bursary-fund/eligibility
       
       Eligibility
       You must:

        + be at least 16 and under 19 on 31 August 2019
        + study at a publicly funded school or college, or be on an unpaid training course
        + meet the residency requirements - your school or college can check this
In the above, the age requirement `at least 16 and under 19` is dependent on the date listed `on 31 August 2019`. Depending on the preposition used (examples include `on|before|by|after|between|from|until|in|up to`), the timeframe within which the age req is relevant is affected

In [46]:
cond = r"(on|before|by|after|between|from|until|in|up to).*\s\d{4}"

In [52]:
text = "this happened before April 2019"
bool(re.search(cond, text))

True

In [41]:
counter = 0
for i,row in df_welig.iterrows():
    text = row['eligibility_html']
#     previous, more lenient condition two =  bool(re.search(".*\d{4}.*", text))
#     not one and two
    if bool(re.search(cond, text)):
        detected_dates = [date.strftime("%d/%m/%Y")
               for date in datefinder.find_dates(re.search(cond, text).group(0))]
        ### additional condition required because sometimes we're capturing phone numbers
        if len(detected_dates)>0:
            print(i,":",row['base_path'])
            print("first:",re.search(cond, text).group(0))
            counter+=1
            print(detected_dates)
            print("####")
counter, df_welig.shape

20 : /1619-bursary-fund
first: on 31 August 2019
['31/08/2019']
####
8732 : /additional-state-pension
first: on age on or after 6 April 2016
['06/04/2016']
####
11303 : /bereavement-allowance
first: before 6 April 2017
['06/04/2017']
####
11310 : /bereavement-payment
first: before 6 April 2017
['06/04/2017']
####
11311 : /bereavement-support-payment
first: on or after 6 April 2017
['06/04/2017']
####
11456 : /agricultural-sick-pay
first: only have the right to Agricultural Sick Pay if you were employed before the [rules changed on 1 October 2013
['01/10/2013']
####
11792 : /advanced-learner-loan
first: before 1 August 2016
['01/08/2016']
####
11843 : /ancestry-visa
first: before 31 March 1922
['31/03/1922']
####
11867 : /apply-citizenship-born-uk
first: on or after 1 January 1983
['01/01/1983']
####
11875 : /apply-citizenship-british-parent
first: on or after 1 July 2006
['01/07/2006']
####
12055 : /care-to-learn
first: ing in the UK until 30 June 2021, or 31 December 2020
['31/12/2020

(37, (127, 15))

## 3. Set up `Section` class to contain eligibility sections detected and extracted from page html
Hierarchical reference: `<h2> > <p> > <ul> > un-attached <p>`
An example at https://www.gov.uk/api/content/1619-bursary-fund
#### Section 1
    <p>You must:</p> 
        <ul> 
            <li>be at least 16 and under 19 on 31 August 2019</li> 
            <li>study at a publicly funded school or college, or be on an unpaid training course</li> 
            <li>meet the residency requirements - your school or college can check this</li> 
        </ul> 
#### Section 2
**This section in particular has a not immediately obvious hierarchy of bullet lists and paragraphs**

    <h2 id="bursary-for-students-in-vulnerable-groups">Bursary for students in vulnerable groups</h2> 
        <p>You could get up to £1,200 if at least one of the following applies:</p> 
            <ul> 
                <li>you’re in or recently left local authority care</li> 
                <li>you get Income Support or Universal Credit because you’re financially supporting yourself</li> 
                <li>you get Disability Living Allowance (<abbr title="Disability Living Allowance">DLA</abbr>) in your name and either Employment and Support Allowance (<abbr title="Employment and Support Allowance">ESA</abbr>) or Universal Credit</li> 
                <li>you get Personal Independence Payment (<abbr title="Personal Independence Payment">PIP</abbr>) in your name and either <abbr title="Employment and Support Allowance">ESA</abbr> or Universal Credit</li> 
            </ul> 
        <p>You may get the full amount if you have expenses and study full-time on a course of at least 30 weeks.</p> 
**This is a list of things that actually make an applicant ineligible for a bursary**

        <p>You’ll usually get less than the full amount, or no bursary, if one of the following apply:</p> 
            <ul> 
                <li>your course is shorter than 30 weeks</li> 
                <li>you study part time</li> 
                <li>you have few expenses</li> 
            </ul> 
        <p>You’ll be told what evidence you need, for example benefit letters.</p> 
#### Section 3
    <h2 id="discretionary-bursary">Discretionary bursary</h2> 
        <p>Your school or college will have their own criteria for discretionary bursaries. They’ll look at your individual circumstances - this usually includes your family income.</p> 
        <p>Ask student services about their criteria and any evidence you’ll need.</p> 
        <p>You can apply to a discretionary bursary if you’re over 19 and either:</p> 
            <ul> 
                <li>continuing on a course you started aged 16 to 18 (known as being a ‘19+ continuer’)</li> 
                <li>have an <a href="/children-with-special-educational-needs/extra-SEN-help">Education, Health and Care Plan (EHCP)</a> </li> 
            </ul>

From the above 3 identified sections, extract the bullet lists where applicable, attach them to the their introductory paragraph/sentence

In [16]:
def specialized_scenario(header_string):
    return any([x in header_string for x in [' for ','group']]) 

In [17]:
specialized_scenario("group")

True

In [60]:
class Section:
    
    def __init__(self, section_type="", header=""):
        self.section_type = section_type
        self.header = header
        self.bullets = []
        self.paragraphs = []
    
    def set_type(self, section_type):
        '''
        Base or secondary
        '''
        self.section_type = section_type
        
    def set_header(self, header):
        '''The text tagged as <h2> or the first <p> of a section.'''
        self.header = header
    
    def set_list_header(self, paragraph_text):
        '''The text tagged as <p> preceeding a <ul> bulleted list'''
        self.bullets.append([paragraph_text,
                             self._is_exclusion(paragraph_text),
                             []])
        
    def add_bullet(self, bullet):
        '''Add a bullet requirement to the most recent bulleted list'''
        if bullet not in self.bullets[-1][2]:
            self.bullets[-1][2].append(bullet)
            
    def add_paragraph(self, paragraph):
        '''Add an un-attached paragraph mentioned within section text.
        Probably does not influence any bulleted lists of requirements'''
        if paragraph not in self.paragraphs:
            self.paragraphs.append(paragraph)   
            
    def to_string(self):
        return "type: {}\nheader: {}\nbullets: {}\nparagraphs: {}"\
                    .format(self.section_type, self.header, self.bullets, self.paragraphs)
    
    def to_dict(self):
        return {'type': self.section_type, 
                'header': self.header,
                'bullets': self.bullets,
                'paragraphs': self.paragraphs}
    
    def _is_exclusion(self,text):
        '''Check whether a list of requirements is posed as anti-eligibility/reasons for rejection'''
        return bool(re.search("not (eligible|qualify)|can('t|not)\s?(get|apply)?", text)) or \
                        TextBlob(text).sentiment.polarity < -0.2

In [79]:
class Requirement:
    
    def __init__(self, tag="", text=""):
        self.tag = tag
        self.text = text
        self.is_age = self._contains_age(text)
    
    def to_string(self):
        return f'tag: {self.tag}\ttext: {self.text}\tis_age: {self.is_age}'
    
    def _contains_age(self, text):
        return bool(re.search(""".*(born on|(reached|over|under) State Pension age|
                         age(d)?|('re|were|are|be( at least)?|have been)\s?(over|under)?\d\d).*""", text))

In [84]:
def check_age_range(age_text):
    return [s.group() for s in re.finditer("\d{2}",age_text)]

In [86]:
## test the thing
[s.group() for s in re.finditer("\d{2}","this is one 19 this is other 19")]

['19', '19']

In [80]:
r = Requirement("<li>", "born on 1983")
print(r.to_string())

tag: <li>	text: born on 1983	is_age: True


In [45]:
# matches = datefinder.find_dates("You could get up to £1,200 if at least one of the following")
# for match in matches:
#     print(match)

# Section._contains_date(_,"you were over under 18 on August 2019")

# Section._is_exclusion(_,"You could get up to £1,200 if at least one of the following applies:")
# TextBlob("You could get up to £1,200 if at least one of the following applies:").sentiment.polarity

# samples = ['''You’ll usually get less than the full amount, or no bursary, '''
#             '''if one of the following apply''',
#           'you must', 
#            'If you’re not automatically a citizen, you may be eligible to apply to ‘register’ as one.',
#           'you are not eligible if', 
#            'Who is not eligible',
#            'Who can’t get Care to Learn',
#            'You won’t be paid for the first 3 days off sick unless you’re away for longer than 14 working days in total.']
# for s in samples:
#     print(s)
#     print(TextBlob(s).sentiment.polarity)
#     print(Section._is_exclusion(_,s))
#     print("###")

In [26]:
section = Section()
section.set_list_header("you must")
section.add_bullet("womp")
section.set_list_header("wat")
section.add_bullet("womp")
section.bullets

[['you must', False, ['womp']], ['wat', False, ['womp']]]

In [55]:
Section._contains_date(_, "be at least 16 and under 19 on 31 August 2019")

True

In [27]:
section.to_dict()

{'type': '',
 'header': '',
 'bullets': [['you must', False, ['womp']], ['wat', False, ['womp']]],
 'paragraphs': []}

## 4. BeautifulSoup attempt to get all the bulleted lists/headers out, split into sections 
**TODO: Include age + date dependency detection in this extract step as well (probably within the `Section` class)**

In [13]:
test = df_welig.iloc[0]
text = test.eligibility_html
soup = BeautifulSoup(text, "html.parser")

In [14]:
test.base_path

'/1619-bursary-fund'

In [15]:
text[0:10]

'You must:\r'

    if header -> new scenario
    first header (overall?)
    other headers -> secondary scenarios, unless first header mentions a requirement disputed by scenario, then it's a different class

In [29]:
def get_type(header):
    if header=="p":
        return "base"
    return "secondary"

In [54]:
all_stacks = []
for i,row in df_welig.iterrows():
    text = row['eligibility_html']
    soup = BeautifulSoup(text, "html.parser")
    section_stack = []
    print("https://www.gov.uk"+row['base_path']+"/eligibility")
    items = list(soup.find_all(["h1","h2","h3", "p", "ul"]))
    
    # first element on eligibility page
    section_stack = [Section(get_type(items[0].name), items[0].text.strip())]
    if len(items)>1 and items[1].name == "ul":
        section_stack[-1].set_list_header(items[0].text.strip()) 
        
    for j, header in enumerate(items[1:],1):
        print(f'at index: {j} type: <{header.name}>\nwords: {header.text}')
        if header.name.startswith("h"):
            section_stack.append(Section("secondary",header.text.strip()))
            if j+1< len(items) and items[j+1].name == "ul":
                section_stack[-1].set_list_header(header.text.strip())    

        elif header.name == "p":
            if j+1< len(items) and items[j+1].name == "ul":
                section_stack[-1].set_list_header(header.text.strip())  
            else:
                 section_stack[-1].add_paragraph(header.text.strip())
        else:
            for bullet in header.find_all(["li"]):
                section_stack[-1].add_bullet(bullet.text.strip())
    print("####")
    all_stacks.append([row['base_path'],section_stack])

https://www.gov.uk/1619-bursary-fund/eligibility
at index: 1 type: <ul>
words: 
be at least 16 and under 19 on 31 August 2019
study at a publicly funded school or college, or be on an unpaid training course
meet the residency requirements - your school or college can check this

at index: 2 type: <h2>
words: Bursary for students in vulnerable groups
at index: 3 type: <p>
words: You could get up to £1,200 if at least one of the following applies:
at index: 4 type: <ul>
words: 
you’re in or recently left local authority care
you get Income Support or Universal Credit because you’re financially supporting yourself
you get Disability Living Allowance (DLA) in your name and either Employment and Support Allowance (ESA) or Universal Credit
you get Personal Independence Payment (PIP) in your name and either ESA or Universal Credit

at index: 5 type: <p>
words: You may get the full amount if you have expenses and study full-time on a course of at least 30 weeks.
at index: 6 type: <p>
words: Yo

https://www.gov.uk/diffuse-mesothelioma-payment/eligibility
at index: 1 type: <p>
words: You can claim a one-off payment if you:
at index: 2 type: <ul>
words: 
are not entitled to a payment under the 1979 Pneumoconiosis Act
have not been given a payment for the disease from an employer, a civil claim or elsewhere
are not entitled to compensation from a Ministry of Defence scheme

at index: 3 type: <p>
words: You must have been exposed to asbestos in the United Kingdom.
at index: 4 type: <p>
words: Examples of exposure include:
at index: 5 type: <ul>
words: 
you came into contact with asbestos from a relative, for instance by washing their clothes
you were exposed to asbestos in the environment, for instance you lived near a factory using asbestos
you were exposed to asbestos while self-employed
your exposure cannot be specified but it occurred in the United Kingdom

at index: 6 type: <p>
words: You must claim within 12 months of diagnosis.
at index: 7 type: <h2>
words: Diffuse Mesothel

words: You can still claim Married Couple’s Allowance if you’re unable to live with your spouse or civil partner because of:
at index: 3 type: <ul>
words: 
illness or old age, for example where your spouse or partner is in residential care
working away from home
an armed forces posting
being in prison
training or education

at index: 4 type: <p>
words: Use the Married Couple’s Allowance calculator to work out what you could get.
####
https://www.gov.uk/masters-loan/eligibility
at index: 1 type: <ul>
words: 
your course
your age
your nationality or residency status

at index: 2 type: <p>
words: You will not be able to get a Postgraduate Master’s Loan if:
at index: 3 type: <ul>
words: 
you’re already getting payments from student finance for another course that you’re studying
you received a Postgraduate Master’s Loan before - unless you left your course due to illness, bereavement or another serious personal reason
you already have a master’s degree, or a qualification that’s equivalent

https://www.gov.uk/state-pension/eligibility
at index: 1 type: <ul>
words: 
6 April 1951 if you’re a man
6 April 1953 if you’re a woman

at index: 2 type: <p>
words: If you were born on or after these dates you must claim the new State Pension.
at index: 3 type: <p>
words: The earliest you can get the basic State Pension is when you reach State Pension age.
at index: 4 type: <p>
words: To get the full basic State Pension you need a total of 30 qualifying years of National Insurance contributions or credits. This means you were either:
at index: 5 type: <ul>
words: 
working and paying National Insurance

getting National Insurance Credits, for example for unemployment, sickness or as a parent or carer
paying voluntary National Insurance contributions


at index: 6 type: <p>
words: If you have fewer than 30 qualifying years, your basic State Pension will be less than £129.20 per week but you might be able to top up by paying voluntary National Insurance contributions.
at index: 7 type: <

at index: 1 type: <ul>
words: 
be a Turkish national
have legally worked in the UK for at least 1 year

at index: 2 type: <p>
words: You must have worked for the same employer for the period you’re quoting in your application.
at index: 3 type: <p>
words: Your application may be refused if you’ve broken any immigration laws in the UK. Instead, you may need to apply for a working visa.
####
https://www.gov.uk/universal-credit/eligibility
at index: 1 type: <ul>
words: 
you’re on a low income or out of work
you’re 18 or over (there are some exceptions if you’re 16 to 17)
you’re under State Pension age (or your partner is)
you and your partner have £16,000 or less in savings between you
you live in the UK

at index: 2 type: <p>
words: The number of children you have does not affect your eligibility for Universal Credit, but it may affect how much you get.
at index: 3 type: <p>
words: Use a benefits calculator to check what benefits you could get if you’re not eligible for Universal Credit.

In [None]:
for thing, section in all_stacks:
    print(thing)
    for sec in section:
        print(sec.to_string())
        print("###")
    print("###\n")