In [1]:
import pandas as pd
import numpy as np
import pickle, logging, spacy
import matplotlib.pyplot as plt
from helpers.classes import Collection
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json

In [None]:
with open('pickles/collection_20210624_194932.pkl', 'rb') as f:
    collection = pickle.load(f)
# with open('pickles/collection_20210622_220215.pkl', 'rb') as f:
#     collection = pickle.load(f)

In [3]:
# bulletin_names contains all bulletins collected through scrapping
bulletin_names = list(collection.bulletins.keys())

# target bulletins are those with usable related datasets and main points
target_bulletins = []
for bulletin in bulletin_names:
    if len(collection.bulletins.get(bulletin).get('main-points')) and len(collection.bulletins.get(bulletin).get('related-datasets')) > 0:
        target_bulletins.append(bulletin)

In [4]:
collection.bulletins.get(target_bulletins[0])

{'main-points': ['In 2019, approximate gross value added at basic prices (aGVA) of the UK non-financial business economy was estimated to be £1,313.9 billion; an increase of £42.8 billion (3.4%) compared with 2018.',
  'The non-financial services sector, which accounted for over half (56.7%) of total aGVA in 2019, increased by £25 billion (3.5%) to £744.4 billion; transport and storage saw the highest increase in aGVA growth at £7.2 billion (8.6%) increasing from £84.4 billion to £91.6 billion.',
  'Total turnover and purchases of the UK non-financial business economy were estimated to be £4,101.5 billion and £2,761.4 billion respectively; an increase of £70.5 billion (1.7%) and £25.8 billion (0.9%) compared with 2018.',
  'Out of the 12 UK regions, 8 regions experienced growth in aGVA; the South East experienced the largest increase in aGVA of £17.3 billion to £214.4 billion, which was as increase of 8.8% in 2019.',
  'West Midlands, Yorkshire and The Humber, Scotland and East Midland

In [192]:
# THE FOLLOWING WAS ADDED IN THE
# CORE FUNCTIONALITY IN
# helpers/classes.py

datepercent_bulletins = []
date_bulletins = []

keywords = [
    'fall', 'rise', 'increase', 'decrease', 'low', 'high',
    'compare', 'most', 'least',
    'growth', 'grow', 'decline', 'fall', 'rise',
    'double', 'triple',
    'million', 'billion', 'trillion', 'thousand', 'hundred'
    ] 

for bulletin in tqdm(bulletin_names):
    main_points = collection.bulletins.get(bulletin).get('main-points')
    date_percent_points = []
    date_points = []
    for i, point in enumerate(main_points):
        entities = nlp(point).ents
        labels = [x.label_ for x in entities]
        lemmatised = [x.lemma_ for x in entities]

        if 'DATE' and 'PERCENT' in labels:
            date_percent_points.append(i)
        elif 'DATE' in labels:
            date_points.append(i)

    if len(date_percent_points) != 0:
        datepercent_bulletins.append((bulletin, date_percent_points))
        collection.bulletins[bulletin]['date-percent'] = \
            [main_points[x] for x in date_percent_points]
    if len(date_points) != 0:
        date_bulletins.append((bulletin, date_points))
        collection.bulletins[bulletin]['date'] = \
            [main_points[x] for x in date_points]
    else:
        collection.bulletins[bulletin]['date'] = []
        collection.bulletins[bulletin]['date-percent'] = []

100%|██████████| 625/625 [00:52<00:00, 11.80it/s]


## Data generation
### creates a dataframe with cols {bulletin, type, point, data}

In [107]:
dataset = []
for bulletin in bulletin_names:
    content = collection.bulletins.get(bulletin)
    related_datasets = content.get('related-datasets')
    if len(related_datasets) > 0:
        date_and_percent = content.get('date-and-percent')
        just_dates = content.get('just-dates')
        main_points = content.get('main-points')
        if len(date_and_percent) != 0:
            for entry in date_and_percent:
                point = main_points[entry]
                row = {'bulletin': bulletin, 'type': 'date_and_percent', 'point': point, 'data': related_datasets}
                dataset.append(row)
        if len(just_dates) != 0:
            for entry in just_dates:
                point = main_points[entry]
                row = {'bulletin': bulletin, 'type': 'just_dates','point': point, 'data': related_datasets}
                dataset.append(row)

# save to drive
df = pd.DataFrame(dataset)
df.to_pickle(f"pickles/dataset_{str(datetime.now().strftime('%Y%m%d_%H%M%S'))}.pkl")

## View data

In [4]:
with open('pickles/dataset_20210625_184837.pkl', 'rb') as f:
    df = pickle.load(f)

In [7]:
df

Unnamed: 0,bulletin,type,point,data
0,businessindustryandtrade/business/businessserv...,date_and_percent,"In 2019, approximate gross value added at basi...",[/businessindustryandtrade/business/businessse...
1,businessindustryandtrade/business/businessserv...,date_and_percent,"The non-financial services sector, which accou...",[/businessindustryandtrade/business/businessse...
2,businessindustryandtrade/business/businessserv...,date_and_percent,Total turnover and purchases of the UK non-fin...,[/businessindustryandtrade/business/businessse...
3,businessindustryandtrade/business/businessserv...,date_and_percent,"Out of the 12 UK regions, 8 regions experience...",[/businessindustryandtrade/business/businessse...
4,businessindustryandtrade/business/businessserv...,date_and_percent,"West Midlands, Yorkshire and The Humber, Scotl...",[/businessindustryandtrade/business/businessse...
...,...,...,...,...
2010,peoplepopulationandcommunity/wellbeing/article...,date_and_percent,Trust in others in their neighbourhood was hig...,[/peoplepopulationandcommunity/wellbeing/datas...
2011,peoplepopulationandcommunity/wellbeing/article...,date_and_percent,Trust in others in their neighbourhood was hig...,[/peoplepopulationandcommunity/wellbeing/datas...
2012,peoplepopulationandcommunity/wellbeing/article...,date_and_percent,People in higher managerial occupations were m...,[/peoplepopulationandcommunity/wellbeing/datas...
2013,peoplepopulationandcommunity/wellbeing/article...,date_and_percent,Around 6 in 10 people (61%) reported feeling s...,[/peoplepopulationandcommunity/wellbeing/datas...


## Generate CLOZES

The following few cells show to handle the creation of Cloze-style questions from a point.

The output of `generate_clozes_from_point` is a generator, and contains as attributes various useful properties

    - cloze_id
    - original point
    - answer
    - ...

In [8]:
from generate_cloze import generate_clozes_from_point, named_entity_answer_generator as answer_generator

In [57]:
all_clozes = []
for _, row in tqdm(df.iterrows()):
    point = row.point
    clozes = [c for c in generate_clozes_from_point(point, ne_answer_generator, tokenizer)]
    all_clozes.extend(clozes)

ids = []
cloze_text = []
source_text = []
answer = []
types = []
for cloze in all_clozes:
    ids.append(cloze.cloze_id)
    cloze_text.append(cloze.cloze_text)
    source_text.append(cloze.source_text)
    answer.append(cloze.answer_text)
    types.append(cloze.answer_type)

clozes_df = pd.DataFrame(
    {'ids': ids, 'cloze_text': cloze_text,
    'source_text': source_text, 'answer_text': answer, 
    'answer_type': types})

clozes_df.to_json('pickles/clozes_20210715_212425.json', default_handler=str)

['In TEMPORALMASK, approximate gross value added at basic prices (aGVA) of the UK non-financial business economy was estimated to be £1,313.9 billion; an increase of £42.8 billion (3.4%) compared with 2018.',
 'In 2019, approximate gross value added at basic prices (aGVA) of the UK non-financial business economy was estimated to be NUMERICMASK; an increase of £42.8 billion (3.4%) compared with 2018.',
 'In 2019, approximate gross value added at basic prices (aGVA) of the UK non-financial business economy was estimated to be £1,313.9 billion; an increase of NUMERICMASK (3.4%) compared with 2018.',
 'In 2019, approximate gross value added at basic prices (aGVA) of the UK non-financial business economy was estimated to be £1,313.9 billion; an increase of £42.8 billion (NUMERICMASK) compared with 2018.',
 'In 2019, approximate gross value added at basic prices (aGVA) of the UK non-financial business economy was estimated to be £1,313.9 billion; an increase of £42.8 billion (3.4%) compared 

In [37]:
df.iloc[22].point

'London had both the highest business birth rate at 15.7%, and death rate at 13.1%.'

In [20]:
all_clozes = []

for _, row in tqdm(df.iterrows()):
    clozes = [c for c in generate_clozes_from_point(row.point, answer_generator)]
    all_clozes.extend(clozes)

2015it [06:51,  4.89it/s]


In [60]:
ids = []
cloze_text = []
source_text = []
answer = []
types = []
for cloze in all_clozes:
    ids.append(cloze.cloze_id)
    cloze_text.append(cloze.cloze_text)
    source_text.append(cloze.source_text.text) # source_text is spacy.span convert to str
    answer.append(cloze.answer_text)
    types.append(cloze.answer_type)

clozes_df = pd.DataFrame(
    {'ids': ids, 'cloze_text': cloze_text,
    'source_text': source_text, 'answer_text': answer, 
    'answer_type': types})

clozes_df.to_json('pickles/clozes_20210824.json', default_handler=str)

# Download XLS files

In [35]:
for name in tqdm(target_bulletins):
    bulletin = collection.bulletins.get(name)
    related_datasets = bulletin.get('related-datasets')
    for dataset in related_datasets:
        url = 'https://www.ons.gov.uk' + dataset
        log = requests.get(url)
        soup = BeautifulSoup(log.content, features = 'html')
        list_of_datsets = soup.findAll('a', {'class':'btn btn--primary btn--thick'}, href = True)
        # take the most recent one
        if len(list_of_datsets):
            excel_file = 'https://www.ons.gov.uk' + list_of_datsets[0]['href']
            download = requests.get(excel_file)

            # change / to _ so you can save them 
            # also ignore the first / for naming convenience
            savename = dataset.replace('/', '_')[1:]
            with open(f'datasets/{savename}.xls', 'wb') as f:
                f.write(download.content)

100%|██████████| 328/328 [11:53<00:00,  2.17s/it]
