In [17]:
import pandas as pd
import numpy as np
import pickle, logging, spacy
import matplotlib.pyplot as plt
from helpers.classes import Collection
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json

In [4]:
with open('pickles/collection_20210624_194932.pkl', 'rb') as f:
    collection = pickle.load(f)
# with open('pickles/collection_20210622_220215.pkl', 'rb') as f:
#     collection = pickle.load(f)

In [5]:
# bulletin_names contains all bulletins collected through scrapping
bulletin_names = list(collection.bulletins.keys())

# target bulletins are those with usable related datasets and main points
target_bulletins = []
for bulletin in bulletin_names:
    if len(collection.bulletins.get(bulletin).get('main-points')) and len(collection.bulletins.get(bulletin).get('related-datasets')) > 0:
        target_bulletins.append(bulletin)

In [192]:
# THE FOLLOWING WAS ADDED IN THE
# CORE FUNCTIONALITY IN
# helpers/classes.py

datepercent_bulletins = []
date_bulletins = []

keywords = [
    'fall', 'rise', 'increase', 'decrease', 'low', 'high',
    'compare', 'most', 'least',
    'growth', 'grow', 'decline', 'fall', 'rise',
    'double', 'triple',
    'million', 'billion', 'trillion', 'thousand', 'hundred'
    ] 

for bulletin in tqdm(bulletin_names):
    main_points = collection.bulletins.get(bulletin).get('main-points')
    date_percent_points = []
    date_points = []
    for i, point in enumerate(main_points):
        entities = nlp(point).ents
        labels = [x.label_ for x in entities]
        lemmatised = [x.lemma_ for x in entities]

        if 'DATE' and 'PERCENT' in labels:
            date_percent_points.append(i)
        elif 'DATE' in labels:
            date_points.append(i)

    if len(date_percent_points) != 0:
        datepercent_bulletins.append((bulletin, date_percent_points))
        collection.bulletins[bulletin]['date-percent'] = \
            [main_points[x] for x in date_percent_points]
    if len(date_points) != 0:
        date_bulletins.append((bulletin, date_points))
        collection.bulletins[bulletin]['date'] = \
            [main_points[x] for x in date_points]
    else:
        collection.bulletins[bulletin]['date'] = []
        collection.bulletins[bulletin]['date-percent'] = []

100%|██████████| 625/625 [00:52<00:00, 11.80it/s]


## Data generation
### creates a dataframe with cols {bulletin, type, point, data}

In [107]:
dataset = []
for bulletin in bulletin_names:
    content = collection.bulletins.get(bulletin)
    related_datasets = content.get('related-datasets')
    if len(related_datasets) > 0:
        date_and_percent = content.get('date-and-percent')
        just_dates = content.get('just-dates')
        main_points = content.get('main-points')
        if len(date_and_percent) != 0:
            for entry in date_and_percent:
                point = main_points[entry]
                row = {'bulletin': bulletin, 'type': 'date_and_percent', 'point': point, 'data': related_datasets}
                dataset.append(row)
        if len(just_dates) != 0:
            for entry in just_dates:
                point = main_points[entry]
                row = {'bulletin': bulletin, 'type': 'just_dates','point': point, 'data': related_datasets}
                dataset.append(row)

# save to drive
df = pd.DataFrame(dataset)
df.to_pickle(f"pickles/dataset_{str(datetime.now().strftime('%Y%m%d_%H%M%S'))}.pkl")

## View data

In [6]:
with open('pickles/dataset_20210625_184837.pkl', 'rb') as f:
    df = pickle.load(f)

In [7]:
df

Unnamed: 0,bulletin,type,point,data
0,businessindustryandtrade/business/businessserv...,date_and_percent,"In 2019, approximate gross value added at basi...",[/businessindustryandtrade/business/businessse...
1,businessindustryandtrade/business/businessserv...,date_and_percent,"The non-financial services sector, which accou...",[/businessindustryandtrade/business/businessse...
2,businessindustryandtrade/business/businessserv...,date_and_percent,Total turnover and purchases of the UK non-fin...,[/businessindustryandtrade/business/businessse...
3,businessindustryandtrade/business/businessserv...,date_and_percent,"Out of the 12 UK regions, 8 regions experience...",[/businessindustryandtrade/business/businessse...
4,businessindustryandtrade/business/businessserv...,date_and_percent,"West Midlands, Yorkshire and The Humber, Scotl...",[/businessindustryandtrade/business/businessse...
...,...,...,...,...
2010,peoplepopulationandcommunity/wellbeing/article...,date_and_percent,Trust in others in their neighbourhood was hig...,[/peoplepopulationandcommunity/wellbeing/datas...
2011,peoplepopulationandcommunity/wellbeing/article...,date_and_percent,Trust in others in their neighbourhood was hig...,[/peoplepopulationandcommunity/wellbeing/datas...
2012,peoplepopulationandcommunity/wellbeing/article...,date_and_percent,People in higher managerial occupations were m...,[/peoplepopulationandcommunity/wellbeing/datas...
2013,peoplepopulationandcommunity/wellbeing/article...,date_and_percent,Around 6 in 10 people (61%) reported feeling s...,[/peoplepopulationandcommunity/wellbeing/datas...


## Generate CLOZES

The following few cells show to handle the creation of Cloze-style questions from a point.

The output of `generate_clozes_from_point` is a generator, and contains as attributes various useful properties

    - cloze_id
    - original point
    - answer
    - ...

In [12]:
from generate_cloze import generate_clozes_from_point, named_entity_answer_generator as ne_answer_generator

In [47]:
all_clozes = []
all_bulletins = []
all_datasets = []
for _, row in tqdm(df.iterrows()):
    point = row.point
    clozes = [c for c in \
        generate_clozes_from_point(point, ne_answer_generator)]
    bulletins = [row.bulletin] * len(clozes)
    datasets = [row.data] * len(clozes)
    all_clozes.extend(clozes)
    all_bulletins.extend(bulletins)
    all_datasets.extend(datasets)

ids = []
cloze_text = []
source_text = []
answer = []
types = []
for cloze in all_clozes:
    ids.append(cloze.cloze_id)
    cloze_text.append(cloze.cloze_text)
    source_text.append(cloze.source_text.text)
    answer.append(cloze.answer_text)
    types.append(cloze.answer_type)

2015it [05:12,  6.45it/s]


In [48]:
clozes_df = pd.DataFrame(
    {'cloze_text': cloze_text,
    'source_text': source_text, 'answer_text': answer, 
    'answer_type': types,
    'bulletin':all_bulletins, 'ids': ids, 'data': all_datasets})

clozes_df.to_json('pickles/clozes_20210825.json', default_handler=str)

## This generates false statements by sampling answers from the same entity group

In [49]:
answer_types = clozes_df['answer_type'].unique()
entities = dict.fromkeys(answer_types)
unique_entities = dict.fromkeys(answer_types)
for anstype in answer_types:
    subdf = clozes_df[clozes_df['answer_type'] == anstype]
    answers = subdf.answer_text.values
    entities[anstype] = answers
    unique_entities[anstype] = np.unique(answers)

MASKS = ['IDENTITYMASK', 'NOUNPHRASEMASK', 'NUMERICMASK', 'PLACEMASK',
       'TEMPORALMASK', 'THINGMASK']

In [57]:
results = []
for _, item in clozes_df.iterrows():
    cloze = item.cloze_text
    source = item.source_text
    anstype = item.answer_type
    random_replacement = np.random.choice(unique_entities[anstype])
    bulletin_id = item.bulletin
    related_data = item.data

    
    # cant think of smarter way
    # check which mask is used
    for term in cloze.split(' '):
        for mask in MASKS:
            if mask in term:
                target_mask = mask
    
    false = cloze.replace(target_mask, random_replacement)
    true = item.source_text

    row = {
        'cloze_text':cloze, 'source_text':source, 'true':true, 'false':false, 
        'answer_text':item.answer_text, 'answer_type': anstype, 
        'answer_length': len(item.answer_text.split(' ')),
        'ids':item.ids, 'bulletin':bulletin_id, 'data':related_data}
    results.append(row)

clozes_with_false = pd.DataFrame(results)
clozes_with_false.to_json('pickles/clozes_with_false_20210825.json', default_handler=str)

In [56]:
clozes_with_false

Unnamed: 0,cloze_text,source_text,true,false,answer_text,answer_type,answer_length,ids,bulletin,data
0,"In TEMPORALMASK, approximate gross value added...","In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...","In between 2001 and 2011, approximate gross va...",2019,DATE,1,ccf75a2c157eacc5253f0e8a55b1ded9f5386d58,businessindustryandtrade/business/businessserv...,[/businessindustryandtrade/business/businessse...
1,"In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...","£1,313.9 billion",MONEY,2,fc3346f2545ca5059bdb5150b998300c5001bea5,businessindustryandtrade/business/businessserv...,[/businessindustryandtrade/business/businessse...
2,"In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...",£42.8 billion,MONEY,2,81aa32270d547a368e7b806646c72069ba4fac43,businessindustryandtrade/business/businessserv...,[/businessindustryandtrade/business/businessse...
3,"In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...",3.4%,PERCENT,1,f308eb661ad1d6c8021cf9e22d8e8c7d9c5ff50e,businessindustryandtrade/business/businessserv...,[/businessindustryandtrade/business/businessse...
4,"In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...","In 2019, approximate gross value added at basi...",2018,DATE,1,0a0bcf4b990f1926bc87fa662badadaf7e23d5fa,businessindustryandtrade/business/businessserv...,[/businessindustryandtrade/business/businessse...
...,...,...,...,...,...,...,...,...,...,...
6179,Around 6 in 10 people (61%) reported feeling s...,Around 6 in 10 people (61%) reported feeling s...,Around 6 in 10 people (61%) reported feeling s...,Around 6 in 10 people (61%) reported feeling s...,76%,PERCENT,1,dce8c5e1d183452a3cc887d90ff0f64ef4bb82b5,peoplepopulationandcommunity/wellbeing/article...,[/peoplepopulationandcommunity/wellbeing/datas...
6180,"In comparison, NUMERICMASK of people did not f...","In comparison, around 14% of people did not fe...","In comparison, around 14% of people did not fe...","In comparison, An estimated 3.1% of people did...",around 14%,PERCENT,2,e7c1fce9fcd8c918a07029ba4266c18391b1aa75,peoplepopulationandcommunity/wellbeing/article...,[/peoplepopulationandcommunity/wellbeing/datas...
6181,"In comparison, around 14% of people did not fe...","In comparison, around 14% of people did not fe...","In comparison, around 14% of people did not fe...","In comparison, around 14% of people did not fe...",38%,PERCENT,1,874948014cfa36eff9f493b57e8077f3bda23019,peoplepopulationandcommunity/wellbeing/article...,[/peoplepopulationandcommunity/wellbeing/datas...
6182,Some of the main UK findings (Source: IDENTITY...,Some of the main UK findings (Source: Understa...,Some of the main UK findings (Source: Understa...,Some of the main UK findings (Source: the Lond...,Understanding Society,ORG,2,fab3e03bd458df568f3d110940583bc1251dfd90,peoplepopulationandcommunity/wellbeing/article...,[/peoplepopulationandcommunity/wellbeing/datas...


# Download XLS files

In [2]:
import pickle

with open('pickles/collection_20210624_194932.pkl', 'rb') as f:
    collection = pickle.load(f)

In [3]:
# bulletin_names contains all bulletins collected through scrapping
bulletin_names = list(collection.bulletins.keys())

# target bulletins are those with usable related datasets and main points
target_bulletins = []
for bulletin in bulletin_names:
    if len(collection.bulletins.get(bulletin).get('main-points')) and len(collection.bulletins.get(bulletin).get('related-datasets')) > 0:
        target_bulletins.append(bulletin)

In [35]:
for name in tqdm(target_bulletins):
    bulletin = collection.bulletins.get(name)
    related_datasets = bulletin.get('related-datasets')
    for dataset in related_datasets:
        url = 'https://www.ons.gov.uk' + dataset
        log = requests.get(url)
        soup = BeautifulSoup(log.content, features = 'html')
        list_of_datsets = soup.findAll('a', {'class':'btn btn--primary btn--thick'}, href = True)
        # take the most recent one
        if len(list_of_datsets):
            excel_file = 'https://www.ons.gov.uk' + list_of_datsets[0]['href']
            download = requests.get(excel_file)

            # change / to _ so you can save them 
            # also ignore the first / for naming convenience
            savename = dataset.replace('/', '_')[1:]
            with open(f'datasets/{savename}.xls', 'wb') as f:
                f.write(download.content)

100%|██████████| 328/328 [11:53<00:00,  2.17s/it]


# THERE WERE SOME MISSING DFS LETS DOWNLOAD THEM

In [12]:
import numpy as np
bulletin_names = np.genfromtxt('secondary/bulletins_with_missing_dfs.txt', dtype = 'str').tolist()

In [13]:
# bulletin_names contains all bulletins collected through scrapping
bulletin_names = list(collection.bulletins.keys())

# target bulletins are those with usable related datasets and main points
target_bulletins = []
for bulletin in bulletin_names:
    if len(collection.bulletins.get(bulletin).get('main-points')) and len(collection.bulletins.get(bulletin).get('related-datasets')) > 0:
        target_bulletins.append(bulletin)

In [18]:
for name in tqdm(target_bulletins):
    bulletin = collection.bulletins.get(name)
    related_datasets = bulletin.get('related-datasets')
    for dataset in related_datasets:
        url = 'https://www.ons.gov.uk' + dataset
        log = requests.get(url)
        soup = BeautifulSoup(log.content, features = 'html')
        list_of_datsets = soup.findAll('a', {'class':'btn btn--primary btn--thick'}, href = True)
        # take the most recent one
        if len(list_of_datsets):
            excel_file = 'https://www.ons.gov.uk' + list_of_datsets[0]['href']
            download = requests.get(excel_file)

            # change / to _ so you can save them 
            # also ignore the first / for naming convenience
            savename = dataset.replace('/', '_')[1:]
            with open(f'datasets/missing/{savename}.xls', 'wb') as f:
                f.write(download.content)

100%|██████████| 405/405 [19:52<00:00,  2.94s/it]
