In [4]:
%load_ext autoreload

In [57]:
%autoreload 2
from utils.news_api import download_articles
from utils.keyword_filter import expand_terms, filter_articles, datapath, save_excel
import json

# An OR of all of these terms is used for the API query
core_care_terms = [[['assisted living']],
                   [['care', 'nursing', 'retirement'], ['home']]]
core_care_terms = expand_terms(core_care_terms)

# TODO: expand this to include a longer list
national_terms = [['nhs', 'national health service',
                   'uk', 'united kingdom', 'britain']]


care_query = ' OR '.join(f'("{term}")' for term in core_care_terms)
national_query = ' OR '.join(f'("{term}")' for term in national_terms[0])
query = f'({care_query}) AND ({national_query})'


initial_kwargs = dict(page_size=100,
                      language='en',
                      q=query,
                      sort_by='publishedAt', verbose=True)

# Collect and save from June 2019 til present
label = 'eol_test'
articles = download_articles(label, start='14 July, 2020',
                              **initial_kwargs)

In [54]:
query

'(("assisted living") OR ("care home") OR ("care homes") OR ("nursing home") OR ("nursing homes") OR ("retirement home") OR ("retirement homes")) AND (("nhs") OR ("national health service") OR ("uk") OR ("united kingdom") OR ("britain"))'

In [55]:
# Format: Pairs of lists of form (A=[a0,..,an], B=[b0,...,bn])
# Firstly, B is expanded to include plural forms
# Then, all combinations of A and B are generated
# e.g. ['video'], ['chat', 'call'] --> ['video chat', 'video chats',
#                                       'video call', 'video calls']
# If B is missing then it is ignored
# e.g. ['triage online'] --> 'triage online'
#
# These terms are then searched for AFTER the API query for filtering articles

seed_terms = [[['hospital', 'end of life', 'palliative', 'dying', 'supported death', 'nurse', 'nurses']],
              [['community'], ['service']],
              [['end of life', 'palliative'], ['care']],
              [['care', 'health'], ['worker']]]

ranked_articles = filter_articles(articles, core_care_terms, seed_terms,
                                  min_core_df=4, min_seed_df=2) # Less tight requirements
filename = datapath('processed', f'filtered_{label}.json')
with open(filename, 'w') as f:
    f.write(json.dumps(ranked_articles))

save_excel(ranked_articles, articles, 'ranked_eol_test')

In [58]:
dummy_rank = {i: 1 for i in range(len(articles))}
save_excel(dummy_rank, articles, 'unranked_eol_test')