In [1]:
%matplotlib inline
%load_ext autoreload       
%autoreload 2              
from news_articles import *

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
from wordcloud import WordCloud

Note that the search terms used to seed the 'background' dataset are:

    ("nhs") AND ("consultation")

and the search terms used to seed the 'signal' dataset are:

    ("nhs") AND ("video consultations" OR "skype consultations" OR "video consulation" OR "skype consultation" OR "remote consultation" OR "remote consultations")

# Open data files and preprocess for analysis

In [69]:
def get_data(label, _tokenize=True, field='content'):
    with open(f'{label}_search.json') as f:
        _signal = json.load(f)
    with open(f'nhs_{label}_search.json') as f:
        _background = json.load(f)

    signal, background = [], []
    signal_titles = set()
    for article in _signal:
        if article['title'] in signal_titles:
            continue
        signal_titles.add(article['title'])
        signal.append(article)
    for article in _background:
        if article['title'] in signal_titles:
            continue  
        background.append(article)
        if len(background) == len(signal):
            break

    if _tokenize:
        signal = tokenize(signal, field=field)
        background = tokenize(background, field=field)
    return signal, background
    
def save_processed(data, label):
    with open(f'processed/{label}.json', 'w') as f:
        f.write(json.dumps(data))

def load_processed(label):
    with open(f'processed/{label}.json') as f:
        data = json.load(f)
    return data

## Comment out if don't need to re-run
#with open('final_large_search.json') as f:
#     _final_signal = json.load(f)    
#
# signal, background = get_data('big')  # Train
# valid_signal, valid_background = get_data('small')  # Test
# extrap_signal, extrap_background = get_data('very_small')  # Valid
# final_signal = tokenize(_final_signal, field='content') # Extrap
#
# save_processed(signal, 'signal')
# save_processed(background, 'background')
# save_processed(valid_signal, 'valid_signal')
# save_processed(valid_background, 'valid_background')
# save_processed(extrap_signal, 'extrap_signal')
# save_processed(extrap_background, 'extrap_background')
# save_processed(final_signal, 'final_signal')

In [9]:
signal = load_processed('signal')
background = load_processed('background')
valid_signal = load_processed('valid_signal')
valid_background = load_processed('valid_background')
extrap_signal = load_processed('extrap_signal')
extrap_background = load_processed('extrap_background')
final_signal = load_processed('final_signal')

# Build a predictive model for 'remote consultation' articles

In [None]:
# Brute force optimisers
def optimise_preprocessing(signal, background, 
                           valid_signal, valid_background,
                           min_df_range, max_df_range, 
                           ngram_range_range, n_estimators_range, 
                           max_dept_range):
    best_score = 0
    for min_df in np.arange(*min_df_range):
        for max_df in np.arange(*max_df_range):
            for ngram_range in np.arange(*ngram_range_range):
                # Testing set
                vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, ngram_range=(1,ngram_range))
                X = vectorizer.fit_transform([' '.join(doc) for doc in signal + background])
                y = [1]*len(signal) + [0]*len(background)
                # Training set
                X0 = vectorizer.transform([' '.join(doc) for doc in valid_signal + valid_background])
                y0 = [1]*len(valid_signal) + [0]*len(valid_background)        
                for n_estimators in np.arange(*n_estimators_range):
                    for max_depth in np.arange(*max_dept_range):                
                        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0, min_samples_split=2, )
                        clf = clf.fit(X, y)
                        score = clf.score(X0, y0)
                        #sample_space[n_estimators][max_depth] = score
                        if score > best_score:
                            best_score = score
                            print(min_df, max_df, ngram_range, n_estimators, max_depth, '--->', score)
                            

def optimise_model(X, y, X0, y0, n_estimators_range, max_dept_range, min_samples_split_range):
    best_score = 0
    for n_estimators in np.arange(*n_estimators_range):
        for max_depth in np.arange(*max_dept_range):                
            for min_samples_split in np.arange(*min_samples_split_range):        
                clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0, 
                                             min_samples_split=min_samples_split, )
                clf = clf.fit(X, y)
                score = clf.score(X0, y0)
                if score > best_score:
                    best_score = score
                    print(n_estimators, max_depth, min_samples_split, '--->', score)

## a) Predictive model based on the main body

In [11]:
## Commented out when not optimising
## Initially try to optimise the preprocessing
# optimise_preprocessing(signal, background, valid_signal, valid_background,
#                        min_df_range=(2, 10, 2), 
#                        max_df_range=(0.8, 0.96, 0.05), 
#                        ngram_range_range=(1,4,1)),
#                        n_estimators_range=(5, 206, 20), 
#                        max_dept_range=(2, 11, 2))

# Preprocess the data with discovered parameters
vectorizer = TfidfVectorizer(min_df=8, max_df=0.8, ngram_range=(1,3))
X = vectorizer.fit_transform([' '.join(doc) for doc in signal + background])
y = [1]*len(signal) + [0]*len(background)

X0 = vectorizer.transform([' '.join(doc) for doc in valid_signal + valid_background])
y0 = [1]*len(valid_signal) + [0]*len(valid_background) 

X1 = vectorizer.transform([' '.join(doc) for doc in extrap_signal + extrap_background])
y1 = [1]*len(extrap_signal) + [0]*len(extrap_background) 

## Commented out when not optimising
## Dig a little deeper to optimise the model given the optimal preprocessing
# optimise_model(X, y, X0, y0, 
#                n_estimators_range=(40, 55, 1), 
#                max_dept_range=(7, 14, 1), 
#                min_samples_split_range=(2, 14, 2))

### Show the performance

In [12]:
clf = RandomForestClassifier(n_estimators=41, max_depth=11, random_state=0, min_samples_split=2, )
clf = clf.fit(X, y)
cm = confusion_matrix(y1, clf.predict(X1))

print(clf.score(X, y), clf.score(X0, y0), clf.score(X1, y1))
print(cm)

0.9825949367088608 0.8669354838709677 0.8133333333333334
[[67  8]
 [20 55]]


In [66]:
Xf = vectorizer.transform([' '.join(doc) for doc in final_signal])
probs = [x[1] for x in clf.predict_proba(Xf)]

In [67]:
sorted_data = reversed(sorted([(art, _y) for art, _y in zip(_final_signal, probs)], key=lambda x: x[1]))
for art, _y in list(sorted_data)[:5]:
    print(_y)
    print(art['title'], "\n", art['url'])    
    print()

0.8995941022149845
Chelsea pays for hospital staff to stay in club hotel amid coronavirus outbreak 
 https://edition.cnn.com/2020/03/18/football/chelsea-football-club-coronavirus-spt-intl-gbr/index.html

0.8484855616189834
Wakefield boss in hands on approach as RL helps vulnerable in coronavirus crisis 
 https://www.mirror.co.uk/sport/rugby-league/wakefield-boss-takes-hands-approach-21722164

0.8332894637859707
Britain's Prince Charles tests positive for new coronavirus 
 https://www.startribune.com/britain-s-prince-charles-tests-positive-for-new-coronavirus/569086732/

0.8264349499136028
Coronavirus death toll in UK rises to 137 in past 24 hours 
 https://www.mirror.co.uk/news/uk-news/breaking-coronavirus-death-toll-england-21719638

0.8201129013202308
10 Cup Zojirushi NHS-18 (Uncooked) Rice Cooker $42.74 @ Amazon / Home Depot 
 https://slickdeals.net/f/13891418-10-cup-zojirushi-nhs-18-uncooked-rice-cooker-42-74-amazon-home-depot



## b) Predictive model based on titles

In [None]:
signal_title, background_title = get_data('big', field='title')  # Train
valid_signal_title, valid_background_title = get_data('small', field='title')  # Test
extrap_signal_title, extrap_background_title = get_data('very_small', field='title')  # Valid
final_signal_title = tokenize(_final_signal, field='title') # Extrap

In [None]:
## Commented out when not optimising
## Initially try to optimise the preprocessing
# optimise_preprocessing(signal_title, background_title, valid_signal_title, valid_background_title,
#                        min_df_range=(4, 14, 2), 
#                        max_df_range=(0.8, 0.96, 0.05), 
#                        ngram_range_range=(1,4,1)),
#                        n_estimators_range=(5, 206, 20), 
#                        max_dept_range=(2, 11, 2))

# Preprocess the data with discovered parameters
vectorizer_title = TfidfVectorizer(min_df=8, max_df=0.6, ngram_range=(1,2))
X_title = vectorizer_title.fit_transform([' '.join(doc) for doc in signal_title + background_title])
y_title = [1]*len(signal_title) + [0]*len(background_title)

X0_title = vectorizer_title.transform([' '.join(doc) for doc in valid_signal_title + valid_background_title])
y0_title = [1]*len(valid_signal_title) + [0]*len(valid_background_title) 

X1_title = vectorizer_title.transform([' '.join(doc) for doc in extrap_signal_title + extrap_background_title])
y1_title = [1]*len(extrap_signal_title) + [0]*len(extrap_background_title)

### Show the performance

In [None]:
clf_title = RandomForestClassifier(n_estimators=25, max_depth=6, random_state=0, min_samples_split=2, )
clf_title = clf.fit(X_title, y_title)
cm_title = confusion_matrix(y1_title, clf.predict(X1_title))

print(clf_title.score(X_title, y_title), clf_title.score(X0_title, y0_title), clf_title.score(X1_title, y1_title))
print(cm_title)

In [None]:
content_score = clf.score(X1, y1)
title_score = clf_title.score(X1_title, y1_title)

In [None]:
Xf_title = vectorizer_title.transform([' '.join(doc) for doc in final_signal_title])

In [None]:
probs = np.array([x[1] for x in clf.predict_proba(Xf)])
probs_title = np.array([x[1] for x in clf_title.predict_proba(Xf_title)])

## c) Combine main and title bodies in reasonably arbitrary way

The logic behind is purely anecdotal: ranking by the main body gives reasonable results, however, there are also good articles which are more difficult to correctly classify e.g. because they have very long text bodies. In these cases, the title seems to be a better indicator. In lieu of combining bodies and titles in a statistically meaningful way (which I really don't have time for), I made up a combination function in order to extract around 10 articles which have a high 'title score' and 'body score'. After this, I then revert back to the 'body' score, and select articles above a threshold picked by eye, based on the sensibleness of the articles.

Not a science, but not without any logic!

In [None]:
from scipy.stats import percentileofscore
def combine(score0, probs0, p0, score1, probs1, p1):
    rank0 = percentileofscore(probs0, p0)/100
    rank1 = score1*percentileofscore(probs1, p1)/100
    return np.sqrt(rank0*rank0 + rank1*rank1)

ranks = []
for p0, p1 in zip(probs, probs_title):
    rank = combine(content_score,probs, p0, title_score, probs_title, p1)
    ranks.append(rank)

In [None]:
# If you have a good title and good content, put at the front of the list
# Otherwise, just take articles with good content
# Then just make an aribitrary cut-off for the pilot

sorted_data = list(reversed(sorted(zip(_final_signal, ranks, probs), key=lambda x: x[1])))
rank_sorted_data = sorted_data[:12]
prob_sorted_data = list(reversed(sorted(sorted_data[12:], key=lambda x: x[2])))
sorted_data = rank_sorted_data + [(art, rank, p0 ) for art, rank, p0 in prob_sorted_data if p0 >= 0.7523]
found_titles = set()
output = []
for art, rank, p0 in reversed(sorted(sorted_data, key=lambda x: x[2])):
    if art['title'] in found_titles:
        continue
    _art = art.copy()
    _art['source'] = _art['source']['name']
    output.append(_art)
    found_titles.add(art['title'])

### Save the data

In [None]:
df = pd.DataFrame(output, columns=['publishedAt', 'source', 'title', 'author', 'description', 'content', 'url'])
df.to_excel('nhsx_digital_shift_news.xlsx')

# Main publishers in the discovery dataset

In [None]:
with open('nhs_big_search.json') as f:
    _raw_background = json.load(f)
with open('big_search.json') as f:
    _raw_signal = json.load(f)
    
words_for_cloud = defaultdict(int)
for term, count in Counter(row['source']['name'] for row in _raw_background + _raw_signal + _final_signal).most_common():
    if count < 15:
        term = 'Other'
    words_for_cloud[term] += count
words_for_cloud = {k: np.log(v) for k, v in words_for_cloud.items()}

wordcloud = WordCloud(max_font_size=40).generate_from_frequencies(words_for_cloud)
fig, ax = plt.subplots(figsize=(12,12))
plt.xticks([])
plt.yticks([])
ax.imshow(wordcloud, interpolation='bilinear')