In [1]:
import os, time
import requests
import pickle
import spacy
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pprint import pprint

apikey = os.getenv('GUARDIAN_APIKEY')

## Scraping the inputs

Re-purpose the code from dataset creation to scrape just 5 new articles, this time from any section of the Guardian.

In [2]:
# Helper functions
def get_results(base_url, params):
    
    results = []
    try:
        r = requests.get(base_url, params)
        r.raise_for_status()
    except requests.exceptions.RequestException as err:
        raise SystemExit(err)
    
    data = r.json()
    results.extend(data['response']['results'])
    return results


def results_to_html(results):
    
    # grab urls, write to file
    urls = [result['webUrl'] for result in results]

    # retrieve HTML from urls
    html_files = {}
    while len(html_files) < len(urls): 
        
        for i, url in enumerate(urls):
            if i not in html_files:
                try:
                    file = requests.get(url)
                    file.raise_for_status()
                    html_files[i] = file
                except requests.exceptions.RequestException as err:
                    time.sleep(2)
    
    return html_files
        
        
def html_to_text(html_files):
    
    all_texts = []
    
    for file_id, file in html_files.items():
        soup = BeautifulSoup(file.content, 'html.parser')
        body = soup.find_all('div', class_='article-body-commercial-selector')
        if len(body) == 1:
            ps = body[0].find_all('p')
        par_list = [p.text for p in ps]
        text = " ".join(par_list)
        text = text.replace('\xa0', ' ')
        if not (text == ''):
            all_texts.append(text)
        
    return all_texts

# Main function to scrape and collect news articles for inference
def get_text():
    
    API_ENDPOINT = "http://content.guardianapis.com/search"
    my_params = {
        'api-key': apikey,
        'order-by': 'relevance', 
        'from-date': "2020-1-1",
        'page-size': 5,
    }
    
    results = get_results(API_ENDPOINT, my_params)
    html_files = results_to_html(results)
    texts = html_to_text(html_files)
    df = pd.DataFrame({'Content': texts})
    
    return df

In [3]:
df = get_text()
df

Unnamed: 0,Content
0,There are still plenty of unanswered questions...
1,"Margaret Nolan, the actor best known for appea..."
2,Morning everyone: I’m Martin Farrer and these ...
3,Vlad has been setting cryptics and Genius puzz...
4,Marcus Rashford has revealed a pep talk from t...


## From inputs to category prediction

Load trained classifier and TF-IDF vectoriser.

In [4]:
with open("models/best_svm.pickle", 'rb') as model:
    svc = pickle.load(model)
    
with open("processed/tfidf_vectoriser.pickle", 'rb') as f:
    vectoriser = pickle.load(f)
    
nlp = spacy.load("en_core_web_sm")

Create dictionary to convert predictions.

In [5]:
# create dictionary
sections = ['environment', 'business', 'film', 'culture', 'education']
d = {i: section for i, section in enumerate(sections)}

# add the "other" category when model is unsure
d.update({5: 'other'})
print(d)

{0: 'environment', 1: 'business', 2: 'film', 3: 'culture', 4: 'education', 5: 'other'}


In [6]:
# Helper functions
def keep_token(t):
    """Decide whether to keep a token"""
    return (t.is_alpha and not (t.is_space or t.is_punct or t.is_stop))


def lemmatised_string(doc):
    """Lemmatise remaining tokens"""
    return " ".join(t.lemma_ for t in doc if keep_token(t))


def predict_category(features):
    initial_preds = svc.predict(features)
    probs = svc.predict_proba(features).max(axis=1)
    preds = []
    
    for prob, in_pred in zip(probs, initial_preds):
        if prob > 0.7:
            preds.append(in_pred)
        else:
            preds.append(5)
    return [d[pred] for pred in preds]

# Main function 
def clean_text_and_predict(df_0):
    
    df = df_0.copy()
    df['Content_parsed'] = df['Content'].str.lower()
    df['Content_parsed'] = df['Content_parsed'].str.strip()
    
    # parse and clean articles
    docs = list(nlp.pipe(df['Content_parsed'], disable=['tok2vec','ner','tagger','parser']))
    df['Content_parsed'] = [lemmatised_string(doc) for doc in docs]
    
    # numericalise with learnt transformer
    features = vectoriser.transform(df['Content_parsed']).toarray()
    
    df['Prediction'] = predict_category(features)
    return df

In [7]:
clean_text_and_predict(df)

Unnamed: 0,Content,Content_parsed,Prediction
0,There are still plenty of unanswered questions...,plenty unanswered question follow tuesday news...,other
1,"Margaret Nolan, the actor best known for appea...",margaret nolan actor well know appear title se...,film
2,Morning everyone: I’m Martin Farrer and these ...,morning martin farrer story start week boris j...,business
3,Vlad has been setting cryptics and Genius puzz...,vlad set cryptics genius puzzle guardian know ...,culture
4,Marcus Rashford has revealed a pep talk from t...,marcus rashford reveal pep talk manchester uni...,other


## Create app in Dash

In [11]:
import dash
import dash_core_components as dcc
import dash_html_components as html

In [12]:
app = dash.Dash(__name__)

app.layout = html.Div(children=[
    
    html.H1(children='My app'),
    
    
])

In [13]:
app.run_server(debug=False)

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [27/Sep/2021 16:34:29] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/Sep/2021 16:34:29] "GET /_dash-component-suites/dash_core_components/plotly-1.v1_3_1m1576595950.50.1.min.js HTTP/1.1" 200 -
127.0.0.1 - - [27/Sep/2021 16:34:30] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [27/Sep/2021 16:34:30] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [27/Sep/2021 16:47:30] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/Sep/2021 16:47:31] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [27/Sep/2021 16:47:31] "GET /_dash-layout HTTP/1.1" 200 -
