In [None]:
import os, time
import requests
import pickle
import spacy
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pprint import pprint

apikey = os.getenv('GUARDIAN_APIKEY')

## Scraping the inputs

Re-purpose the code from dataset creation to scrape just 5 new articles, this time from any section of the Guardian.

In [None]:
# Helper functions
def get_results(base_url, params):
    
    results = []
    try:
        r = requests.get(base_url, params)
        r.raise_for_status()
    except requests.exceptions.RequestException as err:
        raise SystemExit(err)
    
    data = r.json()
    results.extend(data['response']['results'])
    return results


def results_to_html(results):
    
    # grab urls, write to file
    urls = [result['webUrl'] for result in results]

    # retrieve HTML from urls
    html_files = {}
    while len(html_files) < len(urls): 
        
        for i, url in enumerate(urls):
            if i not in html_files:
                try:
                    file = requests.get(url)
                    file.raise_for_status()
                    html_files[i] = file
                except requests.exceptions.RequestException as err:
                    time.sleep(2)
    
    return html_files
        
        
def html_to_text(html_files):
    
    all_texts = []
    
    for file_id, file in html_files.items():
        soup = BeautifulSoup(file.content, 'html.parser')
        body = soup.find_all('div', class_='article-body-commercial-selector')
        if len(body) == 1:
            ps = body[0].find_all('p')
        par_list = [p.text for p in ps]
        text = " ".join(par_list)
        text = text.replace('\xa0', ' ')
        if not (text == ''):
            all_texts.append(text)
        
    return all_texts

# Main function to scrape and collect news articles for inference
def get_text():
    
    API_ENDPOINT = "http://content.guardianapis.com/search"
    my_params = {
        'api-key': apikey,
        'order-by': 'relevance', 
        'from-date': "2020-1-1",
        'page-size': 5,
    }
    
    results = get_results(API_ENDPOINT, my_params)
    html_files = results_to_html(results)
    texts = html_to_text(html_files)
    df = pd.DataFrame({'Content': texts})
    
    return df

In [None]:
df = get_text()
df

## From inputs to category prediction

Load trained classifier and TF-IDF vectoriser.

In [None]:
with open("models/best_svm.pickle", 'rb') as model:
    svc = pickle.load(model)
    
with open("processed/tfidf_vectoriser.pickle", 'rb') as f:
    vectoriser = pickle.load(f)
    
nlp = spacy.load("en_core_web_sm")

Create dictionary to convert predictions.

In [None]:
# create dictionary
sections = ['environment', 'business', 'film', 'culture', 'education']
d = {i: section for i, section in enumerate(sections)}

# add the "other" category when model is unsure
d.update({5: "other stuff"})
print(d)

In [None]:
# Helper functions
def keep_token(t):
    """Decide whether to keep a token"""
    return (t.is_alpha and not (t.is_space or t.is_punct or t.is_stop))


def lemmatised_string(doc):
    """Lemmatise remaining tokens"""
    return " ".join(t.lemma_ for t in doc if keep_token(t))


def predict_category(features):
    initial_preds = svc.predict(features)
    probs = svc.predict_proba(features).max(axis=1)
    preds = []
    
    for prob, in_pred in zip(probs, initial_preds):
        if prob > 0.7:
            preds.append(in_pred)
        else:
            preds.append(5)
    return [d[pred] for pred in preds]

# Main function 
def clean_text_and_predict(df_0):
    
    df = df_0.copy()
    df['Content_parsed'] = df['Content'].str.lower()
    df['Content_parsed'] = df['Content_parsed'].str.strip()
    
    # parse and clean articles
    docs = list(nlp.pipe(df['Content_parsed'], disable=['tok2vec','ner','tagger','parser']))
    df['Content_parsed'] = [lemmatised_string(doc) for doc in docs]
    
    # numericalise with learnt transformer
    features = vectoriser.transform(df['Content_parsed']).toarray()
    
    df['Prediction'] = predict_category(features)
    return df

In [None]:
clean_text_and_predict(df)

## Create app in Dash

In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State

In [None]:
app = dash.Dash(__name__)

colors = {
    'background': '#FFFFFF',#'#212F3D',
    'text': '#000000' #'#FBFCFC'
}

app.layout = html.Div(style={'backgroundColor': colors['background']}, children=[
    
    #########
    # TITLE
    #########
    html.H1(
        children='Wanna see if I can guess...',
        style={
            'textAlign': 'center',
            'color': colors['text']
        }
    ),
    
    # space
    html.Br(),
    
    #########
    # DECIDE-WHAT-TO-DO SECTION
    #########
    html.Div([
        
        # left panel (prediction on user input)
        html.Div([
            html.H3("...what you are writing about?"),
            html.Div([
                dcc.Input(id='input-state', value='write here', type='text'),
                html.Button(id='submit-button-1', n_clicks=0, children='Let me guess')
            ]),
        ],
        style={'width': '49%', 'display': 'inline-block', 'textAlign': 'center'}),
        
        # right panel (prediction on scraped news)
        html.Div([
            html.H3("...what the Guardian is writing about?"),
            html.Div([
                html.Button(id='submit-button-2', n_clicks=0, children='Fetch some news')            
            ]),
        ],
        style={'width': '49%', 'display': 'inline-block', 'textAlign': 'center'})
    ]),
    
    # space
    html.Br(),
    
    #########
    # OUTPUTS AND GRAPHS
    #########
    html.Div([
        
        # left panel
        html.Div([
            html.Div(id='output-1'),
            dcc.Graph(id='graph-1')],
            style={'width': '49%', 'display': 'inline-block', 'textAlign': 'center', 'float': 'left'}
        ),

        # right panel
        html.Div([
            html.Div(id='output-2'),
            dcc.Graph(id='graph-2')],
            style={'width': '49%','display': 'inline-block', 'textAlign': 'center', 'float': 'right'}
        ),
    ])
    # storing temporary values
    #html.Div(id='temp', style={'display': 'none'})
    
    
])

# callbacks
@app.callback(
    Output('output-1', 'children'),
    Input('submit-button-1', 'n_clicks'),
    State('input-state', 'value')
)
def predict_from_input(n_clicks, input_value):
    
    if n_clicks == 0:
        return f"Try me! I'll surprise ya."
    else:
        df = pd.DataFrame({'Content': input_value}, index=[0])
        df = clean_text_and_predict(df)
        return f"Are you perhaps talking about...{df.loc[0, 'Prediction']}?"

#
@app.callback(
    Output('output-2', 'children'),
    Output('graph-2', 'figure'),
    Input('submit-button-2', 'n_clicks')
)
def scrape_and_predict(n_clicks):
    
    if n_clicks == 0:
        return ""
    df = get_text()
    df = clean_text_and_predict(df)
    
    figure = {
        'data' : [
            {'values': [1,2,3,4,5,6],
             'labels': list(d.values()),
             'type': 'pie'}
        ]
    }
    return f"{df['Prediction'].tolist()}", figure
    
    
    

In [None]:
app.run_server(debug=False)

In [None]:
inp = 'hello football game soccer'
df1 = pd.DataFrame({'Content': inp}, index=[0])
df1 = clean_text_and_predict(df1)

print(df1.loc[0, 'Prediction'])