In [1]:
import os, time
import requests
import random
import pickle
import spacy
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pprint import pprint

import dash
import dash_table as dt
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import plotly.express as px

from PIL import Image
import glob

apikey = os.getenv('GUARDIAN_APIKEY')

## Scraping the inputs

Re-purpose the code from dataset creation to scrape just 5 new articles, this time from any section of the Guardian.

In [2]:
# Helper functions
def get_results(base_url, params):
    
    results = []
    try:
        r = requests.get(base_url, params)
        r.raise_for_status()
    except requests.exceptions.RequestException as err:
        raise SystemExit(err)
    
    data = r.json()
    results.extend(data['response']['results'])
    return results


def results_to_html(results):
    
    # grab urls, write to file
    urls = [result['webUrl'] for result in results]

    # retrieve HTML from urls
    html_files = {}
    while len(html_files) < len(urls): 
        
        for i, url in enumerate(urls):
            if i not in html_files:
                try:
                    file = requests.get(url)
                    file.raise_for_status()
                    html_files[i] = file
                except requests.exceptions.RequestException as err:
                    time.sleep(2)
    
    return html_files
        
        
def html_to_text(html_files):
    
    all_texts = []
    
    for file_id, file in html_files.items():
        soup = BeautifulSoup(file.content, 'html.parser')
        body = soup.find_all('div', class_='article-body-commercial-selector')
        if len(body) == 1:
            ps = body[0].find_all('p')
        par_list = [p.text for p in ps]
        text = " ".join(par_list)
        text = text.replace('\xa0', ' ')
        if not (text == ''):
            all_texts.append(text)
        
    return all_texts

# Main function to scrape and collect news articles for inference
def get_text():
    
    API_ENDPOINT = "http://content.guardianapis.com/search"
    # pick a random page, to display different results
    page_number = random.randint(0,15)
    my_params = {
        'api-key': apikey,
        'order-by': 'relevance', 
        'from-date': "2020-1-1",
        'page-size': 10,
        'page': page_number
    }
    
    results = get_results(API_ENDPOINT, my_params)
    html_files = results_to_html(results)
    texts = html_to_text(html_files)
    df = pd.DataFrame({'Content': texts})
    
    return df

In [3]:
df = get_text()
df

Unnamed: 0,Content
0,"Jazz bassist Eugene Wright, who was the last s..."
1,"For France’s heliciculteurs, or snail farmers,..."
2,"In parts of England, parents face more uncerta..."
3,Far be it from the Rumour Mill to claim to kno...
4,Far be it from the Rumour Mill to claim to kno...
5,New Zealand has further tightened border contr...
6,Lawyers for a Georgia father and son accused o...
7,The only black composer has been dropped from ...
8,I lived with an alcoholic for a decade without...
9,New South Wales has recorded two locally acqui...


## From inputs to category prediction

Load trained classifier and TF-IDF vectoriser.

In [4]:
with open("models/best_svm.pickle", 'rb') as model:
    svc = pickle.load(model)
    
with open("processed/tfidf_vectoriser.pickle", 'rb') as f:
    vectoriser = pickle.load(f)
    
nlp = spacy.load("en_core_web_sm")

Create dictionary to convert predictions.

In [5]:
# create dictionary
sections = ['environment', 'business', 'film', 'culture', 'education']
d = {i: section for i, section in enumerate(sections)}

# add the "other" category when model is unsure
d.update({5: "not sure"})
print(d)

{0: 'environment', 1: 'business', 2: 'film', 3: 'culture', 4: 'education', 5: 'not sure'}


In [6]:
# Helper functions
def keep_token(t):
    """Decide whether to keep a token"""
    return (t.is_alpha and not (t.is_space or t.is_punct or t.is_stop))


def lemmatised_string(doc):
    """Lemmatise remaining tokens"""
    return " ".join(t.lemma_ for t in doc if keep_token(t))


def predict_category(features):
    initial_preds = svc.predict(features)
    probs = svc.predict_proba(features).max(axis=1)
    preds = []
    
    for prob, in_pred in zip(probs, initial_preds):
        if prob > 0.5:
            preds.append(in_pred)
        else:
            preds.append(5)
    return [d[pred] for pred in preds], probs

# Main function 
def clean_text_and_predict(df_0):
    
    df = df_0.copy()
    df['Content_parsed'] = df['Content'].str.lower()
    df['Content_parsed'] = df['Content_parsed'].str.strip()
    
    # parse and clean articles
    docs = list(nlp.pipe(df['Content_parsed'], disable=['tok2vec','ner','tagger','parser']))
    df['Content_parsed'] = [lemmatised_string(doc) for doc in docs]
    
    # numericalise with learnt transformer
    features = vectoriser.transform(df['Content_parsed']).toarray()
    
    preds, probs = predict_category(features)
    df['Prediction'] = preds
    df['Confidence'] = probs
    df['Confidence'] = (df['Confidence']*100).map('{:,.0f}%'.format)
    
    return df

In [7]:
clean_text_and_predict(df)

Unnamed: 0,Content,Content_parsed,Prediction,Confidence
0,"Jazz bassist Eugene Wright, who was the last s...",jazz bassist eugene wright survive member dave...,culture,63%
1,"For France’s heliciculteurs, or snail farmers,...",france heliciculteurs snail farmer desperately...,business,81%
2,"In parts of England, parents face more uncerta...",part england parent face uncertainty local aut...,education,99%
3,Far be it from the Rumour Mill to claim to kno...,far rumour mill claim know go mind roman abram...,not sure,45%
4,Far be it from the Rumour Mill to claim to kno...,far rumour mill claim know go mind roman abram...,not sure,45%
5,New Zealand has further tightened border contr...,new zealand tighten border control amid mount ...,not sure,45%
6,Lawyers for a Georgia father and son accused o...,lawyer georgia father son accuse pursue shoot ...,not sure,50%
7,The only black composer has been dropped from ...,black composer drop syllabus popular level mus...,education,99%
8,I lived with an alcoholic for a decade without...,live alcoholic decade realise hold office job ...,culture,63%
9,New South Wales has recorded two locally acqui...,new south wale record locally acquire case lin...,not sure,45%


## Create app in Dash

Create

In [8]:
glob.glob(f'/Users/michele/github/Guardian_news_classification/assets/business.*')[0].split('/')[-1]

'business.jpeg'

In [9]:
image_list = []
for filename in glob.glob(f'/Users/michele/github/Guardian_news_classification/images/*.png'):
    im=Image.open(filename)
    image_list.append(im)

image_list

[]

In [71]:
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

colors = {
    'background': '#FFFFFF',#'#212F3D',
    'text': '#000000' #'#FBFCFC'
}

#current_dir = os.getcwd() + "/images/"

markdown = """
This app predicts the topic of different bodies of text, either input by the user or news articles scraped from the Guardian.
"""

app.layout = html.Div(style={'backgroundColor': colors['background']}, children=[   

    # title
    html.H1(
        children='Wanna see if I can guess...',
        style={
            'textAlign': 'center',
            'color': colors['text']
        }
    ),
    
    # space
    html.Br(),
    
    # main section
    html.Div([
        
        # LEFT PANEL (prediction on user input)
        html.Div([
            html.H3("...what you are writing about?"),
            
            dcc.Textarea(id='input-state',
                         placeholder='Write here',
                         style={'width': 500, 'height': 120}),
            
            html.Div([
                html.Button(id='submit-button-1', n_clicks=0, children='Let me guess',
                        style={'marginBottom':50})
            ]),
                       
            html.H5(id='output-1'),
            
            html.Br(),
            
            html.Div(id='image-1', style={'marginTop': 100})
        ],
        style={'width': '45%', 'display': 'inline-block', 'textAlign': 'center', 'verticalAlign': 'top'}),
        
        
        # RIGHT PANEL (prediction on scraped news)
        html.Div([
            html.H3("...what the Guardian is writing about?"),
            
            html.Button(id='submit-button-2', n_clicks=0, children='Fetch some news'),
            
            dcc.Graph(id='graph-2'),
            
            dt.DataTable(id='table',
                         columns=[{'name':'Content', 'id':'Content'},
                                  {'name':'Prediction', 'id':'Prediction'},
                                  {'name':'Confidence', 'id':'Confidence'}],
                         page_size=5,
                         style_data={'whiteSpace':'normal',
                                     'height':'auto'},
                         style_cell={'textAlign':'left'}
            )],
            style={'width': '45%', 'display': 'inline-block', 'textAlign': 'center', 'verticalAlign': 'top'}),

    ]),
    
])

# callbacks

@app.callback(
    Output('output-1', 'children'),
    Output('image-1', 'children'),
    Input('submit-button-1', 'n_clicks'),
    State('input-state', 'value')
)
def predict_from_input(n_clicks, input_value):
    
    if n_clicks == 0:
        return f"Try me! I'll surprise ya.", ''
    else:
        df = pd.DataFrame({'Content': input_value}, index=[0])
        df = clean_text_and_predict(df)
        pred = df.loc[0, 'Prediction']
        fname = glob.glob(f'/Users/michele/github/Guardian_news_classification/assets/{pred}.*')[0].split('/')[-1]
        src = app.get_asset_url(f"{fname}")
        img = html.Img(src=src, style={'width':'60%', 'height':'60%'})
        if pred == 'not sure':
            return "I'm not sure this time...", img
        else:
            return f"Are you perhaps talking about {pred}?", img

#
@app.callback(
   Output('table', 'data'),
   Output('graph-2', 'figure'),
   Input('submit-button-2', 'n_clicks')
)
def scrape_and_predict(n_clicks):
    
    if n_clicks == 0:
        return [{}], px.pie(values=[1], names=[''], hole=.3)
    
    # get dataframe
    df = get_text()
    df = clean_text_and_predict(df)

    # update table
    df['Content'] = df['Content'].str.slice(stop=150)
    df['Content'] = df['Content'].astype(str) + '...'
    records = df[['Content', 'Prediction', 'Confidence']].to_dict('records')
    
    # update pie chart
    counts = df['Prediction'].value_counts()
    names, values = list(zip(*[(name, count) for name, count in counts.items()]))
    fig = px.pie(df, values=values, names=names, hole=.3)
    
    return records, fig

In [72]:
app.run_server(debug=False)

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is run

 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [01/Oct/2021 17:16:34] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2021 17:16:37] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2021 17:16:37] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2021 17:16:37] "GET /_favicon.ico?v=1.19.0 HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2021 17:16:37] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2021 17:16:37] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2021 17:16:39] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [01/Oct/2021 17:16:44] "POST /_dash-update-component HTTP/1.1" 200 -
