# News Classification App

References:

* https://www.w3schools.com/colors/colors_picker.asp

In [1]:
import pickle
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import punkt
from nltk.corpus.reader import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
from bs4 import BeautifulSoup
import numpy as np
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_table

## 1. Importing inputs

### 1.1. Trained Model

The best performing model is the SVM. We'll use it in the app.

In [2]:
path_models = "C:/Users/migue/Data Science/Master Data Science/KSCHOOL/9. TFM/0. Latest News Classifier/04. Model Training/Models/"

# SVM
path_svm = path_models + 'best_svc.pickle'
with open(path_svm, 'rb') as data:
    svc_model = pickle.load(data)

### 1.2. TF-IDF object

In [3]:
path_tfidf = "C:/Users/migue/Data Science/Master Data Science/KSCHOOL/9. TFM/0. Latest News Classifier/03. Feature Engineering/Pickles/tfidf.pickle"

with open(path_tfidf, 'rb') as data:
    tfidf = pickle.load(data)

### 1.3. Category mapping dictionary

In [4]:
category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}

## 2. Definition of functions

### 2.1. Web Scraping Functions

In [5]:
# El Pais
def get_news_elpais():
    
    # url definition
    url = "https://elpais.com/elpais/inenglish.html"
    
    # Request
    r1 = requests.get(url)
    r1.status_code

    # We'll save in coverpage the cover page content
    coverpage = r1.content

    # Soup creation
    soup1 = BeautifulSoup(coverpage, 'html5lib')

    # News identification
    coverpage_news = soup1.find_all('h2', class_='articulo-titulo')
    len(coverpage_news)
    
    number_of_articles = 5

    # Empty lists for content, links and titles
    news_contents = []
    list_links = []
    list_titles = []

    for n in np.arange(0, number_of_articles):

        # only news articles (there are also albums and other things)
        if "inenglish" not in coverpage_news[n].find('a')['href']:  
            continue

        # Getting the link of the article
        link = coverpage_news[n].find('a')['href']
        list_links.append(link)

        # Getting the title
        title = coverpage_news[n].find('a').get_text()
        list_titles.append(title)

        # Reading the content (it is divided in paragraphs)
        article = requests.get(link)
        article_content = article.content
        soup_article = BeautifulSoup(article_content, 'html5lib')
        body = soup_article.find_all('div', class_='articulo-cuerpo')
        x = body[0].find_all('p')

        # Unifying the paragraphs
        list_paragraphs = []
        for p in np.arange(0, len(x)):
            paragraph = x[p].get_text()
            list_paragraphs.append(paragraph)
            final_article = " ".join(list_paragraphs)

        news_contents.append(final_article)

    # df_features
    df_features = pd.DataFrame(
         {'Content': news_contents 
        })

    # df_show_info
    df_show_info = pd.DataFrame(
        {'Article Title': list_titles,
         'Article Link': list_links})
    
    return (df_features, df_show_info)

### 2.2. Feature Engineering Functions

In [6]:
punctuation_signs = list("?:!.,;")
stop_words = list(stopwords.words('english'))

def create_features_from_df(df):
    
    df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')
    
    df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()
    
    df['Content_Parsed_3'] = df['Content_Parsed_2']
    for punct_sign in punctuation_signs:
        df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')
        
    df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")
    
    wordnet_lemmatizer = WordNetLemmatizer()
    nrows = len(df)
    lemmatized_text_list = []
    for row in range(0, nrows):

        # Create an empty list containing lemmatized words
        lemmatized_list = []
        # Save the text and its words into an object
        text = df.loc[row]['Content_Parsed_4']
        text_words = text.split(" ")
        # Iterate through every word to lemmatize
        for word in text_words:
            lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        # Join the list
        lemmatized_text = " ".join(lemmatized_list)
        # Append to the list containing the texts
        lemmatized_text_list.append(lemmatized_text)
    
    df['Content_Parsed_5'] = lemmatized_text_list
    
    df['Content_Parsed_6'] = df['Content_Parsed_5']
    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')
        
    df = df['Content_Parsed_6']
    df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})
    
    # TF-IDF
    features = tfidf.transform(df).toarray()
    
    return features

In [7]:
def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

### 2.3. Prediction Functions

In [8]:
def predict_from_features(features):
    
    # Predict using the input model
    predictions = svc_model.predict(features)
    
    # Return result
    categories = [get_category_name(x) for x in predictions]
    
    return categories

In [9]:
def complete_df(df, categories):
    df['Prediction'] = categories
    return df

Finally, the whole process can be written in these 4 lines of code:

```python
# Get the scraped dataframes
df_features, df_show_info = get_news_elpais()

# Create features
features = create_features_from_df(df_features)

# Predict
predictions = predict_from_features(features)

# Put into dataset
df = complete_df(df_show_info, predictions)
```

## 3. Dash App

In [11]:
# Stylesheet
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

# Colors
colors = {
    'background': '#fffaea',
    'text': '#696969'
}

app.layout = html.Div(style={'backgroundColor':colors['background']}, children=[
    
    # Title
    html.H1(children='News Classification App',
            style={
                'textAlign': 'left',
                'color': colors['text']

            }),

    # Sub-title
    html.Div(children='''
        Scrape the latest news from different newspapers and show a dashboard.
    '''),
    
    # Checkbox
    dcc.Checklist(
        options=[
            {'label': 'El Pais English', 'value': 'EPE'},
            {'label': 'The New York Times', 'value': 'NYT'}
        ],
        values=['EPE', 'NYT'],
        id='checklist'),

    # Button
    html.Button('Submit', id='submit', type='submit'),
    
    # Output Block
    html.Div(id='output-container-button',
             children='Enter a value and press submit')
    

])

@app.callback(
    dash.dependencies.Output('output-container-button', 'children'),
    [],
    [dash.dependencies.State('checklist', 'values')],
    [dash.dependencies.Event('submit', 'click')])

def update_output(values):
    
    if 'EPE' in values:
        # Get the scraped dataframes
        df_features, df_show_info = get_news_elpais()
    
    # Create features
    features = create_features_from_df(df_features)
    # Predict
    predictions = predict_from_features(features)
    # Put into dataset
    df = complete_df(df_show_info, predictions)
    
    return df

In [12]:
app.run_server(debug=False)

 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [29/Dec/2018 19:49:50] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [29/Dec/2018 19:49:51] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [29/Dec/2018 19:49:51] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [29/Dec/2018 19:50:00] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [29/Dec/2018 19:50:00] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [29/Dec/2018 19:50:00] "GET /_dash-dependencies HTTP/1.1" 200 -
[2018-12-29 19:50:08,595] ERROR in app: Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "C:\Users\migue\Anaconda3\lib\site-packages\dash\dash.py", line 918, in add_context
    cls=plotly.utils.PlotlyJSONEncoder
  File "C:\Users\migue\Anaconda3\lib\json\__init__.py", line 238, in dumps
    **kw).encode(obj)
  File "C:\Users\migue\Anaconda3\lib\site-packages\plotly\utils.py", line 168, in encode
    encoded_o = super(PlotlyJSONEncoder, self).encode(o)
  File "C:\Users\migue\Anaco