# Extracting articles links from a set of news websites : 
A classification approach


# Initialisation
We will use Python3 with Numpy for linear algebra, Pandas for data processing and CSV files I/O, and scikit-learn for predictions. 

In [3]:
import requests 
import re
from bs4 import BeautifulSoup
import datetime
import urllib
import requests
import lxml.html
import json
import csv
import sys
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup

# Reading data 

Let's define some functions to read data from the website into a csv file. 

In [177]:
internal_urls = {}

def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            continue
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        
        if not is_valid(href):
        # not a valid URL
            continue
            
        if href in internal_urls:
            # already in the set
            continue
        
        if domain_name not in href:
            # external link
            continue
            
        urls.add(href)
        title = a_tag.getText().strip().lstrip()       
        title = title.replace(';','')
        title = title.replace('""','')
        title = title.replace("\n", "")
        title = title.replace("\t", "")
        if title == "" or title is None:
            continue
        internal_urls[title] = href
        print(title + ";" + href + ";")

    return internal_urls

def write_in_csv(url_dict):
    with open('training_2.csv', mode='a', encoding="utf-8", newline='') as employee_file:
        employee_writer = csv.writer(employee_file, delimiter=';')

        for url in url_dict:
            employee_writer.writerow([url, url_dict[url]])

In [179]:
#just a little bit of testing code
# f = open("training_2.csv", "w")
# f.truncate()
# f.close()

# write_in_csv(get_all_website_links(""))

# Building the training data

The training.csv file has been built by hand in the following manner : given a set of websites, we have extracted all links from that websites and the text associated with each link, using the methods above. One by one, each couple {title, link} has been given the label 0 or 1, 0 if we decide it's not a relevant headline, 1 if it is.
We used headlines and links from around 5 websites, giving us 537 rows for our training set. It is not much, but I will add a lot more later to make the model more accurate.

The training data looks like this : 

# Training our classifier

Using the training.csv file we built earlier, we use 
Multinomial NB

In [84]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

df=pd.read_csv('training.csv', sep=';',header=None)

df = df.replace('\n','', regex=True) 
df = df.fillna(0)
title_columns_data = df.iloc[:, 0]
classification_columns_data = df.iloc[:, 2]

title_columns_data = df.iloc[:, 0]
title_columns_data = title_columns_data.astype(str)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(title_columns_data)
X_train_counts.shape

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

clf = MultinomialNB().fit(X_train_tfidf, classification_columns_data)

Let's test our classifier with some sentences, for exemple a headline and an unrelated link title :

In [85]:
docs_new = ['Why Are Election Results Taking So Long? State-by-State Look at Electoral Votes', 'Subscriber Agreement & Terms of Use']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

print (docs_new[0], " - prediction : ", predicted[0])
print (docs_new[1], " - prediction : ", predicted[1])

Why Are Election Results Taking So Long? State-by-State Look at Electoral Votes  - prediction :  1.0
Subscriber Agreement & Terms of Use  - prediction :  0.0


# Building a pipeline

In [86]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(title_columns_data, classification_columns_data)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

# Testing our classifier

We created a testing file (testing.csv) to see if our classifier is able to predict if a title is a headline or not. We generated the links from the website (here the Economist), and we annoted by hand the 0 or 1 distinction. Let's see how well our classifier fares :

In [87]:
from sklearn.metrics import f1_score

df_t=pd.read_csv('testing.csv', sep=';',header=None)

df_t = df_t.replace('\n','', regex=True) 
df_t = df_t.fillna(0)
title_columns_data_testing = df_t.iloc[:, 0]
classification_columns_data_testing = df_t.iloc[:, 2]

predicted = text_clf.predict(title_columns_data_testing)

print(f1_score(predicted, classification_columns_data_testing,  average='micro'))

0.7792207792207793


We can see that we get a score of 0.78, which is pretty good but could be better. We can use () to make it more precise.

# Generating a useful list of headline-links

Using our classifier, we can input a website and get as the output the list of {headline-links} our classifier has deemed acceptable (meaning those with a label of "1"). Here we're using the news.sky websites as an exemple :

In [159]:
input = "https://news.sky.com/"

pair_dict = get_all_website_links(input)
dict_df = pd.DataFrame(list(pair_dict.items()),columns=['title', 'link'])

title_columns = dict_df.iloc[:, 0]
predicted = text_clf.predict(title_columns)
dict_df['prediction'] = predicted 

# selecting rows based on condition 
rslt_df = dict_df[dict_df['prediction'] == 1.0] 

rslt_df = rslt_df.drop(['prediction'], axis=1)
rslt_df

Unnamed: 0,title,link
1,Watch Live,https://news.sky.com/watch-live
14,Analysis,https://news.sky.com/analysis
17,Weather,https://news.sky.com/weather
19,Biden closes in on Trump in next battleground ...,https://news.sky.com/story/us-election-joe-bid...
20,Trump supporters - some armed with rifles - ga...,https://news.sky.com/story/us-election-2020-li...
21,Which states haven't declared a winner yet - a...,https://news.sky.com/story/us-election-2020-se...
22,"If Biden wins Arizona and Georgia, history wil...",https://news.sky.com/story/us-election-2020-if...
23,"Sainsbury's warns 3,500 jobs could go in super...",https://news.sky.com/story/sainsburys-reveals-...
24,"'Reckless' rave organiser fined £10,000 after ...",https://news.sky.com/story/coronavirus-reckles...
25,Bank of England unleashes £150bn of new QE as ...,https://news.sky.com/story/coronavirus-bank-of...


In [183]:
print ("Initial number of links : ",dict_df.shape[0])
print("Resulting number of links : ",rslt_df.shape[0])

Initial number of links :  103
Resulting number of links :  72


As we can see, the intial set had 103 {headline-link} rows, and thanks to the classification algorithm, we only have 72 left. I would say that among those 72, around 5 or 6 are not really useful links ("About Us", "Sky News For Your Phone"...), but this is because our model is not really optimal, we don't have a lot of training data (only 537 {headline, link} pairs). We need to add more rows to the training set (not really difficult to do, just a bit time consuming), and improve the classification on the text in the headlines.