In [371]:
#%pip install pandas

import newspaper
from newspaper import Config
import pandas as pd
from dataclasses import make_dataclass
from bs4 import BeautifulSoup
import re
import os
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score

In [372]:
News = make_dataclass("News", [("title", str), ("category", str)])

In [373]:
#url_dict = {
#    'politics' : ['https://www.dw.com/en/top-stories/germany/s-1432', 
#                 'https://www.dw.com/en/top-stories/world/s-1429',
#                 'https://www.spiegel.de/international/world/',
#                 'https://www.spiegel.de/international/europe/',
#                 'https://www.spiegel.de/international/germany/'
#                 'https://edition.cnn.com/europe'],
#    
#    'business' : ['https://www.dw.com/en/top-stories/business/s-1431',
#                  'https://www.spiegel.de/international/business/'
#                  'https://www.bbc.com/news/business',
#                  'https://www.nbcnews.com/business'],
#    
#    'sports' : ['https://www.dw.com/en/top-stories/sports/s-8171',
#                'https://www.skysports.com/',
#                'https://www.bbc.com/sport'],
#    
#    'economy' : ['https://www.cnbc.com/economy/',
#                'https://www.bbc.com/news/business/economy',
#                'https://www.wsj.com/news/economy']
#}

In [374]:
#config = Config()
#config.fetch_images = False

In [375]:
# newspaper3k problems:
# random resultset
# case 1: scrape almost all where result set size = number of articles
# case 2: result set size is reducing on each execution and 
# after certain execution the result set size becomes 0(zero)

#news_list_to_train = []

#for category, urls in url_dict.items():
#    for url in urls:
#        online_paper = newspaper.build(url, config)
#        print("#blackdiamond " + url)
#        print("#blackdiamond " + str(len(online_paper.articles)))
#        for article in online_paper.articles:
#            article.download()
#            article.parse()
#           news_list.append(News(article.title, category))           

In [376]:
# news scraper using beautiful soup
def scrape_dw_news(news_list, file_name, category):
    with open(file_name) as fp:
        soup = BeautifulSoup(fp, "lxml")
        news_tags = soup.find_all("div", "news")
        for nc in news_tags:
            if nc is not None:
                title = nc.find("h2", "linkable")
                if title is not None:
                    news_list.append(News(title.next_element.strip(), category))

In [377]:
def scrape_spiegel_news(news_list, file_name, category):
    with open(file_name) as fp:
        soup = BeautifulSoup(fp, "lxml")
        news_tags = soup.find_all(attrs={"data-area": re.compile("news")})
        for nc in news_tags:
            if nc is not None:
                title = nc.find_all_next("span", string=True, limit=2)
                if title is not None:
                    title = title[0].string.strip() + " - " + title[1].string.strip()
                    news_list.append(News(title, category))

In [378]:
def scrape_bbc_news(news_list, file_name, category):
    if category == 'sports':
        scrape_bbc_sports_news(news_list, file_name, category)
    else:
        scrape_others_news_one(news_list, file_name, category, "div", "News", "h3")

In [379]:
def has_title_attr(tag):
    return tag.has_attr('data-bbc-title')

def scrape_bbc_sports_news(news_list, file_name, category):
    with open(file_name) as fp:
        soup = BeautifulSoup(fp, "lxml")
        news_tags = soup.find_all(has_title_attr)
        for nc in news_tags:
            if nc is not None:
                news_list.append(News(nc["data-bbc-title"].strip(), category))

In [380]:
def scrape_cnbc_news(news_list, file_name, category):
    with open(file_name) as fp:    
        soup = BeautifulSoup(fp, "lxml")
        news_tags = soup.find_all("a", "Card-title")
        for nc in news_tags:
            if nc is not None:
                title = nc.find("div", string=True)
                if title is not None:
                    news_list.append(News(title.string.strip(), category))

In [381]:
def scrape_others_news_one(news_list, file_name, 
                          category, parent_tag, 
                          parent_css_class, child_tag):
    with open(file_name) as fp:    
        soup = BeautifulSoup(fp, "lxml")
        news_tags = soup.find_all(parent_tag, re.compile(parent_css_class))
        for nc in news_tags:
            if nc is not None:
                title = nc.find_all_next(child_tag, string=True, limit=1)
                if title is not None:
                    title = title[0].string.strip()
                    news_list.append(News(title, category))

In [382]:
def scrape_others_news_two(news_list, file_name, 
                          category, parent_tag, 
                          parent_css_class, parent_is_string=True):
    with open(file_name) as fp:
        soup = BeautifulSoup(fp, "lxml")
        news_tags = soup.find_all(parent_tag, re.compile(parent_css_class), string=parent_is_string)
        for nc in news_tags:
            if nc is not None:
                if len(nc.contents) == 4:
                    title = nc.contents[2].string.strip()
                    news_list.append(News(title, category))
                elif nc.string is not None and nc.string.strip() != "":
                    title = nc.string.strip()
                    news_list.append(News(title, category))

In [383]:
# collect news data for train and test datasets
def collect_news(dir_name, news_list):
    for category in os.listdir(dir_name):
        for news_site in os.listdir(dir_name + "/" + category):
            for file_name in os.listdir(dir_name + "/" + category + "/" + news_site):
                if ".html" in file_name:
                    full_file_name = dir_name + "/" + category + "/" + news_site + "/" + file_name
                    if news_site == "dw":
                        scrape_dw_news(news_list, full_file_name, category)
                    elif news_site == "spiegel":
                        scrape_spiegel_news(news_list, full_file_name, category)
                    elif news_site == "bbc":
                        scrape_bbc_news(news_list, full_file_name, category)
                    elif news_site == "cnbc":
                        scrape_cnbc_news(news_list, full_file_name, category)
                    elif news_site == "the_local_de":
                        scrape_others_news_one(news_list, full_file_name, category, "div", "article-content", "a")
                    elif news_site == "reuters":
                        scrape_others_news_two(news_list, full_file_name, category, "h3", "story-title")
                    elif news_site == "yahoo":
                        scrape_others_news_two(news_list, full_file_name, category, "a", "js-content-viewer", parent_is_string=False)    
                    else: 
                        # cnn, wsj, nbcnews, skysport
                        scrape_others_news_two(news_list, full_file_name, category, "span", "headline")

In [384]:
# train dataset
news_list_to_train = []
collect_news("news_html_train", news_list_to_train)

news_df_to_train = pd.DataFrame(news_list_to_train)
news_df_to_train

#news_df_to_train.loc[:, 'title']
#news_df_to_train.loc[:, 'category']

Unnamed: 0,title,category
0,New UK laws to sweep away EU state aid rules,business
1,New UK laws to sweep away EU state aid rules,business
2,UK space start-up gets $500m cash injection,business
3,Portugal exempts under-18s from quarantine,business
4,"Name all firms receiving furlough cash, say MPs",business
...,...,...
538,G20 ministers call for better COVID cooperatio...,politics
539,Eswatini: Anti-monarchy protests rock African ...,politics
540,SPACs: Europe plays catch-up over blank-check ...,politics
541,Opinion: Samia must bring Tanzania to post-Mag...,politics


In [385]:
X = news_df_to_train.loc[:, "title"]
y = news_df_to_train.loc[:, "category"]

In [386]:
news_category_classifier = Pipeline([("word_vectorizer", HashingVectorizer(n_features=2**14, ngram_range=(1, 4))), 
                                     ("decission_tree_classifier", DecisionTreeClassifier(splitter="random", random_state=2))])
#news_category_classifier.fit(X, y)

In [387]:
def calculate_classifier_metric(X, y, label):
    print("-----------------" + label + "------------------------------------------------------")
    print("#blackdiamond " + label +  " Score: " + str(news_category_classifier.score(X, y)))
    y_true = y
    y_pred = news_category_classifier.predict(X) 
    #y_pred_proba = news_category_classifier.predict_proba(X)
    #print("#blackdiamond " + label +  " Prediction Probability: " + str(y_pred_proba))
    print("#blackdiamond " + label +  " Precision: " + str(precision_score(y_true, y_pred, average="micro")))
    print("#blackdiamond " + label +  " Recall: " + str(recall_score(y_true, y_pred, average="micro")))
    print("#blackdiamond " + label +  " F1-Score: " + str(f1_score(y_true, y_pred, average="micro")))
    print("---------------------------------------------------------------------------------")

In [388]:
cross_validator = KFold(n_splits=14, shuffle=True, random_state=2)

for train_index, validation_index in cross_validator.split(X):
    X_train, X_validation = X[train_index], X[validation_index]
    y_train, y_validation = y[train_index], y[validation_index]
    
    news_category_classifier.fit(X_train, y_train)
    
    calculate_classifier_metric(X_train, y_train, "Train")
    calculate_classifier_metric(X_validation, y_validation, "Validation")

-----------------Train------------------------------------------------------
#blackdiamond Train Score: 0.9742063492063492
#blackdiamond Train Precision: 0.9742063492063492
#blackdiamond Train Recall: 0.9742063492063492
#blackdiamond Train F1-Score: 0.9742063492063492
---------------------------------------------------------------------------------
-----------------Validation------------------------------------------------------
#blackdiamond Validation Score: 0.5128205128205128
#blackdiamond Validation Precision: 0.5128205128205128
#blackdiamond Validation Recall: 0.5128205128205128
#blackdiamond Validation F1-Score: 0.5128205128205128
---------------------------------------------------------------------------------
-----------------Train------------------------------------------------------
#blackdiamond Train Score: 0.9761904761904762
#blackdiamond Train Precision: 0.9761904761904762
#blackdiamond Train Recall: 0.9761904761904762
#blackdiamond Train F1-Score: 0.9761904761904762
----

In [389]:
print(news_category_classifier.predict([news_df_to_train.loc[2, "title"]]))
print(news_category_classifier.predict_proba([news_df_to_train.loc[2, "title"]]))

['business']
[[1. 0. 0. 0.]]


In [390]:
news_df_to_train.loc[2, :]

title       UK space start-up gets $500m cash injection
category                                       business
Name: 2, dtype: object

In [396]:
news_titles_test = ["Queen Elizabeth to host Germany’s Merkel during UK visit", # politics
                    "Biden and DeSantis present a united front in response to deadly condo collapse", # politics
                    "Mashburn shares Larry Bird trash talk story from Dream Team scrimmage", # sports
                    "Glass Lewis recommends shareholders don't ratify Volkswagen's board at AGM"] # business  
print(news_category_classifier.predict(news_titles_test))
print(news_category_classifier.predict_proba(news_titles_test))

['politics' 'business' 'sports' 'sports']
[[0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]


In [392]:
# test dataset
news_list_to_test = []
collect_news("news_html_test", news_list_to_test)

news_list_to_test = pd.DataFrame(news_list_to_test)
news_list_to_test

#news_list_to_test.loc[:, 'title']
#news_list_to_test.loc[:, 'category']

Unnamed: 0,title,category
0,U.S. weekly jobless claims fall; layoffs hit 2...,business
1,Trump company's CFO surrenders ahead of expect...,business
2,Oil climbs while OPEC+ considers output increase,business
3,Roche to cut 300-400 product development jobs ...,business
4,UniCredit to allow more remote working post-COVID,business
...,...,...
173,Six stress-busting steps for moving to Germany,politics
174,Brexit: What happens if you don’t exchange you...,politics
175,Today in Denmark: A round-up of the latest new...,politics
176,‘Health pass’: What documents do Americans nee...,politics


In [393]:
X_test = news_list_to_test.loc[:, "title"]
y_test = news_list_to_test.loc[:, "category"]

In [394]:
calculate_classifier_metric(X_test, y_test, "Test")

-----------------Test------------------------------------------------------
#blackdiamond Test Score: 0.5449438202247191
#blackdiamond Test Precision: 0.5449438202247191
#blackdiamond Test Recall: 0.5449438202247191
#blackdiamond Test F1-Score: 0.5449438202247191
---------------------------------------------------------------------------------
