Submit request to website

In [1]:
url = "positive.news"

In [59]:
import requests

def request(url):
    """sends a request to the URL"""

    # add https if not in there at start
    if url[0:8] != "https://" and url[0:7] != "http://":
        url = "https://" + url

    my_session = requests.session()
    
    # these settings help avoid getting blocked by site
    for_cookies = requests.get(url, timeout=5).cookies
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
    }

    return my_session.get(url, headers=headers, cookies=for_cookies, timeout=5)

response = request(url)

Check the request worked

In [60]:
print(response.status_code)
print(response.text[0:400])

200
<!DOCTYPE html>
<!--[if lt IE 7]><html class="no-js ie ie6 lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
<!--[if IE 7]><html class="no-js ie ie7 lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!--[if IE 8]><html class="no-js ie ie8 lt-ie9" lang="en"> <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js" lang="en"> <!--<![endif]-->
<head>
	
    
    

    
            <!-- Google Tag Manager -->
        <


Export the response text into a file so you don't get different results when you query again

In [61]:
with open(url+"-html_text.txt", "w") as text_file:
    text_file.write(response.text)

Start here if you've already submitted the request

In [2]:
with open(url+"-html_text.txt","r") as text_file:
    response_text = text_file.read()

Now we need to pull out all the relevant text content from the HTML of the websites. We'll start by splitting the HTML text into an array of text pieces

In [3]:
from bs4 import BeautifulSoup as bs

soup_li = bs(response_text, "lxml").body.get_text(separator="||").split("||")

Check if it worked

In [4]:
print(len(soup_li))
print(soup_li[0:20])

697
['\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n']


OK, there's a lot of generic text that pertains to navigation links. Let's filter for text that has more than 5 words and which doesn't include generic keywords like `sign up` or `newsletter`

In [5]:
KEYWORDS = ["cookie",
            "newsletter",
            "copyright",
            "trademark",
            "mailing list",
            "subscribe",
            "sign up",
            "rights reserved",
            "this site",
            "©",
            "ltd",
            "llp",
            "inc"
           ]

def is_generic(text):
    if len(text.split()) < 5:
        return True
    
    lower_text = text.lower()
    
    for word in KEYWORDS:
        if word in lower_text:
            return True
        
    return False

long_text = [x for x in soup_li if not is_generic(x)]

More generic text that can be filtered out by keyword

In [6]:
long_text[0:20]

['What can I do about climate change? 14 ways to take positive action',
 'After the UN issued a ‘code red for humanity’ last week, many people are asking — what can I do about climate change? Quite a lot, actually ',
 'The cookbook for people who have long Covid',
 '\n          The authors of a new, free cookbook hope it will improve taste for Covid patients\n        ',
 'Wind power firm aims to nip nimbyism in the bud with tulip-shaped turbines',
 "\n          Want to improve your business's eco-credentials, and ward off nimby naysayers? Flower Turbines may be the ticket \n        ",
 'Meet the plastic-hunting ‘pirates’ of Cornwall',
 '\n          The pirates’ bounty is melted down to make sea kayaks, which are then used to collect more rubbish   \n        ',
 'What went right this week: Australia’s ‘healing journey’, plus more positive news',
 '\n          Australia pledged reparations for indigenous people, wildlife returned to Scottish rivers, plus coffee shops that help homeless p

Seems like there's some duplicates. Let's remove them.

In [7]:
unique_li = []

for text_l in long_text:
    unique = True
    
    for text_u in unique_li:
        if text_l in text_u:
            unique = False
        
    if unique:
        unique_li.append(text_l)

In [8]:
unique_li

['What can I do about climate change? 14 ways to take positive action',
 'After the UN issued a ‘code red for humanity’ last week, many people are asking — what can I do about climate change? Quite a lot, actually ',
 'The cookbook for people who have long Covid',
 '\n          The authors of a new, free cookbook hope it will improve taste for Covid patients\n        ',
 'Wind power firm aims to nip nimbyism in the bud with tulip-shaped turbines',
 "\n          Want to improve your business's eco-credentials, and ward off nimby naysayers? Flower Turbines may be the ticket \n        ",
 'Meet the plastic-hunting ‘pirates’ of Cornwall',
 '\n          The pirates’ bounty is melted down to make sea kayaks, which are then used to collect more rubbish   \n        ',
 'What went right this week: Australia’s ‘healing journey’, plus more positive news',
 '\n          Australia pledged reparations for indigenous people, wildlife returned to Scottish rivers, plus coffee shops that help homeless p

In [12]:
import re

def text_transform(text_input):
    encoded_text = text_input.encode("ascii", "ignore")
    decoded_text = encoded_text.decode("unicode_escape")
    stripped_text = re.sub(
        r"\r|\n|\t| \(link opens in a new browser window\)", "", decoded_text
    ).strip()
    return stripped_text

processed_li = [text_transform(x) for x in unique_li]

In [15]:
processed_li

['What can I do about climate change? 14 ways to take positive action',
 'After the UN issued a code red for humanity last week, many people are asking  what can I do about climate change? Quite a lot, actually',
 'The cookbook for people who have long Covid',
 'The authors of a new, free cookbook hope it will improve taste for Covid patients',
 'Wind power firm aims to nip nimbyism in the bud with tulip-shaped turbines',
 "Want to improve your business's eco-credentials, and ward off nimby naysayers? Flower Turbines may be the ticket",
 'Meet the plastic-hunting pirates of Cornwall',
 'The pirates bounty is melted down to make sea kayaks, which are then used to collect more rubbish',
 'What went right this week: Australias healing journey, plus more positive news',
 'Australia pledged reparations for indigenous people, wildlife returned to Scottish rivers, plus coffee shops that help homeless people',
 'Up for grabs: how litter-pickers are taking the rubbish crisis into their own hand

In [16]:
import pandas as pd

pd.set_option("max_row",None)
pd.set_option('display.max_colwidth', None)

token_df = pd.DataFrame(processed_li,columns=["text"])
token_df["site"] = url

In [17]:
token_df

Unnamed: 0,text,site
0,What can I do about climate change? 14 ways to take positive action,positive.news
1,"After the UN issued a code red for humanity last week, many people are asking what can I do about climate change? Quite a lot, actually",positive.news
2,The cookbook for people who have long Covid,positive.news
3,"The authors of a new, free cookbook hope it will improve taste for Covid patients",positive.news
4,Wind power firm aims to nip nimbyism in the bud with tulip-shaped turbines,positive.news
5,"Want to improve your business's eco-credentials, and ward off nimby naysayers? Flower Turbines may be the ticket",positive.news
6,Meet the plastic-hunting pirates of Cornwall,positive.news
7,"The pirates bounty is melted down to make sea kayaks, which are then used to collect more rubbish",positive.news
8,"What went right this week: Australias healing journey, plus more positive news",positive.news
9,"Australia pledged reparations for indigenous people, wildlife returned to Scottish rivers, plus coffee shops that help homeless people",positive.news


In [18]:
results = pd.DataFrame(columns=["text","site"])

In [None]:
results = results.append(token_df)

In [29]:
results.drop([1])

Unnamed: 0.1,Unnamed: 0,text,site
0,0.0,Struggle to evacuate Afghans from Kabul airport,bbc.co.uk
2,2.0,Banksy: How much do we really know about him?,bbc.co.uk
3,3.0,Raab rejects calls to quit over Afghan interpreters,bbc.co.uk
4,4.0,Transfer latest and Premier League build-up,bbc.co.uk
5,5.0,Facebook removes abusive comments on Lizzo's pages,bbc.co.uk
6,6.0,'The Taliban have moved into the house next door' Video,bbc.co.uk
7,7.0,How many Afghan asylum seekers has the UK taken in?,bbc.co.uk
8,8.0,'It was a choice to live or be beheaded',bbc.co.uk
9,9.0,No planes leaving Afghanistan empty - UK minister,bbc.co.uk
10,10.0,Gunman argued with mum before mass shooting,bbc.co.uk


In [30]:
results.to_csv("test.csv")

The values can be split into 3 groups: negative, neutral and positive. But it's more optimal to have only 2 groups, so we'll group together the positive and neutral scores. 

With that in mind, the models seem to be performing pretty well. Only a couple of misses:
- `Why the Tourette's...` was scored negatively by AFINN
- `We never knew...` was scored positively by VADER

Let's check every place where the models are disagreeing...

In [None]:
token_df[((token_df["afinn_score"]<0) & (token_df["vader_score"]>=0)) | ((token_df["afinn_score"]>=0) & (token_df["vader_score"]<0))]

Unnamed: 0,token,afinn_score,vader_score
6,UK records a further 100 Covid deaths,-0.285714,0.0
7,"Germany fears thousands got saline, not vaccine",0.0,-0.4215
11,"Murder-accused boy, 14, in court after stabbing",-0.571429,0.0
12,"England lose two wickets in two balls - clips, radio & text",0.0,-0.4019
13,Women's Hundred: Trent Rockets struggle in must-win game against Birmingham Phoenix,0.181818,-0.3182
28,Casualty's 10 most memorable episodes,-0.2,0.0
31,'I'm just not ready to buy an electric car',0.0,-0.2755
35,Why the Tourette's queen of Twitch hasn't been banned,-0.222222,0.357
36,'We never knew how dangerous Loch Lomond was',-0.25,0.3724
62,Â© 2021 BBC. The BBC is not responsible for the content of external sites.,0.142857,-0.2411


There's 10 disagreements (out of 64 phrases). AFINN got 3 right, VADER 7 - hence VADER seems to be the superior model!

But what if we could make it even more accurate with a pre-trained, state-of-the-art model? Enter Hugging Face

In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")

Let's see if the model works

In [None]:
classifier('We are very happy to show you the 🤗 Transformers library.')

[{'label': 'POSITIVE', 'score': 0.9997795224189758}]

So far so good. Let's now apply it to the dataset.

In [None]:
for token in scored_tokens:
    bert = classifier(token["token"])[0]
    token["bert_score"] =  bert["score"] * (-1 if bert["label"] == "NEGATIVE" else 1)
    
token_df = pd.DataFrame(scored_tokens)

In [None]:
token_df[["token","bert_score"]][10:20]

Unnamed: 0,token,bert_score
10,Panic as thousands flee Taliban onslaught,-0.975269
11,"Murder-accused boy, 14, in court after stabbing",-0.982073
12,"England lose two wickets in two balls - clips, radio & text",-0.996979
13,Women's Hundred: Trent Rockets struggle in must-win game against Birmingham Phoenix,0.963504
14,'One signing could decide the title race - but it's not Harry Kane',-0.98869
15,Holiday 'stress' over paper vaccine certificate,-0.994894
16,"Woman arrested in murder probe after boy, 2, dies",-0.960317
17,Medics warn of more cancelled operations,-0.999388
18,3 things we love today,0.999752
19,Three of the strangest organs in the animal kingdom,0.993196


BERT has perfect accuracy on the slice, impressive! Let's see how it compares to VADER...

In [None]:
filter = token_df[((token_df["bert_score"]<0) & (token_df["vader_score"]>=0)) | ((token_df["bert_score"]>=0) & (token_df["vader_score"]<0))][["token","vader_score","bert_score"]]
print(filter.count())
filter.to_csv("bert-vader.csv")

token          19
vader_score    19
bert_score     19
dtype: int64


Turns out DistilBERT has lower accuracy than VADER?!

What if we use another model, tailored to twitter?

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
text = "Good night 😊"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-2.4362,  0.5167,  2.2756]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [None]:
import numpy as np
from scipy.special import softmax

In [None]:
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores

array([0.00760985, 0.14581196, 0.8465782 ], dtype=float32)

In [None]:
scores[0]

0.007609855

In [None]:
token_df

Unnamed: 0,token,afinn_score,vader_score,bert_score
0,Plymouth mass shooter was licensed gun holder,-0.142857,-0.34,-0.702464
1,PM calls emergency meeting to discuss Afghanistan,-0.285714,-0.3818,-0.804552
2,England bat after Anderson takes five wickets in second Test,0.0,0.0,0.941195
3,Teachers: 'It's been hell grading exams',-0.666667,-0.6808,-0.995785
4,"'One signing could decide the title race, but it's not Harry Kane'",0.0,0.0,-0.983202
5,Lure of the island with no electricity or wi-fi,-0.111111,-0.296,-0.999425
6,UK records a further 100 Covid deaths,-0.285714,0.0,-0.977418
7,"Germany fears thousands got saline, not vaccine",0.0,-0.4215,-0.998494
8,'We had to tip milk down the drain - now we sell 200-400 bottles a day',0.0,0.0,-0.998709
9,Gunman's victims include three-year-old girl,-0.6,-0.3182,-0.959833


In [None]:
for token in scored_tokens:
    encoded_input = tokenizer(token["token"], return_tensors='pt')
    output = model(**encoded_input)
    
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    
    if scores[0] > scores[1] and scores[0] > scores[2]:
        roberta_score = -1
    elif scores[1] > scores[0] and scores[1] > scores[2]:
        roberta_score = 0
    else:
        roberta_score = 1
        
    token["roberta_score"] = roberta_score

In [None]:
token_df = pd.DataFrame(scored_tokens)
token_df[["token","roberta_score"]][25:35]

Unnamed: 0,token,roberta_score
25,Britney Spears' father to step down as conservator,0
26,Grimmy on leaving Radio 1 and the 'instant bad mood' song,-1
27,Olympian Adam Peaty joins the all-star Strictly 2021 line-up,0
28,Casualty's 10 most memorable episodes,1
29,Marvel launches new Disney+ show featuring Chadwick Boseman,1
30,First batch of student's washing machines shipped,0
31,'I'm just not ready to buy an electric car',-1
32,Olympian given new medal after first got bitten,0
33,'When you're on a BMX 20 feet in the air there's no room for error' Video,0
34,Swimmer taking on 'coldest swim on Earth' to highlight climate change,0


In [None]:
filter = token_df[((token_df["roberta_score"]<0) & (token_df["vader_score"]>=0)) | ((token_df["roberta_score"]>=0) & (token_df["vader_score"]<0))][["token","vader_score","roberta_score"]]
print(filter.count())
filter.to_csv("roberta-vader.csv")

token            14
vader_score      14
roberta_score    14
dtype: int64
