In [24]:
import pandas as pd

results = pd.read_csv("test.csv")

In [26]:
results

Unnamed: 0,text,site,sentiment
0,Struggle to evacuate Afghans from Kabul airport,bbc.co.uk,-1
1,Decision on who gets third vaccine dose due im...,bbc.co.uk,1
2,Banksy: How much do we really know about him?,bbc.co.uk,1
3,Raab rejects calls to quit over Afghan interpr...,bbc.co.uk,-1
4,Transfer latest and Premier League build-up,bbc.co.uk,1
...,...,...,...
112,Age is just a number: Rankin portraits celebra...,positive.news,1
113,Renowned photographer Rankin has teamed up wit...,positive.news,1
114,Catch up on what's been going right in the world,positive.news,1
115,Get Positive News stories in your inbox each week,positive.news,1


In [27]:
from afinn import Afinn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline

afinn = Afinn()
results["afinn_score"] = results.apply(lambda row: 1 if afinn.score(row["text"]) >= 0 else -1, axis=1)

vader = SentimentIntensityAnalyzer()
results["vader_score"] = results.apply(lambda row: 1 if vader.polarity_scores(row["text"])["compound"] >= 0 else -1, axis=1)

In [28]:
results

Unnamed: 0,text,site,sentiment,afinn_score,vader_score
0,Struggle to evacuate Afghans from Kabul airport,bbc.co.uk,-1,-1,-1
1,Decision on who gets third vaccine dose due im...,bbc.co.uk,1,1,1
2,Banksy: How much do we really know about him?,bbc.co.uk,1,1,1
3,Raab rejects calls to quit over Afghan interpr...,bbc.co.uk,-1,-1,-1
4,Transfer latest and Premier League build-up,bbc.co.uk,1,1,1
...,...,...,...,...,...
112,Age is just a number: Rankin portraits celebra...,positive.news,1,1,1
113,Renowned photographer Rankin has teamed up wit...,positive.news,1,1,1
114,Catch up on what's been going right in the world,positive.news,1,1,1
115,Get Positive News stories in your inbox each week,positive.news,1,1,1


In [35]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")

results["bert_score"] = results.apply(lambda row: -1 if classifier(row["text"])[0]["label"] == "NEGATIVE" else 1, axis=1)

In [36]:
results

Unnamed: 0,text,site,sentiment,afinn_score,vader_score,bert_score
0,Struggle to evacuate Afghans from Kabul airport,bbc.co.uk,-1,-1,-1,-1
1,Decision on who gets third vaccine dose due im...,bbc.co.uk,1,1,1,-1
2,Banksy: How much do we really know about him?,bbc.co.uk,1,1,1,-1
3,Raab rejects calls to quit over Afghan interpr...,bbc.co.uk,-1,-1,-1,1
4,Transfer latest and Premier League build-up,bbc.co.uk,1,1,1,1
...,...,...,...,...,...,...
112,Age is just a number: Rankin portraits celebra...,positive.news,1,1,1,1
113,Renowned photographer Rankin has teamed up wit...,positive.news,1,1,1,1
114,Catch up on what's been going right in the world,positive.news,1,1,1,1
115,Get Positive News stories in your inbox each week,positive.news,1,1,1,1


In [37]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [46]:
import numpy as np
from scipy.special import softmax

def roberta(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    
    if scores[0] > scores[1] and scores[0] > scores[2]:
        return -1
    else:
        return 1
    
results["roberta_score"] = results.apply(lambda row: roberta(row["text"]), axis=1)

In [48]:
results[10:20]

Unnamed: 0,text,site,sentiment,afinn_score,vader_score,bert_score,roberta_score
10,Gunman argued with mum before mass shooting,bbc.co.uk,-1,1,-1,-1,-1
11,Arrest in manhunt for murdered nurse's husband,bbc.co.uk,-1,-1,-1,-1,-1
12,Women's Open: Sagstrom leads with Korda chasin...,bbc.co.uk,1,1,1,1,1
13,Man Utd great Law diagnosed with dementia,bbc.co.uk,-1,1,1,-1,1
14,Mother in court charged with murder of her son,bbc.co.uk,-1,-1,-1,-1,-1
15,NI health service facing 'a very difficult win...,bbc.co.uk,-1,-1,-1,-1,-1
16,Newquay has highest Covid rates in England,bbc.co.uk,-1,1,1,1,1
17,'We just can't take any risks',bbc.co.uk,-1,-1,1,-1,1
18,Man takes legal action over quarantine hotel stay,bbc.co.uk,-1,1,1,-1,1
19,Hampshire & Isle of Wight,bbc.co.uk,1,1,1,1,1


The values can be split into 3 groups: negative, neutral and positive. But it's more optimal to have only 2 groups, so we'll group together the positive and neutral scores. 

With that in mind, the models seem to be performing pretty well. Only a couple of misses:
- `Why the Tourette's...` was scored negatively by AFINN
- `We never knew...` was scored positively by VADER

Let's check every place where the models are disagreeing...

In [20]:
token_df[((token_df["afinn_score"]<0) & (token_df["vader_score"]>=0)) | ((token_df["afinn_score"]>=0) & (token_df["vader_score"]<0))]

Unnamed: 0,token,afinn_score,vader_score
6,UK records a further 100 Covid deaths,-0.285714,0.0
7,"Germany fears thousands got saline, not vaccine",0.0,-0.4215
11,"Murder-accused boy, 14, in court after stabbing",-0.571429,0.0
12,"England lose two wickets in two balls - clips, radio & text",0.0,-0.4019
13,Women's Hundred: Trent Rockets struggle in must-win game against Birmingham Phoenix,0.181818,-0.3182
28,Casualty's 10 most memorable episodes,-0.2,0.0
31,'I'm just not ready to buy an electric car',0.0,-0.2755
35,Why the Tourette's queen of Twitch hasn't been banned,-0.222222,0.357
36,'We never knew how dangerous Loch Lomond was',-0.25,0.3724
62,Â© 2021 BBC. The BBC is not responsible for the content of external sites.,0.142857,-0.2411


There's 10 disagreements (out of 64 phrases). AFINN got 3 right, VADER 7 - hence VADER seems to be the superior model!

But what if we could make it even more accurate with a pre-trained, state-of-the-art model? Enter Hugging Face

In [21]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")

Let's see if the model works

In [34]:
classifier('We are very happy to show you the 🤗 Transformers library.')[0]

{'label': 'POSITIVE', 'score': 0.9997795224189758}

So far so good. Let's now apply it to the dataset.

In [23]:
for token in scored_tokens:
    bert = classifier(token["token"])[0]
    token["bert_score"] =  bert["score"] * (-1 if bert["label"] == "NEGATIVE" else 1)
    
token_df = pd.DataFrame(scored_tokens)

In [24]:
token_df[["token","bert_score"]][10:20]

Unnamed: 0,token,bert_score
10,Panic as thousands flee Taliban onslaught,-0.975269
11,"Murder-accused boy, 14, in court after stabbing",-0.982073
12,"England lose two wickets in two balls - clips, radio & text",-0.996979
13,Women's Hundred: Trent Rockets struggle in must-win game against Birmingham Phoenix,0.963504
14,'One signing could decide the title race - but it's not Harry Kane',-0.98869
15,Holiday 'stress' over paper vaccine certificate,-0.994894
16,"Woman arrested in murder probe after boy, 2, dies",-0.960317
17,Medics warn of more cancelled operations,-0.999388
18,3 things we love today,0.999752
19,Three of the strangest organs in the animal kingdom,0.993196


BERT has perfect accuracy on the slice, impressive! Let's see how it compares to VADER...

In [25]:
filter = token_df[((token_df["bert_score"]<0) & (token_df["vader_score"]>=0)) | ((token_df["bert_score"]>=0) & (token_df["vader_score"]<0))][["token","vader_score","bert_score"]]
print(filter.count())
filter.to_csv("bert-vader.csv")

token          19
vader_score    19
bert_score     19
dtype: int64


Turns out DistilBERT has lower accuracy than VADER?!

What if we use another model, tailored to twitter?

In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [5]:
text = "Good night 😊"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [6]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-2.4362,  0.5167,  2.2756]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [7]:
import numpy as np
from scipy.special import softmax

In [8]:
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores

array([0.00760985, 0.14581196, 0.8465782 ], dtype=float32)

In [28]:
scores[0]

0.007609855

In [27]:
token_df

Unnamed: 0,token,afinn_score,vader_score,bert_score
0,Plymouth mass shooter was licensed gun holder,-0.142857,-0.34,-0.702464
1,PM calls emergency meeting to discuss Afghanistan,-0.285714,-0.3818,-0.804552
2,England bat after Anderson takes five wickets in second Test,0.0,0.0,0.941195
3,Teachers: 'It's been hell grading exams',-0.666667,-0.6808,-0.995785
4,"'One signing could decide the title race, but it's not Harry Kane'",0.0,0.0,-0.983202
5,Lure of the island with no electricity or wi-fi,-0.111111,-0.296,-0.999425
6,UK records a further 100 Covid deaths,-0.285714,0.0,-0.977418
7,"Germany fears thousands got saline, not vaccine",0.0,-0.4215,-0.998494
8,'We had to tip milk down the drain - now we sell 200-400 bottles a day',0.0,0.0,-0.998709
9,Gunman's victims include three-year-old girl,-0.6,-0.3182,-0.959833


In [30]:
for token in scored_tokens:
    encoded_input = tokenizer(token["token"], return_tensors='pt')
    output = model(**encoded_input)
    
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    
    if scores[0] > scores[1] and scores[0] > scores[2]:
        roberta_score = -1
    elif scores[1] > scores[0] and scores[1] > scores[2]:
        roberta_score = 0
    else:
        roberta_score = 1
        
    token["roberta_score"] = roberta_score

In [32]:
token_df = pd.DataFrame(scored_tokens)
token_df[["token","roberta_score"]][25:35]

Unnamed: 0,token,roberta_score
25,Britney Spears' father to step down as conservator,0
26,Grimmy on leaving Radio 1 and the 'instant bad mood' song,-1
27,Olympian Adam Peaty joins the all-star Strictly 2021 line-up,0
28,Casualty's 10 most memorable episodes,1
29,Marvel launches new Disney+ show featuring Chadwick Boseman,1
30,First batch of student's washing machines shipped,0
31,'I'm just not ready to buy an electric car',-1
32,Olympian given new medal after first got bitten,0
33,'When you're on a BMX 20 feet in the air there's no room for error' Video,0
34,Swimmer taking on 'coldest swim on Earth' to highlight climate change,0


In [33]:
filter = token_df[((token_df["roberta_score"]<0) & (token_df["vader_score"]>=0)) | ((token_df["roberta_score"]>=0) & (token_df["vader_score"]<0))][["token","vader_score","roberta_score"]]
print(filter.count())
filter.to_csv("roberta-vader.csv")

token            14
vader_score      14
roberta_score    14
dtype: int64
