In [1]:
import eredivisie_nlp as enlp
import numpy as np
import pandas as pd
from pattern.nl import sentiment
from random import randrange
from sklearn.metrics import mean_absolute_error, cohen_kappa_score
from tqdm import tqdm
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Pattern

In [2]:
test = pd.read_csv(enlp.determine_root() + "/data/test_set.csv")

Pattern <> manual annotations

In [3]:
test['sent_pattern'] = [sentiment(text)[0] for text in test.text]
test

Unnamed: 0.1,Unnamed: 0,text,hashtag,annotation,annotation_std,sent_pattern
0,0,ajaaz benieuwd of ten hag nu een laat zien dat...,#ajaaz,-0.3,2,-0.175000
1,1,ik ben er klaar voor.,#pecspa,0.3,4,0.366667
2,2,"ten hag zwaar teleurgesteld in zijn spelers, m...",#ajaaz,-0.6,1,0.138333
3,3,typisch dat ajax deze dan weer laat liggen nat...,#ajaaz,-0.3,2,-0.008333
4,4,genoeg kansen gecreëerd hoor vandaag. maar wat...,#feyrkc,-0.3,2,0.000000
...,...,...,...,...,...,...
95,95,ook weer geregeld.,#ajaaz,0.3,4,0.150000
96,96,antony_aja is 21 en ziyech was al 24/25 die wa...,#ajaaz,0.0,3,0.000000
97,97,veronicainside espn ajautr maher_utr beging ee...,#ajautr,-0.3,2,0.050000
98,98,"69. kramer, elbouchataoui_rkc en bakari eraf, ...",#feyrkc,0.0,3,0.000000


In [4]:
mean_absolute_error(test.annotation, test.sent_pattern)

0.3869361111111111

Let's compare this with a naive approach

In [5]:
test['naive_6d'] = [randrange(1000000)/1000000 for p in range(test.shape[0])]  # pattern gives 6 decimals
test['naive_1d'] = [randrange(21) / 10 - 1 for x in range(test.shape[0])]  # standard SA is 1 decimal

In [6]:
mean_absolute_error(test.annotation, test.naive_6d)

0.69059834

In [7]:
mean_absolute_error(test.annotation, test.naive_1d)

0.6419999999999999

The number of digits does not have an influence on the performance of a naive classifier
Errors of 0.3 vs 0.6 are less bad than -0.3 vs 0.3. Let's see how often pattern gets the polarity wrong.

In [8]:
# percentage agreement manual <> pattern
agreement = [1 if enlp.is_positive(p) == enlp.is_positive(m) else 0 for p, m in zip(test.sent_pattern, test.annotation)]
sum(agreement) / test.shape[0]

0.51

In [9]:
# cohens kappa manual <> pattern
pd.Series([enlp.is_positive(m) for m in test.annotation]).value_counts()
cohen_kappa_score([enlp.is_positive(a) for a in test.annotation], [enlp.is_positive(p) for p in test.sent_pattern])

0.2665768597515342

Now when transforming pattern.nl annotations to the same format as the manual ones.

In [10]:
test['pattern_round'] = [enlp.round_score(p) for p in test.sent_pattern]

In [11]:
mean_absolute_error(test.annotation, test.pattern_round)

0.36899999999999994

In [12]:
test['annotation_pol'] = [enlp.is_positive(a) for a in test.annotation]
test['pattern_pol'] = [enlp.is_positive(p) for p in test.sent_pattern]
mean_absolute_error(test.annotation_pol, test.pattern_pol)

0.69

# RobBERT

In [13]:
# load models and tokenizer
models = {
    'oneshot': RobertaForSequenceClassification.from_pretrained("pdelobelle/robbert-v2-dutch-base"),
    '150': RobertaForSequenceClassification.from_pretrained("./robbert_150"),
    '300': RobertaForSequenceClassification.from_pretrained("./robbert_300"),
    '600': RobertaForSequenceClassification.from_pretrained("./robbert_600")
}
tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")

Some weights of the model checkpoint at pdelobelle/robbert-v2-dutch-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.dense.bias', 'classif

In [14]:
# SA process
model_choice = '150'
model = models[model_choice]

encoded_tweets = [tokenizer(tweet, return_tensors='pt') for tweet in test.text]
outputs = [model(**et)[0].detach().numpy() for et in tqdm(encoded_tweets)]
sentiments = []
for output in outputs:
    robbert_score = np.where(output[0] == max(output[0]))[0].item()
    sentiments.append(enlp.transform_score(robbert_score))
test[model_choice] = sentiments

100%|██████████| 100/100 [00:05<00:00, 18.23it/s]


In [15]:
mean_absolute_error(test.annotation, test[model_choice])

0.315