In [1]:
import eredivisie_nlp as enlp
import pandas as pd
from pattern.nl import sentiment
from random import randrange
from sklearn.metrics import mean_absolute_error, cohen_kappa_score
from sklearn.model_selection import train_test_split

# Pattern

In [2]:
df = pd.read_excel(enlp.determine_root() + "/data/sentiment_annotations_manual_no_leakage.xlsx")
df = df[df.annotation != 100]
train, test = train_test_split(df, test_size=100, random_state=237)

Pattern <> manual annotations

In [3]:
test['sent_pattern'] = [sentiment(text)[0] for text in test.text]
test

Unnamed: 0,text,hashtag,annotation,annotation_std,sent_pattern
137,ajaaz benieuwd of ten hag nu een laat zien dat...,#ajaaz,-0.3,2,-0.175000
101342,ik ben er klaar voor.,#pecspa,0.3,4,0.366667
76,"ten hag zwaar teleurgesteld in zijn spelers, m...",#ajaaz,-0.6,1,0.138333
269,typisch dat ajax deze dan weer laat liggen nat...,#ajaaz,-0.3,2,-0.008333
54512,genoeg kansen gecreëerd hoor vandaag. maar wat...,#feyrkc,-0.3,2,0.000000
...,...,...,...,...,...
27,ook weer geregeld.,#ajaaz,0.3,4,0.150000
71,antony_aja is 21 en ziyech was al 24/25 die wa...,#ajaaz,0.0,3,0.000000
20369,veronicainside espn ajautr maher_utr beging ee...,#ajautr,-0.3,2,0.050000
54497,"69. kramer, elbouchataoui_rkc en bakari eraf, ...",#feyrkc,0.0,3,0.000000


In [4]:
mean_absolute_error(test.annotation, test.sent_pattern)

0.3869361111111111

Let's compare this with a naive approach

In [5]:
test['naive_6d'] = [randrange(1000000)/1000000 for p in range(test.shape[0])]  # pattern gives 6 decimals
test['naive_1d'] = [randrange(21) / 10 - 1 for x in range(test.shape[0])]  # standard SA is 1 decimal

In [6]:
mean_absolute_error(test.annotation, test.naive_6d)

0.63826296

In [7]:
mean_absolute_error(test.annotation, test.naive_1d)

0.564

The number of digits does not have an influence on the performance of a naive classifier
Errors of 0.3 vs 0.6 are less bad than -0.3 vs 0.3. Let's see how often pattern gets the polarity wrong.

In [8]:
# percentage agreement manual <> pattern
agreement = [1 if enlp.is_positive(p) == enlp.is_positive(m) else 0 for p, m in zip(test.sent_pattern, test.annotation)]
sum(agreement) / test.shape[0]

0.51

In [9]:
# cohens kappa manual <> pattern
pd.Series([enlp.is_positive(m) for m in test.annotation]).value_counts()
cohen_kappa_score([enlp.is_positive(a) for a in test.annotation], [enlp.is_positive(p) for p in test.sent_pattern])

0.2665768597515342

Now when transforming pattern.nl annotations to the same format as the manual ones.

In [17]:
test['pattern_round'] = [enlp.round_score(p) for p in test.sent_pattern]

In [18]:
mean_absolute_error(test.annotation, test.pattern_round)

0.36899999999999994

In [23]:
test['annotation_pol'] = [enlp.is_positive(a) for a in test.annotation]
test['pattern_pol'] = [enlp.is_positive(p) for p in test.sent_pattern]
mean_absolute_error(test.annotation_pol, test.pattern_pol)

0.69

# RobBERT