Importing and cleaning

In [28]:
# importing libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from tqdm import tqdm

In [25]:
#use tqdm to track progress of the apply function
tqdm.pandas()  

In [5]:
# load training_text data
df = pd.read_csv('/Users/Niek/pollob/data/training_text.csv', index_col=0)

In [11]:
# loops over the text in the text column and applies BeautifulSoup to split text based on the html tags
text_fragments = []
for text in df['text']:
    soup = BeautifulSoup(text, 'html.parser')
    for sentence in soup.get_text().split('.'):
        if sentence.strip():
            text_fragments.append({'text': sentence.strip()})

In [13]:
# cleaned df with text that was split with html tag in a new row
cleaned_df = pd.DataFrame(text_fragments)

Importing the BERT Transformers from Huggingface

In [None]:
# NLPTown
tokenizernlptown = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
modelnlptown = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

def sentiment_score(text):
    tokens = tokenizernlptown.encode(text, return_tensors='pt')
    result = modelnlptown(tokens)
    return int(torch.argmax(result.logits))+1

cleaned_df['sentiment_score_multi'] = cleaned_df['text'].progress_apply(lambda x: sentiment_score(x))
# 32 seconds

def analysis_multi(score):
    if score < 3:
        return -1
    elif score == 3:
        return 0
    else:
        return 1
      
cleaned_df['sentiment_multi'] = cleaned_df['sentiment_score_multi'].progress_apply(lambda x: analysis_multi(x))

  0%|                                    | 830/442909 [01:07<8:56:30, 13.73it/s]

In [None]:
# DTAI
tokenizerDTAI = AutoTokenizer.from_pretrained("DTAI-KULeuven/robbert-v2-dutch-sentiment")
modelDTAI = AutoModelForSequenceClassification.from_pretrained("DTAI-KULeuven/robbert-v2-dutch-sentiment")

def softmax(z): return np.exp(z)/((np.exp(z)).sum())

def sentiment_score_DTAI(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
    -1 : scores[0],
    0 : scores[1],
    1 : scores[2]
  }
    max_value = max(scores_dict, key=scores_dict.get)
    return max_value
  
cleaned_df['sentiment_score_DTAI'] = cleaned_df['text'].progress_apply(lambda x: sentiment_score_DTAI(x))

In [None]:
#gilesitorr finetuned
tokenizergil = AutoTokenizer.from_pretrained("gilesitorr/bert-base-multilingual-uncased-sentiment-3labels")
modelgil = AutoModelForSequenceClassification.from_pretrained("gilesitorr/bert-base-multilingual-uncased-sentiment-3labels")

def sentiment_score_gil(text):
    tokens = tokenizergil.encode(text, return_tensors='pt')
    result = modelgil(tokens)
    return int(torch.argmax(result.logits))+1
  
cleaned_df['sentiment_score_gil'] = cleaned_df['text'].progress_apply(lambda x: sentiment_score_gil(x))
# -> took 30 seconds

def analysis_gil(score):
    if score == 3:
        return 1
    if score == 2:
        return 0
    elif score == 1:
        return -1
    
cleaned_df['sentiment_gil'] = cleaned_df['sentiment_score_gil'].progress_apply(lambda x: analysis_gil(x))

In [None]:
#testing the agreement of the three models on the sample
annotated['sentiment_multi'] = annotated['sentiment_score_multi'].progress_apply(lambda x: analysis_multi(x))
annotated['sentiment_score_DTAI'] = annotated['text'].progress_apply(lambda x: sentiment_score_DTAI(x))
annotated['sentiment_gil'] = annotated['sentiment_score_gil'].progress_apply(lambda x: analysis_gil(x))

annotated['agreement_models'] = np.where((((annotated['sentiment_multi'])==(annotated['sentiment_score_DTAI']))& (annotated['sentiment_score_DTAI'] == annotated['sentiment_gil'])),annotated['sentiment_multi'],None)
annotated['agreement_models'].sum() / len(annotated['text'])