# Sentiment analysis on document level


## Base Line for binary classification

In [18]:
from transformers import pipeline
import numpy as np

sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Device set to use mps:0


In [32]:
def load_data(file_path):
    with open(file_path, 'r') as file:
        return file.read()

In [33]:
text_whole = load_data("/Users/danielbischoff/Documents/MasterInformatik/MSE/Best-search-engine-in-the-world/sentiment_text/Experience_a_postcard_delivered_121_years_late_led_me_to_my_long-lost_family.txt")

In [2]:
# load data from a file as junks inta array

def load_data_as_chunks(file_path):
    with open(file_path, 'r') as file:
        data = file.readlines()
    return [line.strip() for line in data if line.strip()]


In [None]:
import pickle as pkl

# load pickle file and split
def load_pickle(file_path):
    with open(file_path, 'rb') as file:
        return pkl.load(file)
    
def join_data(tokens, chunk_size=512):
    return [' '.join(tokens[i:i+chunk_size]) for i in range(0, len(tokens), chunk_size)]
        

data = load_pickle("/Users/danielbischoff/Documents/MasterInformatik/MSE/Best-search-engine-in-the-world/test_data/crawled_data.pkl")

In [128]:
test_data = join_data(data[33]["tokens"], chunk_size=120)

In [157]:
np.random.seed(0)
# natural random numbers for testing
random_scores = np.unique(np.random.randint(0, len(test_data), 10))
test_data_split = [test_data[i] for i in random_scores]

In [161]:
len(test_data_split)

9

In [162]:

def document_sentiment_analysis_binary(data : list[str], pipeline=sentiment_pipeline, seed= 0, random_aprox=False):

    doc_analysis = {}

    if random_aprox:

        np.random.seed(seed)
        # natural random numbers for testing
        random_scores = np.unique(np.random.randint(0, len(data), 10))
        data = [test_data[i] for i in random_scores]

    analysis = pipeline(data)

    if analysis["label" == "NEGATIVE"] != None:
        negative_prob = np.sum([doc_analysis["score"] for doc_analysis in analysis if doc_analysis["label"] == "NEGATIVE" ]) / len(analysis)
    else:
        negative_prob = 0
    if analysis["label" == "POSITIVE"] != None:
        positive_prob = np.sum([doc_analysis["score"] for doc_analysis in analysis if doc_analysis["label"] == "POSITIVE" ])  / len(analysis)
    else:
        positive_prob = 0

    if negative_prob > positive_prob:
        doc_analysis["label"] = "NEGATIVE"
        doc_analysis["score"] = negative_prob
    else:
        doc_analysis["label"] = "POSITIVE"
        doc_analysis["score"] = positive_prob

    return doc_analysis

In [166]:
analysis = document_sentiment_analysis_binary(test_data)

In [165]:
analysis_approx = document_sentiment_analysis_binary(test_data,random_aprox=True)

In [167]:
analysis

{'label': 'NEGATIVE', 'score': 0.5276921604360852}

In [168]:
analysis_approx

{'label': 'NEGATIVE', 'score': 0.46918564372592503}

In [None]:
# subjectivity detection in newspaper sentences
# A sentence is subjective if its content is based on or influenced by personal feelings, tastes, or opinions. Otherwise, the sentence is objective. (Antici et al., 2023).
# https://huggingface.co/GroNLP/mdebertav3-subjectivity-english
# https://checkthat.gitlab.io/clef2023/task2/


pipe = pipeline("text-classification", model="GroNLP/mdebertav3-subjectivity-english")

Device set to use mps:0


In [72]:
pipe(positive_text)

[{'label': 'LABEL_0', 'score': 0.7450624704360962},
 {'label': 'LABEL_0', 'score': 0.7528149485588074},
 {'label': 'LABEL_0', 'score': 0.826611340045929},
 {'label': 'LABEL_0', 'score': 0.8640003800392151},
 {'label': 'LABEL_0', 'score': 0.9500696659088135},
 {'label': 'LABEL_0', 'score': 0.9030177593231201},
 {'label': 'LABEL_0', 'score': 0.9125306010246277}]

In [65]:
def document_sentiment_analysis(data : list[str]):

    doc_analysis = {}
    analysis = sentiment_pipeline(data)

    unique_labels = np.unique([doc["label"] for doc in analysis])

    arg_max_labels = 0

    for label in unique_labels:

        label_prob = np.sum([doc_analysis["score"] for doc_analysis in analysis if doc_analysis["label"] == label ]) / len(analysis)

        if arg_max_labels < label_prob:
            doc_analysis["label"] = label
            doc_analysis["score"] = label_prob

    return doc_analysis

## Multivariant Classification

In [None]:
unique_labels = np.unique([doc["label"] for doc in analysis])

array(['POSITIVE'], dtype='<U8')

In [73]:
# Multilingual GoEmotions Classifier
pipe = pipeline("text-classification", model="AnasAlokla/multilingual_go_emotions")

Device set to use mps:0


In [74]:
pipe(negative_text)

[{'label': 'neutral', 'score': 0.9459356069564819},
 {'label': 'neutral', 'score': 0.8973721861839294},
 {'label': 'neutral', 'score': 0.9459723234176636},
 {'label': 'neutral', 'score': 0.8502932190895081},
 {'label': 'neutral', 'score': 0.946921169757843},
 {'label': 'neutral', 'score': 0.9041789770126343},
 {'label': 'neutral', 'score': 0.6865472793579102},
 {'label': 'neutral', 'score': 0.6713699102401733},
 {'label': 'neutral', 'score': 0.7492476105690002},
 {'label': 'neutral', 'score': 0.9403115510940552},
 {'label': 'neutral', 'score': 0.8996682167053223},
 {'label': 'optimism', 'score': 0.8210803866386414},
 {'label': 'neutral', 'score': 0.9212598204612732},
 {'label': 'neutral', 'score': 0.8895576596260071},
 {'label': 'neutral', 'score': 0.7807997465133667}]