# **SENTIMENT ANALYSIS - UNSUPERVISED**

In [2]:
import os
import json
import numpy as np
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
from textserver import TextServer
from dotenv import load_dotenv
from sklearn.metrics import accuracy_score
from typing import Optional
import warnings




In [2]:
load_dotenv('login.env')
ts_password = os.getenv("PASSWORD_PAU")
ts_user = os.getenv("USER_PAU")

In [3]:
# Load datasets    
with open('./data/X_test.json', 'r') as file:
    X_test = json.load(file)
    
with open('./data/y_test.json', 'r') as file:
    y_test = json.load(file)

In [52]:
ts = TextServer(ts_user, ts_password, 'senses') 

In [53]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

example_sent = "i guess that if a very wild bachelor party had gone really bad , there would be broken furniture , traces of smack and cocaine on the floor , and a dead prostitute in the bathroom . i guess that if a movie had also gone really bad , there might be the same elements present . coincidence ? poor kyle ( a meek looking jon favreau ) . . . he is about to marry his radiant fiancee , laura ( cameron diaz ) . but before he exchanges his vows , he embarks to las vegas with his friends for one last blowout . but this bachelor party has gone about as bad as it could possibly get . the prostitute has met a horrible , though accidental death , and drugs are everywhere . the five friends agree that there is enough bad evidence here that will send them to jail for a very long time . a surprisingly calm robert boyd ( christian slater ) , who looks like he was groomed to make nefarious decisions , ponders their dilemma for a few minutes before deciding that the best thing to do is to bury the body in the desert where she ' ll never be found . although they stomach the gruesome deed of getting rid of the body ( which also disturbingly involves dismantling the body using power saws in order to stuff it into suitcases ) , when they return from their trip , guilt and paranoia begins to set in which slowly consumes some of the five friends . one is adam ( daniel stern ) he grows increasingly agitated . whenever people look at his van or whenever a cop glances his way , his blood pressure increases . or that just may be because of his dysfunctional family . another is michael , who was actually responsible for her death . he tries to bury his feelings , but the burden of guilt begins to affect his judgment as well . boyd is the ? doer ' of the group . seemingly suffering from a long psychosis , when he feels as if his secret is about to be exposed , he is apt to take extreme measures to cover up his tracks . kyle just hopes that his wedding will live up to laura ' s demanding expectations . then , there ' s moore ( leland orser ) who speaks 5 lines and walks around with a puzzled look on his face . the problem with this reprehensible movie is that it wants to be a cruel comedy , but it presents things in a manner that just aren ' t funny . drugs , mutilation , and killing your own friends isn ' t something to be laughed at . as a straight psychological drama , i could see how it might have worked , as each one tried to maneuver and overcome the weight of their own guilt in their own sometimes - sick ways . but this movie insults us by assuming that we could simply discard our values for 2 hours . if you do like this movie , i don ' t think that i want to know you . i did find slater a convincing leader who sways his friends to choose not the right thing but the ? smart play . ' and diaz adds some brightness to this film as a wedding - needing fiancee . but her talents are essentially wasted here . it ' s obvious that the film maker is trying to strike a certain tone . but the way that he chooses to do it is tasteless . do not make a very bad decision by seeing this film "

sent_text = nltk.sent_tokenize(example_sent)
sentences = []
no_stopwords_sentences = []
for sentence in sent_text:
    word_tokens = word_tokenize(sentence)

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    no_stopwords_sentences.append(filtered_sentence)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
nlp = spacy.load("en_core_web_sm")

In [8]:
import time

In [10]:
def get_sentences(text:str, remove_stopwords:bool = False) -> None:
    sent_list = nltk.sent_tokenize(text)
    if remove_stopwords:
        no_stopwords_sentences = []
        for sentence in sent_text:
            word_tokens = word_tokenize(sentence)

            filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
            filtered_sentence = []
            for w in word_tokens:
                if w not in stop_words:
                    filtered_sentence.append(w)
            no_stopwords_sentences.append(filtered_sentence)
        return no_stopwords_sentences
    else:
        return sent_list

def get_lesk_synsets(text:str, lemmatize:bool = True, remove_stopwords:bool = False):
    tokens = word_tokenize(text)
    if lemmatize:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    if remove_stopwords:
        tokens = [w for w in tokens if not w.lower() in stop_words]
    tagged_tokens = [(a.text, a.pos_) for a in nlp(text)]
    words = []
    for token, pos in tagged_tokens:
        if pos == "NOUN":
            syn = lesk(tokens, token, pos="n")
        elif pos == "ADJ":
            syn = lesk(tokens, token, pos="a")
        elif pos == "ADV":
            syn = lesk(tokens, token, pos="r")
        elif pos == "VERB":
            syn = lesk(tokens, token, pos="v")
        else:
            syn = None
        if syn is not None:
            words.append(syn)
    return words

def get_lesk_all_synsets(sentences:list, lemmatizer:bool = True) -> list:
    all = []
    for sentence in sentences:
        all.append(get_lesk_synsets(sentence, lemmatizer))
    return all
    
def all_synsets():
    all_synsets = []
    for opinion in X_test:
        s = get_sentences(opinion)
        syns = get_synsets(s)
        all_synsets.append(syns)
        with open('./data/ts_test_synsets.json', 'w') as file:
            json.dump(all_synsets, file)

def get_sentiment(synset:'Synset'):
    sentiment = swn.senti_synset(synset)
    return (sentiment.pos_score(), sentiment.neg_score(), sentiment.obj_score()) if sentiment else None

def score_synsets(synsets:list, score:str = 'obj', threshold:float = 0.25, merge_scores:str = 'mean') -> float:
    """
    Compute a score for each synset in a list of synsets and merge them into a single score.

    Parameters:
        synsets (list): List of synsets.
        score (str): Score to compute. One of 'pos', 'neg', 'obj', 'max_score', 'dif', 'dif2', 'dif_threshold', 'dif2_threshold', 'dif_obj', 'dif2_obj'.
        threshold (float): Threshold for 'dif_threshold' and 'dif2_threshold' scores.
        merge_scores (str): Method for merging scores into a single score. One of 'sum', 'mean', 'max', 'min', 'scale', 'scale_norm1_mean', 'scale_norm2_mean'.

    Returns:
        float: Merged score.
    """

    if score == 'max_score' and merge_scores not in ['sum', 'mean']:
        warnings.warn(f"Score 'max_score' is not compatible with '{merge_scores}'. Using 'sum' instead.", SyntaxWarning)
        merge_scores = 'sum'


    dict_score = {
        'pos': lambda s: s[0],
        'neg': lambda s: s[1], 
        'obj': lambda s: s[2],
        'max_score': lambda s: (-1 if s[0] > s[1] else 1) if s[0] != s[1] else 0,
        'dif': lambda s: s[0] - s[1],
        'dif2': lambda s: s[0]**2 - s[1]**2,
        'dif_threshold': lambda s: (s[0] if s[0] > threshold else 0) - (s[1] if s[1] > threshold else 0),
        'dif2_threshold': lambda s: (s[0]**2 if s[0] > threshold else 0) - (s[1]**2 if s[1] > threshold else 0),
        'dif_obj': lambda s: (s[0] - s[1]) * s[2],
        'dif2_obj': lambda s: (s[0]**2 - s[1]**2) * s[2]
        }
    
    assert score in dict_score.keys(), f"Score '{score}' not valid. Choose one of {list(dict_score.keys())}"
    
    def min_max_scale(scores:list) -> list:
        min_score = min(scores)
        max_score = max(scores)
        return [(s - min_score) / (max_score - min_score) for s in scores]

    dict_merge = {
        'sum': lambda sc: sum(sc),
        'mean': lambda sc: np.mean(sc),
        'max': lambda sc: max(sc),
        'min': lambda sc: min(sc),
        'scale_norm1_mean': lambda sc: np.mean(np.abs(min_max_scale(sc))),
        'scale_norm2_mean': lambda sc: np.linalg.norm(min_max_scale(sc)) / len(sc),
    }
    
    assert merge_scores in dict_merge.keys(), f"Merge score '{merge_scores}' not valid. Choose one of {list(dict_merge.keys())}"

    score_func = dict_score[score]
    scores = [score_func(get_sentiment(synset=s)) for s in synsets if s is not None]

    merge_func = dict_merge[merge_scores]
    scores_merged = merge_func(scores)

    return scores_merged

def discretize_scores(scores:list, threshold:float, positive_value = 1, negative_value = 0) -> list:
    """
    Returns a list of binary values based on a threshold.

    Parameters:
        scores (list): List of scores.
        threshold (float): Minimum value to consider a score as positive.
        positive_value: Value to assign to positive scores.
        negative_value: Value to assign to negative scores.
    """
    return list(map(lambda x: positive_value if x >= threshold else negative_value, scores))

In [9]:
print(get_sentences(X_test[0])[0])

i didn ' t realize how apt the name of this movie was until i called the mpaa ( the motion picture association of america - the folks who decide what ' s g , nc 17 , pg , r or x ) to ask why the preview was rated r .


In [10]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [None]:
s = get_sentences(X_test[0])
print(s)
syns = get_lesk_all_synsets(s)
print(syns)

In [13]:
import re
def fix_text(text):
    fixed_text = re.sub(r'[\\"\+\/]', '', text)
    fixed_text = re.sub(r'\s*([.,!\?;:])\s*', r'\1 ', fixed_text)
    fixed_text = re.sub(r'\s+\' s\s+', "'s ", fixed_text)
    fixed_text = re.sub(r'\s+\' t\s+', "'t ", fixed_text)
    fixed_text = re.sub(r'\s+\' re\s+', "'re ", fixed_text)
    fixed_text = re.sub(r'\s+\' ve\s+', "'ve ", fixed_text)
    fixed_text = re.sub(r'\s+\' ll\s+', "'ll ", fixed_text)
    fixed_text = re.sub(r'\s+', ' ', fixed_text)
    fixed_text = re.sub(r'\si\s', ' I ', fixed_text)
    fixed_text = re.sub(r'(?:^|(?<=[.!?]))\s*(\w)', lambda x: x.group(1).upper(), fixed_text)
    return fixed_text.strip()

In [145]:
test_synsets = []
for opinion in X_test:
    s = get_sentences(opinion)
    #s = [fix_text(t) for t in s]
    syns = get_lesk_all_synsets(s, lemmatizer=False)
    names = [[syn.name() for syn in ll] for ll in syns]
    test_synsets.append(names)

In [146]:
with open('./data/lesk_test_synsets.json', 'w') as file:
    json.dump(test_synsets, file)

In [15]:
with open('./data/ukb2_test_synsets.json', 'r') as file:
    test_synsets = json.load(file)

In [61]:
allowed = ["v", "a", "s", "n"]

In [62]:
results = []
scores_obj = []
scores_res = []
for opinion in test_synsets:
    total_pos = 0
    total_neg = 0
    total_obj = 0
    for sentence in opinion:
        filter_sentence = [name for name in sentence if name.split('.')[1] in allowed]
        scores = [get_sentiment(syn) for syn in filter_sentence if get_sentiment(syn) != None]
        if len(scores) > 0:
            total_pos += sum(s[0] for s in scores)/len(scores)
            total_neg += sum(s[1] for s in scores)/len(scores)
            total_obj += sum(s[2] for s in scores) /len(scores)
    score = total_obj
    scores_obj.append(total_obj)
    scores_res.append(total_pos - total_neg)
    if score > 0.15:
        # print("Positive")
        results.append(1)
    elif score < 0.15:
        # print("Negative")
        results.append(0)
    else:
        # print("Neutral")
        results.append(0)


In [70]:
results = [0 if a < 0.1 else 1 for a in scores_res]
print(accuracy_score(y_test, results))

0.67


Own implementation of UKB since TextServer didn't allow us.

In [11]:
from ukb import *
nlp = spacy.load("en_core_web_sm")
try:
    ukb_graph = load_ukb_graph("ukb_graph.gexf")
except:
    print("Creating graph...")
    ukb_graph = build_ukb_graph()
    nx.write_gexf(ukb_graph, "ukb_graph.gexf")

ukb = UKB(ukb_graph)
def get_ukb_synsets(text:str):
    context_words = extract_context_words(text)
    disambiguated_senses = ukb.disambiguate_context(context_words, method=2)
    return list(disambiguated_senses.values())

def get_ukb_all_synsets(sentences:list) -> list:
    all = []
    for sentence in sentences:
        all.append([a for a in get_ukb_synsets(sentence) if a != None])
    return all

In [12]:
context_words = extract_context_words("my name is john and i work very hard at google")
disambiguated_senses = ukb.disambiguate_context(context_words, method=2)
print(disambiguated_senses)

{'name': 'name.n.01', 'john': 'toilet.n.01', 'work': 'work.v.01', 'very': 'very.r.01', 'hard': 'hard.r.01', 'google': 'google.n.01'}


In [13]:
test_synsets = []
for i, opinion in enumerate(X_test):
    s = get_sentences(opinion)
    #s = [fix_text(t) for t in s]
    syns = get_ukb_all_synsets(s)
    test_synsets.append(syns)
    print(i, end="\r")

499

In [14]:
with open('./data/ukb2_test_synsets.json', 'w') as file:
    json.dump(test_synsets, file)

Using just the most freqüent synset

In [58]:
frequencies = json.load(open("./data/word_sense_frequencies_semcor.json"))

def get_freq_synsets(text:str):
    tokens = word_tokenize(text)
    tagged_tokens = [(a.text, a.pos_) for a in nlp(text)]
    words = []
    for token, pos in tagged_tokens:
        if token not in frequencies.keys():
            syn = None
        else:
            if pos == "NOUN":
                syn = max(frequencies[token], key=lambda key: frequencies[token][key])
            elif pos == "ADJ":
                syn = max(frequencies[token], key=lambda key: frequencies[token][key])
            elif pos == "ADV":
                syn = max(frequencies[token], key=lambda key: frequencies[token][key])
            elif pos == "VERB":
                syn = max(frequencies[token], key=lambda key: frequencies[token][key])
            else:
                syn = None
        if syn is not None:
            words.append(syn.name() if syn.__class__.__name__ == "Lemma" else syn)
    return words

def get_freq_all_synsets(sentences:list) -> list:
    all = []
    for sentence in sentences:
        all.append([a for a in get_freq_synsets(sentence) if a != None])
    return all



In [59]:
test_synsets = []
for i, opinion in enumerate(X_test):
    s = get_sentences(opinion)
    #s = [fix_text(t) for t in s]
    syns = get_freq_all_synsets(s)
    test_synsets.append(syns)
    print(i, end="\r")

499

In [60]:
with open('./data/freq_test_synsets.json', 'w') as file:
    json.dump(test_synsets, file)

In [5]:
with open('./data/ukb1_test_synsets.json', 'r') as file:
	test_synsets = json.load(file)

In [6]:
allowed = [  "v", "a", "s", "n"]

In [18]:
scores_opinions = []
for opinion in test_synsets:
    scores_sentences = []
    for sentence in opinion:
        filter_sentence = [name for name in sentence if name != "NE" and name.split('.')[1] in allowed]
        new_filter_sentence = []
        for a in filter_sentence:
            if "Lemma('" in a :
                n = a.replace("Lemma('", "").replace("')", "")
                n = n.split(".")
                n.pop(-1)
                n = ".".join(n)
                new_filter_sentence.append(n)
            else:
                new_filter_sentence.append(a)
            
        filter_sentence = new_filter_sentence

        scores_sentences.append(score_synsets(synsets=filter_sentence, score='pos', merge_scores='mean'))

    scores_opinions.append(np.mean(scores_sentences))

results_opinions = discretize_scores(scores=scores_opinions, threshold=0.5)

accuracy_score(y_test, results_opinions)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


0.5