# **SENTIMENT ANALYSIS - UNSUPERVISED**

# LESK

In [1]:
import os
import json
import numpy as np
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
from textserver import TextServer
from dotenv import load_dotenv
from sklearn.metrics import accuracy_score
from typing import Optional
import warnings
import re

In [2]:
load_dotenv('login.env')
ts_password = os.getenv("PASSWORD_PAU")
ts_user = os.getenv("USER_PAU")

In [3]:
# Load datasets    
with open('./data/X_test.json', 'r') as file:
    X_test = json.load(file)
    
with open('./data/y_test.json', 'r') as file:
    y_test = json.load(file)

In [4]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()
nltk.download('universal_tagset')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [5]:
def fix_text(text):
    fixed_text = re.sub(r'[\\"\+\/]', '', text)
    fixed_text = re.sub(r'\s*([.,!\?;:])\s*', r'\1 ', fixed_text)
    fixed_text = re.sub(r'\s+\' s\s+', "'s ", fixed_text)
    fixed_text = re.sub(r'\s+\' t\s+', "'t ", fixed_text)
    fixed_text = re.sub(r'\s+\' re\s+', "'re ", fixed_text)
    fixed_text = re.sub(r'\s+\' ve\s+', "'ve ", fixed_text)
    fixed_text = re.sub(r'\s+\' ll\s+', "'ll ", fixed_text)
    fixed_text = re.sub(r'\s+', ' ', fixed_text)
    fixed_text = re.sub(r'\si\s', ' I ', fixed_text)
    fixed_text = re.sub(r'(?:^|(?<=[.!?]))\s*(\w)', lambda x: x.group(1).upper(), fixed_text)
    return fixed_text.strip()

def get_sentences(text:str, remove_stopwords:bool = False) -> None:
    sent_list = nltk.sent_tokenize(text)
    if remove_stopwords:
        no_stopwords_sentences = []
        for sentence in sent_list:
            word_tokens = word_tokenize(sentence)
            filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
            filtered_sentence = []
            for w in word_tokens:
                if w not in stop_words:
                    filtered_sentence.append(w)
            no_stopwords_sentences.append(filtered_sentence)
        return no_stopwords_sentences
    else:
        return sent_list

def get_lesk_synsets(text:str, lemmatize:bool = True, remove_stopwords:bool = False):
    tokens = word_tokenize(text)
    if lemmatize:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    if remove_stopwords:
        tokens = [w for w in tokens if not w.lower() in stop_words]
    tagged_tokens = [(a.text, a.pos_) for a in nlp(text)]
    words = []
    for token, pos in tagged_tokens:
        if pos == "NOUN":
            syn = lesk(tokens, token, pos="n")
        elif pos == "ADJ":
            syn = lesk(tokens, token, pos="a")
        elif pos == "ADV":
            syn = lesk(tokens, token, pos="r")
        elif pos == "VERB":
            syn = lesk(tokens, token, pos="v")
        else:
            syn = None
        if syn is not None:
            words.append(syn)
    return words

def get_lesk_all_synsets(sentences:list, lemmatizer:bool = True) -> list:
    all = []
    for sentence in sentences:
        all.append(get_lesk_synsets(sentence, lemmatizer))
    return all
    

def get_sentiment(synset:'Synset'):
    sentiment = swn.senti_synset(synset)
    return (sentiment.pos_score(), sentiment.neg_score(), sentiment.obj_score()) if sentiment else None

def score_synsets(synsets:list, score:str = 'obj', threshold:float = 0.25, merge_scores:str = 'mean') -> float:
    """
    Compute a score for each synset in a list of synsets and merge them into a single score.

    Parameters:
        synsets (list): List of synsets.
        score (str): Score to compute. One of 'pos', 'neg', 'obj', 'max_score', 'dif', 'dif2', 'dif_threshold', 'dif2_threshold', 'dif_obj', 'dif2_obj'.
        threshold (float): Threshold for 'dif_threshold' and 'dif2_threshold' scores.
        merge_scores (str): Method for merging scores into a single score. One of 'sum', 'mean', 'max', 'min', 'scale', 'scale_norm1_mean', 'scale_norm2_mean'.

    Returns:
        float: Merged score.
    """

    if score == 'max_score' and merge_scores not in ['sum', 'mean']:
        warnings.warn(f"Score 'max_score' is not compatible with '{merge_scores}'. Using 'sum' instead.", SyntaxWarning)
        merge_scores = 'sum'


    dict_score = {
        'pos': lambda s: s[0],
        'neg': lambda s: s[1], 
        'obj': lambda s: s[2],
        'max_score': lambda s: (-1 if s[0] > s[1] else 1) if s[0] != s[1] else 0,
        'dif': lambda s: s[0] - s[1],
        'dif2': lambda s: s[0]**2 - s[1]**2,
        'dif_threshold': lambda s: (s[0] if s[0] > threshold else 0) - (s[1] if s[1] > threshold else 0),
        'dif2_threshold': lambda s: (s[0]**2 if s[0] > threshold else 0) - (s[1]**2 if s[1] > threshold else 0),
        'dif_obj': lambda s: (s[0] - s[1]) * s[2],
        'dif2_obj': lambda s: (s[0]**2 - s[1]**2) * s[2]
        }
    
    assert score in dict_score.keys(), f"Score '{score}' not valid. Choose one of {list(dict_score.keys())}"
    
    def min_max_scale(scores:list) -> list:
        min_score = min(scores)
        max_score = max(scores)
        return [(s - min_score) / (max_score - min_score) for s in scores]

    dict_merge = {
        'sum': lambda sc: sum(sc),
        'mean': lambda sc: np.mean(sc),
        'max': lambda sc: max(sc),
        'min': lambda sc: min(sc),
        'scale_norm1_mean': lambda sc: np.mean(np.abs(min_max_scale(sc))),
        'scale_norm2_mean': lambda sc: np.linalg.norm(min_max_scale(sc)) / len(sc),
    }
    
    assert merge_scores in dict_merge.keys(), f"Merge score '{merge_scores}' not valid. Choose one of {list(dict_merge.keys())}"

    score_func = dict_score[score]
    scores = [score_func(get_sentiment(synset=s)) for s in synsets if s is not None]

    merge_func = dict_merge[merge_scores]
    scores_merged = merge_func(scores)

    return scores_merged

def discretize_scores(scores:list, threshold:float, positive_value = 1, negative_value = 0) -> list:
    """
    Returns a list of binary values based on a threshold.

    Parameters:
        scores (list): List of scores.
        threshold (float): Minimum value to consider a score as positive.
        positive_value: Value to assign to positive scores.
        negative_value: Value to assign to negative scores.
    """
    return list(map(lambda x: positive_value if x >= threshold else negative_value, scores))

In [None]:
test_synsets = []
for opinion in X_test:
    s = get_sentences(opinion)
    #s = [fix_text(t) for t in s]
    syns = get_lesk_all_synsets(s, lemmatizer=False)
    names = [[syn.name() for syn in ll] for ll in syns]
    test_synsets.append(names)

KeyboardInterrupt: 

In [146]:
with open('./data/lesk_test_synsets.json', 'w') as file:
    json.dump(test_synsets, file)

# UKB

Own implementation of UKB since TextServer didn't allow us.

In [None]:
from ukb import *
nlp = spacy.load("en_core_web_sm")
try:
    ukb_graph = load_ukb_graph("ukb_graph.gexf")
except:
    print("Creating graph...")
    ukb_graph = build_ukb_graph()
    nx.write_gexf(ukb_graph, "ukb_graph.gexf")

ukb = UKB(ukb_graph)
def get_ukb_synsets(text:str):
    context_words = extract_context_words(text)
    disambiguated_senses = ukb.disambiguate_context(context_words, method=2)
    return list(disambiguated_senses.values())

def get_ukb_all_synsets(sentences:list) -> list:
    all = []
    for sentence in sentences:
        all.append([a for a in get_ukb_synsets(sentence) if a != None])
    return all

In [4]:
context_words = extract_context_words("my name is john and i work very hard at google")
disambiguated_senses = ukb.disambiguate_context(context_words, method=1)
print(disambiguated_senses)

NameError: name 'extract_context_words' is not defined

In [13]:
test_synsets = []
for i, opinion in enumerate(X_test):
    s = get_sentences(opinion)
    #s = [fix_text(t) for t in s]
    syns = get_ukb_all_synsets(s)
    test_synsets.append(syns)
    print(i, end="\r")

499

In [14]:
with open('./data/ukb2_test_synsets.json', 'w') as file:
    json.dump(test_synsets, file)

# FREQUENCIES

Using just the most freqüent synset

In [524]:
frequencies = json.load(open("./data/word_sense_frequencies_semcor.json"))

def get_freq_synsets(text:str):
    tokens = word_tokenize(text)
    tagged_tokens = [(a.text, a.pos_) for a in nlp(text)]
    words = []
    for token, pos in tagged_tokens:
        if token not in frequencies.keys():
            syn = None
        else:
            if pos == "NOUN":
                syn = max(frequencies[token], key=lambda key: frequencies[token][key])
            elif pos == "ADJ":
                syn = max(frequencies[token], key=lambda key: frequencies[token][key])
            elif pos == "ADV":
                syn = max(frequencies[token], key=lambda key: frequencies[token][key])
            elif pos == "VERB":
                syn = max(frequencies[token], key=lambda key: frequencies[token][key])
            else:
                syn = None
        if syn is not None:
            words.append(syn.name() if syn.__class__.__name__ == "Lemma" else syn)
    return words

def get_freq_all_synsets(sentences:list) -> list:
    all = []
    for sentence in sentences:
        all.append([a for a in get_freq_synsets(sentence) if a != None])
    return all



In [525]:
test_synsets = []
for i, opinion in enumerate(X_test):
    s = get_sentences(opinion)
    #s = [fix_text(t) for t in s]
    syns = get_freq_all_synsets(s)
    test_synsets.append(syns)
    print(i, end="\r")

499

In [526]:
with open('./data/freq_test_synsets.json', 'w') as file:
    json.dump(test_synsets, file)

In [5]:
with open('./data/ukb1_test_synsets.json', 'r') as file:
	test_synsets = json.load(file)

In [6]:
allowed = [  "v", "a", "s", "n"]

In [11]:
new_test_synsets = []

for opinion in test_synsets:
    new_opinion = []
    for sentence in opinion:
        filter_sentence = [name for name in sentence if name != "NE" and name.split('.')[1] in allowed]
        new_filter_sentence = []
        for a in filter_sentence:
            if "Lemma('" in a :
                n = a.replace("Lemma('", "").replace("')", "")
                n = n.split(".")
                n.pop(-1)
                n = ".".join(n)
            else:
                n = a
            try:
                get_sentiment(n)
                new_filter_sentence.append(n)
            except:
                print(n)
                pass
            
        filter_sentence = new_filter_sentence
        new_opinion.append(filter_sentence)
    new_test_synsets.append(new_opinion)

gifted.s.00
non.s.00
condensed.s.00
railway.n.1;2
anti.s.00
post.s.00
anti.s.00
growing.s.00
talented.s.00
ish.s.00
lingering.s.00
hearted.a.00
semi.s.00
post.s.00
post.s.00
gifted.s.00
post.s.00
bloated.s.00
touring.s.00
growing.s.00
anti.s.00
anti.s.00
spark.n.2;1
growing.s.00
talented.s.00
finest.s.00
biggest.s.00
talented.s.00
bloated.s.00


wares.n.00
growing.s.00
non.s.00
talented.s.00
guerrilla.s.00
depressing.s.00
post.s.00
anti.s.00
tuned.s.00
hearted.a.00
slightest.s.00
spark.n.2;1
multi.s.00
growing.s.00
furnishings.n.00
growing.s.00
anti.s.00
growing.s.00
tuned.s.00
growing.s.00
growing.s.00
depressing.s.00
sweaty.s.00
anti.s.00
growing.s.00
pre.s.00
spark.n.2;1
infested.s.00
semi.s.00
talented.s.00
talented.s.00
talented.s.00
biggest.s.00
haired.s.00
compassionate.s.00
sway.v.0;1
unbearable.s.00
non.s.00
depressing.s.00
goods.n.00
non.s.00
anti.s.00
scrap.s.00
rotting.s.00
smart.s.0;2
consist_of.v.00
talented.s.00
whining.s.00
rival.s.00
millions.n.00
post.s.00
semi.s.00
calming.s.00
growing.s.00
sandy.s.00
depressing.s.00
lingering.s.00
lower.s.00
post.s.00
birthe.v.00
lower.s.00
rival.s.00
spark.n.2;1
pre.s.00
spark.n.2;1
depressing.s.00
biggest.s.00
multi.s.00
semi.s.00
non.s.00
slightest.s.00
growing.s.00
consist_of.v.00
anti.s.00
consist_of.v.00
spoil.v.3;1
pilot.s.00
carping.s.00
sunshiny.s.00
reserve.s.00
p

In [12]:
with open('./data/freq_test_synsets.json', 'w') as file:
    json.dump(new_test_synsets, file)

# Testing

In [16]:
with open('./data/lesk_test_synsets.json', 'r') as file:
    test_synsets = json.load(file)

In [17]:
allowed = [ "n", "a","s", "v" ]

In [34]:
scores_opinions = []
for opinion in test_synsets:
    scores_sentences = []
    for sentence in opinion:
        filter_sentence = [name for name in sentence if name.split('.')[1] in allowed]

        scores_sentences.append(score_synsets(synsets=filter_sentence, score='dif', merge_scores='sum'))

    scores_opinions.append(np.mean(scores_sentences))

results_opinions = discretize_scores(scores=scores_opinions, threshold=0.06)

accuracy_score(y_test, results_opinions)


0.61

In [29]:
"""
results = []
scores_obj = []
scores_res = []
for opinion in test_synsets:
    total_pos = 0
    total_neg = 0
    total_obj = 0
    for sentence in opinion:
        filter_sentence = [name for name in sentence if name.split('.')[1] in allowed]
        scores = [get_sentiment(syn) for syn in filter_sentence if get_sentiment(syn) != None]
        if len(scores) > 0:
            total_pos += sum(s[0] for s in scores if s[0] > 0.5) / len(scores)
            total_neg += sum(s[1] for s in scores if s[1] > 0.5) /len(scores)
            total_obj += sum(s[2] for s in scores) /len(scores)
    score = total_obj
    scores_obj.append(total_obj)
    scores_res.append(total_pos - total_neg)
    if score > 0.15:
        # print("Positive")
        results.append(1)
    elif score < 0.15:
        # print("Negative")
        results.append(0)
    else:
        # print("Neutral")
        results.append(0)
results = [0 if a < 0  else 1 for a in scores_res]
print(accuracy_score(y_test, results))
"""
