Kellert, O., M. Zaman, N. Matlis and C. Gomez-Rodriguez (2023) Experimenting with UD Adaptation of an Unsupervised Rule-based Approach for Sentiment Analysis of Mexican Tourist Texts. Proceedings of Iberlef 2023 within Shared Task Rest-Mex 2023. The preprint version is available on arXiv: URL: https://arxiv.org/abs/2309.05312

In [None]:
#pip install stanza

In [None]:
import numpy as np
from statistics import mean
import stanza
stanza.download('es')
nlp = stanza.Pipeline('es')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.1.json:   0%|   …

2023-09-20 13:23:17 INFO: Downloading default packages for language: es (Spanish) ...
2023-09-20 13:23:18 INFO: File exists: /home/imran/stanza_resources/es/default.zip
2023-09-20 13:23:21 INFO: Finished downloading models and saved to /home/imran/stanza_resources.
2023-09-20 13:23:21 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.1.json:   0%|   …

2023-09-20 13:23:22 INFO: Loading these models for language: es (Spanish):
| Processor    | Package         |
----------------------------------
| tokenize     | ancora          |
| mwt          | ancora          |
| pos          | ancora_charlm   |
| lemma        | ancora_nocharlm |
| constituency | combined_charlm |
| depparse     | ancora_charlm   |
| sentiment    | tass2020        |
| ner          | conll02         |

2023-09-20 13:23:22 INFO: Using device: cuda
2023-09-20 13:23:22 INFO: Loading: tokenize
2023-09-20 13:23:22 INFO: Loading: mwt
2023-09-20 13:23:23 INFO: Loading: pos
2023-09-20 13:23:23 INFO: Loading: lemma
2023-09-20 13:23:23 INFO: Loading: constituency
2023-09-20 13:23:23 INFO: Loading: depparse
2023-09-20 13:23:23 INFO: Loading: sentiment
2023-09-20 13:23:24 INFO: Loading: ner
2023-09-20 13:23:24 INFO: Done loading processors!


In [None]:
nlp = stanza.Pipeline(lang='es', processors='tokenize,pos,mwt,lemma,depparse')

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| pos       | ancora  |
| lemma     | ancora  |
| depparse  | ancora  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Process the dictionaries


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
def populateDict(fileLocation):
    dic={}
    f = open(fileLocation, encoding='latin-1')
    for line in f:
        (key,value) = line.split()
        dic[key]=int(value.strip())
    return dic

In [None]:
#dAdj = populateDict('adj.txt') # define sentiment dictionaries
# define sentiment dictionaries
dAdj = populateDict('adj.txt')
dAdv = populateDict('adv.txt')
dNoun = populateDict('noun.txt')
dVerb = populateDict('verb.txt')

In [None]:
lstNeg =['nadie','tampoco','no','ez','non','no','nunca','não','nada','ni','nunca','ningun','ninguno','ninguna','nisiquiera'] # list of negation elements

In [None]:
lstfmod=["mucho", "super", "realmente", "tan", "tanto", "grande", "poco", "más", "simplemente", "absolutamente", "totalmente", "bastante", "menos", "definitivamente", "increíblemente", "especialmente", "incluso" ] # list of intensifiers

In [None]:
max(dAdv.values())

5

In [None]:
min(dAdv.values())

-5

In [None]:
max(dAdj.values())

5

In [None]:
min(dAdj.values())

-5

In [None]:
### analysing a string (=sent) with Stanza
def createDic(sent):
    doc = nlp(sent)
    dicts=doc.to_dict()
    return dicts

In [None]:
createDic("buena comida")

[[{'id': 1,
   'text': 'buena',
   'lemma': 'buen',
   'upos': 'ADJ',
   'xpos': 'aq0fs0',
   'feats': 'Gender=Fem|Number=Sing',
   'head': 2,
   'deprel': 'amod',
   'start_char': 0,
   'end_char': 5,
   'ner': 'O',
   'multi_ner': ('O',)},
  {'id': 2,
   'text': 'comida',
   'lemma': 'comida',
   'upos': 'NOUN',
   'xpos': 'ncfs000',
   'feats': 'Gender=Fem|Number=Sing',
   'head': 0,
   'deprel': 'root',
   'start_char': 6,
   'end_char': 12,
   'ner': 'O',
   'multi_ner': ('O',)}]]

In [None]:
### Creating dctChild dictionary: key= headid, value= a list of childrenids
def getChildParentDicts(dct):
    dctChild       = {} ## for now, this dictionary is important
    dctParent      = {}
    dctBeforeToken = {}

    for dicVal in dct:
        #print(dicVal)
        elementID          = dicVal['id']
        elementHead        = dicVal['head']
        elementLemma       = dicVal['lemma']

        #print(type(dct))
        if(elementID != 1 ):
            eleBeforeDic = dct[int(elementID)-2]
            elementBeforLemma = eleBeforeDic['lemma']
            #print(elementID, elementLemma, eleBeforeDic, eleBeforeDic['lemma'], elementBeforLemma)
        else:
            elementBeforLemma = "-"

        # dic parent is done
        dctParent[elementID] = elementHead
        dctBeforeToken[elementLemma] =elementBeforLemma

        if elementHead not in dctChild.keys():
            dctChild[elementHead] =[]

        # Adding to the parent node (Elements and ElementsLemma)
        dctChild[elementHead].append(elementID)

    # working on Sibling dictionary
    dctSibling = {}

    # Giviing some default values
    for key in dctParent.keys():
        dctSibling[key]=[]
    for key in dctChild.keys():
        childList = dctChild[key]
        #print(childList)
        for chld in childList:
          #print(child)
          dctSibling[chld] = [x for x in childList if x != chld]
    return dctChild, dctParent, dctSibling, dctBeforeToken

In [None]:
### let's see an example
r1 = 'gran servicio'

dicts = createDic(r1)
for dicVal in dicts:
    #print(dicVal)

    dctChild, dctParent, dctSibling, dctBeforeToken = getChildParentDicts(dicVal)

    print(dctParent)
    print(dctChild)
    print(dctSibling)
    print(dctBeforeToken)
#"""

{1: 2, 2: 0}
{2: [1], 0: [2]}
{1: [], 2: []}
{'gran': '-', 'servicio': 'gran'}


In [None]:
### defining POS of our sentiment words from sentiment dictionaries
sentdicts={"ADV": dAdv, "ADJ": dAdj, "NOUN": dNoun, "VERB": dVerb}

In [None]:
#### definining the order of key-value pairs from dctChild, which equal tree branches in a dependency tree.
def calcbranchorder(dctChild):
    #print("dctChild.keys are", list(dctChild.keys()))
    if list(dctChild.keys())==[0]:
        calcheads=[0]
        #print("ony one word")
    else:
        allheads=list(dctChild.keys())
        calcheads=[]
        children=dctChild[0]
        nextchildren=[]
        numchildren=len(children)
        #print("numchildren is", numchildren)
        while numchildren > 0:
            nextchildren=[]
            #print('numchildren = ', numchildren)
            for child in children:
                #print('child = ', child)
                if child in allheads:
                    print('headchild = ', child)
                    calcheads.append(child)
                    nextchildren = nextchildren + dctChild[child]
                    #print('calcheads = ', calcheads)
                    #print('nextchildren = ', nextchildren)
            children=nextchildren
            numchildren=len(children)
        calcheads.reverse()
    #print('branch order = ', calcheads)
    return calcheads

In [None]:
def calcbranch(dct, headId,childIds):
####finding relevant elements in each branch
    a=0
    b=0
    neg=0
    p=0
    pvalue=0.25 #### check!
    negvalue=-4
    Negstat=False
    fmodstat=False
    lexmodstat=False
    lexstat=False
    lstcntchildren=[]

#### starting with children of the branch
    for childId in childIds:###
        print("childid=", childId)
        lemma=dct[childId-1]["lemma"]
        text=dct[childId-1]["text"]
        pos=dct[childId-1]["upos"]#
        if lemma in lstNeg:
            Negstat=True
            neg=negvalue
            #print("child is negation")
        elif pos in sentdicts.keys():
            deprel= dct[childId-1]["deprel"]
            #### score of the child
            dsent= sentdicts[pos]
            if "elementScore" in dct[childId-1].keys():
                elementScore= dct[childId-1]["elementScore"]
            elif lemma in dsent.keys():
                elementScore=dsent[lemma]
            else:
                elementScore='none'
            ########## properties of the children
            Condscoreexists=elementScore!="none"
            Condlexmod= "mod" in deprel and lemma not in lstfmod and lemma in dsent.keys()
            Condlex= "mod" not in deprel and lemma in dsent.keys()
            ###########considers only children with cnt (= count) that are not conjoined clauses######
            if "elementType" in dct[childId-1].keys():
                if dct[childId-1]["elementType"]=='cnt': #and 'conj' not in deprel and 'VERB' not in pos:
                    lstcntchildren.append(childId)
            if Condlex:
                lexstat=True
                dct[childId-1]["elementType"]='cnt'
                dct[childId-1]["elementScore"]=dsent[lemma]
                lexscore=elementScore
                a=lexscore
                #print("child is lexical head")
            elif lemma in lstfmod and lemma in dsent.keys():
                fmodstat=True
                fmodscore= elementScore
                b= scaleB(int(fmodscore))
                #print("child is functional modifier")
                #print("scaleB:", b)
            elif Condlexmod:
                lexmodstat=True
                lexmodscore= elementScore
                c=lexmodscore
                #print("child is lexical modifier")
                #print("elementscore:", lexmodscore)
            ##### check for pero####
        elif lemma=="pero":
            p=pvalue
            #print("pero is present")
        #print("child: ", childId, text, elementScore)
####continuing with parents/heads of the branch ###
    lemma=dct[headId-1]["lemma"]
    pos=dct[headId-1]["upos"]
    deprel= dct[headId-1]["deprel"]
    headmodstat= "mod" in deprel
    headscore="none"
    childp=p

    if lemma== "nada":
       neg=negvalue
       #print("head is negation")
    if lexmodstat:
        a=lexmodscore
        headsentimentscore=(a * (1+b)  + (np.sign(a*(1+b)))* neg)*(1+p)
        if headmodstat==False:
            dct[headId-1]["elementType"]='cnt'
        dct[headId-1]["elementScore"]= headsentimentscore
        childp=0
        #print("a=", a)

    elif pos in sentdicts.keys():
        dsent= sentdicts[pos]
        if lemma in dsent.keys():
            headscore=dsent[lemma]
            a=dsent[lemma]
            headsentimentscore=(a * (1+b)  + (np.sign(a*(1+b)))* neg)*(1+p)
            if headmodstat==False:
                dct[headId-1]["elementType"]='cnt'
            dct[headId-1]["elementScore"]= headsentimentscore
            childp=0
            #print("head is a sentiment word and is a lexical head")

    elif lexstat and False:
        a=lexscore
        headsentimentscore=(a * (1+b)  + (np.sign(a*(1+b)))* neg)*(1+p)
        if headmodstat==False:
            dct[headId-1]["elementType"]='cnt'
        dct[headId-1]["elementScore"]= headsentimentscore
        #childp=0
        #print("lexhead applies")
        #print("a and headsentimentscore", a, headsentimentscore)

    #elif ##check status of children, whether object
    if False: #### ###applying negation to heads of previous nodes if they are not clauses
        for childId in lstcntchildren:
            dct[childId-1]["elementScore"]= (dct[childId-1]["elementScore"]+  (np.sign(dct[childId-1]["elementScore"]))* neg)*(1+childp)
    #print("head: ", headId, lemma, headscore, headsentimentscore)
    return dct

In [None]:
def scaleB(inputVal):
    return ((inputVal+5)/10)*0.50 -0.25

In [None]:
scaleB(-1)

-0.04999999999999999

Compute the Sentence Score

In [None]:
def calcSentenceScore(sent):
    lstScores=[]

    dicts = {}

    dicts = createDic(sent)
    #print(len(dicts))

    for dicVal in dicts:
        #print(dicVal)

        dctChild, dctParent, dctSibling, dctBeforeToken = getChildParentDicts(dicVal)

        ###Step 2 figure out order of nodes
        branchheadIds= calcbranchorder(dctChild)

        ## Step 3 looping over nodes
        if branchheadIds!=[0]:
            for headId in branchheadIds:
                dct=calcbranch(dicVal, headId, dctChild[headId])
                #print(dct)

            ###Step 4 collect the scores of branchheadIds
                if "elementType" in dct[headId-1].keys():
                  #if dct[headId-1]["elementType"]=='cnt':
                  if dct[headId-1]["elementType"]=='cnt':
                        lstScores.append(dct[headId-1]["elementScore"])
                        #print("lstScores", lstScores)
        else:
            headId=1
            dct=calcbranch(dicVal, headId, [])
            #print(dct)
        ###Step 4 collect the scores of branchheadIds
        if "elementType" in dct[headId-1].keys():
            #if dct[headId-1]["elementType"]=='cnt':
            if dct[headId-1]["elementType"]=='cnt':
                lstScores.append(dct[headId-1]["elementScore"])
                #print("lstScores", lstScores)

        ###Step 5 create a sentence score
        #SentenceScore= np.mean(lstScores) # metrics a= mean scores

        if len(lstScores)!=0: #Assign score
            NumPosScores = sum([1 for score in lstScores if score>0])
            NumNegScores = sum([1 for score in lstScores if score<0])

            for x in lstScores:
                if NumPosScores > NumNegScores:
                    SentenceScore = max(lstScores)
                elif NumPosScores < NumNegScores:
                    SentenceScore = min(lstScores)
                else:
                    SentenceScore = mean([float(score) for score in lstScores])
                    #SentenceScore= Extreme value function (metrics c)

                print(lstScores)
                print("SentenceScore", SentenceScore)

            return SentenceScore
        else:
            return -200

In [None]:

r1= "muy bueno"

In [None]:
calcSentenceScore(r1)

headchild =  2
childid= 1
[2.3, 2.3]
SentenceScore 2.3
[2.3, 2.3]
SentenceScore 2.3


2.3