### Script to produce the ReLi Corpus with PALAVRAS parser dependency output

Using PALAVRAS revision 10754, compiled on 2015-06-06.


Corpus reference:
 
          Freitas, C., Motta, E., Milidiú, R., & Cesar, J. (2012).
          Vampiro que brilha... rá! Desafios na anotação de opinião em um corpus
          de resenhas de livros. Proceedings do XI Encontro de Linguística de Corpus (XI ELC). São Carlos - SP.
          http://www.linguateca.pt/Repositorio/ReLi/

Parser reference:

          Bick, Eckhard (2000), The Parsing System "Palavras" - Automatic Grammatical 
          Analysis of Portuguese in a Constraint Grammar Framework
          Aarhus: Aarhus University Press -- dr.phil. thesis
          http://beta.visl.sdu.dk/constraint_grammar.html


In [1]:
from lxml import etree
from lxml.etree import XMLSyntaxError
from subprocess import Popen, PIPE
import re
import os
import logging
import unicodedata

In [7]:
RELI_CORPUS = '../corpus/ReLi.xml'

CORPUS_PALAVRAS = '../corpus/ReLiPalavras.xml

PALAVRAS_CMD = ['/opt/palavras/por.pl', '--role']
PALAVRAS_MALT = ['/opt/palavras/bin/visldep2malt.pl']
PALAVRAS_EXTRA2SEM = ['/opt/palavras/bin/extra2sem']

# necessary to PALAVRAS
os.environ['PERL_UNICODE'] = 'SDA'

logging.basicConfig(filename='ReLiPalavras.log', level=logging.DEBUG)
logger = logging.getLogger()

In [3]:
# detokenizer
def detokenizer(sentence):
    # punctuation to keep close to the left word: . , .. ... : ? ! ;
    # final stop?
    sentence = re.sub(r' (\.|,|\.\.|\.\.\.|:|\?|!|;)',
                      r'\1', sentence, flags=re.U)

    # punctuation to keep close to the both words: -se -me
    sentence = re.sub(r'(\w) (-) (a|as|o|os|se|me|te|vos|lhe|lha|lhes|lhas|na|nas|no|nos|la|las|lo|los)( |$)',
                      r'\1\2\3\4', sentence, flags=re.U)

    # punctuation to keep in context (x) "x" 'x'
    sentence = re.sub(r'\( (.*?) \)', r'(\1)', sentence, flags=re.U)
    sentence = re.sub(r'" (.*?) "', r'"\1"', sentence, flags=re.U)
    sentence = re.sub(r"' (.*?) '", r"'\1'", sentence, flags=re.U)

    # correct emoticons!
    sentence = sentence.replace(': )', ' :)')
    sentence = sentence.replace(': (', ' :(')

    return sentence

In [4]:
contractions = dict(
[('ao', ('a', 'o')),
 ('aonde', ('a', 'onde')),
 ('aos', ('a', 'os')),
 ('comigo', ('com', 'mim')),
 ('conosco', ('com', 'nós')),
 ('contigo', ('com', 'ti')),
 ('convosco', ('com', 'vós')),
 ('da', ('de', 'a')),
 ('dacolá', ('de', 'acolá')),
 ('dali', ('de', 'ali')),
 ('daquela', ('de', 'aquela')),
 ('daquelas', ('de', 'aquelas')),
 ('daquele', ('de', 'aquele')),
 ('daqueles', ('de', 'aqueles')),
 ('daqui', ('de', 'aqui')),
 ('daquilo', ('de', 'aquilo')),
 ('das', ('de', 'as')),
 ('daí', ('de', 'aí')),
 ('dela', ('de', 'ela')),
 ('delas', ('de', 'elas')),
 ('dele', ('de', 'ele')),
 ('deles', ('de', 'eles')),
 ('dentre', ('de', 'entre')),
 ('dessa', ('de', 'essa')),
 ('dessas', ('de', 'essas')),
 ('desse', ('de', 'esse')),
 ('desses', ('de', 'esses')),
 ('desta', ('de', 'esta')),
 ('destas', ('de', 'estas')),
 ('deste', ('de', 'este')),
 ('destes', ('de', 'estes')),
 ('disso', ('de', 'isso')),
 ('disto', ('de', 'isto')),
 ('do', ('de', 'o')),
 ('donde', ('de', 'onde')),
 ('dos', ('de', 'os')),
 ('doutra', ('de', 'outra')),
 ('doutras', ('de', 'outras')),
 ('doutro', ('de', 'outro')),
 ('doutros', ('de', 'outros')),
 ('na', ('em', 'a')),
 ('naquela', ('em', 'aquela')),
 ('naquelas', ('em', 'aquelas')),
 ('naquele', ('em', 'aquele')),
 ('naqueles', ('em', 'aqueles')),
 ('naquilo', ('em', 'aquilo')),
 ('nas', ('em', 'as')),
 ('nela', ('em', 'ela')),
 ('nelas', ('em', 'elas')),
 ('nele', ('em', 'ele')),
 ('neles', ('em', 'eles')),
 ('nessa', ('em', 'essa')),
 ('nessas', ('em', 'essas')),
 ('nesse', ('em', 'esse')),
 ('nesses', ('em', 'esses')),
 ('nesta', ('em', 'esta')),
 ('nestas', ('em', 'estas')),
 ('neste', ('em', 'este')),
 ('nestes', ('em', 'estes')),
 ('nisso', ('em', 'isso')),
 ('no', ('em', 'o')),
 ('nos', ('em', 'os')),
 ('noutra', ('em', 'outra')),
 ('noutras', ('em', 'outras')),
 ('noutro', ('em', 'outro')),
 ('noutros', ('em', 'outros')),
 ('num', ('em', 'um')),
 ('numa', ('em', 'uma')),
 ('numas', ('em', 'umas')),
 ('nuns', ('em', 'uns')),
 ('pela', ('por', 'a')),
 ('pelas', ('por', 'as')),
 ('pelo', ('por', 'o')),
 ('pelos', ('por', 'os')),
 ('à', ('a', 'a')),
 ('àquela', ('a', 'aquela')),
 ('àquelas', ('a', 'aquelas')),
 ('àquele', ('a', 'aquele')),
 ('àqueles', ('a', 'aqueles')),
 ('àquilo', ('a', 'aquilo')),
 ('às', ('a', 'as'))]
)

In [5]:
# map tokenization between reli tokenization and PALAVRAS tokenization
def transfer_annotation(sentence, parsed_sentence, j):

    parsed_sentence.set('place', sentence.get('place'))
    parsed_sentence.set('polarity', sentence.get('polarity'))

    sentence = sentence.getchildren()
    # j is the align factor for sentence
    size_j = len(sentence)

    parsed_sentence = parsed_sentence.getchildren()
    i = 0  # align factor for parsed_sentence
    size_i = len(parsed_sentence)

    # annotation from ReLi is transfered to PALAVRAS tokenization
    aligned = False
    while not aligned:

        if i >= size_i or j >= size_j:
            aligned = True
            continue

        word = parsed_sentence[i].get('form').lower()
        candidate = sentence[j].get('form').lower()

        # align match!
        if word == candidate:
            # transfer annotation from ReLi to PALAVRAS
            parsed_sentence[i].set('obj', sentence[j].get('obj'))
            parsed_sentence[i].set('opinion', sentence[j].get('opinion'))
            i += 1
            j += 1
            continue

        # PALAVRAS change symbom ' to `
        # PALAVRAS change symbom ... to .
        # PALAVRAS change symbom .. to .
        # PALAVRAS change symbom - to --
        # PALAVRAS remove dialog introduction symbol '-'
        # PALAVRAS remove symbol ' from the word:
        if (word == candidate.replace('\'', '`') or
                word == candidate.replace('...', '.') or
                word == candidate.replace('..', '.') or
                word == candidate.replace('-', '--') or
                word == candidate.replace('-', '') or
                word == candidate.replace('\'', '')):
            parsed_sentence[i].set('obj', sentence[j].get('obj'))
            parsed_sentence[i].set('opinion', sentence[j].get('opinion'))
            i += 1
            j += 1
            continue

        # PALAVRAS changes admirá - las to admirar- las
        if word.endswith('-') and len(word) >= 4 and word[:-3] == candidate[:-1]:
            parsed_sentence[i].set('obj', sentence[j].get('obj'))
            parsed_sentence[i].set('opinion', sentence[j].get('opinion'))
            i += 1
            j += 2
            continue

        # PALAVRAS sometimes put accents in the words
        w1 = unicodedata.normalize('NFD', word).encode('ascii', 'ignore')
        w2 = unicodedata.normalize('NFD', candidate).encode('ascii', 'ignore')
        if w1 == w2:
            parsed_sentence[i].set('obj', sentence[j].get('obj'))
            parsed_sentence[i].set('opinion', sentence[j].get('opinion'))
            i += 1
            j += 1
            continue

        # tokens dont match due a collocation: hoje_em_dia
        if '_' in word:
            shift = word.count('_') + 1
            collocation = '_'.join([w.get('form').lower()
                                    for w in sentence[j: min(j + shift, size_j)]])
            if word == collocation:
                # transfer annotation from first ReLi token to PALAVRAS
                parsed_sentence[i].set('obj', sentence[j].get('obj'))
                parsed_sentence[i].set('opinion', sentence[j].get('opinion'))
                i += 1
                j += shift
                continue

        # PALAVRAS concatenated two, three or four tokens
        if word.startswith(candidate):
            for shift in [2, 3, 4]:
                found = False
                collocation = ''.join([w.get('form').lower() for w in sentence[
                                      j: min(j + shift, size_j)]])
                if word == collocation:
                    # transfer annotation from first ReLi token to PALAVRAS
                    parsed_sentence[i].set('obj', sentence[j].get('obj'))
                    parsed_sentence[i].set(
                        'opinion', sentence[j].get('opinion'))
                    i += 1
                    j += shift
                    found = True
                    break
            if found:
                continue

        # contractions!
        if candidate in contractions:
            if (word == contractions[candidate][0] and
                    i < size_i and
                    parsed_sentence[i + 1].get('form').lower() == contractions[candidate][1]):
                parsed_sentence[i].set('obj', sentence[j].get('obj'))
                parsed_sentence[i].set('opinion', sentence[j].get('opinion'))
                parsed_sentence[i + 1].set('obj', sentence[j].get('obj'))
                parsed_sentence[
                    i + 1].set('opinion', sentence[j].get('opinion'))

                i += 2
                j += 1

        # PALAVRAS split a token in two, three or four tokens
        if candidate.startswith(word):
            for shift in [2, 3, 4]:
                found = False
                collocation = ''.join([w.get('form').lower() for w in parsed_sentence[
                                      i: min(i + shift, size_i)]])
                if candidate == collocation:
                    for k in range(i, i + shift):
                        # transfer annotation from first ReLi token to PALAVRAS
                        parsed_sentence[k].set('obj', sentence[j].get('obj'))
                        parsed_sentence[k].set(
                            'opinion', sentence[j].get('opinion'))
                    i += shift
                    j += 1
                    found = True
                    break
            if found:
                continue

        # Unknown match, but next candidate matches, so match annotations and
        # continue...
        if (j < size_j - 1 and word == sentence[j + 1].get('form').lower()):
            parsed_sentence[i].set('obj', sentence[j + 1].get('obj'))
            parsed_sentence[i].set('opinion', sentence[j + 1].get('opinion'))
            i += 1
            j += 2
            continue

        # Unknown match, but next word matches, so match annotations and
        # continue...
        if (i < size_i - 1 and parsed_sentence[i + 1].get('form').lower() == candidate):
            parsed_sentence[i + 1].set('obj', sentence[j].get('obj'))
            parsed_sentence[i + 1].set('opinion', sentence[j].get('opinion'))
            i += 2
            j += 1
            continue

        # Unknown match, but next words match, so match annotations and
        # continue...
        if (i < size_i - 1 and j < size_j - 1 and parsed_sentence[i + 1].get('form').lower() == sentence[j + 1].get('form').lower()):
            parsed_sentence[i].set('obj', sentence[j].get('obj'))
            parsed_sentence[i].set('opinion', sentence[j].get('opinion'))
            i += 1
            j += 1
            continue

        # I dont know the problem
        logger.error('''Mismatch from PALAVRAS word "{0}"
                        with ReLi word "{1}".
                        Parsed Sentence: "{2}"
                        ReLi sentence: "{3}"'''.format(
            word,
            candidate,
            ' '.join([w.get('form') for w in parsed_sentence]),
            ' '.join([w.get('form') for w in sentence])))

        return -1
    return j

In [8]:
parser = etree.XMLParser(remove_blank_text=True)
reviews = etree.parse(RELI_CORPUS)

In [None]:
count = 0
xmldoc = etree.Element('reviews')
for review in reviews.getroot():
    count += 1
    logger.info('Processing review {}/{}'.format(count, len(reviews.getroot())))
    review_node = etree.SubElement(xmldoc, 'review')
    review_node.set('id', review.get('id'))
    review_node.set('book_title', review.get('book_title'))
    review_node.set('score', review.get('score'))

    for sentence in review:

        sentence_string = ' '.join([w.get('form')
                                    for w in sentence.getchildren()])
        
        # PALAVRAS has problem with a tokenized string since tokenization is part of the parser process
        # this function tries to retrieved ReLi to untokenized form, since only the tokenized corpus is available
        sentence_string = detokenizer(sentence_string)

        # PALAVRAS parser
        p = Popen(PALAVRAS_CMD, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        (stdout, stderr) = p.communicate(input=sentence_string.encode('utf8'))

        # the script to convert from visl format to malt format
        p = Popen(PALAVRAS_MALT, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        (stdout, stderr) = p.communicate(input=stdout)

        # script to keep only semantic information output
        p = Popen(PALAVRAS_EXTRA2SEM, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        (stdout, stderr) = p.communicate(input=stdout)

        output = stdout.decode('utf8')

        # fix missing open <sentence> in xml
        if output.find('<body>\n</body>') == -1:
            output = output.replace('<body>', '<body>\n<sentence>')

        # fix missing open <sentence> in xml between break sentences
        output = re.sub(r'/>[\n\t ]+<word id="1"',
                        '/>\n</sentence>\n<sentence>\n<word id="1"', output, re.M)

        # tags lixo
        output = re.sub(r'<lixo .+?>', '', output, re.U)

        # remove xml declaration necessary to load from string in etree
        output = output.replace('<?xml version="1.0" encoding="UTF-8"?>', '')

        # bug for output semantic roles in xml format
        output = re.sub(r'form="(.+) <.*?"', r'form="\1"', output, re.U)
        output = re.sub(r'form="(.+) PU.*?"', r'form="\1"', output, re.U)

        # bug for output semantic roles showing  £CLE
        output = output.replace(' £CLE', '')

        # deal with semantic roles outputed in deprel. ex:
        # head="0" deprel="PU" obj="O"
        output = re.sub(r'deprel="([^§]+?)"', r'deprel="\1" srl=""', output, re.U)

        # head="0" deprel="STA §PRED" obj="O"
        output = re.sub(r'deprel="(.+) §(.+)?"', r'deprel="\1" srl="\2"', output, re.U)
        try:
            parser = etree.XMLParser(remove_blank_text=True)
            tree = etree.fromstring(output, parser)
            index = 0
        except XMLSyntaxError as err:
            logger.error(err)
            continue

        for parsed_sentence in tree.xpath('/treebank/body/sentence'):

            # since PALAVRAS has its own (unique!) tokenization, it is necessary to transfer the ReLi
            # annotation to PALAVRAS parsed sentence
            index = transfer_annotation(sentence, parsed_sentence, index)
            if index != -1:

                sent_node = etree.SubElement(review_node, 'sentence')
                sent_node.set('place', sentence.get('place'))
                sent_node.set('polarity', sentence.get('polarity'))

                for word_node in parsed_sentence.getchildren():
                    sent_node.append(word_node)

In [None]:
etree.ElementTree(xmldoc).write(CORPUS_PALAVRAS,encoding='utf8', xml_declaration=True, pretty_print=True)