### Script to produce the ReLi Corpus in XML format with UNITEX lemmatizer and morfological features.
   
This format is similar to malt parser format. Corpus reference: 

     Freitas, C., Motta, E., Milidiú, R., & Cesar, J. (2012).
     Vampiro que brilha... rá! Desafios na anotação de opinião em um corpus
     de resenhas de livros. Proceedings do XI Encontro de Linguística de Corpus (XI ELC). São Carlos - SP.
     http://www.linguateca.pt/Repositorio/ReLi/

In [1]:
import os
import codecs
import gzip

from lxml import etree
import enchant
import unicodedata

Used libraries require some extra steps:

    pip install -U lxml
    pip install -U enchant
    apt install aspell-pt-br

In [2]:
# Parameters
root = '../corpus/ReLi/'
xml_filename = '../corpus/ReLi.xml'
spell_checker = enchant.Dict('pt_BR')

PoS Tags in ReLi (retrieved from corpus stats):

```
   FREQ    POSTAG       EXAMPLES
  44954       N         livro, história, mundo, vida, leitura, livros, pessoas, personagens, forma, tempo
  33068     PREP        de, em, a, por, com, para, como, Em, sobre, sem
  31175       V         é, ler, ser, tem, são, foi, É, era, li, faz
  29043      ART        o, a, um, os, uma, as, O, A, Um, Uma
  15158       ,         ,
  13572      ADJ        bom, primeiro, grande, melhor, primeira, humano, diferente, interessante, boa
  11959      ADV        não, mais, muito, bem, já, Não, tão, sempre, ainda, um
  11059     NPROP       Bella, Edward, Saramago, Orwell, Crepúsculo, de, Capitães, Partido, Pedro, Areia
   9693       .         .
   9296      KC         e, mas, ou, E, Mas, pois, tanto, quanto, porém, nem
   8498    PROADJ       sua, esse, seu, essa, cada, seus, todos, minha, este, suas
   7977    PROPESS      se, ele, ela, eu, me, você, nos, eles, Eu, o
   4670      KS         que, quando, se, como, porque, Se, Quando, enquanto, Como, já
   4628  PRO-KS-REL     que, qual, a, quem, o, quais, Que, as, cujo, cuja
   4321      PCP        escrito, lido, sido, feito, escrita, visto, apaixonada, proibido, chamada, publicado
   4282    PROSUB       isso, o, um, tudo, nada, todos, algo, que, O, outros
   3866     VAUX        é, pode, ser, foi, vai, são, ter, acaba, poderia, tinha
   2267       "         "
   1765     PDEN        também, só, mesmo, apenas, até, Só, somente, exemplo, assim, afinal
   1302       -         -
   1214       !         !
   1136    PRO-KS       que, o, quem, Quem, quanto, qual, O, tudo, quão, como
   1076      NUM        um, três, duas, dois, uma, quatro, 15, 5, 3, 12
    739      ...        ...
    703       :         :
    694       )         )
    638       (         (
    502       ?         ?
    317       ;         ;
    276  ADV-KS-REL     onde, quando, como
    235       '         '
    232    ADV-KS       como, onde, que, por, quando
    175      IN         ai, né, Ah, Ok, Ora, ah, Ia, oh, ok, hein
     37       [         [
     20       =         =
      8      CUR        R$, US$
      2       O         ...., Legal
      1       /         /
      1       $         $
      1      //         //
      1      ..         ..
```

Unitex Dicionary can be downloaded from 
http://www.nilc.icmc.usp.br/nilc/projects/unitex-pb/web/dicionarios.html

In [3]:
# Reading unitex files for lemmatization and morfological features. This may take some time and memory...

unitex = dict()
unitex['all'] = dict()

with gzip.open('Delaf2015v04.dic.gz') as fp:
    for line in fp:
        line = line.decode('utf8')
        word, info = line.split(',')
        lemma, info = info.split('.')
        postag = info.split(':')[0].strip()
        if len(info.split(':')) == 2:
            morf = info.split(':')[1].strip()
        else:
            morf = ''
        
        # get the first pos tag in case of multiple
        postag = postag.split('+')[0]
        
        # convert tag A to ADJ to keep similar with ReLi tags reported above
        if postag == 'A':
            postag = 'ADJ'
        
        if postag not in unitex:
            unitex[postag] = dict()
            
        # no disambiguation, get the last value present in the dict
        # keep the words organized by postag
        unitex[postag][word] = (lemma, morf)
        
        # keep the 'all' lexicon, despite the tag
        unitex['all'][word] = (lemma, morf)

In [4]:
# Reading files from root folder to build the xml. This operation may take some time...

xmldoc = etree.Element('reviews')

for filename in os.listdir(root):
    if filename.endswith('.txt'):
        filepath = os.path.join(root, filename)
        with codecs.open(filepath, encoding='utf8') as fp:
            book_name = ''
            sent_place = ''
            word_id = 0
            
            for line in fp:
                line = line.strip()

                # check if line contains #Livro
                if line.startswith('#Livro_'):
                    if book_name != line[7:]:
                        book_name = line[7:]

                # check if line contains #Resenha
                elif line.startswith('#Resenha_'):
                    review_id = line[9:]
                    review_node = etree.SubElement(xmldoc, 'review')
                    review_node.set('id', review_id)
                    review_node.set('book_title', book_name)

                # check if line contains #nota
                elif line.startswith('#Nota_'):
                    score = line[6:]
                    review_node.set('score', score)

                # check if line contains #Título
                elif line.startswith('#Título'):
                    sent_node = etree.SubElement(review_node, 'sentence')
                    sent_place = 'title'
                    sent_node.set('place', sent_place)
                    word_id = 0

                # check if line contains #Corpo
                elif line.startswith('#Corpo'):
                    sent_node = etree.SubElement(review_node, 'sentence')
                    sent_place = 'body'
                    sent_node.set('place', sent_place)
                    word_id = 0

                # check if line contains 6 elements, this is the corpus data
                elif len(line.split('\t')) == 6:

                    word, pos, obj, opinion, polarity, _ = line.split('\t')
                    lower_word = word.lower()
                    
                    # if word not in unitex try to correct the spell using a spell checker
                    if lower_word not in unitex['all']:
                        suggestions = spell_checker.suggest(lower_word)
                        if len(suggestions) >= 1:
                            suggestion = suggestions[0]

                        # strip accents to compare with the original word.
                        nfkd_form = unicodedata.normalize('NFKD', str(suggestion))
                        suggestion = nfkd_form.encode('ASCII', 'ignore')
                        
                        # Only corrects if the suggestion is the word with the correct accent
                        if suggestion == lower_word:
                            # check original case for the word
                            if word.istitle():
                                word = suggestions[0].title()
                            elif word.isuppter():
                                word = suggestions[0].upper()
                            else:
                                word = suggestions[0]

                    # check unitex for lemma and morfological features                    
                    if pos in unitex and lower_word in unitex[pos]:
                        base, morf = unitex[pos][lower_word]
                    elif lower_word in unitex['all']:
                        base, morf = unitex['all'][lower_word]
                    else:
                        base = word
                        morf = ''


                    # build word node
                    word_node = etree.SubElement(sent_node, 'word')
                    word_node.set('id', str(word_id))
                    word_id += 1
                    word_node.set('form', word)
                    word_node.set('base', base)
                    word_node.set('morf', morf)
                    word_node.set('postag', pos)
                    word_node.set('obj', obj)
                    word_node.set('opinion', opinion)                    

                    if polarity == '-':
                        sent_node.set('polarity', 'negative')
                    elif polarity == '+':
                        sent_node.set('polarity', 'positive')
                    else:
                        sent_node.set('polarity', 'neutral')

                elif len(line) == 0:
                    sent_node = etree.SubElement(review_node, 'sentence')
                    sent_node.set('place', sent_place)
                    word_id = 0

In [5]:
# Save to xml file
etree.ElementTree(xmldoc).write(xml_filename, encoding='utf8', xml_declaration=True, pretty_print=True)