# Statistics in ReLi Corpus annotated with PALAVRAS Parser

In [1]:
from __future__ import print_function
from __future__ import division

In [2]:
from lxml.etree import ElementTree
reviews = ElementTree().parse('../corpus/ReLiUniversalDependencies.xml')

## Token Type Ratio

In [3]:
from collections import Counter
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

print ('Total tokens: {}'.format(sum(freqlist.values())))
print ('Total types:  {}'.format(len(freqlist.keys())))
print ('Token/Type ratio:  {:.1f}%'.format(sum(freqlist.values())/len(freqlist.keys())))

Total tokens: 260561
Total types:  19300
Token/Type ratio:  13.5%


## Lexical Frequency

In [4]:
import ipy_table

In [5]:
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Token']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Token
15158,5.82%,","
14413,5.53%,de
10944,4.20%,a
10557,4.05%,o
9693,3.72%,.
7513,2.88%,que
6264,2.40%,e
4738,1.82%,em
3464,1.33%,é


In [6]:
freqlist = Counter([word_node.get('form').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'LowerCase Token']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,LowerCase Token
15158,5.82%,","
14579,5.60%,de
11895,4.57%,a
11695,4.49%,o
9693,3.72%,.
7591,2.91%,que
6667,2.56%,e
5261,2.02%,em
3887,1.49%,é


## Lemma Frequency

In [7]:
freqlist = Counter([word_node.get('base').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Lemma']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Lemma
25384,9.74%,o
15158,5.82%,","
14579,5.60%,de
9693,3.72%,.
7591,2.91%,que
7575,2.91%,ser
6667,2.56%,e
6309,2.42%,um
5261,2.02%,em


## Part-of-Speach Frequency

In [8]:
from operator import itemgetter
postag_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    postag = word_node.get('postag')
    
    postag_freq[postag] = postag_freq.get(postag, 0) + 1
    if postag not in freqlist:
        freqlist[postag] = dict()
    freqlist[postag][form] = freqlist[postag].get(form, 0) + 1 

In [9]:
data = [['FREQ', '% FREQ', 'POSTAG', 'EXAMPLES']]

for pos, freq in sorted(postag_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[pos].values()) * 100) 
                          for w,f in sorted(freqlist[pos].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(postag_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), pos, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,POSTAG,EXAMPLES
45346,17.4%,NOUN,"livro(6.0%), história(1.9%), mundo(1.2%), vida(1.1%), .(0.9%)"
37349,14.3%,DET,"o(27.7%), a(23.9%), um(8.4%), os(7.4%), uma(6.5%)"
33835,13.0%,VERB,"é(10.1%), ler(1.7%), tem(1.5%), ser(1.4%), foi(1.3%)"
32634,12.5%,ADP,"de(44.7%), em(16.1%), a(8.3%), por(7.4%), com(6.6%)"
32069,12.3%,.,",(47.3%), .(25.3%), ""(7.0%), -(4.1%), !(3.5%)"
17406,6.7%,PRON,"que(28.1%), se(8.4%), o(7.3%), ele(6.0%), eu(5.8%)"
15581,6.0%,ADJ,".(4.4%), bom(2.1%), primeiro(1.3%), grande(1.2%), melhor(1.1%)"
14813,5.7%,ADV,"não(18.3%), mais(8.4%), muito(6.4%), já(3.3%), bem(3.0%)"
13293,5.1%,CONJ,"e(50.1%), que(19.4%), mas(10.0%), se(4.0%), ou(3.9%)"


## Dependency relation frequency

In [10]:
from operator import itemgetter
deprel_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    deprel = word_node.get('deprel')
    
    deprel_freq[deprel] = deprel_freq.get(deprel, 0) + 1
    if deprel not in freqlist:
        freqlist[deprel] = dict()
    freqlist[deprel][form] = freqlist[deprel].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'deprel', 'EXAMPLES']]

for deprel, freq in sorted(deprel_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[deprel].values()) * 100) 
                          for w,f in sorted(freqlist[deprel].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(deprel_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), deprel, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,deprel,EXAMPLES
34846,13.4%,det,"o(29.9%), a(25.8%), um(9.0%), os(7.9%), uma(6.9%)"
31852,12.2%,p,",(47.6%), .(25.4%), ""(6.7%), -(4.1%), !(3.5%)"
29938,11.5%,adpmod,"de(46.1%), em(16.9%), a(7.7%), por(6.9%), com(6.8%)"
27438,10.5%,adpobj,"livro(3.1%), ele(1.2%), vida(1.2%), mundo(1.2%), história(1.1%)"
15963,6.1%,nsubj,"que(20.3%), eu(5.9%), livro(4.5%), ele(3.9%), ela(3.6%)"
12843,4.9%,dobj,"que(8.3%), se(5.2%), me(4.1%), o(4.1%), livro(4.0%)"
12662,4.9%,ROOT,"é(14.6%), foi(1.8%), livro(1.6%), tem(1.5%), são(1.2%)"
11439,4.4%,amod,".(6.1%), grande(2.7%), primeiro(1.7%), bom(1.4%), mesmo(1.4%)"
11308,4.3%,conj,"é(4.2%), de(2.4%), em(0.9%), tem(0.9%), são(0.7%)"


## Dependency Root frequency

In [15]:
freqlist = Counter([word_node.get('form').lower() 
                    for word_node in reviews.iter('word') 
                        if word_node.get('deprel') =='ROOT'])

data = [['Freq', '% Freq', 'word']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,word
1852,14.63%,é
223,1.76%,foi
203,1.60%,livro
189,1.49%,tem
157,1.24%,são
123,0.97%,era
120,0.95%,""""
109,0.86%,há
99,0.78%,recomendo


## Morphological tags

In [14]:
morf_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    morf_tags = word_node.get('morf').split()
    
    for tag in morf_tags:
        morf_freq[tag] = morf_freq.get(tag, 0) + 1
        if tag not in freqlist:
            freqlist[tag] = dict()
        freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'MORF TAG', 'EXAMPLES']]

for tag, freq in sorted(morf_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(morf_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,MORF TAG,EXAMPLES
69387,41.1%,ms,"o(16.9%), que(10.9%), e(9.6%), um(5.2%), livro(4.0%)"
32454,19.2%,fs,"a(27.7%), uma(8.1%), história(2.7%), vida(1.5%), essa(1.3%)"
12812,7.6%,mp,"os(22.9%), todos(4.1%), livros(3.0%), personagens(2.5%), seus(2.4%)"
9031,5.4%,Y2s,"sua(8.5%), tem(6.3%), está(3.1%), faz(2.8%), pode(2.7%)"
7379,4.4%,W3s,"ser(9.9%), ler(8.0%), ter(4.7%), fazer(2.8%), ver(2.7%)"
7209,4.3%,fp,"as(24.3%), pessoas(4.5%), vezes(3.1%), coisas(2.4%), todas(2.2%)"
3949,2.3%,P3s,"é(98.4%), ideia(1.1%), fogem(0.1%), rodeia(0.1%), chateia(0.1%)"
2460,1.5%,J3s,"foi(26.7%), fez(3.9%), conseguiu(3.3%), teve(2.5%), disse(2.3%)"
2113,1.3%,P3p,"são(28.7%), estão(3.4%), vivem(3.2%), podem(2.7%), fazem(2.7%)"


## Aspect Frequency

In [16]:
# obj tags
aspect_freq =  Counter([word_node.get('form').lower() 
                        for word_node in reviews.iter('word') if word_node.get('obj') != 'O' and
                        word_node.get('obj') != None])

data = [['freq', '%freq', 'unigram aspect']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,unigram aspect
953,21.7%,livro
236,5.4%,história
229,5.2%,de
132,3.0%,o
127,2.9%,a
115,2.6%,leitura
100,2.3%,personagens
64,1.5%,romance
64,1.5%,crepúsculo


In [17]:
aspects_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('obj') != 'O' and word_node.get('obj') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                aspects_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        aspects_list.append('_'.join(chunk))
        
aspect_freq =  Counter(aspects_list)

data = [['freq', '%freq', 'aspect chunk']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,aspect chunk
916,33.1%,livro
208,7.5%,história
112,4.0%,leitura
85,3.1%,personagens
62,2.2%,crepúsculo
61,2.2%,narrativa
56,2.0%,final
55,2.0%,romance
48,1.7%,obra


In [18]:
tokens = '_'.join([word_node.get('form').lower() 
                   for word_node in reviews.iter('word')])

# get chunks of aspects in the corpus
aspects_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('obj') != 'O' and word_node.get('obj') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                aspects_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        aspects_list.append('_'.join(chunk))

        
aspect_freq =  Counter(aspects_list)
# check relative frequency for these chunks in the corpus

data = [['freq', '%aspect', '%tokens', 'aspect chunk']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    relat_freq = freq / tokens.count(token) * 100
    data.append([freq, '{:.1f}%'.format(ratio), '{:.1f}%'.format(relat_freq), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
freq,%aspect,%tokens,aspect chunk
916,33.1%,28.7%,livro
208,7.5%,21.2%,história
112,4.0%,25.6%,leitura
85,3.1%,26.4%,personagens
62,2.2%,23.6%,crepúsculo
61,2.2%,42.4%,narrativa
56,2.0%,20.7%,final
55,2.0%,17.2%,romance
48,1.7%,14.1%,obra


## Predicate frequency

In [19]:
aspect_freq =  Counter([word_node.get('form').lower() 
                        for word_node in reviews.iter('word') 
                            if word_node.get('opinion') != 'O' and 
                               word_node.get('opinion') != None])

data = [['freq', '%freq', 'unigram opinion']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,unigram opinion
929,5.2%,de
637,3.6%,a
577,3.2%,o
394,2.2%,não
384,2.1%,que
274,1.5%,um
212,1.2%,em
211,1.2%,me
206,1.2%,bom


In [20]:
opinions_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('opinion') != 'O' and word_node.get('opinion') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                opinions_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        opinions_list.append('_'.join(chunk))
        
opinions_freq =  Counter(opinions_list)

data = [['freq', '%freq', 'opinion chunk']]
for token, freq in opinions_freq.most_common(30):
    ratio = freq / sum(opinions_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,opinion chunk
140,2.6%,bom
113,2.1%,recomendo
92,1.7%,gostei
60,1.1%,interessante
46,0.9%,ótimo
45,0.8%,envolvente
43,0.8%,adorei
35,0.7%,amei
33,0.6%,perfeito


In [21]:
tokens = '_'.join([word_node.get('form').lower() 
                   for word_node in reviews.iter('word')])

# get chunks of opinions in the corpus
opinions_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('opinion') != 'O' and word_node.get('opinion') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                opinions_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        opinions_list.append('_'.join(chunk))

        
opinion_freq =  Counter(opinions_list)
# check relative frequency for these chunks in the corpus

data = [['freq', '%opinion', '%tokens', 'opinion chunk']]
for token, freq in opinion_freq.most_common(30):
    ratio = freq / sum(opinion_freq.values()) *100
    relat_freq = freq / tokens.count(token) * 100
    data.append([freq, '{:.1f}%'.format(ratio), '{:.1f}%'.format(relat_freq), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
freq,%opinion,%tokens,opinion chunk
140,2.6%,36.0%,bom
113,2.1%,73.4%,recomendo
92,1.7%,56.1%,gostei
60,1.1%,35.5%,interessante
46,0.9%,61.3%,ótimo
45,0.8%,75.0%,envolvente
43,0.8%,68.3%,adorei
35,0.7%,63.6%,amei
33,0.6%,28.4%,perfeito
