# Statistics in ReLi Corpus annotated with PALAVRAS Parser

In [1]:
from __future__ import print_function
from __future__ import division

In [3]:
from lxml.etree import ElementTree
reviews = ElementTree().parse('../corpus/ReLiPalavras.xml')

## Token Type Ratio

In [4]:
from collections import Counter
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

print ('Total tokens: {}'.format(sum(freqlist.values())))
print ('Total types:  {}'.format(len(freqlist.keys())))
print ('Token/Type ratio:  {:.1f}%'.format(sum(freqlist.values())/len(freqlist.keys())))

Total tokens: 238435
Total types:  18878
Token/Type ratio:  12.6%


## Lexical Frequency

In [5]:
import ipy_table

In [6]:
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Token']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Token
14210,5.96%,","
12347,5.18%,de
10495,4.40%,.
9920,4.16%,a
9587,4.02%,o
6555,2.75%,que
5880,2.47%,e
4263,1.79%,em
3243,1.36%,é


In [7]:
freqlist = Counter([word_node.get('form').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'LowerCase Token']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,LowerCase Token
14210,5.96%,","
12470,5.23%,de
10773,4.52%,a
10588,4.44%,o
10495,4.40%,.
6628,2.78%,que
6258,2.62%,e
4738,1.99%,em
3639,1.53%,é


## Lemma Frequency

In [8]:
freqlist = Counter([word_node.get('base').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Lemma']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Lemma
32188,13.50%,--
22149,9.29%,o
12476,5.23%,de
6818,2.86%,ser
6629,2.78%,que
6258,2.62%,e
5439,2.28%,um
4738,1.99%,em
3002,1.26%,livro


## Part-of-Speach Frequency

In [9]:
from operator import itemgetter
postag_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    postag = word_node.get('postag')
    
    postag_freq[postag] = postag_freq.get(postag, 0) + 1
    if postag not in freqlist:
        freqlist[postag] = dict()
    freqlist[postag][form] = freqlist[postag].get(form, 0) + 1 

In [10]:
data = [['FREQ', '% FREQ', 'POSTAG', 'EXAMPLES']]

for pos, freq in sorted(postag_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[pos].values()) * 100) 
                          for w,f in sorted(freqlist[pos].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(postag_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), pos, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,POSTAG,EXAMPLES
40404,16.9%,n,"livro(6.5%), história(2.0%), vida(1.1%), mundo(1.1%), leitura(1.0%)"
32229,13.5%,pu,",(44.1%), .(32.6%), ""(6.5%), !(3.6%), )(2.0%)"
30486,12.8%,pron-indef,"o(31.8%), a(26.0%), um(8.8%), os(7.6%), uma(7.2%)"
28969,12.1%,prp,"de(43.0%), em(16.4%), a(8.7%), com(6.9%), por(6.6%)"
28321,11.9%,v-fin,"é(12.5%), foi(2.1%), são(2.0%), tem(1.9%), era(1.3%)"
15611,6.5%,adv,"não(16.3%), mais(6.2%), muito(5.0%), como(4.0%), quando(3.2%)"
14045,5.9%,adj,"bom(2.5%), primeiro(1.4%), melhor(1.3%), grande(1.2%), interessante(1.0%)"
8021,3.4%,conj-c,"e(78.0%), mas(15.8%), ou(5.7%), nem(0.5%), and(0.0%)"
7769,3.3%,pron-pers,"se(22.9%), ele(12.9%), eu(11.8%), ela(11.0%), me(9.2%)"


## Dependency relation frequency

In [21]:
from operator import itemgetter
deprel_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    deprel = word_node.get('deprel')
    
    deprel_freq[deprel] = deprel_freq.get(deprel, 0) + 1
    if deprel not in freqlist:
        freqlist[deprel] = dict()
    freqlist[deprel][form] = freqlist[deprel].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'deprel', 'EXAMPLES']]

for deprel, freq in sorted(deprel_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[deprel].values()) * 100) 
                          for w,f in sorted(freqlist[deprel].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(deprel_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), deprel, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,deprel,EXAMPLES
58762,24.6%,DN,"o(16.5%), de(14.9%), a(13.8%), um(4.9%), os(4.0%)"
32229,13.5%,PU,",(44.1%), .(32.6%), ""(6.5%), !(3.6%), )(2.0%)"
25858,10.8%,DP,"livro(3.0%), ele(1.2%), vida(1.1%), mundo(1.0%), história(1.0%)"
24290,10.2%,fA,"em(12.4%), não(9.7%), a(4.9%), para(4.8%), de(4.6%)"
20036,8.4%,CJT,"é(7.5%), são(1.4%), foi(1.2%), tem(1.2%), de(1.2%)"
16084,6.7%,S,"que(12.5%), eu(5.5%), livro(4.7%), ele(4.1%), ela(3.8%)"
14723,6.2%,Od,"que(9.6%), se(6.1%), me(3.6%), livro(3.1%), o(2.8%)"
7788,3.3%,CO,"e(77.5%), mas(15.7%), ou(5.6%), nem(0.6%), mais(0.2%)"
6282,2.6%,STA,"é(21.3%), foi(3.1%), são(2.0%), tem(1.8%), recomendo(1.5%)"


## Dependency Root frequency

In [25]:
freqlist = Counter([word_node.get('form').lower() 
                    for word_node in reviews.iter('word') 
                        if word_node.get('head') =='0' and 
                           word_node.get('postag') !='pu'])

data = [['Freq', '% Freq', 'word']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,word
2086,13.51%,é
295,1.91%,foi
240,1.55%,são
228,1.48%,tem
181,1.17%,livro
156,1.01%,o
150,0.97%,era
148,0.96%,de
117,0.76%,há


## Morphological tags

In [27]:
morf_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    morf_tags = word_node.get('morf').split()
    
    for tag in morf_tags:
        morf_freq[tag] = morf_freq.get(tag, 0) + 1
        if tag not in freqlist:
            freqlist[tag] = dict()
        freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'MORF TAG', 'EXAMPLES']]

for tag, freq in sorted(morf_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(morf_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,MORF TAG,EXAMPLES
96687,22.0%,--,",(14.7%), de(12.9%), .(10.9%), e(6.5%), em(4.9%)"
87208,19.8%,S,"o(11.8%), a(9.2%), que(4.6%), um(3.6%), livro(3.0%)"
62566,14.2%,M,"o(16.9%), que(6.3%), um(5.1%), livro(4.2%), os(4.0%)"
44553,10.1%,F,"a(18.5%), uma(5.5%), as(3.4%), ela(1.9%), história(1.9%)"
24587,5.6%,VFIN,"é(14.5%), foi(2.4%), são(2.3%), tem(2.1%), era(1.5%)"
22339,5.1%,IND,"é(15.9%), foi(2.6%), são(2.6%), tem(2.4%), era(1.7%)"
20670,4.7%,P,"os(11.5%), as(7.3%), livros(1.7%), pessoas(1.5%), personagens(1.5%)"
20273,4.6%,3S,"é(17.5%), ele(4.9%), se(4.5%), ela(4.2%), foi(2.9%)"
16302,3.7%,PR,"é(21.5%), são(3.5%), tem(3.2%), está(1.7%), faz(1.5%)"


## Extra tags frequency

In [29]:
extra_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    extra_tags = word_node.get('extra').split()
    
    for tag in extra_tags:
        extra_freq[tag] = extra_freq.get(tag, 0) + 1
        if tag not in freqlist:
            freqlist[tag] = dict()
        freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'EXTRA TAG', 'EXAMPLES']]

for tag, freq in sorted(extra_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(extra_freq.values()) * 100
    data.append([freq, '{:.2f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,EXTRA TAG,EXAMPLES
120345,33.88%,--,",(11.8%), .(8.7%), a(7.9%), o(7.5%), em(2.8%)"
31975,9.00%,mv,"é(9.9%), ler(1.8%), ser(1.5%), tem(1.5%), foi(1.3%)"
24232,6.82%,np-close,"de(37.0%), em(2.0%), com(1.5%), é(0.9%), para(0.8%)"
18827,5.30%,*,"o(5.3%), a(4.5%), em(2.5%), é(2.1%), e(2.0%)"
16281,4.58%,vH,"ler(3.5%), li(1.7%), ver(1.2%), acho(1.0%), gostei(0.9%)"
15929,4.48%,fmc,"é(17.7%), foi(2.7%), são(2.4%), tem(2.2%), era(1.4%)"
11819,3.33%,clb,"que(50.7%), como(4.8%), se(4.2%), mas(4.1%), quando(4.0%)"
8634,2.43%,clb-fs,"que(66.8%), como(6.2%), quando(5.2%), se(5.0%), porque(2.6%)"
8473,2.39%,-head,"é(8.7%), são(1.5%), tem(1.3%), foi(1.3%), livro(1.1%)"


## Semantic Role Label Frequency

In [31]:
srl_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    tag = word_node.get('srl')
    srl_freq[tag] = srl_freq.get(tag, 0) + 1
    if tag not in freqlist:
        freqlist[tag] = dict()
    freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'SRL TAG', 'EXAMPLES']]

for tag, freq in sorted(srl_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(srl_freq.values()) * 100
    data.append([freq, '{:.2f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,SRL TAG,EXAMPLES
144013,60.40%,,",(9.4%), de(8.1%), .(6.9%), a(6.9%), o(6.5%)"
13326,5.59%,ATR,"livro(2.6%), com(1.8%), é(1.1%), de(1.0%), bom(0.9%)"
11757,4.93%,TH,"que(10.5%), livro(6.7%), ele(2.3%), história(2.3%), o(1.9%)"
9627,4.04%,PAT,"que(9.2%), livro(3.3%), o(2.8%), me(1.7%), se(1.5%)"
8244,3.46%,,",(8.9%), de(7.8%), .(7.2%), a(6.3%), o(5.5%)"
7760,3.25%,PRED,"é(16.0%), foi(1.9%), ser(1.5%), tem(1.4%), era(1.2%)"
7562,3.17%,AG,"que(14.1%), eu(7.1%), ele(4.5%), ela(4.2%), livro(2.9%)"
4132,1.73%,FOC,"não(60.5%), só(8.8%), também(7.4%), ainda(6.4%), apenas(4.8%)"
3997,1.68%,MNR,"já(9.4%), forma(4.3%), então(3.5%), assim(3.4%), sim(2.7%)"


## Aspect Frequency

In [12]:
# obj tags
aspect_freq =  Counter([word_node.get('form').lower() 
                        for word_node in reviews.iter('word') if word_node.get('obj') != 'O' and
                        word_node.get('obj') != None])

data = [['freq', '%freq', 'unigram aspect']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,unigram aspect
915,22.8%,livro
230,5.7%,história
203,5.1%,de
113,2.8%,o
110,2.7%,leitura
104,2.6%,a
97,2.4%,personagens
64,1.6%,narrativa
60,1.5%,romance


In [13]:
aspects_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('obj') != 'O' and word_node.get('obj') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                aspects_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        aspects_list.append('_'.join(chunk))
        
aspect_freq =  Counter(aspects_list)

data = [['freq', '%freq', 'aspect chunk']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,aspect chunk
881,33.3%,livro
202,7.6%,história
107,4.0%,leitura
82,3.1%,personagens
61,2.3%,narrativa
58,2.2%,crepúsculo
52,2.0%,final
51,1.9%,romance
47,1.8%,obra


In [14]:
tokens = '_'.join([word_node.get('form').lower() 
                   for word_node in reviews.iter('word')])

# get chunks of aspects in the corpus
aspects_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('obj') != 'O' and word_node.get('obj') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                aspects_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        aspects_list.append('_'.join(chunk))

        
aspect_freq =  Counter(aspects_list)
# check relative frequency for these chunks in the corpus

data = [['freq', '%aspect', '%tokens', 'aspect chunk']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    relat_freq = freq / tokens.count(token) * 100
    data.append([freq, '{:.1f}%'.format(ratio), '{:.1f}%'.format(relat_freq), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
freq,%aspect,%tokens,aspect chunk
881,33.3%,29.1%,livro
202,7.6%,21.6%,história
107,4.0%,25.6%,leitura
82,3.1%,27.0%,personagens
61,2.3%,42.7%,narrativa
58,2.2%,23.5%,crepúsculo
52,2.0%,20.7%,final
51,1.9%,16.8%,romance
47,1.8%,14.5%,obra


## Predicate frequency

In [16]:
aspect_freq =  Counter([word_node.get('form').lower() 
                        for word_node in reviews.iter('word') 
                            if word_node.get('opinion') != 'O' and 
                               word_node.get('opinion') != None])

data = [['freq', '%freq', 'unigram opinion']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,unigram opinion
838,5.0%,de
585,3.5%,a
517,3.1%,o
382,2.3%,não
345,2.1%,que
236,1.4%,um
205,1.2%,me
198,1.2%,bom
197,1.2%,em


In [17]:
opinions_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('opinion') != 'O' and word_node.get('opinion') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                opinions_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        opinions_list.append('_'.join(chunk))
        
opinions_freq =  Counter(opinions_list)

data = [['freq', '%freq', 'opinion chunk']]
for token, freq in opinions_freq.most_common(30):
    ratio = freq / sum(opinions_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,opinion chunk
134,2.6%,bom
109,2.1%,recomendo
88,1.7%,gostei
59,1.1%,interessante
45,0.9%,ótimo
42,0.8%,adorei
41,0.8%,envolvente
33,0.6%,amei
33,0.6%,perfeito


In [18]:
tokens = '_'.join([word_node.get('form').lower() 
                   for word_node in reviews.iter('word')])

# get chunks of opinions in the corpus
opinions_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('opinion') != 'O' and word_node.get('opinion') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                opinions_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        opinions_list.append('_'.join(chunk))

        
opinion_freq =  Counter(opinions_list)
# check relative frequency for these chunks in the corpus

data = [['freq', '%opinion', '%tokens', 'opinion chunk']]
for token, freq in opinion_freq.most_common(30):
    ratio = freq / sum(opinion_freq.values()) *100
    relat_freq = freq / tokens.count(token) * 100
    data.append([freq, '{:.1f}%'.format(ratio), '{:.1f}%'.format(relat_freq), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
freq,%opinion,%tokens,opinion chunk
134,2.6%,36.4%,bom
109,2.1%,73.6%,recomendo
88,1.7%,57.1%,gostei
59,1.1%,35.5%,interessante
45,0.9%,60.8%,ótimo
42,0.8%,68.9%,adorei
41,0.8%,73.2%,envolvente
33,0.6%,68.8%,amei
33,0.6%,29.5%,perfeito
