# Statistics in ReLi Corpus

In [17]:
from __future__ import print_function
from __future__ import division

In [2]:
from lxml.etree import ElementTree
reviews = ElementTree().parse('../corpus/ReLi.xml')

## Token Type Ratio

In [4]:
from collections import Counter
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

print ('Total tokens: {}'.format(sum(freqlist.values())))
print ('Total types:  {}'.format(len(freqlist.keys())))
print ('Token/Type ratio:  {:.1f}%'.format(sum(freqlist.values())/len(freqlist.keys())))

Total tokens: 260561
Total types:  19300
Token/Type ratio:  13.5%


## Lexical Frequency

In [56]:
import ipy_table

In [64]:
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Token']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Token
15158,5.82%,","
14413,5.53%,de
10944,4.20%,a
10557,4.05%,o
9693,3.72%,.
7513,2.88%,que
6264,2.40%,e
4738,1.82%,em
3464,1.33%,é


In [69]:
freqlist = Counter([word_node.get('form').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'LowerCase Token']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,LowerCase Token
15158,5.82%,","
14579,5.60%,de
11895,4.57%,a
11695,4.49%,o
9693,3.72%,.
7591,2.91%,que
6667,2.56%,e
5261,2.02%,em
3887,1.49%,é


## Lemma Frequency

In [70]:
freqlist = Counter([word_node.get('base').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Lemma']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Lemma
25384,9.74%,o
15158,5.82%,","
14579,5.60%,de
9693,3.72%,.
7591,2.91%,que
7575,2.91%,ser
6667,2.56%,e
6309,2.42%,um
5261,2.02%,em


## Part-of-Speach Frequency

In [72]:
from operator import itemgetter
postag_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    postag = word_node.get('postag')
    
    postag_freq[postag] = postag_freq.get(postag, 0) + 1
    if postag not in freqlist:
        freqlist[postag] = dict()
    freqlist[postag][form] = freqlist[postag].get(form, 0) + 1 

In [73]:
data = [['FREQ', '% FREQ', 'POSTAG', 'EXAMPLES']]

for pos, freq in sorted(postag_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[pos].values()) * 100) 
                          for w,f in sorted(freqlist[pos].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(postag_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), pos, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,POSTAG,EXAMPLES
44954,17.3%,N,"livro(6.0%), história(1.9%), mundo(1.2%), vida(1.1%), leitura(0.9%)"
33068,12.7%,PREP,"de(43.0%), em(15.7%), a(8.8%), por(6.9%), com(6.5%)"
31175,12.0%,V,"é(11.3%), ler(1.8%), tem(1.7%), ser(1.6%), foi(1.5%)"
29043,11.1%,ART,"o(35.8%), a(30.1%), um(10.1%), os(9.5%), uma(8.4%)"
15158,5.8%,",",",(100.0%)"
13572,5.2%,ADJ,"bom(2.3%), primeiro(1.4%), grande(1.3%), melhor(1.3%), primeira(1.1%)"
11959,4.6%,ADV,"não(22.4%), mais(8.9%), muito(8.0%), bem(3.9%), já(3.7%)"
11059,4.2%,NPROP,"bella(3.3%), edward(3.1%), saramago(2.2%), crepúsculo(2.0%), de(2.0%)"
9693,3.7%,.,.(100.0%)


## Morphological tags

In [88]:
morf_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    morf_tags = word_node.get('morf').split()
    
    for tag in morf_tags:
        morf_freq[tag] = morf_freq.get(tag, 0) + 1
        if tag not in freqlist:
            freqlist[tag] = dict()
        freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'MORF TAG', 'EXAMPLES']]

for tag, freq in sorted(morf_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(morf_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,MORF TAG,EXAMPLES
69387,41.1%,ms,"o(16.9%), que(10.9%), e(9.6%), um(5.2%), livro(4.0%)"
32454,19.2%,fs,"a(27.7%), uma(8.1%), história(2.7%), vida(1.5%), essa(1.3%)"
12812,7.6%,mp,"os(22.9%), todos(4.1%), livros(3.0%), personagens(2.5%), seus(2.4%)"
9031,5.4%,Y2s,"sua(8.5%), tem(6.3%), está(3.1%), faz(2.8%), pode(2.7%)"
7379,4.4%,W3s,"ser(9.9%), ler(8.0%), ter(4.7%), fazer(2.8%), ver(2.7%)"
7209,4.3%,fp,"as(24.3%), pessoas(4.5%), vezes(3.1%), coisas(2.4%), todas(2.2%)"
3949,2.3%,P3s,"é(98.4%), ideia(1.1%), fogem(0.1%), baseia(0.1%), chateia(0.1%)"
2460,1.5%,J3s,"foi(26.7%), fez(3.9%), conseguiu(3.3%), teve(2.5%), disse(2.3%)"
2113,1.3%,P3p,"são(28.7%), estão(3.4%), vivem(3.2%), podem(2.7%), fazem(2.7%)"


## Aspect Frequency

In [82]:
# obj tags
aspect_freq =  Counter([word_node.get('form').lower() 
                        for word_node in reviews.iter('word') if word_node.get('obj') != 'O'])

data = [['freq', '%freq', 'unigram aspect']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,unigram aspect
953,21.7%,livro
236,5.4%,história
229,5.2%,de
132,3.0%,o
127,2.9%,a
115,2.6%,leitura
100,2.3%,personagens
64,1.5%,crepúsculo
64,1.5%,narrativa


In [83]:
aspects_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('obj') != 'O':
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                aspects_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        aspects_list.append('_'.join(chunk))
        
aspect_freq =  Counter(aspects_list)

data = [['freq', '%freq', 'aspect chunk']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,aspect chunk
916,33.1%,livro
208,7.5%,história
112,4.0%,leitura
85,3.1%,personagens
62,2.2%,crepúsculo
61,2.2%,narrativa
56,2.0%,final
55,2.0%,romance
48,1.7%,obra


In [84]:
tokens = '_'.join([word_node.get('form').lower() 
                   for word_node in reviews.iter('word')])

# get chunks of aspects in the corpus
aspects_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('obj') != 'O':
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                aspects_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        aspects_list.append('_'.join(chunk))

        
aspect_freq =  Counter(aspects_list)
# check relative frequency for these chunks in the corpus

data = [['freq', '%aspect', '%tokens', 'aspect chunk']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    relat_freq = freq / tokens.count(token) * 100
    data.append([freq, '{:.1f}%'.format(ratio), '{:.1f}%'.format(relat_freq), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
freq,%aspect,%tokens,aspect chunk
916,33.1%,28.7%,livro
208,7.5%,21.2%,história
112,4.0%,25.6%,leitura
85,3.1%,26.4%,personagens
62,2.2%,23.6%,crepúsculo
61,2.2%,42.4%,narrativa
56,2.0%,20.7%,final
55,2.0%,17.2%,romance
48,1.7%,14.1%,obra


## Predicate frequency

In [85]:
aspect_freq =  Counter([word_node.get('form').lower() 
                        for word_node in reviews.iter('word') if word_node.get('opinion') != 'O'])

data = [['freq', '%freq', 'unigram opinion']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,unigram opinion
929,5.2%,de
637,3.6%,a
577,3.2%,o
394,2.2%,não
384,2.1%,que
274,1.5%,um
212,1.2%,em
211,1.2%,me
206,1.2%,bom


In [86]:
opinions_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('opinion') != 'O':
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                opinions_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        opinions_list.append('_'.join(chunk))
        
opinions_freq =  Counter(opinions_list)

data = [['freq', '%freq', 'opinion chunk']]
for token, freq in opinions_freq.most_common(30):
    ratio = freq / sum(opinions_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,opinion chunk
140,2.6%,bom
113,2.1%,recomendo
92,1.7%,gostei
60,1.1%,interessante
46,0.9%,ótimo
45,0.8%,envolvente
43,0.8%,adorei
35,0.7%,amei
33,0.6%,perfeito


In [87]:
tokens = '_'.join([word_node.get('form').lower() 
                   for word_node in reviews.iter('word')])

# get chunks of opinions in the corpus
opinions_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('opinion') != 'O':
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                opinions_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        opinions_list.append('_'.join(chunk))

        
opinion_freq =  Counter(opinions_list)
# check relative frequency for these chunks in the corpus

data = [['freq', '%opinion', '%tokens', 'opinion chunk']]
for token, freq in opinion_freq.most_common(30):
    ratio = freq / sum(opinion_freq.values()) *100
    relat_freq = freq / tokens.count(token) * 100
    data.append([freq, '{:.1f}%'.format(ratio), '{:.1f}%'.format(relat_freq), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
freq,%opinion,%tokens,opinion chunk
140,2.6%,36.0%,bom
113,2.1%,73.4%,recomendo
92,1.7%,56.1%,gostei
60,1.1%,35.5%,interessante
46,0.9%,61.3%,ótimo
45,0.8%,75.0%,envolvente
43,0.8%,68.3%,adorei
35,0.7%,63.6%,amei
33,0.6%,28.4%,perfeito
