# Statistics in ReLi Corpus

In [2]:
from __future__ import print_function
from __future__ import division

In [3]:
from lxml.etree import ElementTree
reviews = ElementTree().parse('../corpus/ReLi.xml')

## Token Type Ratio

In [3]:
from collections import Counter
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

print ('Total tokens: {}'.format(sum(freqlist.values())))
print ('Total types:  {}'.format(len(freqlist.keys())))
print ('Token/Type ratio:  {:.1f}%'.format(sum(freqlist.values())/len(freqlist.keys())))

Total tokens: 260561
Total types:  19300
Token/Type ratio:  13.5%


## Lexical Frequency

In [4]:
import ipy_table

In [5]:
freqlist = Counter([word_node.get('form').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Token']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Token
15158,5.82%,","
14579,5.60%,de
11895,4.57%,a
11695,4.49%,o
9693,3.72%,.
7591,2.91%,que
6667,2.56%,e
5261,2.02%,em
3887,1.49%,é


## Lemma Frequency

In [6]:
freqlist = Counter([word_node.get('base').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Lemma']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Lemma
25430,9.76%,o
15158,5.82%,","
14579,5.60%,de
9693,3.72%,.
7591,2.91%,que
7577,2.91%,ser
6678,2.56%,e
6309,2.42%,um
5261,2.02%,em


## Part-of-Speech Frequency

In [7]:
from operator import itemgetter
postag_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    postag = word_node.get('postag')
    
    postag_freq[postag] = postag_freq.get(postag, 0) + 1
    if postag not in freqlist:
        freqlist[postag] = dict()
    freqlist[postag][form] = freqlist[postag].get(form, 0) + 1 

In [8]:
data = [['FREQ', '% FREQ', 'POSTAG', 'EXAMPLES']]

for pos, freq in sorted(postag_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[pos].values()) * 100) 
                          for w,f in sorted(freqlist[pos].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(postag_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), pos, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,POSTAG,EXAMPLES
44954,17.3%,N,"livro(6.0%), história(1.9%), mundo(1.2%), vida(1.1%), leitura(0.9%)"
33068,12.7%,PREP,"de(43.0%), em(15.7%), a(8.8%), por(6.9%), com(6.5%)"
31175,12.0%,V,"é(11.3%), ler(1.8%), tem(1.7%), ser(1.6%), foi(1.5%)"
29043,11.1%,ART,"o(35.8%), a(30.1%), um(10.1%), os(9.5%), uma(8.4%)"
15158,5.8%,",",",(100.0%)"
13572,5.2%,ADJ,"bom(2.3%), primeiro(1.4%), grande(1.3%), melhor(1.3%), primeira(1.1%)"
11959,4.6%,ADV,"não(22.4%), mais(8.9%), muito(8.0%), bem(3.9%), já(3.7%)"
11059,4.2%,NPROP,"bella(3.3%), edward(3.1%), saramago(2.2%), crepúsculo(2.0%), de(2.0%)"
9693,3.7%,.,.(100.0%)


## Morphological tags

In [9]:
morf_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    morf_tags = word_node.get('morf').split()
    
    for tag in morf_tags:
        morf_freq[tag] = morf_freq.get(tag, 0) + 1
        if tag not in freqlist:
            freqlist[tag] = dict()
        freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'MORF TAG', 'EXAMPLES']]

for tag, freq in sorted(morf_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(morf_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,MORF TAG,EXAMPLES
69638,41.1%,ms,"o(16.8%), que(10.9%), e(9.6%), um(5.2%), livro(3.9%)"
32670,19.3%,fs,"a(27.5%), uma(8.1%), história(2.6%), vida(1.5%), essa(1.3%)"
12880,7.6%,mp,"os(22.8%), todos(4.0%), livros(2.9%), personagens(2.5%), seus(2.3%)"
9155,5.4%,Y2s,"sua(8.4%), tem(6.2%), está(3.1%), faz(2.7%), pode(2.7%)"
7386,4.4%,W3s,"ser(9.9%), ler(8.0%), ter(4.7%), fazer(2.8%), ver(2.7%)"
7248,4.3%,fp,"as(24.2%), pessoas(4.5%), vezes(3.0%), coisas(2.4%), todas(2.2%)"
3949,2.3%,P3s,"é(98.4%), ideia(1.1%), fogem(0.1%), rodeia(0.1%), baseia(0.1%)"
2461,1.5%,J3s,"foi(26.7%), fez(3.9%), conseguiu(3.3%), teve(2.5%), disse(2.3%)"
2143,1.3%,P3p,"são(28.3%), estão(3.4%), vivem(3.1%), podem(2.7%), fazem(2.6%)"


## Aspect Frequency

In [10]:
freqlist =  Counter([node.get('target').lower() for node in reviews.iter('opinion')])

data = [['freq', '%freq', 'aspect(OBJ)']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,aspect(OBJ)
916,33.1%,livro
208,7.5%,história
112,4.0%,leitura
85,3.1%,personagens
62,2.2%,crepúsculo
61,2.2%,narrativa
57,2.1%,final
55,2.0%,romance
48,1.7%,obra


In [11]:
sentences = ' '.join([node.get('form').lower() for node in reviews.iter('word')])
freqlist =  Counter([node.get('target').lower() for node in reviews.iter('opinion')])

data = [['Total in Corpus', 'Total as target',  'Freq as target', 'aspect chunk']]
for token, freq in freqlist.most_common(20):
    total = sentences.count(' ' + token + ' ')
    ratio = freq / total *100    
    data.append([total, freq,'{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
Total in Corpus,Total as target,Freq as target,aspect chunk
2779,916,33.0%,livro
864,208,24.1%,história
409,112,27.4%,leitura
321,85,26.5%,personagens
260,62,23.8%,crepúsculo
141,61,43.3%,narrativa
193,57,29.5%,final
274,55,20.1%,romance
251,48,19.1%,obra


## Opinion frequency

In [12]:
sentences = ' '.join([node.get('form').lower() for node in reviews.iter('word')])

# get chunks of opinions in the corpus
opinions_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.iter('word'):
        if word_node.get('opinion') != 'O':
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                opinions_list.append(' '.join(chunk))
                chunk = []
    if len(chunk) != 0:
        opinions_list.append(' '.join(chunk))

freqlist =  Counter(opinions_list)

data = [['Total in Corpus', 'Total as opinion',  'Freq as opinion', 'aspect chunk']]
for token, freq in freqlist.most_common(20):
    total = sentences.count(' ' + token + ' ')
    ratio = freq / total *100    
    data.append([total, freq,'{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
Total in Corpus,Total as opinion,Freq as opinion,aspect chunk
371,140,37.7%,bom
150,113,75.3%,recomendo
163,92,56.4%,gostei
147,60,40.8%,interessante
70,46,65.7%,ótimo
60,45,75.0%,envolvente
59,43,72.9%,adorei
50,35,70.0%,amei
106,33,31.1%,perfeito


### Aspect polarity

In [6]:
from collections import Counter
targets = Counter([opinion_node.get('target').lower() for opinion_node in reviews.iter('Opinion') if opinion_node.get('target') != 'NULL'])


targets_polarity = {}
for opinion_node in reviews.iter('Opinion'):
    if opinion_node.get('target') != 'NULL':
            target = opinion_node.get('target').lower()
            polarity = opinion_node.get('polarity')
            if target not in targets_polarity:
                targets_polarity[target] = {}
                
            targets_polarity[target][polarity] = targets_polarity[target].get(polarity, 0) + 1

In [7]:
import ipy_table

data = [['freq', '%freq', 'target', 'positive', 'negative', 'neutral']]
for target, freq in targets.most_common(20):
    ratio = freq / sum(targets.values()) *100
    positive = targets_polarity[target].get('positive', 0)
    negative = targets_polarity[target].get('negative', 0)
    neutral = targets_polarity[target].get('neutral', 0)
    data.append([freq, '{:.1f}%'.format(ratio), target, positive, negative, neutral])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3,4,5
freq,%freq,target,positive,negative,neutral
916,33.1%,livro,761,144,0
208,7.5%,história,158,45,0
112,4.0%,leitura,89,23,0
85,3.1%,personagens,57,23,0
62,2.2%,crepúsculo,38,19,0
61,2.2%,narrativa,47,12,0
57,2.1%,final,35,20,0
55,2.0%,romance,47,8,0
48,1.7%,obra,46,2,0
