# Statistics in ReLi Corpus annotated with PALAVRAS parser

In [1]:
from __future__ import print_function
from __future__ import division

In [2]:
from lxml.etree import ElementTree
reviews = ElementTree().parse('../corpus/ReLiPalavras.xml')

## Token Type Ratio

In [3]:
from collections import Counter
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

print ('Total tokens: {}'.format(sum(freqlist.values())))
print ('Total types:  {}'.format(len(freqlist.keys())))
print ('Token/Type ratio:  {:.1f}%'.format(sum(freqlist.values())/len(freqlist.keys())))

Total tokens: 231217
Total types:  18584
Token/Type ratio:  12.4%


## Lexical Frequency

In [4]:
import ipy_table

In [5]:
freqlist = Counter([word_node.get('form').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Token']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Token
13787,5.96%,","
12016,5.20%,de
10386,4.49%,a
10286,4.45%,.
10280,4.45%,o
6378,2.76%,que
6078,2.63%,e
4455,1.93%,em
3566,1.54%,é


## Lemma Frequency

In [6]:
freqlist = Counter([word_node.get('base').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Lemma']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Lemma
31361,13.56%,--
21419,9.26%,o
12021,5.20%,de
6683,2.89%,ser
6379,2.76%,que
6078,2.63%,e
5267,2.28%,um
4455,1.93%,em
2949,1.28%,livro


## Part-of-Speech Frequency

In [7]:
from operator import itemgetter
postag_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    postag = word_node.get('postag')
    
    postag_freq[postag] = postag_freq.get(postag, 0) + 1
    if postag not in freqlist:
        freqlist[postag] = dict()
    freqlist[postag][form] = freqlist[postag].get(form, 0) + 1 

In [8]:
data = [['FREQ', '% FREQ', 'POSTAG', 'EXAMPLES']]

for pos, freq in sorted(postag_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[pos].values()) * 100) 
                          for w,f in sorted(freqlist[pos].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(postag_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), pos, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,POSTAG,EXAMPLES
39136,16.9%,n,"livro(6.6%), história(2.1%), vida(1.1%), mundo(1.0%), leitura(1.0%)"
31402,13.6%,pu,",(43.9%), .(32.8%), ""(6.4%), !(3.6%), )(2.0%)"
29529,12.8%,pron-indef,"o(31.9%), a(25.9%), um(8.8%), os(7.6%), uma(7.2%)"
27867,12.1%,prp,"de(43.1%), em(16.0%), a(8.7%), com(7.0%), por(6.7%)"
27471,11.9%,v-fin,"é(12.7%), foi(2.1%), são(2.1%), tem(1.9%), era(1.3%)"
15214,6.6%,adv,"não(16.3%), mais(6.3%), muito(5.0%), como(3.9%), quando(3.1%)"
13723,5.9%,adj,"bom(2.5%), primeiro(1.5%), melhor(1.4%), grande(1.2%), interessante(1.0%)"
7789,3.4%,conj-c,"e(78.0%), mas(15.8%), ou(5.6%), nem(0.5%), and(0.1%)"
7553,3.3%,pron-pers,"se(22.9%), ele(12.7%), eu(12.0%), ela(10.9%), me(9.4%)"


## Morphological tags

In [9]:
morf_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    morf_tags = word_node.get('morf').split()
    
    for tag in morf_tags:
        morf_freq[tag] = morf_freq.get(tag, 0) + 1
        if tag not in freqlist:
            freqlist[tag] = dict()
        freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'MORF TAG', 'EXAMPLES']]

for tag, freq in sorted(morf_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(morf_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,MORF TAG,EXAMPLES
93776,22.0%,--,",(14.7%), de(12.8%), .(11.0%), e(6.5%), em(4.8%)"
84545,19.8%,S,"o(11.8%), a(9.1%), que(4.5%), um(3.6%), livro(3.1%)"
60688,14.2%,M,"o(16.9%), que(6.2%), um(5.1%), livro(4.3%), os(4.0%)"
43113,10.1%,F,"a(18.4%), uma(5.5%), as(3.4%), ela(1.9%), história(1.9%)"
23846,5.6%,VFIN,"é(14.6%), foi(2.4%), são(2.4%), tem(2.2%), era(1.5%)"
21666,5.1%,IND,"é(16.1%), foi(2.7%), são(2.6%), tem(2.4%), era(1.7%)"
20014,4.7%,P,"os(11.5%), as(7.3%), livros(1.7%), pessoas(1.5%), personagens(1.5%)"
19616,4.6%,3S,"é(17.8%), ele(4.9%), se(4.5%), ela(4.2%), foi(2.9%)"
15774,3.7%,PR,"é(21.8%), são(3.6%), tem(3.3%), faz(1.5%), está(1.4%)"


## Dependency relation frequency

In [10]:
from operator import itemgetter
deprel_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    deprel = word_node.get('deprel')
    # fix a bug in PALAVRAS output
    deprel = deprel.split()[0]
    
    deprel_freq[deprel] = deprel_freq.get(deprel, 0) + 1
    if deprel not in freqlist:
        freqlist[deprel] = dict()
    freqlist[deprel][form] = freqlist[deprel].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'DEPREL', 'EXAMPLES']]

for deprel, freq in sorted(deprel_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[deprel].values()) * 100) 
                          for w,f in sorted(freqlist[deprel].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(deprel_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), deprel, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,DEPREL,EXAMPLES
56940,24.6%,DN,"o(16.5%), de(14.9%), a(13.7%), um(4.9%), os(3.9%)"
31402,13.6%,PU,",(43.9%), .(32.8%), ""(6.4%), !(3.6%), )(2.0%)"
25218,10.9%,DP,"livro(3.1%), ele(1.2%), vida(1.1%), história(1.0%), mundo(1.0%)"
23705,10.3%,fA,"em(12.4%), não(9.8%), a(4.8%), para(4.8%), de(4.6%)"
19431,8.4%,CJT,"é(7.6%), são(1.4%), foi(1.3%), tem(1.2%), de(1.1%)"
15697,6.8%,S,"que(12.4%), eu(5.6%), livro(4.7%), ele(4.1%), ela(3.7%)"
14443,6.2%,Od,"que(9.6%), se(6.1%), me(3.6%), livro(3.1%), o(2.8%)"
7560,3.3%,CO,"e(77.5%), mas(15.8%), ou(5.6%), nem(0.6%), mais(0.2%)"
6169,2.7%,STA,"é(21.4%), foi(3.1%), são(2.1%), tem(1.8%), recomendo(1.5%)"


## Dependency Root frequency

In [17]:
freqlist = Counter([word_node.get('form').lower() 
                    for word_node in reviews.iter('word') 
                        if word_node.get('head') =='0' and 
                           word_node.get('postag') !='pu'])

data = [['Freq', '% Freq', 'word']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,word
2043,13.49%,é
293,1.94%,foi
238,1.57%,são
226,1.49%,tem
178,1.18%,livro
153,1.01%,o
148,0.98%,era
143,0.94%,de
112,0.74%,há


## Semantic tags frequency

In [18]:
extra_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    extra_tags = word_node.get('sem') or ''
    extra_tags = extra_tags.split()
    
    for tag in extra_tags:
        extra_freq[tag] = extra_freq.get(tag, 0) + 1
        if tag not in freqlist:
            freqlist[tag] = dict()
        freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'SEM TAG', 'EXAMPLES']]

for tag, freq in sorted(extra_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(extra_freq.values()) * 100
    data.append([freq, '{:.2f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,SEM TAG,EXAMPLES
5573,9.34%,sem-r,"livro(44.2%), história(13.6%), livros(6.1%), leitura(5.9%), romance(3.9%)"
3634,6.09%,am,"amor(6.4%), tempo(6.2%), partido(5.9%), poder(3.1%), atenção(2.2%)"
3338,5.60%,ac,"amor(7.0%), coisa(6.0%), parte(4.9%), coisas(4.3%), verdade(4.2%)"
2889,4.84%,per,"história(26.3%), vida(14.8%), anos(7.9%), tempo(7.8%), dia(3.5%)"
2075,3.48%,sem-c,"obra(10.2%), fim(5.8%), visão(4.4%), trama(4.0%), opinião(3.7%)"
1830,3.07%,HH,"sociedade(9.9%), parte(9.0%), grupo(7.2%), família(6.3%), governo(4.5%)"
1790,3.00%,H,"pessoas(16.3%), amor(13.0%), pessoa(6.0%), crianças(4.9%), tipo(4.7%)"
1710,2.87%,temp,"anos(13.4%), tempo(13.1%), final(8.9%), vezes(8.4%), fim(7.1%)"
1667,2.79%,percep-f,"forma(15.2%), verdade(8.5%), realidade(7.7%), nome(4.9%), pena(4.7%)"


## Semantic tags category frequency

In [19]:
# from https://github.com/NAMD/pypln.backend/blob/develop/pypln/backend/workers/palavras_semantic_tagger.py
SEMANTIC_TAGS = \
{
    'Animal':
        {
        '<A>': u'Animal, umbrella tag (clone, fêmea, fóssil, parasito, predador)' ,
        '<AA>': u'Group of animals (cardume, enxame, passarada, ninhada)',
        '<Adom>': u'Domestic animal or big mammal (likely to have female forms etc.: terneiro, leão/leoa, cachorro)',
        '<AAdom>': u'Group of domestic animals (boiada)',
        '<Aich>': u'Water-animal (tubarão, delfim)',
        '<Amyth>': u'Mythological animal (basilisco)',
        '<Azo>': u'Land-animal (raposa)',
        '<Aorn>': u'Bird (águia, bem-te-vi)',
        '<Aent>': u'Insect (borboleta)',
        '<Acell>': u'Cell-animal (bacteria, blood cells: linfócito)',
        },
    'Plant':
        {
        '<B>': u'Plant, umbrella tag',
        '<BB>': u'Group of plants, plantation (field, forest etc.: mata, nabal)',
        '<Btree>': u'Tree (oliveira, palmeira)',
        '<Bflo>': u'Flower (rosa, taraxaco)',
        '<Bbush>': u'Bush, shrub (rododendro, tamariz)',
        '<fruit>': u'(fruit, berries, nuts: maçã, morango, avelã, melancia)',
        '<Bveg>': u'(vegetable espargo, funcho)',
        },
    'Human':
        {
        '<H>': u'Human, umbrella tag',
        '<HH>': u'Group of humans (organisations, teams, companies, e.g. editora)',
        '<Hattr>': u'Attributive human umbrella tag (many -ista, -ante)',
        '<Hbio>': u'Human classified by biological criteria (race, age etc., caboclo, mestiço, bebé, adulto)',
        '<Hfam>': u'Human with family or other private relation (pai, noiva)',
        '<Hideo>': u'Ideological human (comunista, implies <Hattr>), also: follower, disciple (dadaista)',
        '<Hmyth>': u'Humanoid mythical (gods, fairy tale humanoids, curupira, duende)',
        '<Hnat>': u'Nationality human (brasileiro, alemão), also: inhabitant (lisboeta)',
        '<Hprof>': u'Professional human (marinheiro, implies <Hattr>), also: sport, hobby (alpinista)',
        '<Hsick>': u'Sick human (few: asmático, diabético, cp <sick>)',
        '<Htit>': u'Title noun (rei, senhora)',
        },
    'Place and spatial':
        {
        '<L>': u'Place, umbrella tag',
        '<Labs>': u'Abstract place (anverso. auge)',
        '<Lciv>': u'Civitas, town, country, county (equals <L> + <HH>, cidade, país)',
        '<Lcover>': u'Cover, lid (colcha, lona, tampa)',
        '<Lh>': u'Functional place, human built or human-used (aeroporto, anfiteatro, cp. <build> for just a building)',
        '<Lopening>': u'opening, hole (apertura, fossa)',
        '<Lpath>': u'Path (road, street etc.: rua, pista)' ,
        '<Lstar>': u'Star object (planets, comets: planeta, quasar)',
        '<Lsurf>': u'surface (face, verniz, cp. <Lcover>)',
        '<Ltip>': u'tip place, edge (pico, pontinha, cp. <Labs>)',
        '<Ltop>': u'Geographical, natural place (promontório, pântano)',
        '<Ltrap>': u'trap place (armadilha, armazelo)',
        '<Lwater>': u'Water place (river, lake, sea: fonte, foz, lagoa)',
        '<bar>': u'barrier noun (dique, limite, muralha)',
        '<build>': u'(building)',
        '<inst>': u'(institution)',
        '<pict>': u'(picture)',
        '<sit>': u'(situation)',
        '<pos-an>': u'anatomical/body position (few: desaprumo)',
        '<pos-soc>': u'social position, job (emprego, condado, capitania, presidência)',
        },
    'Vehicle':
        {
        '<V>': u'Vehicle, umbrella tag and ground vehicle (car, train: carro, comboio, tanque, teleférico)',
        '<VV>': u'Group of vehicles (armada, convoy: frota, esquadra)',
        '<Vwater>': u'Water vehicle (ship: navio, submersível, canoa)',
        '<Vair>': u'Air vehicle (plane: hidroplano, jatinho)',
        },
    'Abstract':
        {
        '<ac>': u'Abstract countable, umbrella tag (alternativa, chance, lazer)',
        '<ac-cat>': u'Category word (latinismo, número atômico)',
        '<ac-sign>': u'sign, symbol (parêntese, semicolcheia)',
        '<am>': u'Abstract mass/non-countable, umbrella tag (still contains many cases that could be <f-..>, e.g. habilidade, legalidade)',
        '<ax>': u'Abstract/concept, neither countable nor mass (endogamia), cp. <f>, <sit> etc.',
        '<f...>': u'(features)',
        '<dir>': u'direction noun (estibordo, contrasenso, norte)',
        '<geom...>': u'(shapes)',
        '<meta>': u'meta noun (tipo, espécie)',
        '<brand>': u'(MARCA) brand',
        '<genre>': u'(DISCIPLINA) subject matter',
        '<school>': u'(ESCOLA) school of thought',
        '<idea>': u'(IDEA) idea, concept',
        '<plan>': u'(PLANO) named plan, project',
        '<author>': u'(OBRA) artist-s name, standing for body of work',
        '<absname>': u'(NOME)',
        '<disease>': u'(ESTADO) physiological state, in particular: disease',
        },
    'Concept':
        {
        '<conv>': u'convention (social rule or law, lei, preceito)',
        '<domain>': u'subject matter, profession, cf. <genre>, anatomia, citricultura, dactilografia)',
        '<ism>': u'ideology or other value system (anarquismo, anti-ocidentalismo, apartheid)',
        '<genre>': u'',
        '<ling>': u'language (alemão, catalão, bengali)',
        '<disease>': u'',
        '<state...>': u'',
        '<therapy>': u'therapy (also <domain> and <activity>, acupuntura, balneoterapia)',
        },
    'Game':
        {
        '<game>': u'play, game (bilhar, ioiô, poker, also <activity>)',
        },
    'Genre':
        {
        '<genre>': u'genre (especially art genre, cf. <domain>, modernismo, tropicalismo)',
        },
    'Quantity':
        {
        '<unit>': u'',
        '<amount>': u'quantity noun (bocada, teor, sem-fim)',
        '<cur>': u'currency noun (countable, implies <unit>, cf. <mon>, dirham, euro, real, dólar)',
        '<mon>': u'amount of money (bolsa, custo, imposto, cf. <cur>)',
        },
    'Action':
        {
        '<act>': u'Action umbrella tag (+CONTROL, PERFECTIVE)',
        '<act-beat>': u'beat-action (thrashing, pancada, surra)',
        '<act-d>': u'do-action (typically dar/fazer + N, tentativa, teste, homenagem)',
        '<act-s>': u'speech act or communicative act (proposta, ordem)',
        '<act-trick>': u'trick-action (cheat, fraud, ruse, jeito, fraude, similar to <act-d>)',
        '<activity>': u'Activity, umbrella tag (+CONTROL, IMPERFECTIVE, correria, manejo)',
        '<sport>': u'',
        '<game>': u'',
        '<therapy>': u'',
        '<dance>': u'dance (both <activity>, <genre> and <sem-l>, calipso, flamenco, forró)',
        '<fight>': u'fight, conflict (also <activity> and +TEMP, briga, querela)',
        '<talk>': u'speech situation, talk, discussion, quarrel (implies <activity> and <sd>, entrevista, lero-lero)',
        },
    'Anatomical':
        {
        '<an>': u'Anatomical noun, umbrella tag (carótida, clítoris, dorso)',
        '<anmov>': u'Movable anatomy (arm, leg, braço, bíceps, cotovelo)',
        '<anorg>': u'Organ (heart, liver, hipófise, coração, testículo)',
        '<anost>': u'Bone (calcâneo, fíbula, vértebra)',
        '<anzo>': u'Animal anatomy (rúmen, carapaça, chifres, tromba)',
        '<anorn>': u'Bird anatomy (bico, pluma)',
        '<anich>': u'Fish anatomy (few: bránquias, siba)',
        '<anent>': u'Insect anatomy (few: tentáculo, olho composto)',
        '<anbo>': u'Plant anatomy (bulbo, caule, folha)',
        '<f-an>': u'(human anatomical feature)',
        },
    'Thing':
        {
        '<cc>': u'Concrete countable object, umbrella tag (briquete, coágulo, normally movable things, unlike <part-build>)',
        '<cc-h>': u'Artifact, umbrella tag (so far empty category in PALAVRAS)',
        '<cc-beauty>': u'ornamental object (few: guirlanda, rufo)',
        '<cc-board>': u'flat long object (few: board, plank, lousa, tabla)',
        '<cc-fire>': u'fire object (bonfire, spark, chispa, fogo, girândola)',
        '<cc-handle>': u'handle (garra, ansa, chupadouro)',
        '<cc-light>': u'light artifact (lampião, farol, projector) ',
        '<cc-particle>': u'(atomic) particle (few: cátion, eletrônio)',
        '<cc-r>': u'read object (carteira, cupom, bilhete, carta, cf. <sem-r>)',
        '<cc-rag>': u'cloth object (towel, napkin, carpet, rag) , cp. <mat-cloth>',
        '<cc-stone>': u'(cc-round) stones and stone-sized round objects (pedra, itá, amonite, tijolo)',
        '<cc-stick>': u'stick object (long and thin, vara, lançe, paulito)',
        '<object>': u'(OBJECT) named object',
        '<common>': u'(OBJECT) common noun used as name',
        '<mat>': u'(SUBSTANCIA) substance',
        '<class>': u'(CLASSE) classification category for things',
        '<plant>': u'(CLASSE) plant name',
        '<currency>': u'(MOEDA) currency name (also marked on the number)',
        '<mass>': u'mass noun (e.g. "leite", "a-gua")',
        '<furn>': u'furniture (cama, cadeira, tambo, quadro)',
        '<con>': u'container (implies <num+> quantifying, ampola, chícara, aquário)',
        },
    'Substance':
        {
        '<cm>': u'concrete mass/non-countable, umbrella tag, substance (cf. <mat>, terra, choça, magma)',
        '<cm-h>': u'human-made substance (cf. <mat>, cemento)',
        '<cm-chem>': u'chemical substance, also biological (acetileno, amônio, anilina, bilirrubina',
        '<cm-gas>': u'gas substance (so far few: argônio, overlap with. <cm-chem> and <cm>)',
        '<cm-liq>': u'liquid substance (azeite, gasolina, plasma, overlap with <food> and <cm-rem>)',
        '<cm-rem>': u'remedy (medical or hygiene, antibiótico, canabis, quinina, part of <cm-h>, overlap with <cm-chem>)',
        },
    'Materials':
        {
        '<mat>': u'material (argila, bronze, granito, cf. <cm>)',
        '<mat-cloth>': u'cloth material (seda, couro, vison, kevlar), cp. <cc-rag>',
        '<cord>': u'cord, string, rope, tape (previously <tool-tie>, arame, fio, fibrila)',
        },
    'Clothing':
        {
        '<cloA>': u'animal clothing (sela, xabraque)',
        '<cloH>': u'human clothing (albornoz, anoraque, babadouro, bermudas)',
        '<cloH-beauty>': u'beauty clothing (e.g. jewelry, diadema, pendente, pulseira)',
        '<cloH-hat>': u'hat (sombrero, mitra, coroa)',
        '<cloH-shoe>': u'shoe (bota, chinela, patim)',
        '<mat-cloth>': u'cloth material (seda, couro, vison, kevlar), cp. <cc-rag>',
        '<clo...>': u'(clothing)',
        },
    'Collective':
        {
        '<coll>': u'set,collective (random or systematic collection/compound/multitude of similar but distinct small parts, conjunto, série)',
        '<coll-cc>': u'thing collective, pile (baralho, lanço)',
        '<coll-B>': u'plant-part collective (buquê, folhagem)',
        '<coll-sem>': u'semantic collective, collection (arquivo, repertório)',
        '<coll-tool>': u'tool collective, set (intrumentário, prataria)',
        '<HH>': u'(group)',
        '<AA>': u'(herd)',
        '<BB>': u'(plantation)',
        '<VV>': u'(convoy)',
        },
    'Time_Event':
        {
        '<dur>': u'duration noun (test: durar+, implies <unit>, e.g. átimo, mês, hora)',
        '<temp>': u'temporal object, point in time (amanhecer, novilúnio, test: até+, cf. <dur> and <per>)',
        '<event>': u'non-organised event  (-CONTROL, PERFECTIVE, milagre, morte)',
        '<occ>': u'occasion, human/social event (copa do mundo, aniversário, jantar, desfile, cp. unorganized <event>) ',
        '<process>': u'process (-CONTROL, -PERFECTIVE, cp. <event>, balcanização, convecção, estagnação)',
        '<act...>': u'',
        '<activity>': u'',
        '<history>': u'(EFEMERIDE) one-time [historical] occurrence',
        '<date>': u'(DATA) date',
        '<hour>': u'(HORA) hour',
        '<period>': u'(PERIODO) period',
        '<cyclic>': u'(CICLICO) cyclic time expression',
        '<month>': u'month noun/name (agosto, julho, part of <temp>)',
        '<per>': u'period of time (prototypical test: durante, e.g. guerra, década, cf. <dur> and <temp>)',
        },
    'Feature':
        {
        '<f>': u'feature/property, umbrella tag (problematicidade, proporcionalidade)',
        '<f-an>': u'anatomical "local" feature, includes countables, e.g. barbela, olheiras)',
        '<f-c>': u'general countable feature (vestígio, laivos, vinco)',
        '<f-h>': u'human physical feature, not countable (lindura, compleição, same as <f-phys-h>, cp. anatomical local features <f-an>)',
        '<f-phys-h>': u'',
        '<f-psych>': u'human psychological feature (passionalidade, pavonice, cp. passing states <state-h>)',
        '<f-q>': u'quantifiable feature (e.g. circunferência, calor, DanGram-s <f-phys> covers both <f> and <f-q>)',
        '<f-phys>': u'',
        '<f-right>': u'human social feature (right or duty): e.g. copyright, privilégio, imperativo legal)',
        '<state>': u'',
        '<state-h>': u'(human state)',
        },
    'Food':
        {
        '<food>': u'natural/simplex food (aveia, açúcar, carne, so far including <spice>)',
        '<food-c>': u'countable food (few: ovo, dente de alho, most are <fruit> or <food-c-h>)',
        '<food-h>': u'human-prepared/complex culinary food (caldo verde, lasanha)',
        '<food-c-h>': u'culinary countable food (biscoito, enchido, panetone, pastel)',
        '<drink>': u'drink (cachaça, leite, guaraná, moca)',
        '<fruit>': u'fruit, berry, nut (still mostly marked as <food-c>, abricote, amora, avelã, cebola)',
        '<spice>': u'condiments, pepper',
        },
    'Part':
        {
        '<part>': u'distinctive or functional part (ingrediente, parte, trecho)',
        '<part-build>': u'structural part of building or vehicle (balustrada, porta, estai)',
        '<piece>': u'indistinctive (little) piece (pedaço, raspa)',
        '<cc-handle>': u'',
        '<Ltip>': u'',
        },
    'Perception':
        {
        '<percep-f>': u'what you feel (senses or sentiment, pain, e.g. arrepio, aversão, desagrado, cócegas, some overlap with <state-h>)',
        '<percep-l>': u'sound (what you hear, apitadela, barrulho, berro, crepitação)',
        '<percep-o>': u'olfactory impression (what you smell, bafo, chamuscom fragrância)',
        '<percep-t>': u'what you taste (PALAVRAS: not implemented)',
        '<percep-w>': u'visual impression (what you see, arco-iris, réstia, vislumbre)',
        },
    'Semantic Product':
        {
        '<sem>': u'semiotic artifact, work of art, umbrella tag (all specified in PALAVRAS)',
        '<sem-c>': u'cognition product (concept, plan, system, conjetura, esquema, plano, prejuízo)',
        '<sem-l>': u'listen-work (music, cantarola, prelúdio, at the same time <genre>: bossa nova)',
        '<sem-nons>': u'nonsense, rubbish (implies <sem-s>, galimatias, farelório)',
        '<sem-r>': u'read-work (biografia, dissertação, e-mail, ficha cadastral)',
        '<sem-s>': u'speak-work (palestra, piada, exposto)',
        '<sem-w>': u'watch-work (filme, esquete, mininovela)',
        '<ac-s>': u'(speach act)',
        '<talk>': u'',
        },
    'Disease':
        {
        '<sick>': u'disease (acne, AIDS, sida, alcoolismo, cp. <Hsick>)',
        '<Hsick>': u'',
        '<sick-c>': u'countable disease-object (abscesso, berruga, cicatriz, gangrena)',
        },
    'State-of-affairs':
        {
        '<sit>': u'psychological situation or physical state of affairs (reclusão, arruaça, ilegalidade, more complex and more "locative" than <state> and <state-h>',
        '<state>': u'state (of something, otherwise <sit>), abundância, calma, baixa-mar, equilíbrio',
        '<state-h>': u'human state (desamparo, desesperança, dormência, euforia, febre',
        '<f-psych>': u'',
        '<f-phys-h>': u'',
        },
    'Sport':
        {
        '<sport>': u'sport (capoeira, futebol, golfe, also <activity> and <domain>)',
        },
    'Tool':
        {
        '<tool>': u'tool, umbrella tag (abana-moscas, lápis, computador, maceta, "handable", cf. <mach>)',
        '<tool-cut>': u'cutting tool, knife (canivete, espada)',
        '<tool-gun>': u'shooting tool, gun (carabina, metralhadora, helicanão, in Dangram: <tool-shoot>)',
        '<tool-mus>': u'musical instrument (clavicórdio, ocarina, violão)',
        '<tool-sail>': u'sailing tool, sail (vela latina, joanete, coringa)',
        '<mach>': u'machine (complex, usually with moving parts, betoneira, embrulhador, limpa-pratos, cp. <tool>)',
        '<tube>': u'tube object (cânula, gasoduto, zarabatana, shape-category, typically with another category, like <an> or <tool>)',
        },
    'Unit':
        {
        '<unit>': u'unit noun (always implying <num+>, implied by <cur> and <dur>, e.g. caloria, centímetro, lúmen))',
        },
    'Weather':
        {
        '<wea>': u'weather (states), umbrella tag (friagem, bruma)',
        '<wea-c>': u'countable weather phenomenon (nuvem, tsunami)',
        '<wea-rain>': u'rain and other precipitation (chuvisco, tromba d-água, granizo)',
        '<wea-wind>': u'wind, storm (brisa, furacão)',
        },
    'Person':
        {
        '<hum>': u'(INDIVIDUAL) person name (cp. <H>)',
        '<official>': u'(CARGO) official function (~ cp. <Htitle> and <Hprof>)',
        '<member>': u'(MEMBRO) member',
        },
    'Organization_Group':
        {
        '<admin>': u'(ADMINISTRACAO, ORG.) administrative body (government, town administration etc.)',
        '<org>': u'(INSTITUICAO/EMPRESA) commercial or non-commercial, non-administrative non-party organisations (not place-bound, therefore not the same as <Linst>)',
        '<inst>': u'(EMPRESA) organized site (e.g. restaurant, cp. <Linst>)',
        '<media>': u'(EMPRESA) media organisation (e.g. newspaper, tv channel)',
        '<party>': u'(INSTITUICAO) political party',
        '<suborg>': u'(SUB) organized part of any of the above',
        '<company>': u'currently unsupported: (EMPRESA) company (not site-bound, unlike <inst>, now fused with. <org>)',
        },
    'Group':
        {
        '<groupind>': u'(GROUPOIND) people, family',
        '<groupofficial>': u'(GROUPOCARGO) board, government (not fully implemented)',
        '<grouporg>': u'currently unsupported (GROUPOMEMBRO) club, e.g. football club (now fused with <org>)',
        },
    'Place':
        {
        '<top>': u'(GEOGRAFICO) geographical location (cp. <Ltop>)',
        '<civ>': u'(ADMINISTRACAO, LOC.) civitas (country, town, state, cp. <Lciv>)',
        '<address>': u'(CORREIO) address (including numbers etc.)',
        '<site>': u'(ALARGADO) functional place (cp. <Lh>)',
        '<virtual>': u'(VIRTUAL) virtual place',
        '<astro>': u'(OBJECTO) astronomical place (in HAREM object, not place)',
        '<road>': u'suggested (ALARGADO) roads, motorway (unlike <address>)',
        },
    'Work_of_Art':
        {
        '<tit>': u'(REPRODUZIDO) [title of] reproduced work, copy',
        '<pub>': u'(PUBLICACAO) [scientific] publication',
        '<product>': u'(PRODUTO) product brand',
        '<V>': u'(PRODUTO) vehicle brand (cp. <V>, <Vair>, <Vwater>)',
        '<artwork>': u'(ARTE) work of art',
        '<pict>': u'picture (combination of <cc>, <sem-w> and <L>, caricatura, cintilograma, diapositivo)',
        },
    'Colours':
        {
        '<col>': u'colours',
        },
    'Numeric_and_Math':
        {
        '<quantity>': u'(QUANTIDADE) simple measuring numeral',
        '<prednum>': u'(CLASSIFICADO) predicating numeral',
        '<currency>': u'(MOEDA) currency name (also marked on the unit)',
        '<geom>': u'geometry noun (circle, shape, e.g. losango, octógono, elipse)',
        '<geom-line>': u'line (few: linha, percentil, curvas isobáricas)',
        },
    'Modifying_Adjectives':
        {
        '<jh>': u'adjective modifying human noun',
        '<jn>': u'adjective modifying inanimate noun ',
        '<ja>': u'adjective modifying animal',
        '<jb>': u'adjective modifying plant',
        '<col>': u'color adjective',
        '<nat>': u'nationality adjective (also: from a certain town etc.)',
        '<attr>': u'(human) attributive adjective (not fully implemented, cp. <Hattr>, e.g. "um presidente COMUNISTA")',
        },
    'Verbs_related_human_things':
        {
        '<vH>': u'verb with human subject',
        '<vN>': u'verb with inanimate subject',
        },
}

In [20]:
extra_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    extra_tags = word_node.get('sem') or ''
    extra_tags = extra_tags.split()
    
    for tag in extra_tags:
        
        tag = '<' + tag + '>'
        for category, subcategories in SEMANTIC_TAGS.items():
            if tag in subcategories:
                tag = category
                break

        extra_freq[tag] = extra_freq.get(tag, 0) + 1
        if tag not in freqlist:
            freqlist[tag] = dict()
        freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'SEM TAG', 'EXAMPLES']]

for tag, freq in sorted(extra_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(extra_freq.values()) * 100
    data.append([freq, '{:.2f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,SEM TAG,EXAMPLES
8905,14.93%,Semantic Product,"livro(27.7%), história(8.5%), livros(3.8%), leitura(3.7%), narrativa(2.6%)"
8900,14.92%,Abstract,"amor(5.2%), tempo(5.0%), poder(2.6%), partido(2.4%), coisa(2.2%)"
8298,13.91%,Time_Event,"história(9.2%), anos(8.3%), tempo(5.4%), vida(5.1%), leitura(4.0%)"
7566,12.68%,Human,"personagens(7.2%), personagem(5.4%), mulher(5.4%), pessoas(3.8%), leitor(3.6%)"
4129,6.92%,Place and spatial,"mundo(9.3%), obra(5.1%), campo(4.8%), parte(4.0%), lugar(3.9%)"
3202,5.37%,Action,"trama(2.6%), palavras(2.4%), jeito(2.1%), guerra(1.9%), crítica(1.5%)"
2151,3.61%,Collective,"sociedade(8.4%), parte(7.7%), grupo(6.1%), família(5.4%), série(4.3%)"
2098,3.52%,Thing,"forma(12.1%), filme(10.3%), coisa(9.5%), coisas(6.8%), páginas(4.4%)"
1949,3.27%,State-of-affairs,"realidade(6.6%), falta(4.1%), pena(4.0%), situação(3.7%), situações(3.2%)"


## Extra tags frequency

In [21]:
extra_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    extra_tags = word_node.get('extra').split()
    
    for tag in extra_tags:
        extra_freq[tag] = extra_freq.get(tag, 0) + 1
        if tag not in freqlist:
            freqlist[tag] = dict()
        freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'EXTRA TAG', 'EXAMPLES']]

for tag, freq in sorted(extra_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(extra_freq.values()) * 100
    data.append([freq, '{:.2f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,EXTRA TAG,EXAMPLES
116527,33.82%,--,",(11.8%), .(8.8%), a(7.9%), o(7.5%), em(2.7%)"
31006,9.00%,mv,"é(10.0%), ler(1.8%), ser(1.6%), tem(1.5%), foi(1.3%)"
23435,6.80%,np-close,"de(37.0%), em(2.0%), com(1.5%), é(1.0%), para(0.8%)"
18394,5.34%,*,"o(5.3%), a(4.5%), em(2.5%), é(2.1%), e(2.0%)"
15782,4.58%,vH,"ler(3.6%), li(1.8%), ver(1.2%), acho(1.0%), achei(0.9%)"
15530,4.51%,fmc,"é(17.8%), foi(2.7%), são(2.4%), tem(2.2%), era(1.4%)"
11444,3.32%,clb,"que(50.7%), como(4.7%), se(4.3%), mas(4.2%), quando(3.9%)"
8292,2.41%,clb-fs,"que(66.8%), como(6.2%), se(5.2%), quando(5.1%), porque(2.6%)"
8211,2.38%,-head,"é(8.8%), são(1.5%), foi(1.4%), tem(1.4%), livro(1.1%)"


## Semantic Role Label Frequency

In [22]:
srl_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    tag = word_node.get('srl')
    srl_freq[tag] = srl_freq.get(tag, 0) + 1
    if tag not in freqlist:
        freqlist[tag] = dict()
    freqlist[tag][form] = freqlist[tag].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'SRL TAG', 'EXAMPLES']]

for tag, freq in sorted(srl_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[tag].values()) * 100) 
                          for w,f in sorted(freqlist[tag].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(srl_freq.values()) * 100
    data.append([freq, '{:.2f}%'.format(ratio), tag, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,SRL TAG,EXAMPLES
139801,60.46%,,",(9.4%), de(8.0%), .(7.0%), a(6.8%), o(6.5%)"
13055,5.65%,ATR,"livro(2.6%), com(1.8%), é(1.1%), de(1.0%), bom(0.9%)"
11556,5.00%,TH,"que(10.5%), livro(6.7%), história(2.3%), ele(2.3%), o(1.9%)"
9385,4.06%,PAT,"que(9.2%), livro(3.3%), o(2.8%), me(1.7%), se(1.5%)"
7629,3.30%,PRED,"é(16.1%), foi(1.9%), ser(1.5%), tem(1.4%), era(1.3%)"
7385,3.19%,AG,"que(14.1%), eu(7.2%), ele(4.5%), ela(4.2%), livro(3.0%)"
7199,3.11%,,",(9.7%), de(8.3%), .(7.7%), a(6.8%), o(6.0%)"
4080,1.76%,FOC,"não(60.6%), só(9.0%), também(7.4%), ainda(6.4%), apenas(4.7%)"
3921,1.70%,MNR,"já(9.5%), forma(4.2%), então(3.4%), assim(3.3%), sim(2.8%)"


## Aspect Frequency

In [25]:
freqlist =  Counter([node.get('target').lower() for node in reviews.iter('Opinion')])

data = [['freq', '%freq', 'aspect(OBJ)']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,aspect(OBJ)
864,33.4%,livro
199,7.7%,história
104,4.0%,leitura
81,3.1%,personagens
60,2.3%,narrativa
56,2.2%,crepúsculo
52,2.0%,final
52,2.0%,romance
46,1.8%,obra


In [29]:
sentences = ' '.join([node.get('form').lower() for node in reviews.iter('word')])
freqlist =  Counter([node.get('target').lower() for node in reviews.iter('Opinion')])

data = [['Total in Corpus', 'Total as target',  'Freq as target', 'aspect chunk']]
for token, freq in freqlist.most_common(40):
    total = sentences.count(' ' + token + ' ')
    ratio = freq / total *100    
    data.append([total, freq,'{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
Total in Corpus,Total as target,Freq as target,aspect chunk
2605,864,33.2%,livro
810,199,24.6%,história
385,104,27.0%,leitura
296,81,27.4%,personagens
133,60,45.1%,narrativa
231,56,24.2%,crepúsculo
175,52,29.7%,final
255,52,20.4%,romance
225,46,20.4%,obra


## Opinion frequency

In [19]:
sentences = ' '.join([node.get('form').lower() for node in reviews.iter('word')])

# get chunks of opinions in the corpus
opinions_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.iter('word'):
        if word_node.get('opinion') != 'O':
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                opinions_list.append(' '.join(chunk))
                chunk = []
    if len(chunk) != 0:
        opinions_list.append(' '.join(chunk))

freqlist =  Counter(opinions_list)

data = [['Total in Corpus', 'Total as opinion',  'Freq as opinion', 'aspect chunk']]
for token, freq in freqlist.most_common(20):
    total = sentences.count(' ' + token + ' ')
    ratio = freq / total *100    
    data.append([total, freq,'{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
Total in Corpus,Total as opinion,Freq as opinion,aspect chunk
10387,776,7.5%,.
352,128,36.4%,bom
143,103,72.0%,recomendo
152,87,57.2%,gostei
145,57,39.3%,interessante
57,41,71.9%,envolvente
69,41,59.4%,ótimo
58,39,67.2%,adorei
45,33,73.3%,amei
