# Statistics in SemEval Train Corpus annotated with Stanford CoreNLP

In [1]:
from __future__ import print_function
from __future__ import division

In [6]:
from lxml.etree import ElementTree
reviews = ElementTree().parse('../corpus/SemEvalABSA2016EnglishRestaurants_train.xml')

## Token Type Ratio

In [7]:
from collections import Counter
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

print ('Total tokens: {}'.format(sum(freqlist.values())))
print ('Total types:  {}'.format(len(freqlist.keys())))
print ('Token/Type ratio:  {:.1f}%'.format(sum(freqlist.values())/len(freqlist.keys())))

Total tokens: 29524
Total types:  3985
Token/Type ratio:  7.4%


## Lexical Frequency

In [8]:
import ipy_table

In [9]:
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Token']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Token
1701,5.76%,.
1187,4.02%,the
1037,3.51%,","
879,2.98%,and
603,2.04%,I
589,1.99%,a
532,1.80%,is
523,1.77%,to
478,1.62%,was


In [10]:
freqlist = Counter([word_node.get('form').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'LowerCase Token']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,LowerCase Token
1701,5.76%,.
1540,5.22%,the
1037,3.51%,","
894,3.03%,and
655,2.22%,i
620,2.10%,a
534,1.81%,is
530,1.80%,to
483,1.64%,was


## Lemma Frequency

In [11]:
freqlist = Counter([word_node.get('base').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Lemma']]
for token, freq in freqlist.most_common(30):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Lemma
1701,5.76%,.
1654,5.60%,be
1540,5.22%,the
1037,3.51%,","
894,3.03%,and
706,2.39%,i
681,2.31%,a
530,1.80%,to
400,1.35%,have


## Part-of-Speach Frequency

In [12]:
from operator import itemgetter
postag_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    postag = word_node.get('postag')
    
    postag_freq[postag] = postag_freq.get(postag, 0) + 1
    if postag not in freqlist:
        freqlist[postag] = dict()
    freqlist[postag][form] = freqlist[postag].get(form, 0) + 1 

In [13]:
data = [['FREQ', '% FREQ', 'POSTAG', 'EXAMPLES']]

for pos, freq in sorted(postag_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[pos].values()) * 100) 
                          for w,f in sorted(freqlist[pos].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(postag_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), pos, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,POSTAG,EXAMPLES
4187,14.2%,NN,"food(6.6%), place(4.9%), service(3.3%), restaurant(2.7%), time(1.5%)"
2778,9.4%,DT,"the(55.3%), a(22.3%), this(8.8%), all(2.8%), an(2.2%)"
2710,9.2%,JJ,"great(7.1%), good(6.8%), delicious(2.1%), nice(1.8%), excellent(1.7%)"
2398,8.1%,IN,"of(15.0%), for(13.2%), in(12.6%), with(7.3%), on(7.3%)"
2059,7.0%,RB,"not(9.7%), n't(8.2%), very(5.9%), so(4.5%), here(3.7%)"
1921,6.5%,.,".(88.5%), !(10.4%), ?(1.1%)"
1815,6.1%,PRP,"i(33.2%), it(21.7%), we(14.5%), you(13.2%), they(6.8%)"
1331,4.5%,VBD,"was(36.3%), were(10.7%), had(9.4%), did(3.2%), went(3.1%)"
1214,4.1%,CC,"and(73.6%), but(18.3%), or(5.8%), both(0.7%), either(0.7%)"


## Dependency relation frequency

In [14]:
from operator import itemgetter
deprel_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    deprel = word_node.get('deprel')
    
    deprel_freq[deprel] = deprel_freq.get(deprel, 0) + 1
    if deprel not in freqlist:
        freqlist[deprel] = dict()
    freqlist[deprel][form] = freqlist[deprel].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'deprel', 'EXAMPLES']]

for deprel, freq in sorted(deprel_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[deprel].values()) * 100) 
                          for w,f in sorted(freqlist[deprel].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(deprel_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), deprel, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,deprel,EXAMPLES
3446,11.7%,punct,".(49.4%), ,(30.1%), !(5.8%), -rrb-(2.8%), ...(2.8%)"
2925,9.9%,nsubj,"i(20.3%), it(8.2%), we(8.1%), you(6.8%), food(4.5%)"
2532,8.6%,det,"the(60.5%), a(24.2%), this(7.2%), an(2.4%), some(1.2%)"
2060,7.0%,root,"good(2.5%), had(2.4%), go(2.1%), place(1.9%), went(1.7%)"
2060,7.0%,case,"of(16.7%), for(14.1%), in(14.0%), to(8.3%), with(8.3%)"
1847,6.3%,nmod,"food(2.7%), place(2.2%), restaurant(1.9%), it(1.8%), restaurants(1.4%)"
1760,6.0%,advmod,"very(6.9%), so(5.3%), here(4.2%), just(4.1%), back(3.1%)"
1594,5.4%,amod,"great(8.3%), good(5.1%), best(2.8%), other(2.1%), nice(1.6%)"
1275,4.3%,cop,"is(35.9%), was(31.1%), are(8.8%), were(6.8%), be(5.6%)"


## Dependency Root frequency

In [20]:
from operator import itemgetter
deprel_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    deprel = word_node.get('deprel')
    
    deprel_freq[deprel] = deprel_freq.get(deprel, 0) + 1
    if deprel not in freqlist:
        freqlist[deprel] = dict()
    freqlist[deprel][form] = freqlist[deprel].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'deprel', 'EXAMPLES']]

for deprel, freq in sorted(deprel_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[deprel].values()) * 100) 
                          for w,f in sorted(freqlist[deprel].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(deprel_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), deprel, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,deprel,EXAMPLES
3446,11.7%,punct,".(49.4%), ,(30.1%), !(5.8%), -rrb-(2.8%), ...(2.8%)"
2925,9.9%,nsubj,"i(20.3%), it(8.2%), we(8.1%), you(6.8%), food(4.5%)"
2532,8.6%,det,"the(60.5%), a(24.2%), this(7.2%), an(2.4%), some(1.2%)"
2060,7.0%,root,"good(2.5%), had(2.4%), go(2.1%), place(1.9%), went(1.7%)"
2060,7.0%,case,"of(16.7%), for(14.1%), in(14.0%), to(8.3%), with(8.3%)"
1847,6.3%,nmod,"food(2.7%), place(2.2%), restaurant(1.9%), it(1.8%), restaurants(1.4%)"
1760,6.0%,advmod,"very(6.9%), so(5.3%), here(4.2%), just(4.1%), back(3.1%)"
1594,5.4%,amod,"great(8.3%), good(5.1%), best(2.8%), other(2.1%), nice(1.6%)"
1275,4.3%,cop,"is(35.9%), was(31.1%), are(8.8%), were(6.8%), be(5.6%)"


## Named Entities Frequency

In [21]:
from operator import itemgetter
ne_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    ne = word_node.get('ner')
    
    ne_freq[ne] = ne_freq.get(ne, 0) + 1
    if ne not in freqlist:
        freqlist[ne] = dict()
    freqlist[ne][form] = freqlist[ne].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'NAMED ENTITY', 'EXAMPLES']]

for ne, freq in sorted(ne_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[ne].values()) * 100) 
                          for w,f in sorted(freqlist[ne].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(ne_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), ne, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,NAMED ENTITY,EXAMPLES
28239,95.6%,O,".(6.0%), the(5.4%), ,(3.7%), and(3.2%), i(2.3%)"
200,0.7%,LOCATION,"new(10.0%), york(9.0%), manhattan(6.5%), brooklyn(5.0%), ny(3.0%)"
186,0.6%,NUMBER,"one(36.0%), two(11.3%), four(5.9%), 2(5.4%), three(4.3%)"
153,0.5%,DURATION,"minutes(10.5%), years(10.5%), day(7.2%), the(5.9%), hour(4.6%)"
140,0.5%,DATE,"once(14.3%), saturday(7.1%), now(7.1%), the(7.1%), last(5.0%)"
135,0.5%,MISC,"thai(15.6%), indian(14.8%), japanese(9.6%), italian(6.7%), chinese(5.9%)"
132,0.4%,PERSON,"suan(4.5%), mizu(3.8%), dokebi(3.0%), la(3.0%), saul(3.0%)"
100,0.3%,MONEY,"$(48.0%), 500(4.0%), 60(3.0%), 2(3.0%), 10(3.0%)"
83,0.3%,TIME,"night(37.3%), evening(15.7%), last(9.6%), afternoon(6.0%), late(6.0%)"


## Aspect Frequency

In [16]:
# obj tags
aspect_freq =  Counter([word_node.get('form').lower() 
                        for word_node in reviews.iter('word') if word_node.get('obj') != 'O' and
                        word_node.get('obj') != None])

data = [['freq', '%freq', 'unigram aspect']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,unigram aspect
953,21.7%,livro
236,5.4%,história
229,5.2%,de
132,3.0%,o
127,2.9%,a
115,2.6%,leitura
100,2.3%,personagens
64,1.5%,romance
64,1.5%,crepúsculo


In [17]:
aspects_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('obj') != 'O' and word_node.get('obj') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                aspects_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        aspects_list.append('_'.join(chunk))
        
aspect_freq =  Counter(aspects_list)

data = [['freq', '%freq', 'aspect chunk']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,aspect chunk
916,33.1%,livro
208,7.5%,história
112,4.0%,leitura
85,3.1%,personagens
62,2.2%,crepúsculo
61,2.2%,narrativa
56,2.0%,final
55,2.0%,romance
48,1.7%,obra


In [18]:
tokens = '_'.join([word_node.get('form').lower() 
                   for word_node in reviews.iter('word')])

# get chunks of aspects in the corpus
aspects_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('obj') != 'O' and word_node.get('obj') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                aspects_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        aspects_list.append('_'.join(chunk))

        
aspect_freq =  Counter(aspects_list)
# check relative frequency for these chunks in the corpus

data = [['freq', '%aspect', '%tokens', 'aspect chunk']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    relat_freq = freq / tokens.count(token) * 100
    data.append([freq, '{:.1f}%'.format(ratio), '{:.1f}%'.format(relat_freq), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
freq,%aspect,%tokens,aspect chunk
916,33.1%,28.7%,livro
208,7.5%,21.2%,história
112,4.0%,25.6%,leitura
85,3.1%,26.4%,personagens
62,2.2%,23.6%,crepúsculo
61,2.2%,42.4%,narrativa
56,2.0%,20.7%,final
55,2.0%,17.2%,romance
48,1.7%,14.1%,obra


## Predicate frequency

In [19]:
aspect_freq =  Counter([word_node.get('form').lower() 
                        for word_node in reviews.iter('word') 
                            if word_node.get('opinion') != 'O' and 
                               word_node.get('opinion') != None])

data = [['freq', '%freq', 'unigram opinion']]
for token, freq in aspect_freq.most_common(30):
    ratio = freq / sum(aspect_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,unigram opinion
929,5.2%,de
637,3.6%,a
577,3.2%,o
394,2.2%,não
384,2.1%,que
274,1.5%,um
212,1.2%,em
211,1.2%,me
206,1.2%,bom


In [20]:
opinions_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('opinion') != 'O' and word_node.get('opinion') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                opinions_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        opinions_list.append('_'.join(chunk))
        
opinions_freq =  Counter(opinions_list)

data = [['freq', '%freq', 'opinion chunk']]
for token, freq in opinions_freq.most_common(30):
    ratio = freq / sum(opinions_freq.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])
    
ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,opinion chunk
140,2.6%,bom
113,2.1%,recomendo
92,1.7%,gostei
60,1.1%,interessante
46,0.9%,ótimo
45,0.8%,envolvente
43,0.8%,adorei
35,0.7%,amei
33,0.6%,perfeito


In [21]:
tokens = '_'.join([word_node.get('form').lower() 
                   for word_node in reviews.iter('word')])

# get chunks of opinions in the corpus
opinions_list = []
for sentence_node in reviews.iter('sentence'):    
    chunk = []
    for word_node in sentence_node.getchildren():
        if word_node.get('opinion') != 'O' and word_node.get('opinion') != None:
            chunk.append(word_node.get('form').lower())
        else:
            if len(chunk) != 0:
                opinions_list.append('_'.join(chunk))
                chunk = []
    if len(chunk) != 0:
        opinions_list.append('_'.join(chunk))

        
opinion_freq =  Counter(opinions_list)
# check relative frequency for these chunks in the corpus

data = [['freq', '%opinion', '%tokens', 'opinion chunk']]
for token, freq in opinion_freq.most_common(30):
    ratio = freq / sum(opinion_freq.values()) *100
    relat_freq = freq / tokens.count(token) * 100
    data.append([freq, '{:.1f}%'.format(ratio), '{:.1f}%'.format(relat_freq), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
freq,%opinion,%tokens,opinion chunk
140,2.6%,36.0%,bom
113,2.1%,73.4%,recomendo
92,1.7%,56.1%,gostei
60,1.1%,35.5%,interessante
46,0.9%,61.3%,ótimo
45,0.8%,75.0%,envolvente
43,0.8%,68.3%,adorei
35,0.7%,63.6%,amei
33,0.6%,28.4%,perfeito
