# Statistics in SemEval 2015 Restaurants Train Corpus annotated with Stanford CoreNLP

In [1]:
from __future__ import print_function
from __future__ import division

In [2]:
from lxml.etree import ElementTree
reviews = ElementTree().parse('../corpus/SemEvalABSA2016EnglishRestaurants_train.xml')

## Token Type Ratio

In [3]:
from collections import Counter
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

print ('Total tokens: {}'.format(sum(freqlist.values())))
print ('Total types:  {}'.format(len(freqlist.keys())))
print ('Token/Type ratio:  {:.1f}%'.format(sum(freqlist.values())/len(freqlist.keys())))

Total tokens: 28946
Total types:  3985
Token/Type ratio:  7.3%


## Lexical Frequency

In [4]:
import ipy_table

In [5]:
freqlist = Counter([word_node.get('form').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Token']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Token
1661,5.74%,.
1512,5.22%,the
1019,3.52%,","
874,3.02%,and
642,2.22%,i
605,2.09%,a
525,1.81%,is
519,1.79%,to
468,1.62%,was


## Lemma Frequency

In [6]:
freqlist = Counter([word_node.get('base').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Lemma']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Lemma
1661,5.74%,.
1621,5.60%,be
1512,5.22%,the
1019,3.52%,","
874,3.02%,and
693,2.39%,i
664,2.29%,a
519,1.79%,to
389,1.34%,have


## Part-of-Speech Frequency

In [7]:
from operator import itemgetter
postag_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    postag = word_node.get('postag')
    
    postag_freq[postag] = postag_freq.get(postag, 0) + 1
    if postag not in freqlist:
        freqlist[postag] = dict()
    freqlist[postag][form] = freqlist[postag].get(form, 0) + 1 

In [8]:
data = [['FREQ', '% FREQ', 'POSTAG', 'EXAMPLES']]

for pos, freq in sorted(postag_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[pos].values()) * 100) 
                          for w,f in sorted(freqlist[pos].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(postag_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), pos, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,POSTAG,EXAMPLES
4087,14.1%,NN,"food(6.7%), place(5.0%), service(3.3%), restaurant(2.7%), time(1.5%)"
2721,9.4%,DT,"the(55.5%), a(22.2%), this(8.8%), all(2.9%), an(2.2%)"
2652,9.2%,JJ,"great(7.0%), good(6.7%), delicious(2.0%), nice(1.8%), excellent(1.7%)"
2357,8.1%,IN,"of(15.0%), for(13.2%), in(12.8%), on(7.3%), with(7.1%)"
2008,6.9%,RB,"not(9.8%), n't(8.2%), very(6.0%), so(4.5%), here(3.6%)"
1877,6.5%,.,".(88.5%), !(10.4%), ?(1.1%)"
1788,6.2%,PRP,"i(33.2%), it(21.6%), we(14.6%), you(13.1%), they(6.9%)"
1296,4.5%,VBD,"was(36.1%), were(10.9%), had(9.3%), did(3.2%), went(2.9%)"
1190,4.1%,CC,"and(73.4%), but(18.5%), or(5.9%), either(0.7%), both(0.7%)"


## Dependency relation frequency

In [9]:
from operator import itemgetter
deprel_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    deprel = word_node.get('deprel')
    # fix a bug in PALAVRAS output
    deprel = deprel.split()[0]
    
    deprel_freq[deprel] = deprel_freq.get(deprel, 0) + 1
    if deprel not in freqlist:
        freqlist[deprel] = dict()
    freqlist[deprel][form] = freqlist[deprel].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'DEPREL', 'EXAMPLES']]

for deprel, freq in sorted(deprel_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[deprel].values()) * 100) 
                          for w,f in sorted(freqlist[deprel].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(deprel_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), deprel, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,DEPREL,EXAMPLES
3376,11.7%,punct,".(49.2%), ,(30.2%), !(5.8%), -rrb-(2.8%), ...(2.8%)"
2865,9.9%,nsubj,"i(20.3%), we(8.2%), it(8.2%), you(6.8%), food(4.5%)"
2480,8.6%,det,"the(60.6%), a(24.2%), this(7.2%), an(2.4%), some(1.2%)"
2023,7.0%,case,"of(16.7%), in(14.2%), for(14.0%), to(8.3%), with(8.1%)"
2011,6.9%,root,"good(2.5%), had(2.4%), go(2.0%), place(1.9%), food(1.7%)"
1812,6.3%,nmod,"food(2.7%), place(2.2%), restaurant(1.9%), it(1.9%), restaurants(1.4%)"
1718,5.9%,advmod,"very(7.0%), so(5.3%), here(4.1%), just(4.0%), when(3.1%)"
1565,5.4%,amod,"great(8.1%), good(4.9%), best(2.8%), other(2.1%), indian(1.6%)"
1245,4.3%,cop,"is(36.1%), was(30.6%), are(8.8%), were(6.9%), be(5.7%)"


## Dependency Root frequency

In [10]:
freqlist = Counter([word_node.get('form').lower() 
                    for word_node in reviews.iter('word') 
                        if word_node.get('head') =='0' and 
                           word_node.get('postag') !='pu'])

data = [['Freq', '% Freq', 'word']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,word
51,2.55%,good
48,2.40%,had
41,2.05%,go
39,1.95%,place
34,1.70%,food
34,1.70%,went
31,1.55%,recommend
26,1.30%,great
25,1.25%,restaurant


## Named Entities Frequency

In [11]:
from operator import itemgetter
ne_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    ne = word_node.get('ner')
    
    ne_freq[ne] = ne_freq.get(ne, 0) + 1
    if ne not in freqlist:
        freqlist[ne] = dict()
    freqlist[ne][form] = freqlist[ne].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'NAMED ENTITY', 'EXAMPLES']]

for ne, freq in sorted(ne_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[ne].values()) * 100) 
                          for w,f in sorted(freqlist[ne].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(ne_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), ne, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,NAMED ENTITY,EXAMPLES
27677,95.6%,O,".(6.0%), the(5.4%), ,(3.7%), and(3.2%), i(2.3%)"
195,0.7%,LOCATION,"new(10.3%), york(9.2%), manhattan(6.2%), brooklyn(5.1%), ny(3.1%)"
183,0.6%,NUMBER,"one(36.1%), two(10.9%), four(6.0%), 2(5.5%), 5(3.8%)"
153,0.5%,DURATION,"minutes(10.5%), years(10.5%), day(7.2%), the(5.9%), hour(4.6%)"
139,0.5%,DATE,"once(14.4%), saturday(7.2%), now(7.2%), the(7.2%), last(5.0%)"
132,0.5%,PERSON,"suan(4.5%), mizu(3.8%), saul(3.0%), la(3.0%), dokebi(3.0%)"
132,0.5%,MISC,"thai(15.9%), indian(15.2%), japanese(9.8%), italian(6.8%), chinese(6.1%)"
98,0.3%,MONEY,"$(48.0%), 500(4.1%), 10(3.1%), 2(3.1%), 60(2.0%)"
84,0.3%,ORGANIZATION,"la(7.1%), femme(6.0%), casa(6.0%), flatbush(4.8%), yamato(2.4%)"


## Aspect Frequency

In [12]:
freqlist =  Counter([node.get('target').lower() for node in reviews.iter('Opinion')])

data = [['freq', '%freq', 'aspect(OBJ)']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,aspect(OBJ)
627,25.0%,
233,9.3%,food
148,5.9%,service
129,5.1%,place
49,2.0%,restaurant
40,1.6%,staff
31,1.2%,pizza
28,1.1%,atmosphere
26,1.0%,sushi


In [13]:
sentences = ' '.join([node.get('form').lower() for node in reviews.iter('word')])
freqlist =  Counter([node.get('target').lower() for node in reviews.iter('Opinion') if node.get('target') != 'NULL'])

data = [['Total in Corpus', 'Total as target',  'Freq as target', 'aspect chunk']]
for token, freq in freqlist.most_common(20):
    total = sentences.count(' ' + token + ' ')
    ratio = freq / total * 100    
    data.append([total, freq,'{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
Total in Corpus,Total as target,Freq as target,aspect chunk
287,233,81.2%,food
161,148,91.9%,service
206,129,62.6%,place
118,49,41.5%,restaurant
49,40,81.6%,staff
58,31,53.4%,pizza
33,28,84.8%,atmosphere
43,26,60.5%,sushi
25,23,92.0%,decor
