# Statistics in SemEval 2015 Restaurants Train Corpus annotated with Stanford CoreNLP

In [1]:
from __future__ import print_function
from __future__ import division

In [2]:
from lxml.etree import ElementTree
reviews = ElementTree().parse('../corpus/SemEvalABSA2015EnglishRestaurants_train.xml')

## Token Type Ratio

In [3]:
from collections import Counter
freqlist = Counter([word_node.get('form') for word_node in reviews.iter('word')])

print ('Total tokens: {}'.format(sum(freqlist.values())))
print ('Total types:  {}'.format(len(freqlist.keys())))
print ('Token/Type ratio:  {:.1f}%'.format(sum(freqlist.values())/len(freqlist.keys())))

Total tokens: 18513
Total types:  2965
Token/Type ratio:  6.2%


## Lexical Frequency

In [4]:
import ipy_table

In [5]:
freqlist = Counter([word_node.get('form').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Token']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Token
1143,6.17%,.
991,5.35%,the
643,3.47%,","
590,3.19%,and
395,2.13%,i
395,2.13%,a
382,2.06%,is
322,1.74%,to
300,1.62%,was


## Lemma Frequency

In [6]:
freqlist = Counter([word_node.get('base').lower() for word_node in reviews.iter('word')])

data = [['Freq', '% Freq', 'Lemma']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,Lemma
1143,6.17%,.
1085,5.86%,be
991,5.35%,the
643,3.47%,","
590,3.19%,and
429,2.32%,a
427,2.31%,i
322,1.74%,to
259,1.40%,it


## Part-of-Speech Frequency

In [7]:
from operator import itemgetter
postag_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    postag = word_node.get('postag')
    
    postag_freq[postag] = postag_freq.get(postag, 0) + 1
    if postag not in freqlist:
        freqlist[postag] = dict()
    freqlist[postag][form] = freqlist[postag].get(form, 0) + 1 

In [8]:
data = [['FREQ', '% FREQ', 'POSTAG', 'EXAMPLES']]

for pos, freq in sorted(postag_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[pos].values()) * 100) 
                          for w,f in sorted(freqlist[pos].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(postag_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), pos, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,POSTAG,EXAMPLES
2624,14.2%,NN,"food(6.8%), place(5.1%), service(4.0%), restaurant(3.0%), time(1.5%)"
1816,9.8%,JJ,"great(7.4%), good(7.0%), delicious(2.2%), excellent(2.0%), nice(1.9%)"
1769,9.6%,DT,"the(55.9%), a(22.3%), this(9.4%), all(2.8%), an(1.9%)"
1459,7.9%,IN,"of(13.9%), in(13.4%), for(13.1%), on(8.0%), with(7.9%)"
1297,7.0%,RB,"not(9.6%), n't(7.9%), very(6.4%), so(3.9%), here(3.7%)"
1275,6.9%,.,".(89.6%), !(9.6%), ?(0.7%)"
1107,6.0%,PRP,"i(32.4%), it(23.4%), we(13.6%), you(13.1%), they(7.9%)"
789,4.3%,CC,"and(74.8%), but(17.4%), or(5.6%), both(0.6%), either(0.5%)"
771,4.2%,VBD,"was(38.9%), were(11.0%), had(9.5%), went(3.4%), did(3.0%)"


## Dependency relation frequency

In [9]:
from operator import itemgetter
deprel_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    deprel = word_node.get('deprel')
    # fix a bug in PALAVRAS output
    deprel = deprel.split()[0]
    
    deprel_freq[deprel] = deprel_freq.get(deprel, 0) + 1
    if deprel not in freqlist:
        freqlist[deprel] = dict()
    freqlist[deprel][form] = freqlist[deprel].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'DEPREL', 'EXAMPLES']]

for deprel, freq in sorted(deprel_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[deprel].values()) * 100) 
                          for w,f in sorted(freqlist[deprel].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(deprel_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), deprel, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,DEPREL,EXAMPLES
2181,11.8%,punct,".(52.4%), ,(29.5%), !(5.6%), ...(2.8%), -rrb-(2.5%)"
1858,10.0%,nsubj,"i(19.5%), it(8.9%), we(7.6%), you(6.9%), food(5.3%)"
1621,8.8%,det,"the(60.7%), a(24.1%), this(7.5%), an(2.1%), some(1.2%)"
1323,7.1%,root,"good(2.6%), had(2.3%), place(2.1%), go(2.1%), went(1.7%)"
1280,6.9%,case,"of(15.2%), in(14.4%), for(13.5%), on(8.8%), with(8.7%)"
1130,6.1%,nmod,"food(2.2%), restaurant(2.1%), place(2.0%), it(1.6%), restaurants(1.5%)"
1116,6.0%,advmod,"very(7.4%), so(4.5%), here(4.2%), just(4.0%), too(3.6%)"
1041,5.6%,amod,"great(8.5%), good(5.6%), best(3.3%), little(1.8%), other(1.8%)"
875,4.7%,cop,"is(38.4%), was(28.9%), are(8.7%), were(6.9%), be(5.4%)"


## Dependency Root frequency

In [10]:
freqlist = Counter([word_node.get('form').lower() 
                    for word_node in reviews.iter('word') 
                        if word_node.get('head') =='0' and 
                           word_node.get('postag') !='pu'])

data = [['Freq', '% Freq', 'word']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) * 100
    data.append( [freq, '{:.2f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
Freq,% Freq,word
34,2.59%,good
31,2.36%,had
28,2.13%,place
28,2.13%,go
23,1.75%,went
23,1.75%,recommend
18,1.37%,great
18,1.37%,restaurant
16,1.22%,food


## Named Entities Frequency

In [11]:
from operator import itemgetter
ne_freq = dict()
freqlist = dict()

for word_node in reviews.iter('word'):
    form = word_node.get('form').lower()
    ne = word_node.get('ner')
    
    ne_freq[ne] = ne_freq.get(ne, 0) + 1
    if ne not in freqlist:
        freqlist[ne] = dict()
    freqlist[ne][form] = freqlist[ne].get(form, 0) + 1 
    
data = [['FREQ', '% FREQ', 'NAMED ENTITY', 'EXAMPLES']]

for ne, freq in sorted(ne_freq.items(), key=itemgetter(1), reverse=True):
    examples = ', '.join(['{}({:.1f}%)'.format(w,f / sum(freqlist[ne].values()) * 100) 
                          for w,f in sorted(freqlist[ne].items(), key=itemgetter(1), reverse=True)[:5]])
    ratio = freq / sum(ne_freq.values()) * 100
    data.append([freq, '{:.1f}%'.format(ratio), ne, examples])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
FREQ,% FREQ,NAMED ENTITY,EXAMPLES
17727,95.8%,O,".(6.4%), the(5.5%), ,(3.6%), and(3.3%), i(2.2%)"
143,0.8%,LOCATION,"new(9.8%), york(8.4%), manhattan(7.0%), ny(3.5%), chinatown(2.8%)"
99,0.5%,NUMBER,"one(41.4%), two(13.1%), three(5.1%), four(4.0%), 4(3.0%)"
98,0.5%,MISC,"thai(20.4%), japanese(10.2%), italian(9.2%), indian(9.2%), taiwanese(5.1%)"
92,0.5%,PERSON,"suan(6.5%), mizu(5.4%), saul(4.3%), rao(3.3%), murray(3.3%)"
89,0.5%,DURATION,"years(13.5%), day(9.0%), the(7.9%), minutes(4.5%), few(4.5%)"
80,0.4%,DATE,"once(10.0%), saturday(8.8%), now(6.2%), recently(6.2%), week(5.0%)"
56,0.3%,TIME,"night(35.7%), evening(14.3%), last(10.7%), late(7.1%), afternoon(7.1%)"
47,0.3%,MONEY,"$(44.7%), 1(4.3%), 20(4.3%), 10(4.3%), 2(4.3%)"


## Aspect Frequency

In [12]:
freqlist =  Counter([node.get('target').lower() for node in reviews.iter('Opinion')])

data = [['freq', '%freq', 'aspect(OBJ)']]
for token, freq in freqlist.most_common(20):
    ratio = freq / sum(freqlist.values()) *100
    data.append([freq, '{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2
freq,%freq,aspect(OBJ)
375,22.7%,
158,9.6%,food
117,7.1%,service
82,5.0%,place
29,1.8%,restaurant
27,1.6%,staff
26,1.6%,pizza
21,1.3%,atmosphere
20,1.2%,sushi


In [6]:
sentences = ' '.join([node.get('form').lower() for node in reviews.iter('word')])
freqlist =  Counter([node.get('target').lower() for node in reviews.iter('Opinion') if node.get('target') != 'NULL'])

data = [['Total in Corpus', 'Total as target',  'Freq as target', 'aspect chunk']]
for token, freq in freqlist.most_common(20):
    total = sentences.count(' ' + token + ' ')
    ratio = freq / total * 100    
    data.append([total, freq,'{:.1f}%'.format(ratio), token])

ipy_table.make_table(data)
ipy_table.apply_theme('basic')

0,1,2,3
Total in Corpus,Total as target,Freq as target,aspect chunk
190,158,83.2%,food
127,117,92.1%,service
135,82,60.7%,place
82,29,35.4%,restaurant
33,27,81.8%,staff
42,26,61.9%,pizza
26,21,80.8%,atmosphere
32,20,62.5%,sushi
19,16,84.2%,decor
