In [1]:
from bidict import bidict
from collections import defaultdict
import itertools
import os
import pandas as pd
import pickle
import lzma

from conllu import parse, parse_incr
import numpy as np
from cp_orth import orth_als
import sktensor

from decomp_pmi import VerbTensor

import logging
logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s [%(lineno)d] %(message)s')

# DepCC: salience

In [2]:
svo_count = pd.read_csv('/mnt/store/home/makrai/project/verb-tensor/depCC-00.tsv', sep='\t')

In [3]:
svo_count.sort_values('freq', ascending=False).head()

Unnamed: 0,nsubj,ROOT,dobj,freq
0,we,support,browser,14303
1,I,have,idea,12840
2,we,reserve,right,10651
3,I,love,it,9545
4,I,have,problem,8316


In [4]:
modes = ['nsubj', 'ROOT', 'dobj']
svo_count, log_total = VerbTensor().append_pmi(svo_count=svo_count, modes=modes)

INFO     [33] Computing marginals..
INFO     [35] Computing 2-marginals..
INFO     [42] Computing Dice..
INFO     [59] Computing PMI variants..
INFO     [78] Computing salience..
INFO     [81] Writing..


In [20]:
svo_count.to_csv('/mnt/store/home/makrai/project/verb-tensor/depCC.tsv', sep='\t', index=False, float_format='%.5g')

In [26]:
pickle.dump((svo_count, log_total), open('/mnt/store/home/makrai/project/verb-tensor/depCC.pkl', mode='wb'))

In [5]:
svo_count[svo_count.columns[10:]].describe(percentiles=[])

Unnamed: 0,dice,freq2,pmi,iact_info,salience,iact_sali
count,2168649.0,2168721.0,2168721.0,2168721.0,2168721.0,2168721.0
mean,0.002826417,-21.88612,44.43568,61.59019,-972.5244,-1347.97
std,0.03779198,0.8656357,1.49214e-13,1.421086e-14,38.46511,53.31467
min,1.453412e-06,-22.21784,44.43568,61.59019,-987.2646,-1368.401
50%,2.701072e-05,-22.21784,44.43568,61.59019,-987.2646,-1368.401
max,1.0,-8.413807,44.43568,61.59019,-373.8732,-518.208


In [6]:
svo_count[svo_count.freq>1].sort_values('pmi', ascending=False).head()

Unnamed: 0,nsubj,ROOT,dobj,freq,freq_nsubj,freq_ROOT,freq_dobj,"freq_('nsubj', 'ROOT')","freq_('nsubj', 'dobj')","freq_('ROOT', 'dobj')",dice,freq2,pmi,iact_info,salience,iact_sali
0,we,support,browser,14303,-3.124918,-7.563929,-8.382182,-8.276882,-8.412799,-8.412698,0.071567,-8.413807,44.435675,61.590194,-373.873216,-518.208032
267980,this,make,month,2,-5.49013,-5.457235,-10.836836,-9.676015,-16.068091,-18.410483,2.7e-05,-21.217838,44.435675,61.590194,-942.828942,-1306.810731
267978,this,allow,replication,2,-5.49013,-9.1536,-16.545412,-10.642298,-18.758406,-21.217838,5.1e-05,-21.217838,44.435675,61.590194,-942.828942,-1306.810731
267977,this,permit,synchronization,2,-5.49013,-12.640409,-16.89591,-13.80421,-19.047913,-21.217838,5.5e-05,-21.217838,44.435675,61.590194,-942.828942,-1306.810731
267976,this,allow,engine,2,-5.49013,-9.1536,-11.716996,-10.642298,-16.758406,-20.632875,5.1e-05,-21.217838,44.435675,61.590194,-942.828942,-1306.810731


In [7]:
svo_count[svo_count.freq>1].sort_values('iact_info', ascending=False).head()

Unnamed: 0,nsubj,ROOT,dobj,freq,freq_nsubj,freq_ROOT,freq_dobj,"freq_('nsubj', 'ROOT')","freq_('nsubj', 'dobj')","freq_('ROOT', 'dobj')",dice,freq2,pmi,iact_info,salience,iact_sali
0,we,support,browser,14303,-3.124918,-7.563929,-8.382182,-8.276882,-8.412799,-8.412698,0.071567,-8.413807,44.435675,61.590194,-373.873216,-518.208032
267980,this,make,month,2,-5.49013,-5.457235,-10.836836,-9.676015,-16.068091,-18.410483,2.7e-05,-21.217838,44.435675,61.590194,-942.828942,-1306.810731
267978,this,allow,replication,2,-5.49013,-9.1536,-16.545412,-10.642298,-18.758406,-21.217838,5.1e-05,-21.217838,44.435675,61.590194,-942.828942,-1306.810731
267977,this,permit,synchronization,2,-5.49013,-12.640409,-16.89591,-13.80421,-19.047913,-21.217838,5.5e-05,-21.217838,44.435675,61.590194,-942.828942,-1306.810731
267976,this,allow,engine,2,-5.49013,-9.1536,-11.716996,-10.642298,-16.758406,-20.632875,5.1e-05,-21.217838,44.435675,61.590194,-942.828942,-1306.810731


In [8]:
svo_count[svo_count.freq>10].sort_values('dice', ascending=False).head()

Unnamed: 0,nsubj,ROOT,dobj,freq,freq_nsubj,freq_ROOT,freq_dobj,"freq_('nsubj', 'ROOT')","freq_('nsubj', 'dobj')","freq_('ROOT', 'dobj')",dice,freq2,pmi,iact_info,salience,iact_sali
2767,Rovartani,lapok,QL461,113,-15.397659,-15.397659,-15.397659,-15.397659,-15.397659,-15.397659,1.0,-15.397659,44.435675,61.590194,-684.20536,-948.344781
5712,Vul,a.u.b.,bericht,59,-16.335195,-16.335195,-16.335195,-16.335195,-16.335195,-16.335195,1.0,-16.335195,44.435675,61.590194,-725.865401,-1006.0878
12259,SOUNDBWOY,soundcolour,Soundmen,30,-17.310947,-17.310947,-17.310947,-17.310947,-17.310947,-17.310947,1.0,-17.310947,44.435675,61.590194,-769.22362,-1066.184582
1605,ISPE,s.r.l.,Palazzolo,177,-14.750232,-14.742104,-14.742104,-14.750232,-14.750232,-14.750232,0.996248,-14.750232,44.435675,61.590194,-655.436522,-908.469652
1537,bayou,goat,Mounts,183,-14.694276,-14.686456,-14.702138,-14.702138,-14.702138,-14.702138,0.994565,-14.702138,44.435675,61.590194,-653.29942,-905.507515


In [9]:
svo_count[svo_count.freq>0].sort_values('salience', ascending=False).head()

Unnamed: 0,nsubj,ROOT,dobj,freq,freq_nsubj,freq_ROOT,freq_dobj,"freq_('nsubj', 'ROOT')","freq_('nsubj', 'dobj')","freq_('ROOT', 'dobj')",dice,freq2,pmi,iact_info,salience,iact_sali
0,we,support,browser,14303,-3.124918,-7.563929,-8.382182,-8.276882,-8.412799,-8.412698,0.071567,-8.413807,44.435675,61.590194,-373.873216,-518.208032
1,I,have,idea,12840,-1.938978,-2.626869,-7.408019,-4.234087,-8.170288,-8.008384,0.018425,-8.56948,44.435675,61.590194,-380.790632,-527.795937
2,we,reserve,right,10651,-3.124918,-8.636284,-7.929044,-8.820564,-8.736543,-8.70066,0.054026,-8.839136,44.435675,61.590194,-392.772993,-544.404121
3,I,love,it,9545,-1.938978,-5.164219,-5.190958,-5.557062,-6.194451,-8.547956,0.018573,-8.997308,44.435675,61.590194,-399.801463,-554.145952
4,I,have,problem,8316,-1.938978,-2.626869,-7.986842,-4.234087,-8.911206,-8.60889,0.011988,-9.196164,44.435675,61.590194,-408.637738,-566.393497


In [10]:
svo_count[svo_count.freq>0].sort_values('iact_sali', ascending=False).head()

Unnamed: 0,nsubj,ROOT,dobj,freq,freq_nsubj,freq_ROOT,freq_dobj,"freq_('nsubj', 'ROOT')","freq_('nsubj', 'dobj')","freq_('ROOT', 'dobj')",dice,freq2,pmi,iact_info,salience,iact_sali
0,we,support,browser,14303,-3.124918,-7.563929,-8.382182,-8.276882,-8.412799,-8.412698,0.071567,-8.413807,44.435675,61.590194,-373.873216,-518.208032
1,I,have,idea,12840,-1.938978,-2.626869,-7.408019,-4.234087,-8.170288,-8.008384,0.018425,-8.56948,44.435675,61.590194,-380.790632,-527.795937
2,we,reserve,right,10651,-3.124918,-8.636284,-7.929044,-8.820564,-8.736543,-8.70066,0.054026,-8.839136,44.435675,61.590194,-392.772993,-544.404121
3,I,love,it,9545,-1.938978,-5.164219,-5.190958,-5.557062,-6.194451,-8.547956,0.018573,-8.997308,44.435675,61.590194,-399.801463,-554.145952
4,I,have,problem,8316,-1.938978,-2.626869,-7.986842,-4.234087,-8.911206,-8.60889,0.011988,-9.196164,44.435675,61.590194,-408.637738,-566.393497


# Get UMBC

In [None]:
def get_umbc_dict():
    umbc_dir = '/mnt/store/home/makrai/data/language/english/corp/umbc_WebBase/English/'
    freq = defaultdict(int)
    for filen in os.listdir(umbc_dir):
        logging.info(filen)
        for i, sentence in  enumerate(parse_incr(lzma.open(os.path.join(umbc_dir, filen), mode='rt',
                                                           encoding="utf-8"))):
            if not i % 100000:
                logging.debug(i)
            root = sentence.to_tree()
            subj, obj = '', ''
            for child in root.children:
                if 'subj' in child.token['deprel']:
                    if subj:
                        #logging.warn('subj: {}'.format((subj, child.token['lemma'], sentence)))
                        continue
                    subj = child.token['lemma']
                elif child.token['deprel'] == 'obj':
                    if obj:
                        #logging.warn('obj: {}'.format((obj, child.token['lemma'], sentence)))
                        continue            
                    obj = child.token['lemma']
            #if bool(obj) and bool(subj):
            freq[(subj, root.token['lemma'], obj)] += 1
        #pickle.dump(freq, open('/mnt/store/home/makrai/project/verb-tensor/umbc_freq.pkl', mode='wb'))
    return freq

In [None]:
def get_umbc_df():
    freq = pickle.load(open('/mnt/store/home/makrai/project/verb-tensor/umbc_freq.pkl', mode='rb'))
    freq_df = pd.DataFrame.from_records(list(freq.items()), columns=['svo', 'freq'])
    freq_df[['subj', 'verb', 'obj']] = pd.DataFrame(freq_df.svo.tolist(), index=freq_df.index)                                                                                                                       
    del freq_df['svo']
    return freq_df

# Mazsola DB

In [None]:
mazsola = pickle.load(open(
    '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis.pkl',
    mode='rb'))

In [None]:
mazsola_df =pd.read_csv(
    '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis_svo_freq.tsv', sep='\t',
    keep_default_na=False)

In [None]:
mazsola_df, log_total = append_pmi(mazsola_df, compute_freq=False)

In [None]:
mazsola_df.sort_values('iact_info').head()

# Top triples

In [None]:
svo_count.sort_values('freq', ascending=False).head()

In [None]:
svo_count[svo_count.freq>100].sort_values('pmi', ascending=False).head()

In [None]:
svo_count[(svo_count.freq>100) & (svo_count.ACC != 'NULL')].sort_values('iact_info', ascending=False).head()

In [None]:
svo_count[(svo_count.ACC != 'NULL')].sort_values('salience', ascending=False).head()

In [None]:
svo_count.sort_values('iact_sali', ascending=False).head()

# UMBC

In [None]:
svo_count, log_total = append_pmi(freq_df, modes=['subj', 'verb', 'obj'], compute_freq=False)#, debug_index=2234759)

In [None]:
svo_count[(svo_count.subj != '') & (svo_count.obj != '')].sort_values('freq', ascending=False).head()

In [None]:
svo_count[svo_count.freq>100].sort_values('pmi', ascending=False).head()

In [None]:
svo_count[(svo_count.freq>100) & (svo_count.subj != '') & (svo_count.obj != '')].sort_values(
    'iact_info', ascending=True).head()

In [None]:
svo_count[(svo_count.subj != '') & (svo_count.obj != '')].sort_values('salience', ascending=False).head()

In [None]:
svo_count[(svo_count.freq>100)].sort_values('dice', ascending=False).head()