In [7]:
from bidict import bidict
from collections import defaultdict
import itertools
import os
import pandas as pd
import pickle
import lzma

from conllu import parse, parse_incr
import numpy as np
from cp_orth import orth_als
import sktensor

from decomp_pmi import append_pmi

import logging
logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s [%(lineno)d] %(message)s')

# Get UMBC

In [2]:
def get_umbc_dict():
    umbc_dir = '/mnt/store/home/makrai/data/language/english/corp/umbc_WebBase/English/'
    freq = defaultdict(int)
    for filen in os.listdir(umbc_dir):
        logging.info(filen)
        for i, sentence in  enumerate(parse_incr(lzma.open(os.path.join(umbc_dir, filen), mode='rt',
                                                           encoding="utf-8"))):
            if not i % 100000:
                logging.debug(i)
            root = sentence.to_tree()
            subj, obj = '', ''
            for child in root.children:
                if 'subj' in child.token['deprel']:
                    if subj:
                        #logging.warn('subj: {}'.format((subj, child.token['lemma'], sentence)))
                        continue
                    subj = child.token['lemma']
                elif child.token['deprel'] == 'obj':
                    if obj:
                        #logging.warn('obj: {}'.format((obj, child.token['lemma'], sentence)))
                        continue            
                    obj = child.token['lemma']
            #if bool(obj) and bool(subj):
            freq[(subj, root.token['lemma'], obj)] += 1
        #pickle.dump(freq, open('/mnt/store/home/makrai/project/verb-tensor/umbc_freq.pkl', mode='wb'))
    return freq

In [3]:
def get_umbc_df():
    freq = pickle.load(open('/mnt/store/home/makrai/project/verb-tensor/umbc_freq.pkl', mode='rb'))
    freq_df = pd.DataFrame.from_records(list(freq.items()), columns=['svo', 'freq'])
    freq_df[['subj', 'verb', 'obj']] = pd.DataFrame(freq_df.svo.tolist(), index=freq_df.index)                                                                                                                       
    del freq_df['svo']
    return freq_df

# Mazsola DB

In [15]:
mazsola = pickle.load(open(
    '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis.pkl',
    mode='rb'))

In [26]:
mazsola_df =pd.read_csv(
    '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis_svo_freq.tsv', sep='\t',
    keep_default_na=False)

In [27]:
mazsola_df, log_total = append_pmi(mazsola_df, compute_freq=False)

INFO     [31] Computing marginals..
INFO     [33] Computing 2-marginals..
INFO     [40] Computing Dice..
INFO     [57] Computing PMI variants..


In [32]:
mazsola_df.sort_values('iact_info').head()

Unnamed: 0,NOM,stem,ACC,freq,freq_NOM,freq_stem,freq_ACC,"freq_('NOM', 'stem')","freq_('NOM', 'ACC')","freq_('stem', 'ACC')",dice,freq2,pmi,iact_info,salience,iact_sali
28,vita,lezár,szakaszPOSS,981,-10.714631,-10.05804,-11.313181,-12.448836,-12.35337,-12.393314,0.274944,-12.467829,44.811876,17.577497,-558.706803,-219.153222
2,képviselő,megad,szó,2706,-8.235231,-8.374668,-7.735337,-10.981247,-10.939862,-9.193746,0.132508,-11.003992,44.811876,17.773611,-493.10953,-195.580672
0,ez,jelent,az,7074,-7.180994,-6.294171,-4.659988,-9.048661,-9.28797,-8.348793,0.064518,-9.617628,44.811876,18.167898,-430.983941,-174.732082
649,Bocsi,megun,nyomkodás,129,-14.791228,-13.12285,-15.394711,-15.394711,-15.394711,-15.394711,0.408228,-15.394711,44.811876,18.270055,-689.865881,-281.262208
141,foglaló,utasít,aláírásPOSS,381,-12.769314,-11.178322,-12.277944,-13.601807,-13.394711,-13.712451,0.265013,-13.832291,44.811876,18.315681,-619.850912,-253.347824


# Top triples

In [27]:
svo_count.sort_values('freq', ascending=False).head()

Unnamed: 0,NOM,stem,ACC,freq,freq_NOM,freq_stem,freq_ACC,"freq_('NOM', 'stem')","freq_('NOM', 'ACC')","freq_('stem', 'ACC')",dice,freq2,pmi,iact_info,salience,iact_sali
0,-7.0,vesz,rész,1,-21.701497,-6.410435,-8.304224,-21.701497,-21.701497,-9.200655,5.9e-05,-21.701497,43.402995,37.888992,-941.909967,-822.247849
2273858,1.0,cselekedik,bármi,1,-11.504281,-13.397717,-11.381825,-21.701497,-21.701497,-19.701497,0.001084,-21.701497,43.402995,48.522167,-941.909967,-1053.00367
2273574,1.0,összegyűjt,ők,1,-21.701497,-11.217681,-7.070491,-21.701497,-21.701497,-18.531572,0.000112,-21.701497,43.402995,43.646395,-941.909967,-947.192111
2273575,77.0,felhív,telefonszám,1,-21.701497,-9.341748,-14.806679,-21.701497,-21.701497,-18.379569,0.000558,-21.701497,43.402995,37.634136,-941.909967,-816.71711
2273576,0.0,mond,mind,1,-18.379569,-6.946662,-13.92671,-21.701497,-21.701497,-20.701497,0.000108,-21.701497,43.402995,46.553048,-941.909967,-1010.270838


In [69]:
svo_count[svo_count.freq>100].sort_values('pmi', ascending=False).head()

Unnamed: 0,NOM,stem,ACC,freq,freq_NOM,freq_stem,freq_ACC,"freq_('NOM', 'stem')","freq_('NOM', 'ACC')","freq_('stem', 'ACC')",dice,freq2,pmi,iact_info,salience
61368,Bocsi,megun,nyomkodás,129,-14.791228,-13.12285,-15.394711,-15.394711,-15.394711,-15.394711,0.408228,-15.394711,27.914078,18.270055,-429.729163
54520,Barabás,belelő,golyó,123,-14.350656,-15.039616,-13.566734,-15.394711,-15.111317,-15.339849,0.415073,-15.463424,27.493582,18.352295,-425.144912
430375,ajánlattevő,utasít,bank,151,-14.405938,-11.178322,-12.091921,-15.167533,-15.167533,-15.158011,0.115355,-15.167533,22.508648,22.984429,-341.400677
1285388,foglaló,utasít,aláírásPOSS,381,-12.769314,-11.178322,-12.277944,-13.601807,-13.394711,-13.712451,0.265013,-13.832291,22.393289,18.315681,-309.750484
1314300,forráspontPOSS,tartalmaz,szénhidrogén,144,-15.139152,-7.916529,-13.762082,-15.158011,-15.236013,-13.859044,0.01834,-15.236013,21.58175,22.671318,-328.819821


In [76]:
svo_count[(svo_count.freq>100) & (svo_count.ACC != 'NULL')].sort_values('iact_info', ascending=False).head()

Unnamed: 0,NOM,stem,ACC,freq,freq_NOM,freq_stem,freq_ACC,"freq_('NOM', 'stem')","freq_('NOM', 'ACC')","freq_('stem', 'ACC')",dice,freq2,pmi,iact_info,salience
799060,bíróság,ad,hely,109,-8.835015,-6.356643,-8.813248,-14.26128,-15.51112,-12.38357,0.003541,-15.637754,8.367151,33.788819,-130.843455
1810456,jövő,hoz,mi,114,-12.331797,-6.936169,-7.044268,-15.350656,-15.177119,-13.307906,0.003862,-15.573048,10.739186,33.096495,-167.241864
464861,aki,kísér,munkaPOSS,141,-5.426591,-9.721847,-9.338336,-14.009333,-13.569888,-14.733513,0.00293,-15.266387,9.220387,33.092347,-140.761996
2083281,képviselőtársPOSS,kap,javaslat,117,-9.803239,-6.273137,-8.306755,-12.107876,-14.719438,-14.996547,0.003671,-15.535573,8.847558,32.976303,-137.451884
1729734,isten,ad,mi,130,-9.110887,-6.356643,-7.044268,-12.647715,-13.634449,-13.664471,0.003251,-15.38357,7.128227,32.818407,-109.657589


In [10]:
svo_count[(svo_count.ACC != 'NULL')].sort_values('salience', ascending=False).head()

Unnamed: 0,NOM,stem,ACC,freq,freq_NOM,freq_stem,freq_ACC,"freq_('NOM', 'stem')","freq_('NOM', 'ACC')","freq_('stem', 'ACC')",dice,freq2,pmi,iact_info,salience,iact_sali
2981091,aki,mond,ami,1,-5.426591,-5.175627,-6.533095,-10.997608,-17.762082,-10.979673,9e-06,-22.405938,-5.270625,45.009989,118.093303,-1008.49103
2973534,aki,tesz,ami,1,-5.426591,-5.756149,-6.533095,-11.159198,-17.762082,-12.045091,1e-05,-22.405938,-4.690103,45.656474,105.086166,-1022.976132
1588330,én,jelent,az,1,-6.985649,-6.294171,-4.659988,-19.820976,-9.75399,-8.348793,9e-06,-22.405938,-4.46613,42.389888,100.067839,-949.785209
2301391,ami,vesz,az,1,-7.093303,-6.190367,-4.659988,-15.451742,-10.109022,-12.31715,9e-06,-22.405938,-4.462279,42.340193,99.981557,-948.671749
2301037,ami,tart,az,1,-7.093303,-6.248769,-4.659988,-15.651051,-10.109022,-11.242288,9e-06,-22.405938,-4.403878,41.406239,98.673013,-927.745627


In [82]:
svo_count.sort_values('iact_sali', ascending=False).head()

Unnamed: 0,NOM,stem,ACC,freq,freq_NOM,freq_stem,freq_ACC,"freq_('NOM', 'stem')","freq_('NOM', 'ACC')","freq_('stem', 'ACC')",dice,freq2,pmi,iact_info,salience,iact_sali
1167071,ez,jelent,az,7074,-7.180994,-6.294171,-4.659988,-9.048661,-9.28797,-8.348793,0.064518,-9.617628,8.517526,18.167898,-81.918391,-174.732082
2072245,képviselő,megad,szó,2706,-8.235231,-8.374668,-7.735337,-10.981247,-10.939862,-9.193746,0.132508,-11.003992,13.341244,17.773611,-146.806938,-195.580672
2731609,országgyűlés,elfogad,javaslat,1979,-9.387912,-7.191809,-8.306755,-10.735282,-11.334476,-10.184653,0.092975,-11.455382,13.431093,18.823318,-153.858306,-215.628299
4021570,úr,köszön,szó,2144,-7.504317,-8.807886,-7.735337,-10.47779,-10.962995,-10.384958,0.0931,-11.339849,12.707691,19.118052,-144.103296,-216.795821
648953,asszony,köszön,szó,1885,-8.909833,-8.807886,-7.735337,-10.950097,-11.461958,-10.384958,0.113025,-11.525589,13.927467,18.869547,-160.522264,-217.482648


# UMBC

In [84]:
svo_count, log_total = append_pmi(freq_df, modes=['subj', 'verb', 'obj'], compute_freq=False)#, debug_index=2234759)

INFO     [11] 
INFO     [14] 
INFO     [30] 


In [91]:
svo_count[(svo_count.subj != '') & (svo_count.obj != '')].sort_values('freq', ascending=False).head()

Unnamed: 0,freq,subj,verb,obj,freq_subj,freq_verb,freq_obj,"freq_('subj', 'verb')","freq_('subj', 'obj')","freq_('verb', 'obj')",dice,freq2,pmi,iact_info,salience
7585,947,I,recommend,this,-4.327517,-10.210711,-9.423566,-11.590257,-10.984983,-13.002427,0.006598,-13.091054,10.87074,24.706926,-142.30945
307,879,you,think,what,-5.682829,-7.48061,-9.065385,-11.083457,-10.827893,-12.792161,0.011849,-13.198555,9.030269,25.673242,-119.186501
7644,801,I,have,problem,-4.327517,-5.63653,-10.895128,-8.261295,-12.900124,-12.133569,0.004129,-13.332616,7.526559,25.768428,-100.348725
686,787,I,have,idea,-4.327517,-5.63653,-11.029178,-8.261295,-12.488427,-12.313827,0.004059,-13.358055,7.63517,25.428379,-101.991023
2560,700,you,do,what,-5.682829,-8.03146,-9.065385,-11.40179,-10.827893,-11.952443,0.010103,-13.527064,9.25261,24.929516,-125.160648


In [93]:
svo_count[svo_count.freq>100].sort_values('pmi', ascending=False).head()

Unnamed: 0,freq,subj,verb,obj,freq_subj,freq_verb,freq_obj,"freq_('subj', 'verb')","freq_('subj', 'obj')","freq_('verb', 'obj')",dice,freq2,pmi,iact_info,salience
1547531,253,papers,span,year,-13.417942,-13.318279,-11.442513,-14.683654,-14.98959,-14.270916,0.167439,-14.995281,23.183453,20.760707,-347.642396
124437,103,use,constitute,acceptance,-11.132001,-13.164494,-15.101758,-16.17092,-16.10791,-15.912186,0.064148,-16.291774,23.106478,25.084537,-376.445522
4433,203,value,price,+.,-11.773704,-9.803661,-15.196915,-15.277835,-15.312939,-15.277835,0.051497,-15.312939,21.461341,24.407268,-328.6362
8850,105,View,search,salary,-13.358055,-9.44799,-14.502541,-15.01249,-16.264029,-15.502541,0.024279,-16.264029,21.044557,25.734504,-342.269296
52164,127,user,browse,forum,-11.333517,-12.286531,-13.220052,-15.711488,-15.933881,-15.859334,0.066585,-15.98959,20.85051,26.654193,-333.391105


In [34]:
svo_count[(svo_count.freq>100) & (svo_count.subj != '') & (svo_count.obj != '')].sort_values(
    'iact_info', ascending=True).head()

Unnamed: 0,freq,subj,verb,obj,freq_subj,freq_verb,freq_obj,"freq_('subj', 'verb')","freq_('subj', 'obj')","freq_('verb', 'obj')",dice,freq2,pmi,iact_info,salience
1547531,253,papers,span,year,-13.417942,-13.318279,-11.442513,-14.683654,-14.98959,-14.270916,0.167439,-14.995281,23.183453,20.760707,-347.642396
2570,538,resource,send,link,-12.736292,-9.22026,-11.046799,-13.906812,-13.89879,-13.667662,0.085077,-13.906812,19.096538,22.376727,-265.571964
4433,203,value,price,+.,-11.773704,-9.803661,-15.196915,-15.277835,-15.312939,-15.277835,0.051497,-15.312939,21.461341,24.407268,-328.6362
15265,264,last,edit,time,-13.896126,-10.834892,-9.80788,-14.725609,-14.933881,-14.378362,0.055443,-14.933881,19.605017,24.432835,-292.77898
7585,947,I,recommend,this,-4.327517,-10.210711,-9.423566,-11.590257,-10.984983,-13.002427,0.006598,-13.091054,10.87074,24.706926,-142.30945


In [15]:
svo_count[(svo_count.subj != '') & (svo_count.obj != '')].sort_values('salience', ascending=False).head()

Unnamed: 0,freq,subj,verb,obj,freq_subj,freq_verb,freq_obj,"freq_('subj', 'verb')","freq_('subj', 'obj')","freq_('verb', 'obj')",dice,freq2,pmi,iact_info,salience
2208830,1,I,be,it,-4.327517,-6.027309,-7.681717,-13.072888,-9.70346,-16.423686,5e-06,-22.978275,-4.941731,44.141765,113.552463
146474,1,I,have,we,-4.327517,-5.63653,-9.253974,-8.261295,-16.408419,-16.848992,5e-06,-22.978275,-3.760253,45.278959,86.404129
127234,1,he,be,it,-5.740747,-6.027309,-7.681717,-14.665392,-12.574198,-16.423686,9e-06,-22.978275,-3.528502,47.191777,81.07888
452618,1,he,have,you,-5.740747,-5.63653,-8.300115,-10.476438,-14.983921,-16.047537,9e-06,-22.978275,-3.300883,44.808779,75.848586
2512849,1,I,be,#,-4.327517,-6.027309,-9.399195,-13.072888,-17.223387,-21.978275,5e-06,-22.978275,-3.224254,55.498804,74.087791


In [36]:
svo_count[(svo_count.freq>100)].sort_values('dice', ascending=False).head()

Unnamed: 0,freq,subj,verb,obj,freq_subj,freq_verb,freq_obj,"freq_('subj', 'verb')","freq_('subj', 'obj')","freq_('verb', 'obj')",dice,freq2,pmi,iact_info,salience
1547531,253,papers,span,year,-13.417942,-13.318279,-11.442513,-14.683654,-14.98959,-14.270916,0.167439,-14.995281,23.183453,20.760707,-347.642396
2570,538,resource,send,link,-12.736292,-9.22026,-11.046799,-13.906812,-13.89879,-13.667662,0.085077,-13.906812,19.096538,22.376727,-265.571964
52164,127,user,browse,forum,-11.333517,-12.286531,-13.220052,-15.711488,-15.933881,-15.859334,0.066585,-15.98959,20.85051,26.654193,-333.391105
124437,103,use,constitute,acceptance,-11.132001,-13.164494,-15.101758,-16.17092,-16.10791,-15.912186,0.064148,-16.291774,23.106478,25.084537,-376.445522
15265,264,last,edit,time,-13.896126,-10.834892,-9.80788,-14.725609,-14.933881,-14.378362,0.055443,-14.933881,19.605017,24.432835,-292.77898
