In [57]:
from collections import defaultdict
from copy import copy
from functools import reduce
import glob
import numpy as np
import operator
import os
import pickle

import pandas as pd
import tensorly as tl


import matplotlib.pyplot as plt
%pylab inline

from eval_tensor import VerbTensorEvaluator

import logging
logging.basicConfig(level=logging.INFO,
        format='%(levelname)-8s [%(lineno)d] %(message)s')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# 1 One-mode similarity

## 1.1 Data

### 1.1.1 [SimVerb](http://people.ds.cam.ac.uk/dsg40/simverb.html) (Gerz+ EMNLP 2016)

In [58]:
simverb = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/simverb-3500/SimVerb-3500.txt', 
                      sep='\t', header=None, names=['verb1', 'verb2', 'pos', 'sim', 'rel'])

In [59]:
simverb.head()

Unnamed: 0,verb1,verb2,pos,sim,rel
0,take,remove,V,6.81,SYNONYMS
1,walk,trail,V,4.81,COHYPONYMS
2,feed,starve,V,1.49,ANTONYMS
3,shine,polish,V,7.8,SYNONYMS
4,calculate,add,V,5.98,HYPER/HYPONYMS


In [60]:
simverb.describe(percentiles=[])

Unnamed: 0,sim
count,3500.0
mean,4.291554
std,2.652621
min,0.0
50%,4.32
max,9.96


In [61]:
simverb.groupby('rel').sim.describe(percentiles=[]).sort_values('count', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,50%,max
rel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NONE,2093.0,3.431276,2.342695,0.0,3.15,9.79
HYPER/HYPONYMS,800.0,6.012525,2.104537,0.5,6.31,9.96
SYNONYMS,306.0,6.78915,2.10449,0.5,7.14,9.96
COHYPONYMS,190.0,4.435526,2.381992,0.0,4.665,9.3
ANTONYMS,111.0,0.977748,1.074232,0.0,0.66,6.04


### 1.1.2 SimLex-999

In [62]:
simlex = pd.read_csv('/mnt/permanent/Language/English/Data/SimLex-999/SimLex-999.txt', sep='\t')

In [63]:
simlex.head()

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.2,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93


* conc(w1): The concreteness rating of word1 on a scale of 1-7. Taken from the University of South Florida Free Association Norms database. 

* conc(w2): The concreteness rating of word2 on a scale of 1-7. Taken from the University of South Florida Free Association Norms database. 

* concQ: The quartile the pair occupies based on the two concreteness ratings. Used for some analyses in the above paper. 

* Assoc(USF): The strength of free association from word1 to word2. Values are taken from the University of South Florida Free Association Dataset. 

* SimAssoc333: Binary indicator of whether the pair is one of the 333 most associated in the dataset (according to Assoc(USF)). This subset of SimLex999 is often the hardest for computational models to capture because the noise from high association can confound the similarity rating. See the paper for more details. 

* SD(SimLex): The standard deviation of annotator scores when rating this pair. Low values indicate good agreement between the 15+ annotators on the similarity value SimLex999. Higher scores indicate less certainty. 


In [64]:
simlex.describe(percentiles=[])

Unnamed: 0,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,4.561572,3.657087,3.568629,2.501502,0.751512,0.333333,1.274505
std,2.614663,1.13105,1.159572,1.118145,1.344569,0.471641,0.366278
min,0.23,1.19,1.19,1.0,0.0,0.0,0.34
50%,4.67,3.83,3.66,3.0,0.25,0.0,1.31
max,9.8,5.0,5.0,4.0,8.85,1.0,2.18


In [65]:
simlex.groupby('POS').size()

POS
A    111
N    666
V    222
dtype: int64

## 1.2 Testing the verb tensor

In [66]:
evalor = VerbTensorEvaluator(include_empty=False)

In [67]:
weights = ["iact" ,"iact_sali" ,"ldice" ,"ldice_sali" ,"log_freq" ,"niact" ,"npmi" ,"pmi" ,"pmi_sali"]

cutoffs = []
for i in [1, 2, 3, 5]:
    cutoffs += list(i*10**np.arange(2, 9))

def eval_for_for(task_df0, mode_to_test, algo='tucker'):
    # normlz_vocb=True, lmbda=False, decomp_algo='tucker', weight_name='log_freq'
    data = []
    for non_negative in range(2):
        for include_empty in range(2):
            for cutoff in cutoffs:
                for exp in range(10):
                    rank = 2**exp
                    for weight in weights:
                        try:
                            evalor = VerbTensorEvaluator(
                                non_negative=non_negative, decomp_algo=algo, rank=rank, 
                                include_empty=include_empty, cutoff=cutoff, weight=weight, 
                                mode_to_test=mode_to_test)
                            score, known_word_ratio = evalor.test_sim(task_df0)
                            data.append((weight, non_negative, algo, rank, include_empty, cutoff, score, 
                                         known_word_ratio))
                        except FileNotFoundError:
                            pass
                        except ValueError as e:
                            logging.warning((cutoff, rank, e))
    df = pd.DataFrame(data, columns=['weight', 'non_negative', 'algo', 'rank', 'include_empty', 'cutoff', 
                                     'corr', 'known_word_ratio'])
    return df.sort_values('corr', ascending=False)

In [68]:
def eval_for_files(task_df0, mode_to_test):
    data = []
    for filen in glob.glob(f'/mnt/permanent/home/makrai/project/verb-tensor/nonempty/tensor/*_*_*_*_*_*.pkl'):
        pedigree = os.path.splitext(os.path.basename(filen))[0].split('_')
        non_negative, decomp_algo, weight, include_empty, cutoff, rank = pedigree
        evalor = VerbTensorEvaluator(
            mode_to_test=mode_to_test, non_negative=non_negative=='nonneg', decomp_algo=decomp_algo,
            weight=weight, include_empty=include_empty=='optional', cutoff=int(cutoff), rank=int(rank))
        score, known_word_ratio = evalor.test_sim(task_df0.reset_index())
        data.append((non_negative, decomp_algo, weight, include_empty, int(cutoff), int(rank), score, 
                     known_word_ratio))
    df = pd.DataFrame(data, columns=['non_negative', 'decomp_algo', 'weight', 'include_empty', 'cutoff', 'rank', 
                                     'corr', 'known_word_ratio'])
    return df.sort_values('corr', ascending=False)

In [69]:
df_s = eval_for_files(simlex, mode_to_test='nsubj')#, cutoff=500, rank=32)['tensor_sim']
df_s.head()

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
83,general,tucker,npmi,non-empty,10000,64,0.089196,0.700701
0,general,tucker,npmi,optional,30000,64,0.074334,0.563564
12,general,tucker,npmi,optional,100000,128,0.072204,0.349349
142,general,tucker,npmi,non-empty,100000,64,0.061739,0.349349
59,general,tucker,npmi,optional,100000,32,0.057109,0.349349


In [70]:
df_v = eval_for_files(simverb, mode_to_test='ROOT')#, cutoff=500, rank=32)['tensor_sim']
df_v.head()

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
0,general,tucker,npmi,optional,30000,64,0.196648,0.967714
27,general,parafac,npmi,optional,30000,256,0.186675,0.967714
3,general,parafac,npmi,optional,10000,256,0.186195,0.992857
12,general,tucker,npmi,optional,100000,128,0.182457,0.866
45,general,tucker,pmi-sali,optional,100000,64,0.167229,0.866


In [71]:
df_v[df_v.non_negative=='nonneg']

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
92,nonneg,parafac,npmi,optional,300000,64,0.053630,0.700286
57,nonneg,parafac,pmi,non-empty,10000,2,0.050742,0.991429
18,nonneg,parafac,pmi-sali,optional,300000,64,0.049137,0.700286
128,nonneg,parafac,npmi,non-empty,1000,2,0.038334,0.997143
146,nonneg,tucker,npmi,non-empty,300000,64,0.029212,0.700286
...,...,...,...,...,...,...,...,...
1,nonneg,tucker,iact,non-empty,1000000,64,-0.062046,0.486857
118,nonneg,tucker,iact-sali,non-empty,1000000,64,-0.063448,0.486857
102,nonneg,tucker,niact,non-empty,1000000,64,-0.065776,0.486857
78,nonneg,tucker,pmi-sali,non-empty,10000000,32,-0.067300,0.124571


In [72]:
df_o = eval_for_files(simlex, mode_to_test='dobj')

In [73]:
df_o.head()

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
12,general,tucker,npmi,optional,100000,128,0.098974,0.488488
0,general,tucker,npmi,optional,30000,64,0.082097,0.628629
61,general,tucker,ldice,optional,100000,64,0.070562,0.488488
45,general,tucker,pmi-sali,optional,100000,64,0.065858,0.488488
73,general,tucker,iact-sali,optional,100000,64,0.060944,0.488488


In [74]:
df_o[df_o.non_negative=='nonneg']

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
115,nonneg,parafac,ldice-sali,non-empty,10000000,2,0.004282,0.003003
78,nonneg,tucker,pmi-sali,non-empty,10000000,32,0.004272,0.003003
114,nonneg,parafac,pmi-sali,optional,3000000,64,-0.007703,0.034034
11,nonneg,parafac,npmi,optional,3000000,64,-0.007962,0.034034
93,nonneg,tucker,npmi,non-empty,3000000,32,-0.008035,0.034034
...,...,...,...,...,...,...,...,...
140,nonneg,tucker,niact,non-empty,300000,64,-0.107606,0.318318
44,nonneg,parafac,ldice,non-empty,100,2,-0.135797,0.936937
22,nonneg,parafac,ldice-sali,non-empty,100000000,2,,0.000000
126,nonneg,tucker,pmi-sali,non-empty,30000000,16,,0.000000


In [75]:
def plot_cutoff(df, col):
    for weight in df.weight.unique():
        df0 = df[(df.weight==weight)&(df.rank_==256)]
        plt.plot(df0.cutoff, df0[col])
        plt.legend(df.weight.unique())

In [76]:
def plot_rank(df, col):
    for weight in df.weight.unique():
        df0 = df[(df.weight==weight)&(df.cutoff==100)]
        plt.plot(df0.rank_, df0[col])
        plt.legend(df.weight.unique())

In [77]:
def plot_results(df0, col='sim', save_filen=''):#else ''
    
    df0 = df0[df0.rank_ ==256]
    weights = pd.unique(df0.sort_values(col, ascending=False).weight)
    for weight in weights:
        df = df0[df0.weight==weight].sort_values('rank_')
        plt.plot(df.rank_, df[col])#, c=color)
    #plt.xscale('log')
    _ = plt.legend(weights)
    if save_filen:
        #plt.rc('text', usetex = True)
        #plt.figure(1, figsize = (3, 2))#6, 4))
        #plt.savefig('/home/makrai/repo/paper/LREC20/verbtensor/img/{}'.format(save_filen))
        filen = '/home/makrai/repo/paper/Coling2020/verbtensor/img/{}.png'.format(save_filen)
        plt.savefig(filen)

In [78]:
#plot_results(simverb_res)#, save_filen='SimVerb')

In [79]:
#plot_results(simlex_subj, col='SimLex999')#, save_filen='simLex-subj')

In [80]:
#plot_results(simlex_obj, col='SimLex999')#, save_filen='simLex-obj')

# 2 SVO triples (_al et_ Sadrzadeh 2011--2014)

## 2.1 Datasets

  * [GS’11](http://www.cs.ox.ac.uk/activities/compdistmeaning/GS2011data.txt) provided by Grefenstette and Sadrzadeh (EMNLP 2011)
      * each verb pair takes the same subject and object
      * the task has an aspect of a verb sense disambiguation 
          * As discussed in previous work
            (Kartsaklis and Sadrzadeh, 2013; Milajevs+ 2014; Polajnar+ 2014), GS’11
      * For example, the transitive verb “run” is known as polysemous: operate/move
        * “run” and “operate” are similar when subj = “people” and obj = “company”
        * In the same [context, not similar to] “move”
  * ML’10 provided by Mitchell and Lapata (2010),
    * pairs of verb-object phrases and
  * KS’13 provided by Kartsaklis and Sadrzadeh (2013)
    * complements ML’10 by incorporating an appropriate subject for each VO
  * KS’14 provided by [Kartsaklis and Sadrzadeh (2014)](https://arxiv.org/abs/1405.2874)
    * reannotated version of KS’13 using a cloud sourcing service
  * the latter three require one to capture the topical similarity
    rather than the disambiguation aspect (Polajnar+ 2014)

In [81]:
verb_sim_data_dir = '/mnt/permanent/Language/English/Data/verb-similarity/Sadrzadeh/'

In [82]:
def read_sim_data(filen):
    return pd.read_csv(os.path.join(verb_sim_data_dir, filen), sep=' ')

### 2.1.1 Pairs of SVO triples with the same but ambiguous verb (GS11)

In [83]:
gs11 = read_sim_data('GS2011data.txt').groupby(['verb', 'subject', 'object', 'landmark', 'hilo']).mean()
print(gs11.shape)
gs11.head()

(200, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,input
verb,subject,object,landmark,hilo,Unnamed: 5_level_1
accept,government,proposal,bear,LOW,2.666667
accept,government,proposal,receive,HIGH,2.583333
accept,government,recommendation,bear,LOW,3.5
accept,government,recommendation,receive,HIGH,3.333333
accept,lawyer,conviction,bear,LOW,3.333333


In [84]:
gs11.groupby('verb').size().sort_values(ascending=False).head()

verb
write    20
try      20
show     20
say      20
run      20
dtype: int64

In [85]:
gs11.groupby('landmark').size().sort_values(ascending=False).head()

landmark
visit      10
test       10
attract    10
bear       10
bribe      10
dtype: int64

In [86]:
cols = ['sentence_id', 'adj_subj', 'subj', 'landmark', 'verb', 'adj_obj', 'obj']
gs12 = read_sim_data('GS2012data.txt').groupby(cols).mean().drop(columns=['annotator_id'])
gs12.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,annotator_score
sentence_id,adj_subj,subj,landmark,verb,adj_obj,obj,Unnamed: 7_level_1
1,statistical,table,show,express,good,result,6.06
2,statistical,table,show,depict,good,result,5.9
3,recent,study,show,express,significant,correlation,5.72
4,recent,study,show,depict,significant,correlation,5.92
5,annual,figure,show,express,substantial,increase,5.74


In [87]:
cols = ['sentence_id', 'adj_subj', 'subj', 'landmark', 'verb', 'adj_obj', 'obj']
gsk13 = read_sim_data('pickering-judgements.txt').groupby(cols).mean().drop(columns=['annotator_id'])
gsk13.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,annotator_score
sentence_id,adj_subj,subj,landmark,verb,adj_obj,obj,Unnamed: 7_level_1
1,private,company,file,register,annual,account,5.44186
2,private,company,file,smooth,annual,account,2.302326
3,young,woman,file,register,long,nail,2.069767
4,young,woman,file,smooth,long,nail,5.023256
5,local,government,file,register,criminal,charge,5.44186


In [88]:
cols = ['subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2']
ks13_mitchell = read_sim_data('emnlp2013_ml.txt').groupby(cols).mean()
print(ks13_mitchell.shape)
ks13_mitchell.head()

(108, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,score
subject1,verb1,object1,subject2,verb2,object2,Unnamed: 6_level_1
agent,sell,property,family,buy,home,2.777778
agent,sell,property,group,hold,meeting,1.666667
author,write,book,delegate,buy,land,1.5
author,write,book,man,hear,word,2.055556
author,write,book,writer,read,word,2.777778


In [89]:
def get_cols(i):
    return ['subject{}'.format(i), 'verb{}'.format(i), 'object{}'.format(i)]

def get_one_sent_from_pair(i):
    df = ks13_mitchell.reset_index()[get_cols(i)]
    df.columns = get_cols('')
    return df

ks13_long = pd.concat(get_one_sent_from_pair(i) for i in [1, 2])
ks13_long = ks13_long.drop_duplicates()

### 2.1.2 Kartsaklis and Sadrzadeh, Turk

In [90]:
cols = ['subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2']
ks13_turk = read_sim_data('emnlp2013_turk.txt').groupby(cols).mean().drop(columns=['annotator']).reset_index()
ks13_turk.head()

Unnamed: 0,subject1,verb1,object1,subject2,verb2,object2,score
0,agent,sell,property,family,buy,home,3.125
1,agent,sell,property,group,hold,meeting,1.166667
2,author,write,book,delegate,buy,land,1.130435
3,author,write,book,man,hear,word,1.64
4,author,write,book,writer,read,word,3.166667


In [91]:
ks13_turk.groupby('verb1').size().sort_values(ascending=False).head()

verb1
achieve    5
use        5
provide    4
win        4
attend     3
dtype: int64

In [92]:
ks13_turk.groupby('verb2').size().sort_values(ascending=False).head()

verb2
use        7
provide    5
leave      4
buy        4
cut        3
dtype: int64

### 2.1.3 [Verb prediction task by Jenatton+ (NIPS 2012)](https://everest.hds.utc.fr/doku.php?id=en:lfmnips12)

In [93]:
dataset_dir = '/mnt/permanent/Language/English/Data/verb-similarity/SVO-tensor-dataset/'
def get_index(pos):
    with open(os.path.join(dataset_dir, 'svo-{}s.lst'.format(pos))) as infile:
        return {i+1: ' '.join(line.strip().split('_')[2:-1]) for i, line in enumerate(infile)}

In [94]:
svo_df = pd.read_csv(os.path.join(dataset_dir, 'svo_data_train_1000000.dat'), sep='\t', header=None, 
                     names=['subject', 'verb', 'object'])

In [95]:
index_verb = get_index('verb')
index_noun = get_index('noun')

In [96]:
svo_df.subject = svo_df.subject.apply(index_noun.get)
svo_df.verb = svo_df.verb.apply(index_verb.get)
svo_df.object = svo_df.object.apply(index_noun.get)

In [97]:
svo_df.sample(5)

Unnamed: 0,subject,verb,object
449742,wife,take,photograph
9310,pas,cross,line
617614,bidder,select,order
89173,regiment,kill,soldier
928508,draft,define,satellite


## 2.2 Similarity (KS Turk)

In [31]:
df = eval_for_files(ks13_turk, mode_to_test='svo')

In [32]:
df.sort_values('corr', ascending=False, inplace=True)

In [33]:
df.head()

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
95,nonneg,parafac,pmi-sali,optional,1000000,64,0.735944,0.583333
60,nonneg,parafac,npmi,optional,1000000,128,0.721517,0.583333
43,general,tucker,npmi,optional,100000,64,0.719182,0.861111
28,nonneg,parafac,pmi-sali,optional,1000000,128,0.709791,0.583333
45,general,tucker,pmi-sali,optional,100000,64,0.704931,0.861111


In [100]:
df.groupby('cutoff').mean()['known_word_ratio'].str.f

cutoff
100          1.000000
1000         1.000000
10000        0.972222
30000        0.972222
50000        0.944444
100000       0.861111
300000       0.768519
1000000      0.583333
3000000      0.240741
5000000      0.064815
10000000     0.018519
30000000     0.000000
50000000     0.000000
100000000    0.000000
Name: known_word_ratio, dtype: float64

### Algorithm

In [34]:
df[(df.non_negative=='general')&(df.decomp_algo=='tucker')].head(10)

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
43,general,tucker,npmi,optional,100000,64,0.719182,0.861111
45,general,tucker,pmi-sali,optional,100000,64,0.704931,0.861111
38,general,tucker,ldice,non-empty,1000000,64,0.694181,0.583333
10,general,tucker,log-freq,optional,100000,64,0.68834,0.861111
69,general,tucker,log-freq,non-empty,1000000,64,0.686887,0.583333
42,general,tucker,pmi,optional,100000,64,0.675942,0.861111
0,general,tucker,npmi,optional,30000,64,0.672978,0.972222
96,general,tucker,ldice-sali,non-empty,1000000,64,0.67122,0.583333
61,general,tucker,ldice,optional,100000,64,0.668523,0.861111
13,general,tucker,ldice-sali,optional,100000,64,0.666684,0.861111


In [35]:
param_is_same_l = [df.non_negative=='general', df.decomp_algo=='tucker', df.weight=='npmi', 
                   df.include_empty=='optional', df.cutoff==100000, df['rank']==64]
df[reduce(lambda ser1, ser2: ser1.astype(int) + ser2.astype(int), param_is_same_l)>=5]

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
43,general,tucker,npmi,optional,100000,64,0.719182,0.861111
45,general,tucker,pmi-sali,optional,100000,64,0.704931,0.861111
10,general,tucker,log-freq,optional,100000,64,0.68834,0.861111
42,general,tucker,pmi,optional,100000,64,0.675942,0.861111
0,general,tucker,npmi,optional,30000,64,0.672978,0.972222
61,general,tucker,ldice,optional,100000,64,0.668523,0.861111
13,general,tucker,ldice-sali,optional,100000,64,0.666684,0.861111
52,general,tucker,npmi,optional,300000,64,0.659828,0.768519
12,general,tucker,npmi,optional,100000,128,0.654005,0.861111
72,general,tucker,npmi,optional,1000000,64,0.64257,0.583333


In [36]:
df[(df.non_negative=='general')&(df.decomp_algo=='parafac')].head(7)

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
88,general,parafac,npmi,non-empty,300000,256,0.638353,0.768519
106,general,parafac,pmi-sali,non-empty,300000,256,0.616683,0.768519
29,general,parafac,pmi,non-empty,300000,256,0.581191,0.768519
111,general,parafac,npmi,non-empty,1000000,256,0.575416,0.583333
75,general,parafac,npmi,non-empty,100000,256,0.571304,0.861111
74,general,parafac,npmi,non-empty,300000,512,0.567759,0.768519
6,general,parafac,npmi,optional,100000,256,0.547347,0.861111


In [37]:
param_is_same_l = [df.non_negative=='general', df.decomp_algo=='parafac', df.weight=='npmi', 
                   df.include_empty=='non-empty', df.cutoff==300000, df['rank']==256]
df[reduce(lambda ser1, ser2: ser1.astype(int) + ser2.astype(int), param_is_same_l)>=5]#.to_latex())

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
88,general,parafac,npmi,non-empty,300000,256,0.638353,0.768519
106,general,parafac,pmi-sali,non-empty,300000,256,0.616683,0.768519
29,general,parafac,pmi,non-empty,300000,256,0.581191,0.768519
111,general,parafac,npmi,non-empty,1000000,256,0.575416,0.583333
75,general,parafac,npmi,non-empty,100000,256,0.571304,0.861111
74,general,parafac,npmi,non-empty,300000,512,0.567759,0.768519
39,general,parafac,npmi,non-empty,300000,128,0.529041,0.768519
101,general,parafac,npmi,non-empty,30000,256,0.523938,0.972222
15,general,parafac,npmi,optional,300000,256,0.507076,0.768519
62,general,parafac,log-freq,non-empty,300000,256,0.246579,0.768519


In [38]:
df.groupby(['non_negative', 'decomp_algo']).size()

non_negative  decomp_algo
general       parafac        46
              tucker         31
nonneg        parafac        31
              tucker         39
dtype: int64

In [39]:
df[(df.non_negative=='nonneg')&(df.decomp_algo=='tucker')].head()

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
84,nonneg,tucker,npmi,non-empty,1000000,64,0.51863,0.583333
40,nonneg,tucker,npmi,non-empty,1000000,128,0.51023,0.583333
125,nonneg,tucker,pmi-sali,non-empty,1000000,128,0.498933,0.583333
146,nonneg,tucker,npmi,non-empty,300000,64,0.481441,0.768519
71,nonneg,tucker,pmi,non-empty,1000000,64,0.456374,0.583333


In [40]:
param_is_same_l = [df.non_negative=='nonneg', df.decomp_algo=='tucker', df.weight=='npmi', 
                   df.include_empty=='non-empty', df.cutoff==1000000, df['rank']==64]
df[reduce(lambda ser1, ser2: ser1.astype(int) + ser2.astype(int), param_is_same_l)>=5]#.to_latex())

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
84,nonneg,tucker,npmi,non-empty,1000000,64,0.51863,0.583333
40,nonneg,tucker,npmi,non-empty,1000000,128,0.51023,0.583333
87,nonneg,parafac,npmi,non-empty,1000000,64,0.500262,0.583333
146,nonneg,tucker,npmi,non-empty,300000,64,0.481441,0.768519
24,general,tucker,npmi,non-empty,1000000,64,0.479883,0.583333
71,nonneg,tucker,pmi,non-empty,1000000,64,0.456374,0.583333
100,nonneg,tucker,pmi-sali,non-empty,1000000,64,0.438707,0.583333
112,nonneg,tucker,npmi,non-empty,1000000,32,0.375336,0.583333
81,nonneg,tucker,npmi,non-empty,3000000,64,0.336619,0.240741
51,nonneg,tucker,npmi,optional,1000000,64,0.288937,0.583333


### Size (rank, empty fillers, cutoff)

In [41]:
df.head(3)

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
95,nonneg,parafac,pmi-sali,optional,1000000,64,0.735944,0.583333
60,nonneg,parafac,npmi,optional,1000000,128,0.721517,0.583333
43,general,tucker,npmi,optional,100000,64,0.719182,0.861111


In [42]:
df[df['include_empty']=='non-empty'].head()

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
38,general,tucker,ldice,non-empty,1000000,64,0.694181,0.583333
69,general,tucker,log-freq,non-empty,1000000,64,0.686887,0.583333
96,general,tucker,ldice-sali,non-empty,1000000,64,0.67122,0.583333
88,general,parafac,npmi,non-empty,300000,256,0.638353,0.768519
106,general,parafac,pmi-sali,non-empty,300000,256,0.616683,0.768519


In [43]:
df[(df['include_empty']=='non-empty')&(df.non_negative=='nonneg')].head()

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
79,nonneg,parafac,pmi-sali,non-empty,1000000,128,0.547566,0.583333
84,nonneg,tucker,npmi,non-empty,1000000,64,0.51863,0.583333
40,nonneg,tucker,npmi,non-empty,1000000,128,0.51023,0.583333
87,nonneg,parafac,npmi,non-empty,1000000,64,0.500262,0.583333
125,nonneg,tucker,pmi-sali,non-empty,1000000,128,0.498933,0.583333


In [44]:
df[df['rank']>128].head()

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
88,general,parafac,npmi,non-empty,300000,256,0.638353,0.768519
106,general,parafac,pmi-sali,non-empty,300000,256,0.616683,0.768519
29,general,parafac,pmi,non-empty,300000,256,0.581191,0.768519
111,general,parafac,npmi,non-empty,1000000,256,0.575416,0.583333
75,general,parafac,npmi,non-empty,100000,256,0.571304,0.861111


In [45]:
df[df['cutoff']<100000].head()

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
0,general,tucker,npmi,optional,30000,64,0.672978,0.972222
83,general,tucker,npmi,non-empty,10000,64,0.582577,0.972222
27,general,parafac,npmi,optional,30000,256,0.538018,0.972222
101,general,parafac,npmi,non-empty,30000,256,0.523938,0.972222
3,general,parafac,npmi,optional,10000,256,0.503049,0.972222


In [46]:
df[df['cutoff']<30000].head()

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
83,general,tucker,npmi,non-empty,10000,64,0.582577,0.972222
3,general,parafac,npmi,optional,10000,256,0.503049,0.972222
128,nonneg,parafac,npmi,non-empty,1000,2,0.133474,1.0
57,nonneg,parafac,pmi,non-empty,10000,2,0.099182,0.972222
44,nonneg,parafac,ldice,non-empty,100,2,0.013527,1.0


### Neighboring setting

In [47]:
param_is_same_l = [df.non_negative=='nonneg', df.decomp_algo=='parafac', df.weight=='pmi-sali', 
                   df.include_empty=='optional', df.cutoff==1000000, df['rank']==64]
df[reduce(lambda ser1, ser2: ser1.astype(int) + ser2.astype(int), param_is_same_l)>=5]

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
95,nonneg,parafac,pmi-sali,optional,1000000,64,0.735944,0.583333
28,nonneg,parafac,pmi-sali,optional,1000000,128,0.709791,0.583333
64,nonneg,parafac,pmi,optional,1000000,64,0.685776,0.583333
77,nonneg,parafac,pmi-sali,optional,1000000,32,0.677333,0.583333
18,nonneg,parafac,pmi-sali,optional,300000,64,0.663044,0.768519
17,nonneg,parafac,npmi,optional,1000000,64,0.660266,0.583333
34,nonneg,parafac,ldice-sali,optional,1000000,64,0.470907,0.583333
124,nonneg,parafac,pmi-sali,non-empty,1000000,64,0.457894,0.583333
99,general,parafac,pmi-sali,optional,1000000,64,0.456088,0.583333
80,nonneg,parafac,ldice,optional,1000000,64,0.440932,0.583333


In [48]:
df[reduce(lambda ser1, ser2: ser1.astype(int) + ser2.astype(int), param_is_same_l)==4]

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
60,nonneg,parafac,npmi,optional,1000000,128,0.721517,0.583333
92,nonneg,parafac,npmi,optional,300000,64,0.621376,0.768519
32,nonneg,parafac,npmi,optional,1000000,32,0.615816,0.583333
79,nonneg,parafac,pmi-sali,non-empty,1000000,128,0.547566,0.583333
98,general,parafac,npmi,optional,1000000,64,0.521703,0.583333
87,nonneg,parafac,npmi,non-empty,1000000,64,0.500262,0.583333
4,nonneg,parafac,pmi-sali,non-empty,300000,64,0.48441,0.768519
100,nonneg,tucker,pmi-sali,non-empty,1000000,64,0.438707,0.583333
11,nonneg,parafac,npmi,optional,3000000,64,0.421123,0.240741
51,nonneg,tucker,npmi,optional,1000000,64,0.288937,0.583333


In [49]:
n_same_param = (df.non_negative=='nonneg').astype(int) + (df.weight=='npmi').astype(int) + (df['rank'].astype(int)==64).astype(int) + (df.include_empty=='optional').astype(int) + (df.cutoff.astype(int)==1000000).astype(int)

In [50]:
df[(df.decomp_algo=='parafac')].head() # Experiments running on percy.

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
95,nonneg,parafac,pmi-sali,optional,1000000,64,0.735944,0.583333
60,nonneg,parafac,npmi,optional,1000000,128,0.721517,0.583333
28,nonneg,parafac,pmi-sali,optional,1000000,128,0.709791,0.583333
64,nonneg,parafac,pmi,optional,1000000,64,0.685776,0.583333
77,nonneg,parafac,pmi-sali,optional,1000000,32,0.677333,0.583333


In [51]:
n_same_param = (df.weight=='npmi').astype(int) + (df.cutoff.astype(int)==1000000).astype(int) + (df['rank'].astype(int)==64).astype(int) + (df.include_empty=='non-empty').astype(int)

In [52]:
df[(df.non_negative=='nonneg')&(df.decomp_algo=='parafac')].head() # Experiments running on store.

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
95,nonneg,parafac,pmi-sali,optional,1000000,64,0.735944,0.583333
60,nonneg,parafac,npmi,optional,1000000,128,0.721517,0.583333
28,nonneg,parafac,pmi-sali,optional,1000000,128,0.709791,0.583333
64,nonneg,parafac,pmi,optional,1000000,64,0.685776,0.583333
77,nonneg,parafac,pmi-sali,optional,1000000,32,0.677333,0.583333


In [53]:
df[df['rank'].astype(int)>128]#.head() # Experiment running on ron.

Unnamed: 0,non_negative,decomp_algo,weight,include_empty,cutoff,rank,corr,known_word_ratio
88,general,parafac,npmi,non-empty,300000,256,0.638353,0.768519
106,general,parafac,pmi-sali,non-empty,300000,256,0.616683,0.768519
29,general,parafac,pmi,non-empty,300000,256,0.581191,0.768519
111,general,parafac,npmi,non-empty,1000000,256,0.575416,0.583333
75,general,parafac,npmi,non-empty,100000,256,0.571304,0.861111
74,general,parafac,npmi,non-empty,300000,512,0.567759,0.768519
6,general,parafac,npmi,optional,100000,256,0.547347,0.861111
27,general,parafac,npmi,optional,30000,256,0.538018,0.972222
101,general,parafac,npmi,non-empty,30000,256,0.523938,0.972222
16,general,parafac,npmi,optional,100000,512,0.521994,0.861111


# Attic

## Verb prediction

In [54]:
gs11.shape

(200, 1)

In [55]:
gs_rank = 256

In [56]:
%time VerbTensorEvaluator(rank=gs_rank).predict_verb(gs11.reset_index())

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/permanent/home/makrai/project/verb-tensor/nonempty/tensor/nonneg_parafac_pmi-sali_optional_1000000_256.pkl'

In [None]:
for weight in weights:
    try:
        VerbTensorEvaluator(rank=gs_rank, weight=weight).predict_verb(gs11.reset_index())
    except FileNotFoundError:
        pass

In [None]:
VerbTensorEvaluator(rank=128).predict_verb(gs11.reset_index())

In [None]:
for cutoff in [50000,200000]:
    try:
        VerbTensorEvaluator(rank=gs_rank, cutoff=cutoff).predict_verb(gs11.reset_index())
    except FileNotFoundError:
        pass

In [None]:
#VerbTensorEvaluator().predict_verb(svo_df) # Jenatton

In [None]:
ks13_turk.shape

In [None]:
VerbTensorEvaluator().predict_verb(ks13_turk, cols_suff=1)

In [None]:
VerbTensorEvaluator().predict_verb(ks13_turk, cols_suff=2)

## Exploring GS11

In [None]:
svo_sim = gs11.reset_index()

In [None]:
svo_sim.hilo = (svo_sim=='HIGH').astype(int)

In [None]:
cols_ordered = ["subject", "verb", "landmark", "object", "input", "hilo"]

In [None]:
svo_sim[cols_ordered].head()

In [None]:
svo_sim.groupby('landmark').size().sort_values(ascending=False)

In [None]:
svo_sim.groupby('verb').size().sort_values(ascending=False)

In [None]:
svo_sim.describe(percentiles=[])

In [None]:
svo_sim.corr(method='spearman')

In [None]:
def eval_for_weight(task_df0, mode_to_test='svo', cutoff=2000, rank=128):
    # normlz_vocb=True, lmbda=False, decomp_algo='tucker', weight_name='log_freq'
    data = []
    for weight_name in weights:
        try:
            score_d = evalor.test_sim(task_df0, mode_to_test=mode_to_test, 
                                      cutoff=cutoff, rank=rank, weight=weight_name)
            data.append((cutoff, rank, weight_name, score_d))
        except FileNotFoundError as e:
            logging.warning(weight_name)
        except ValueError as e:
            logging.warning((cutoff, rank, e))
    df = pd.DataFrame(data, columns=['cutoff', 'rank', 'weight', 'corr'])
    return df.sort_values('corr', ascending=False)