In [None]:
from collections import defaultdict
from copy import copy
import numpy as np
import operator
import os
import pandas as pd
import pickle

import matplotlib.pyplot as plt
%pylab inline

import logging
logging.basicConfig(level=logging.INFO,
        format='%(levelname)-8s [%(lineno)d] %(message)s')

In [None]:
pylab.rcParams['figure.figsize'] = (20, 10)

In [None]:
matplotlib.rcParams.update({'font.size': 14})
from tikzplotlib import save as tikz_save

In [None]:
from eval_tensor import test_sim

# 1 One-mode similarity

## 1.1 Data

### 1.1.1 [SimVerb](http://people.ds.cam.ac.uk/dsg40/simverb.html) (Gerz+ EMNLP 2016)

In [None]:
simverb = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/simverb-3500/SimVerb-3500.txt', sep='\t',
                      header=None, names=['verb1', 'verb2', 'pos', 'sim', 'rel'])

In [None]:
simverb.head()

In [None]:
simverb.describe(percentiles=[])

In [None]:
simverb.groupby('rel').sim.describe(percentiles=[]).sort_values('count', ascending=False)

### 1.1.2 SimLex-999

In [None]:
simlex = pd.read_csv('/mnt/permanent/Language/English/Data/SimLex-999/SimLex-999.txt', sep='\t')

In [None]:
simlex.head()

* conc(w1): The concreteness rating of word1 on a scale of 1-7. Taken from the University of South Florida Free Association Norms database. 

* conc(w2): The concreteness rating of word2 on a scale of 1-7. Taken from the University of South Florida Free Association Norms database. 

* concQ: The quartile the pair occupies based on the two concreteness ratings. Used for some analyses in the above paper. 

* Assoc(USF): The strength of free association from word1 to word2. Values are taken from the University of South Florida Free Association Dataset. 

* SimAssoc333: Binary indicator of whether the pair is one of the 333 most associated in the dataset (according to Assoc(USF)). This subset of SimLex999 is often the hardest for computational models to capture because the noise from high association can confound the similarity rating. See the paper for more details. 

* SD(SimLex): The standard deviation of annotator scores when rating this pair. Low values indicate good agreement between the 15+ annotators on the similarity value SimLex999. Higher scores indicate less certainty. 


In [None]:
simlex.describe(percentiles=[])

In [None]:
simlex.groupby('POS').size()

## 1.2 Testing the verb tensor

In [None]:
def df_columns_from_filen(sim_df, col='sim'):#
    sim_df = sim_df.drop(col)
    sim_df = sim_df.to_frame().reset_index()
    sim_df [['_t', '_s', 'weight', 'rank_']] = pd.DataFrame(sim_df['index'].str.rsplit('_').values.tolist())
    sim_df = sim_df[sim_df.weight.isna()==0]
    sim_df.rank_ = sim_df.rank_.astype(int)
    #sim_df['weight'] = pd.DataFrame(sim_df['weight'].str.split('_', 2).values.tolist())[2]
    sim_df = sim_df.drop(columns='index')
    #sim_df = sim_df.drop(labels=[0])
    sim_df =sim_df[sim_df.isna().sum(axis=1)==0]
    return sim_df.sort_values(col, ascending=False)

In [None]:
simverb_res = test_sim(simverb, mode_to_test='ROOT')
simverb_res = df_columns_from_filen(simverb_res)

In [None]:
simlex_subj = test_sim(simlex, mode_to_test='nsubj')

In [None]:
simlex_subj = df_columns_from_filen(simlex_subj, col='SimLex999')

In [None]:
simlex_obj = test_sim(simlex, mode_to_test='dobj')
simlex_obj = df_columns_from_filen(simlex_obj, col='SimLex999')

In [None]:
simverb_res.sort_values('sim', ascending=False).head()

In [None]:
simlex_subj.sort_values('SimLex999', ascending=False).head()

In [None]:
simlex_obj.sort_values('SimLex999', ascending=False).head()

In [None]:
# weights = pd.unique(simverb_res.weight)

def plot_results(df0, col='sim', save_filen=''):#else ''
    #df0 = df0[df0.rank_ ==128]            
    weights = pd.unique(df0.sort_values(col, ascending=False).weight)
    for weight in weights:
        df = df0[df0.weight==weight].sort_values('rank_')
        plt.plot(df.rank_, df[col])#, c=color)
    #plt.xscale('log')
    _ = plt.legend(weights)
    if save_filen:
        tikz_save('/home/makrai/repo/paper/LREC20/verbtensor/img/{}.tikz'.format(save_filen), 
                  figurewidth = '\\columnwidth')

In [None]:
plot_results(simverb_res)

In [None]:
plot_results(simlex_subj, col='SimLex999')

In [None]:
simlex_subj[simlex_subj.rank_==128]

In [None]:
plot_results(simlex_obj, col='SimLex999')

# 2 SVO triples (_al et_ Sadrzadeh 2011--2014)

## 2.1 Datasets

  * [GS’11](http://www.cs.ox.ac.uk/activities/compdistmeaning/GS2011data.txt) provided by Grefenstette and Sadrzadeh (EMNLP 2011)
      * each verb pair takes the same subject and object
      * the task has an aspect of a verb sense disambiguation 
          * As discussed in previous work
            (Kartsaklis and Sadrzadeh, 2013; Milajevs+ 2014; Polajnar+ 2014), GS’11
      * For example, the transitive verb “run” is known as polysemous: operate/move
        * “run” and “operate” are similar when subj = “people” and obj = “company”
        * In the same [context, not similar to] “move”
  * ML’10 provided by Mitchell and Lapata (2010),
    * pairs of verb-object phrases and
  * KS’13 provided by Kartsaklis and Sadrzadeh (2013)
    * complements ML’10 by incorporating an appropriate subject for each VO
  * KS’14 provided by [Kartsaklis and Sadrzadeh (2014)](https://arxiv.org/abs/1405.2874)
    * reannotated version of KS’13 using a cloud sourcing service
  * the latter three require one to capture the topical similarity
    rather than the disambiguation aspect (Polajnar+ 2014)

In [None]:
np.st

In [None]:
verb_sim_data_dir = '/mnt/permanent/Language/English/Data/verb-similarity/Sadrzadeh/'

In [None]:
def read_sim_data(filen):
    return pd.read_csv(os.path.join(verb_sim_data_dir, filen), sep=' ')

### 2.1.1 Pairs of SVO triples with the same but ambiguous verb (GS11)

In [None]:
gs11 = read_sim_data('GS2011data.txt').groupby(['verb', 'subject', 'object', 'landmark', 'hilo']).mean()
print(gs11.shape)
gs11.head()

In [None]:
cols = ['sentence_id', 'adj_subj', 'subj', 'landmark', 'verb', 'adj_obj', 'obj']
gs12 = read_sim_data('GS2012data.txt').groupby(cols).mean().drop(columns=['annotator_id'])
gs12.head()

In [None]:
cols = ['sentence_id', 'adj_subj', 'subj', 'landmark', 'verb', 'adj_obj', 'obj']
gsk13 = read_sim_data('pickering-judgements.txt').groupby(cols).mean().drop(columns=['annotator_id'])
gsk13.head()

In [None]:
cols = ['subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2']
ks13_mitchell = read_sim_data('emnlp2013_ml.txt').groupby(cols).mean()
print(ks13_mitchell.shape)
ks13_mitchell.head()

In [None]:
get_cols = lambda i: ['subject{}'.format(i), 'verb{}'.format(i), 'object{}'.format(i)]
def get_one_sent_from_pair(i):
    df = ks13_mitchell.reset_index()[get_cols(i)]
    df.columns = get_cols('')
    return df
ks13_long = pd.concat(get_one_sent_from_pair(i) for i in [1, 2])
ks13_long = ks13_long.drop_duplicates()

### 2.1.2 Kartsaklis and Sadrzadeh, Turk

In [None]:
cols = ['subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2']
ks13_turk = read_sim_data('emnlp2013_turk.txt').groupby(cols).mean().drop(columns=['annotator'])
ks13_turk.head()

### 2.1.3 [Verb prediction task by Jenatton+ (NIPS 2012)](https://everest.hds.utc.fr/doku.php?id=en:lfmnips12)

In [None]:
dataset_dir = '/mnt/permanent/Language/English/Data/verb-similarity/SVO-tensor-dataset/'
def get_index(pos):
    with open(os.path.join(dataset_dir, 'svo-{}s.lst'.format(pos))) as infile:
        return {i+1: ' '.join(line.strip().split('_')[2:-1]) for i, line in enumerate(infile)}

In [None]:
svo_df = pd.read_csv(os.path.join(dataset_dir, 'svo_data_train_1000000.dat'), sep='\t', header=None, 
                     names=['subject', 'verb', 'object'])

In [None]:
index_verb = get_index('verb')
index_noun = get_index('noun')

In [None]:
svo_df.subject = svo_df.subject.apply(index_noun.get)
svo_df.verb = svo_df.verb.apply(index_verb.get)
svo_df.object = svo_df.object.apply(index_noun.get)

In [None]:
svo_df.head(10)

## 2.2 Similarity (KS Turk)

In [None]:
df = test_sim(ks13_turk, mode_to_test='svo')

## 2.3 Verb prediction

In [None]:
from eval_tensor import predict_verb

In [None]:
def for_weight_and_rank(max_exp_plus_one=7):
    for weight in ['log_freq', 'pmi', 'iact_info', 'salience', 'iact_sali', 'log_dice', 'dice_sali']:
        logging.info(weight)
        for exp in range(3, max_exp_plus_one):
            rank = 2 ** exp
            logging.info('\t{}'.format(rank))
            try:
                predict_verb(svo_df, weight, rank)
            except FileNotFoundError as e:
                logging.warning(e)

In [None]:
%time for_weight_and_rank()

With 44 k $\times$ 9 k $\times$ 39 k-s tenosor:

|assoc measure|rank|verb|
|-|-|-|
|pmi|256|318|
|salience|128|318|
|log-Dice|128|270|

In [None]:
svo_sim.sum(numeric_only=True).sort_values(ascending=False).head()/svo_sim.shape[0]

|verb|`* 1`|`* lmbda`|
|----|--|------|
|unnorm|130|**272**|
|norm|0|24|

Majoroty baseline...

In [None]:
target = 'verb'
svo_sim.groupby(target).size().sort_values()/svo_sim.shape[0]

## Attic: Exploring GS11

In [None]:
svo_sim = gs11.reset_index()

In [None]:
svo_sim.hilo = (svo_sim=='HIGH').astype(int)

In [None]:
cols_ordered = ["subject", "verb", "landmark", "object", "input", "hilo"]

In [None]:
svo_sim[cols_ordered].head()

In [None]:
svo_sim.groupby('landmark').size().sort_values(ascending=False)

In [None]:
svo_sim.groupby('verb').size().sort_values(ascending=False)

In [None]:
svo_sim.describe(percentiles=[])

In [None]:
svo_sim.corr(method='spearman')