## Text similarity probings

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tqdm import tqdm
import numpy, pandas

import pathlib
import sys
import os

WD = str(pathlib.Path().absolute()) + '/'
PROJECT_FOLDER = WD + '../'
PROBINGS_FOLDER = PROJECT_FOLDER + 'probings/'
DATA_FOLDER = PROJECT_FOLDER + 'data/'

sys.path.append(PROBINGS_FOLDER)

from similarity import dataset_similarity


# utilities
def similarity_dataframe(inputs, similarity):
    idx = numpy.dstack(numpy.unravel_index(numpy.argsort(similarity.ravel()), similarity.shape)).squeeze()
    pairs = [(inputs[i], inputs[j], similarity[i, j]) for i, j in idx if not numpy.isnan(similarity[i, j])]
    df = pandas.DataFrame(pairs, columns=['premise_1', 'premise_2', 'similarity_score'])
    
    return df

In [3]:
DATA_FOLDER = WD + '../data/'

input_texts = list([
    DATA_FOLDER + 'rte/val.jsonl',
    DATA_FOLDER + 'axb/val.jsonl',
    DATA_FOLDER + 'axg/val.jsonl',
    DATA_FOLDER + 'mnli/val.jsonl',
])

### Similarity according to different metrics

By default, we use `cosine` similarity. `dot` and `euclidean` distances are also available.
We also provide two similarity modules, [SBERT](https://arxiv.org/pdf/1908.10084.pdf) and [Infersent](https://arxiv.org/pdf/1705.02364.pdf), which you can choose through the `model` parameter of the `dataset_similarity` function.
To use SBERT, set `similarity_model` to any of the pretrained models you can find in [here](https://www.sbert.net/docs/pretrained_models.html), while to use `Infersent` set it to `infersent`.
It defaults to `'stsb-distilbert-base'`.

In [5]:
import nltk

nltk.download('punkt')

similarity_model = 'infersent'
inputs, inputs_similarities = dict(), dict()
for dataset in tqdm(input_texts[-1:]):
    # load data
    data = pandas.read_json(dataset, lines=True)
    data = data.drop('idx', axis='columns')
    data = data['premise'].values.tolist()
    
    dataset_name = dataset.split('/data/')[1].split('/')[0]
    inputs[dataset_name] = data
    inputs_similarities[dataset_name] = dict()
    
    for distance in ['euclidean', 'dot', 'cosine'][2:]:
        similarities_file = DATA_FOLDER + 'probings/' + similarity_model + '_' + dataset_name + '_pairwise_similarities_by_' + distance + '.dat'

        if os.path.isfile(similarities_file):
            inputs_similarities[dataset_name][distance] = numpy.load(similarities_file, allow_pickle=True)
        else:
            _, similarity_matrix = dataset_similarity(dataset, model=similarity_model, metric=distance)
            numpy.matrix.dump(similarity_matrix, similarities_file)
            inputs_similarities[dataset_name][distance] = similarity_matrix

[nltk_data] Downloading package punkt to
[nltk_data]     /home/mattiasetzu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 1/1 [00:43<00:00, 43.03s/it]


## Visualization

In [None]:
import pandas

from bokeh.io import output_file, output_notebook, show, export_png
from bokeh.models import BasicTicker, ColorBar, ColumnDataSource, LinearColorMapper, PrintfTickFormatter
from bokeh.plotting import figure
from bokeh.sampledata.unemployment1948 import data
from bokeh.transform import transform

# palettes
from bokeh.palettes import RdBu7


def similarity_heatmap(premises, data, dataset_name, colors=RdBu7, out_file=None):
    mapper = LinearColorMapper(palette=colors, low=-1, high=+1)
    
    vals = list()
    for i in range(len(inp)):
        for j in range(i + 1, len(inp)):
            vals.append((str(i), str(j), sim[i, j], i, j))
    data = pandas.DataFrame(vals, columns=['x', 'y', 'val', 'x_int', 'y_int']).sort_values(by=['x_int', 'y_int'])
    data = data.pivot(index='x', columns='y', values='val')
    data.columns.name = 'y'
    df = pandas.DataFrame(data.stack(), columns=['val']).reset_index()
    source = ColumnDataSource(df)
    
    x_index = [str(el) for el in sorted([int(x) for x in list(data.index)])]
    y_index = [str(el) for el in sorted([int(y) for y in list(reversed(data.columns))])]
    
    p = figure(plot_width=6000, plot_height=6000, title='Pairwise premise similarity on ' + dataset_name,
               x_range=x_index, y_range=y_index)
    p.rect('x', 'y', width=1, height=1, source=source, line_color=None, fill_color={'field': 'val', 'transform':mapper})
    
    color_bar = ColorBar(color_mapper=mapper, location=(0, 0))
    p.add_layout(color_bar, 'right')
    p.axis.major_label_text_font_size = "0px"

    export_png(p, filename=out_file)

## Similarity matrices

**Note: run the next cell only on a server with high RAM capacity**

In [None]:
#for inp, sim, dataset in zip(inputs, inputs_similarities, ['rte', 'axb', 'axg', 'mnli']):
#    similarity_heatmap(inp, sim, dataset, out_file=dataset + '_pairwise_similarities.png')

## Pairwise similarity

In [None]:
dfs = list()
for dataset in ['rte', 'axb', 'axg', 'mnli']:
    inp = inputs[dataset]
    for distance in ['euclidean', 'dot', 'cosine']:
        df = similarity_dataframe(inp, inputs_similarities[dataset][distance])
        df['dataset'] = dataset
        df['distance'] = distance
        df = df[['dataset', 'premise_1', 'premise_2', 'similarity_score', 'distance']]
        dfs.append(df)
pairwise_similarities = pandas.concat(dfs, axis='rows')
pairwise_similarities['model'] = similarity_model
pairwise_similarities = pairwise_similarities[['dataset', 'model', 'premise_1', 'premise_2', 'similarity_score', 'distance']]
pairwise_similarities.to_csv(DATA_FOLDER + '/probings/infersent_pairwise_similarities.csv', index=False)

## Percentiles by dataset

In [None]:
pairwise_similarities.groupby(['dataset', 'distance'], sort=False).describe(percentiles=[0.9, 0.95, 0.9975, 0.999])

## Pairwise similarities percentiles

In [None]:
# show all rows
pandas.set_option('display.max_rows', None, 'display.max_columns', None)
# show full columns
pandas.set_option('display.max_colwidth', None)

by_premise_groups = pairwise_similarities.groupby(['premise_1', 'distance'], sort=False)
by_premise_groups_percentiles = by_premise_groups.describe(percentiles=[.5, .75, .995, .9975, .999])['similarity_score'][['mean', 'std', 'min', '99.5%', '99.75%', '99.9%']]
by_premise_groups_percentiles

## Top k most similar instances

In [None]:
K = 5
sorted_groups = by_premise_groups.apply(lambda x: x.sort_values('similarity_score', ascending=False))\
                                        .drop(['premise_1', 'distance'], axis='columns')\
                                        .groupby(['premise_1', 'distance'])\
                                        .head(K)[['premise_2', 'similarity_score']]

In [None]:
sorted_groups.head(100)

## Top k least similar instances

In [None]:
K = 5
sorted_groups = by_premise_groups.apply(lambda x: x.sort_values('similarity_score', ascending=True))\
                                        .drop(['premise_1', 'distance'], axis='columns')\
                                        .groupby(['premise_1', 'distance'], sort=False)\
                                        .head(K)[['premise_2', 'similarity_score']]
sorted_groups.head(100)

## Scratchpad