In [1]:
import torch
torch.cuda.is_available()

True

# Model

In [2]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

device = "cuda"

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large').to(device)

In [3]:
from tqdm.auto import tqdm
batch_size = 8
def encode(input_texts):
    with torch.no_grad():
        all_embeddings = []
        for i in tqdm(range(0, len(input_texts), batch_size)):
            batch_texts = input_texts[i:i+batch_size]
            # Tokenize the input texts
            batch_dict = tokenizer(batch_texts, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)

            outputs = model(**batch_dict)
            embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

            # (Optionally) normalize embeddings
            embeddings = F.normalize(embeddings, p=2, dim=1)
            all_embeddings.append(embeddings.cpu())
            
    return torch.cat(all_embeddings, dim=0).numpy()

# JSTS

In [4]:
import json
import pandas as pd
from urllib.request import urlopen
jsts_url = "https://raw.githubusercontent.com/yahoojapan/JGLUE/main/datasets/jsts-v1.1/valid-v1.1.json"
df = pd.DataFrame([json.loads(line) for line in urlopen(jsts_url).readlines()])
df.head(1)

Unnamed: 0,sentence_pair_id,yjcaptions_id,sentence1,sentence2,label
0,0,100312_421853-104611-31624,レンガの建物の前を、乳母車を押した女性が歩いています。,厩舎で馬と女性とが寄り添っています。,0.0


In [5]:
df.shape

(1457, 5)

## Encode

In [6]:
sentence1_embs = encode(df["sentence1"].values.tolist())
sentence2_embs = encode(df["sentence2"].values.tolist())
sentence1_embs.shape, sentence2_embs.shape

  0%|          | 0/183 [00:00<?, ?it/s]

  0%|          | 0/183 [00:00<?, ?it/s]

((1457, 1024), (1457, 1024))

## Correlation Score

In [7]:
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr


df["similarity"] = [1 - cosine(s1, s2) for s1, s2 in zip(sentence1_embs, sentence2_embs)]
spearmanr(df["similarity"], df["label"])[0]

0.809871019433716

# JSICK

In [8]:
df = pd.read_csv("https://github.com/verypluming/JSICK/raw/main/jsick/test.tsv", sep="\t")
df.head(1)

Unnamed: 0,pair_ID,data,sentence_A_En,sentence_B_En,entailment_label_En,relatedness_score_En,corr_entailment_labelAB_En,corr_entailment_labelBA_En,sentence_A_Ja,sentence_B_Ja,entailment_label_Ja,relatedness_score_Ja,image_ID,original_caption,semtag_short,semtag_long
0,6,test,There is no boy playing outdoors and there is ...,A group of kids is playing in a yard and an ol...,neutral,3.3,,,戸外で遊んでいる男の子は一人もおらず、微笑んでいる男性は一人もいない,子供たちのグループが庭で遊んでいて、後ろの方には年を取った男性が立っている,contradiction,2.3,3155657768_b83a7831e5.jpg,"The children are playing outdoors , while a ma...",Negation#Numerical,"Numerical;人;名詞,接尾,助数詞,*#Negation;ない;助動詞,*,*,*#..."


In [9]:
df.shape

(4927, 16)

## Encode

In [11]:
sentence1_embs = encode(df["sentence_A_Ja"].values.tolist())
sentence2_embs = encode(df["sentence_B_Ja"].values.tolist())
sentence1_embs.shape, sentence2_embs.shape

  0%|          | 0/616 [00:00<?, ?it/s]

  0%|          | 0/616 [00:00<?, ?it/s]

((4927, 1024), (4927, 1024))

## Correlation Score

In [13]:
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
df["similarity"] = [1 - cosine(s1, s2) for s1, s2 in zip(sentence1_embs, sentence2_embs)]
spearmanr(df["similarity"], df["relatedness_score_Ja"])[0]

0.7838394132798657