In [1]:
import torch

torch.cuda.is_available()

True

In [2]:
jsts_url = "https://raw.githubusercontent.com/yahoojapan/JGLUE/main/datasets/jsts-v1.1/valid-v1.1.json"
jsick_url = "https://github.com/verypluming/JSICK/raw/main/jsick/test.tsv"
miracle_n_hard_negs = 300
miracle_n_recall = 30

In [3]:
# Parameters
model_id = "BAAI/bge-m3"
query_prefix = ""
passage_prefix = ""

# Model

In [4]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

loading existing colbert_linear and sparse_linear---------


# JSTS

In [5]:
import json
import pandas as pd
from urllib.request import urlopen

df = pd.DataFrame([json.loads(line) for line in urlopen(jsts_url).readlines()])
df.head(1)

Unnamed: 0,sentence_pair_id,yjcaptions_id,sentence1,sentence2,label
0,0,100312_421853-104611-31624,レンガの建物の前を、乳母車を押した女性が歩いています。,厩舎で馬と女性とが寄り添っています。,0.0


In [6]:
df.shape

(1457, 5)

## Encode

In [7]:
sentence1_embs = model.encode(query_prefix + df["sentence1"], batch_size=32)[
    "dense_vecs"
]

sentence2_embs = model.encode(query_prefix + df["sentence2"], batch_size=32)[
    "dense_vecs"
]

sentence1_embs.shape, sentence2_embs.shape

encoding: 100%|██████████| 46/46 [00:26<00:00,  1.71it/s]
encoding: 100%|██████████| 46/46 [00:25<00:00,  1.80it/s]


((1457, 1024), (1457, 1024))

## Correlation Score

In [8]:
from scipy.spatial.distance import cosine, euclidean
from scipy.stats import spearmanr

df["similarity"] = [
    1 - cosine(s1, s2) for s1, s2 in zip(sentence1_embs, sentence2_embs)
]
jsts_score = spearmanr(df["similarity"], df["label"])[0]
jsts_score

0.8023288886438221

# JSICK

In [9]:
df = pd.read_csv(jsick_url, sep="\t")
df.head(1)

Unnamed: 0,pair_ID,data,sentence_A_En,sentence_B_En,entailment_label_En,relatedness_score_En,corr_entailment_labelAB_En,corr_entailment_labelBA_En,sentence_A_Ja,sentence_B_Ja,entailment_label_Ja,relatedness_score_Ja,image_ID,original_caption,semtag_short,semtag_long
0,6,test,There is no boy playing outdoors and there is ...,A group of kids is playing in a yard and an ol...,neutral,3.3,,,戸外で遊んでいる男の子は一人もおらず、微笑んでいる男性は一人もいない,子供たちのグループが庭で遊んでいて、後ろの方には年を取った男性が立っている,contradiction,2.3,3155657768_b83a7831e5.jpg,"The children are playing outdoors , while a ma...",Negation#Numerical,"Numerical;人;名詞,接尾,助数詞,*#Negation;ない;助動詞,*,*,*#..."


In [10]:
df.shape

(4927, 16)

## Encode

In [11]:
sentence1_embs = model.encode(query_prefix + df["sentence_A_Ja"], batch_size=32)[
    "dense_vecs"
]

sentence2_embs = model.encode(query_prefix + df["sentence_B_Ja"], batch_size=32)[
    "dense_vecs"
]

sentence1_embs.shape, sentence2_embs.shape

encoding: 100%|██████████| 154/154 [00:27<00:00,  5.54it/s]
encoding: 100%|██████████| 154/154 [00:28<00:00,  5.40it/s]


((4927, 1024), (4927, 1024))

## Correlation Score

In [12]:
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr

df["similarity"] = [
    1 - cosine(s1, s2) for s1, s2 in zip(sentence1_embs, sentence2_embs)
]
jsick_score = spearmanr(df["similarity"], df["relatedness_score_Ja"])[0]
jsick_score

0.7983146503052837

# Miracle
* Need access token for huggingface

In [13]:
import os
import dotenv

dotenv.load_dotenv("huggingface_access_token", override=True)

True

In [14]:
import datasets

# query and positives
ds = datasets.load_dataset(
    "miracl/miracl", "ja", use_auth_token=os.environ["HF_ACCESS_TOKEN"], split="dev"
)
ds

Found cached dataset miracl (G:/cache/miracl___miracl/ja/1.0.0/f598b4ee332f2b16e82c6c83ab1ba82e1a7777ef82e7ce3c1416f6b20a142313)


Dataset({
    features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
    num_rows: 860
})

In [15]:
# all corpus texts
corpus = datasets.load_dataset("miracl/miracl-corpus", "ja")
corpus

Found cached dataset miracl-corpus (G:/cache/miracl___miracl-corpus/ja/1.0.0/16b566312c83a2e1f94d0813c8702b464b97f6b8959336adf062d289ce9b51fa)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['docid', 'title', 'text'],
        num_rows: 6953614
    })
})

In [16]:
# hard negatives
with open("./miracl_hard_negs_1000.json") as f:
    hn = json.loads(f.read())
len(hn), list(hn.keys())[:5], hn["0"].keys(), hn["0"]["docids"][:2], hn["0"]["indices"][
    :2
]

(860,
 ['0', '3', '4', '5', '7'],
 dict_keys(['docids', 'indices']),
 ['2681119#0', '2681119#1'],
 [1393435, 1393436])

In [17]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist


def get_text(corpus_item):
    return corpus_item["title"] + " " + corpus_item["text"]


corpus_dict = {item["docid"]: get_text(item) for item in corpus["train"]}

In [18]:
n_total_pos = 0
n_total_tp = 0

for item in ds:
    # query
    query_emb = model.encode([query_prefix + item["query"]])["dense_vecs"]

    # passages are set(300 hard negatives + positives)
    positive_docids = [pp["docid"] for pp in item["positive_passages"]]
    positive_texts = [get_text(pp) for pp in item["positive_passages"]]
    hn_docids = hn[item["query_id"]]["docids"][:miracle_n_hard_negs]

    # drop hard negatives in positives
    hn_docids = [docid for docid in hn_docids if docid not in positive_docids]

    # search target
    target_docids = positive_docids + hn_docids
    target_texts = positive_texts + [corpus_dict[docid] for docid in hn_docids]

    # embedding
    target_embs = model.encode(
        [passage_prefix + text for text in target_texts], batch_size=32
    )["dense_vecs"]

    # topK
    topk_indices = np.argsort(cdist(query_emb, target_embs, metric="cosine"))[0][
        :miracle_n_recall
    ]
    n_pos = len(positive_docids)
    n_tp = len(
        set(topk_indices) & set(range(len(positive_docids)))
    )  # positives are first indices

    n_total_pos += n_pos
    n_total_tp += n_tp

    # if n_pos > n_tp:
    # print(f"{item['query_id']}:{n_tp}/{n_pos}", end=", ")

miracl_recall = n_total_tp / n_total_pos

n_total_pos, n_total_tp, miracl_recall

encoding: 100%|██████████| 1/1 [00:25<00:00, 25.56s/it]
encoding: 100%|██████████| 10/10 [00:29<00:00,  2.94s/it]
encoding: 100%|██████████| 1/1 [00:24<00:00, 24.07s/it]
encoding: 100%|██████████| 10/10 [00:27<00:00,  2.73s/it]
encoding: 100%|██████████| 1/1 [00:23<00:00, 23.80s/it]
encoding: 100%|██████████| 10/10 [00:25<00:00,  2.56s/it]
encoding: 100%|██████████| 1/1 [00:23<00:00, 23.49s/it]
encoding: 100%|██████████| 10/10 [00:29<00:00,  2.92s/it]
encoding: 100%|██████████| 1/1 [00:24<00:00, 24.88s/it]
encoding: 100%|██████████| 10/10 [00:28<00:00,  2.87s/it]
encoding: 100%|██████████| 1/1 [00:24<00:00, 24.34s/it]
encoding: 100%|██████████| 10/10 [00:28<00:00,  2.89s/it]
encoding: 100%|██████████| 1/1 [00:23<00:00, 23.83s/it]
encoding: 100%|██████████| 10/10 [00:28<00:00,  2.88s/it]
encoding: 100%|██████████| 1/1 [00:25<00:00, 25.08s/it]
encoding: 100%|██████████| 10/10 [00:28<00:00,  2.83s/it]
encoding: 100%|██████████| 1/1 [00:24<00:00, 24.36s/it]
encoding: 100%|██████████| 10/10

(1790, 1629, 0.9100558659217877)

# Output

In [19]:
model_id, jsts_score, jsick_score, miracl_recall

('BAAI/bge-m3', 0.8023288886438221, 0.7983146503052837, 0.9100558659217877)

In [21]:
import json

with open(f'./scores/{model_id.replace("/", "_")}.txt', "w") as f:
    f.write(
        json.dumps(
            {
                "model_id": model_id,
                "jsts": jsts_score,
                "jsick": jsick_score,
                "miracl": miracl_recall,
            }
        )
    )