In [None]:
%pip install --upgrade google-cloud-aiplatform


# Setup

In [6]:
from google.colab import auth

PROJECT_ID = "<PUT YOUR GCP PJ>"
auth.authenticate_user(project_id=PROJECT_ID)


In [29]:
import vertexai

vertexai.init(project=PROJECT_ID, location='us-central1')


# Model

In [30]:
from vertexai.language_models import TextEmbeddingModel
import numpy as np

# https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text-embeddings#model_versions
model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual@001")

def get_embeddings(sentences: list[str]) -> np.ndarray:
    embeddings = model.get_embeddings(sentences, auto_truncate=False)
    return np.array([embedding.values for embedding in embeddings])

def batch_process_embeddings(sentences: list[str], batch_size: int = 5) -> np.ndarray:
    all_embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        batch_embeddings = get_embeddings(batch_sentences)
        all_embeddings.append(batch_embeddings)

    return np.vstack(all_embeddings)


# JSTS

In [None]:
import json
import pandas as pd
from urllib.request import urlopen

jsts_url = "https://raw.githubusercontent.com/yahoojapan/JGLUE/main/datasets/jsts-v1.1/valid-v1.1.json"
df = pd.DataFrame([json.loads(line) for line in urlopen(jsts_url).readlines()])
df.head(1)


In [None]:
df.shape


## Encode

In [None]:
import numpy as np

sentence1_embs = batch_process_embeddings(df["sentence1"].tolist())
sentence2_embs = batch_process_embeddings(df["sentence2"].tolist())
sentence1_embs.shape, sentence2_embs.shape


## Correlation Score

In [32]:
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr

df["similarity"] = [
    1 - cosine(s1, s2) for s1, s2 in zip(sentence1_embs, sentence2_embs)
]
spearmanr(df["similarity"], df["label"])[0]


0.8006039095558688

# JSICK

In [None]:
df = pd.read_csv(
    "https://github.com/verypluming/JSICK/raw/main/jsick/test.tsv", sep="\t"
)
df.head(1)


In [None]:
df.shape


In [None]:
sentence1_embs = []
sentence2_embs = []

for batch_index in range(0, len(df["sentence_A_Ja"]), 2048):
    cur_sentence1_list = df["sentence_A_Ja"][batch_index : batch_index + 2048]
    cur_sentence2_list = df["sentence_B_Ja"][batch_index : batch_index + 2048]

    cur_sentence1_embs = batch_process_embeddings(cur_sentence1_list.tolist())
    cur_sentence2_embs = batch_process_embeddings(cur_sentence2_list.tolist())

    sentence1_embs.extend(cur_sentence1_embs)
    sentence2_embs.extend(cur_sentence2_embs)

sentence1_embs = np.array(sentence1_embs)
sentence2_embs = np.array(sentence2_embs)

sentence1_embs.shape, sentence2_embs.shape


## Correlation Score

In [40]:
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr

df["similarity"] = [
    1 - cosine(s1, s2) for s1, s2 in zip(sentence1_embs, sentence2_embs)
]
spearmanr(df["similarity"], df["relatedness_score_Ja"])[0]


0.803561121302977

# Miracle

In [39]:
None


# Output

In [41]:
model_id = "textembedding-gecko-multilingual@001"
jsts_score = 0.8006039095558688
jsick_score = 0.803561121302977
# Not calculated from a cost perspective.
miracl_recall = None
model_id, jsts_score, jsick_score, miracl_recall


('textembedding-gecko-multilingual@001',
 0.8006039095558688,
 0.803561121302977,
 None)

In [None]:
import json

with open(f'./scores/{model_id.replace("/", "_")}.txt', "w") as f:
    f.write(
        json.dumps(
            {
                "model_id": model_id,
                "jsts": jsts_score,
                "jsick": jsick_score,
                "miracl": miracl_recall,
            }
        )
    )
