In [1]:
import json
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [2]:
small = "stexpanded"
big = "memoryalpha"
embeddings = "dogtag_bgelarge"

In [3]:
mappings_file_small = "./_input/mappings/" + small + ".json"
mappings_file_big = "./_input/mappings/" + big + ".json"

dogtags_small_file = "./_input/dogtags/" + small + ".json"
dogtags_big_file = "./_input/dogtags/" + big + ".json"

exact_match_file = "./_input/exact_match/" + small + "-" + big + ".json"

output_file = "./_input/exact_match_similarity/" + small + "-" + big + ".txt"

In [4]:
with open(mappings_file_small) as file:
    mappings_small = {str(v): k for k, v in json.load(file).items()}
    mappings_small_reversed = {v: k for k, v in mappings_small.items()}

with open(mappings_file_big) as file:
    mappings_big = {str(v): k for k, v in json.load(file).items()}
    mappings_big_reversed = {v: k for k, v in mappings_big.items()}

with open(dogtags_small_file) as df:
    dogtags_small = json.load(df)

with open(dogtags_big_file) as df:
    dogtags_big = json.load(df)

with open(exact_match_file) as file:
    exact_match = json.load(file)

In [5]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large')
model.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, ou

In [6]:
with open(output_file, "w") as file:

    # string_pairs = list()
    #
    # for item in exact_match:
    #     left_dogtag = dogtags_small[mappings_small[str(item[0])]]
    #     right_dogtag = dogtags_big[mappings_big[str(item[1])]]
    #     string_pairs.append((left_dogtag, right_dogtag))
    #     break

    string_pairs = [("asdasd", "qweqwe")]

    batch = tokenizer(
        [str(q) for q, d in string_pairs],
        [str(d) for q, d in string_pairs],
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**batch)
        scores = torch.softmax(outputs.logits, dim=1)[:, 0]

    similarities = scores.tolist()

    for i, ((q, d), score) in enumerate(zip(string_pairs, similarities)):
        print(f"Pair {i+1}:")
        print(f"Q: {q}")
        print(f"D: {d}")
        print(f"Similarity Score: {score:.4f}\n")

Pair 1:
Q: asdasd
D: qweqwe
Similarity Score: 1.0000

