In [2]:
import os
import sys
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from collections import defaultdict
from transformers import AutoTokenizer, BasicTokenizer, AutoModelForMaskedLM
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.evaluation import EvaluateRetrieval

In [3]:
def word_subword_ratio(corpus, tks):
    nume = 0
    denom = 0
    for cid, d_text in tqdm(corpus.items()):
        text = d_text["title"] + " " + d_text["text"]
        t_words = tks["space"].tokenize(text)
        t_subs_org = tks["splade"].tokenize(text)
        word_subword_splade = word_subword_alignment(t_subs_org, t_words)
        t_words = tks["space"].tokenize(text)
        t_subs_62783 = tks["splade-addv"].tokenize(text)
        word_subword_splade_62783 = word_subword_alignment(t_subs_62783, t_words)
        nume += len(word_subword_splade_62783)
        denom += len(word_subword_splade)
    print(nume, denom, (denom - nume) / denom)

In [4]:
def word_subword_alignment(t_subs, t_words):
    word_subword = defaultdict(list)
    if not t_subs:
        return word_subword
    t_sub = t_subs.pop(0)
    while len(t_subs) and len(t_words):
        t_word = t_words.pop(0)
        while t_sub.strip("#") in t_word and len(t_words) and len(t_subs):
            if t_sub != t_word:
                word_subword[t_word].append(t_sub)
            t_sub = t_subs.pop(0)
            
    del_key = []
    for word, subs in word_subword.items():
        if subs[0][0] != word[0]:
            del_key.append(word)
            
    for word in del_key:
        del word_subword[word]
    return word_subword

# science

In [5]:
tks = {"splade-addv": AutoTokenizer.from_pretrained("/home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/tokenizer/raw/remove/62783/"),
       "splade": AutoTokenizer.from_pretrained("bert-base-uncased"),
      "space": BasicTokenizer()}

In [17]:
data_path = "/home/gaia_data/iida.h/BEIR/datasets/scidocs"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
word_subword_ratio(corpus, tks)

  0%|          | 0/25657 [00:00<?, ?it/s]

  0%|                                                                                                                          | 0/25657 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25657/25657 [01:59<00:00, 214.03it/s]

92804 262696 0.646724731248287





In [10]:
data_path = "/home/gaia_data/iida.h/BEIR/datasets/scifact"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
word_subword_ratio(corpus, tks)

  0%|          | 0/5183 [00:00<?, ?it/s]

  0%|                                                                                                                           | 0/5183 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:29<00:00, 176.96it/s]

22911 105263 0.7823451735177602





# bio

In [18]:
tks = {"splade-addv": AutoTokenizer.from_pretrained("/home/gaia_data/iida.h/BEIR/model/pubmed_abst/bert-base-uncased/tokenizer/raw/remove/71694//"),
       "splade": AutoTokenizer.from_pretrained("bert-base-uncased"),
      "space": BasicTokenizer()}

In [19]:
data_path = "/home/gaia_data/iida.h/BEIR/datasets/nfcorpus"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
word_subword_ratio(corpus, tks)

  0%|          | 0/3633 [00:00<?, ?it/s]

  0%|▍                                                                                                                | 16/3633 [00:00<00:23, 153.56it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3633/3633 [00:22<00:00, 160.60it/s]

13688 64267 0.7870135528342695





In [28]:
data_path = "/home/gaia_data/iida.h/BEIR/datasets/trec-covid"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
word_subword_ratio(corpus, tks)

  0%|          | 0/171332 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 171332/171332 [12:28<00:00, 229.00it/s]

1968846 2370710 0.16951208709627075





In [6]:
data_path = "/home/gaia_data/iida.h/BEIR/datasets/bioask"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
word_subword_ratio(corpus, tks)

  0%|          | 0/14914714 [00:00<?, ?it/s]

  0%|                                                                                                                         | 0/14914604 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (593 > 512). Running this sequence through the model will result in indexing errors
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 14914604/14914604 [24:50:43<00:00, 166.75it/s]

73236481 274168469 0.732877813166765





# finnance

In [21]:
tks = {"splade-addv": AutoTokenizer.from_pretrained("/home/gaia_data/iida.h/BEIR/model/TRC2/bert-base-uncased/tokenizer/raw/remove/56543///"),
       "splade": AutoTokenizer.from_pretrained("bert-base-uncased"),
      "space": BasicTokenizer()}

In [27]:
data_path = "/home/gaia_data/iida.h/BEIR/datasets/fiqa"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
word_subword_ratio(corpus, tks)

  0%|          | 0/57638 [00:00<?, ?it/s]

  0%|                                                                                                                          | 0/57638 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 57638/57638 [02:55<00:00, 328.46it/s]

174061 275820 0.3689326372271771



