# Tokenization before adding to index

In this notebook we showcase how you can add tokens to the index. Especially when the tokenization happens outside of Baguetter.

In [1]:
from baguetter.indices import BM25SparseIndex, BMXSparseIndex
from baguetter.indices.sparse.text_preprocessor import TextPreprocessor
from baguetter.evaluation.datasets import HFDataset

## 1. Setup index

In [None]:
# Load the Quora dataset
ds = HFDataset("mteb/quora")

# Initialize our sparse indices
bm25 = BM25SparseIndex()
bmx = BMXSparseIndex()

# Initialize our preprocessor with some other settings
preprocessor = TextPreprocessor(
    do_lowercasing=False,
    do_punctuation_removal=False,
)

## 2. Processing the Corpus

In [2]:
doc_ids, docs = ds.get_corpus()

tokens = preprocessor.process_many(docs, show_progress=True, n_workers=8)

Tokenization: 100%|██████████| 522931/522931 [00:05<00:00, 90891.49it/s] 


## 3. Adding to Indices

In [3]:
# Add to BM25 index
bm25.add_many(doc_ids, tokens, show_progress=True)

# Add to BMX index
bmx.add_many(doc_ids, tokens, show_progress=True)

Calculating Unique Tokens: 100%|██████████| 522931/522931 [00:00<00:00, 1300109.53it/s]
Converting tokens to token IDs: 100%|██████████| 522931/522931 [00:01<00:00, 315091.84it/s]
Counting Tokens: 100%|██████████| 522931/522931 [00:00<00:00, 660249.67it/s]
Computing IDF: 100%|██████████| 165517/165517 [00:00<00:00, 2012781.93it/s]
Computing BM25 Scores: 100%|██████████| 522931/522931 [00:06<00:00, 80429.92it/s]
Building TDF matrix: 100%|██████████| 522931/522931 [00:01<00:00, 327959.37it/s]
Building inverted index: 100%|██████████| 165517/165517 [00:26<00:00, 6273.12it/s]


<baguetter.indices.sparse.bmx.BMXSparseIndex at 0x15534aeaf9b0>

In [4]:
print(bm25.search("Can I recover my email if I forgot the password?"))
print(bmx.search("Can I recover my email if I forgot the password?"))

SearchResults(keys=['17231', '44565', '209', '10382', '71880', '12017', '7551', '142823', '69513', '23798', '110098', '49132', '481825', '197604', '83187', '55451', '71879', '506563', '161919', '351727', '154537', '33072', '160488', '223589', '179669', '368945', '268427', '442705', '354266', '40270', '118', '192710', '220445', '183131', '266929', '70187', '82890', '26729', '138441', '277622', '228976', '199885', '140080', '6261', '268428', '433845', '310314', '54879', '210', '140883', '229747', '37866', '286905', '472157', '26334', '66653', '506725', '398335', '391587', '142824', '330675', '286951', '308683', '24288', '317824', '242895', '249474', '53587', '24289', '188604', '531846', '218402', '260595', '381065', '427350', '272355', '98948', '42709', '8432', '135229', '58673', '296680', '451829', '339911', '310315', '37865', '17928', '103498', '178704', '15320', '221187', '244833', '304570', '32118', '281261', '523810', '248545', '8387', '397810', '90174'], scores=array([9.671901 , 9.