# Uploading indices to Hugging Face

In [1]:
from baguetter.indices import BMXSparseIndex
from baguetter.evaluation import HFDataset

## 1. Create index and load dataset

In [2]:
index = BMXSparseIndex()

ds = HFDataset("mteb/scidocs", "corpus")
doc_ids, docs = ds.get_corpus()
_, queries = ds.get_queries()

## 2. Add documents to index

In [3]:
index.add_many(doc_ids, docs, show_progress=True)

Tokenization: 100%|██████████| 25657/25657 [00:08<00:00, 2902.54it/s]
Building doc-term matrix: 100%|██████████| 25657/25657 [00:00<00:00, 47972.01it/s]
Building inverted index: 100%|██████████| 61627/61627 [00:04<00:00, 14397.29it/s]


<baguetter.indices.sparse.bmx.BMXSparseIndex at 0x31737e650>

## 3. Save index to Hugging Face

In [4]:
index.push_to_hub("mixedbread-ai/baguetter", "bmx_scidocs")

No files have been modified since last commit. Skipping to prevent empty commit.


'datasets/mixedbread-ai/baguetter/bmx_scidocs'

## 4. Load index from Hugging Face

In [5]:
idx = index.load_from_hub("mixedbread-ai/baguetter", "bmx_scidocs")

## 5. Use index

In [6]:
idx.search(queries[0])

SearchResults(keys=['86e87db2dab958f1bd5877dc7d5b8105d6e31e46', 'cd31ecb3b58d1ec0d8b6e196bddb71dd6a921b6d', '2a43d3905699927ace64e880fe9ba8a730e14be1', 'eef39364df06eb9933d2fc41a0f13eea17113c58', '19c90b3c0c0d94e8235731a057cc6377c46482ee', '768b18d745639fcfb157fe16cbd957ca60ebfc2e', 'f2ab0a2aa4177dd267c3c6cc37c7ad0e33c2cdbf', 'd504a72e40ecee5c2e721629e7368a959b18c681', 'd1d120bc98e536dd33e37c876aaba57e584d252e', 'e2890afe42e64b910609e7554130d6a81427e02a', '829033fd070c6ed30d28a21187e0def25a3e809f', '0948365ef39ef153e61e9569ade541cf881c7c2a', '4a4cea4421ff0be7bcc06e92179cd2d5f1102ff8', '745b88eb437eb59e2a58fe378d287702a6b0d985', '1f009366a901c403a8aad65c94ec2fecf3428081', '26880494f79ae1e35ffee7f055cb0ad5693060c2', '432143ab67c05f42c918c4ed6fd9412d26e659be', '53f3edfeb22de82c7a4b4a02209d296526eee38c', 'a16dc6af67ef9746068c63a56a580cb3b2a83e9c', '2eafdb47aa9b5b510f7fcb113b22e6ab7c79d143', '0a202f1dfc6991a6a204eaa5e6b46d6223a4d98a', '6307f94aefdc7268c27e3af8fc04f090bc1b18bb', 'e90dd4a2750