In [1]:
## Basic indexing and search

# pip install ragatouille
# pip install rich
# Source: https://github.com/bclavie/RAGatouille/blob/main/examples/01-basic_indexing_and_search.ipynb

In [31]:
from ragatouille import RAGPretrainedModel, RAGTrainer
from ragatouille.data import CorpusProcessor, llama_index_sentence_splitter
from rich import inspect as rinspect

model_name = "colbert-ir/colbertv2.0"
RAG = RAGPretrainedModel.from_pretrained(model_name) # load a colBERT pretrained model from a checkpoint



In [11]:
# Get a wikipedia page using requests module with title of the wikipedia page and return the full text context of page as raw string
import requests

def get_wikipedia_page(title):
    url = f"https://en.wikipedia.org/w/api.php"
    params = dict(action="query", prop="extracts", format="json", titles=title, explaintext=True)
    headers = {"User-Agent": "RAG example (manisnesan@users.noreply.github.com)"}
    r = requests.get(url, params=params, headers=headers)
    data = r.json()
    page = next(iter(data["query"]["pages"].values()))
    return page["extract"] if "extract" in page else ""

title = "Hayao_Miyazaki"
full_doc = get_wikipedia_page(title)
print(f"Length of the doc : {len(full_doc)}")

Length of the doc : 45093


In [18]:
# Index the document into a collection with the title as the index name and max_document_length as 180, split the document into chunks of 180 tokens
# splitting the document into chunks of 180 tokens is the default behaviour of RAG 
RAG.index(collection=[full_doc], index_name=title, max_document_length=180) 



[Jan 15, 20:14:44] #> Creating directory .ragatouille/colbert/indexes/Hayao_Miyazaki 


[Jan 15, 20:14:45] [0] 		 #> Encoding 81 passages..


100%|██████████| 2/2 [00:38<00:00, 19.43s/it]

[Jan 15, 20:15:24] [0] 		 avg_doclen_est = 129.9629669189453 	 len(local_sample) = 81
[Jan 15, 20:15:24] [0] 		 Creating 1,024 partitions.
[Jan 15, 20:15:24] [0] 		 *Estimated* 10,527 embeddings.
[Jan 15, 20:15:24] [0] 		 #> Saving the indexing plan to .ragatouille/colbert/indexes/Hayao_Miyazaki/plan.json ..





Clustering 10001 points in 128D to 1024 clusters, redo 1 times, 20 iterations
  Preprocessing in 0.00 s
  Iteration 19 (2.47 s, search 2.38 s): objective=2113.62 imbalance=1.408 nsplit=0       
[0.037, 0.039, 0.038, 0.033, 0.033, 0.036, 0.034, 0.036, 0.034, 0.032, 0.03, 0.036, 0.034, 0.037, 0.034, 0.034, 0.033, 0.032, 0.032, 0.036, 0.035, 0.033, 0.035, 0.037, 0.034, 0.033, 0.037, 0.034, 0.035, 0.037, 0.034, 0.034, 0.039, 0.034, 0.033, 0.033, 0.034, 0.034, 0.035, 0.04, 0.036, 0.034, 0.032, 0.034, 0.036, 0.032, 0.035, 0.036, 0.037, 0.033, 0.033, 0.034, 0.033, 0.038, 0.035, 0.035, 0.037, 0.036, 0.038, 0.03, 0.032, 0.034, 0.033, 0.034, 0.038, 0.035, 0.034, 0.035, 0.032, 0.032, 0.036, 0.034, 0.033, 0.033, 0.035, 0.034, 0.034, 0.037, 0.034, 0.034, 0.035, 0.039, 0.032, 0.039, 0.029, 0.034, 0.037, 0.036, 0.034, 0.041, 0.035, 0.035, 0.034, 0.036, 0.034, 0.036, 0.038, 0.034, 0.036, 0.034, 0.039, 0.037, 0.035, 0.034, 0.038, 0.034, 0.037, 0.032, 0.034, 0.034, 0.035, 0.035, 0.036, 0.031, 0.034, 0.0

0it [00:00, ?it/s]

[Jan 15, 20:15:27] [0] 		 #> Encoding 81 passages..


100%|██████████| 2/2 [00:37<00:00, 18.97s/it]
1it [00:38, 38.21s/it]
100%|██████████| 1/1 [00:00<00:00, 688.15it/s]

[Jan 15, 20:16:05] #> Optimizing IVF to store map from centroids to list of pids..
[Jan 15, 20:16:05] #> Building the emb2pid mapping..
[Jan 15, 20:16:05] len(emb2pid) = 10527



100%|██████████| 1024/1024 [00:00<00:00, 29982.53it/s]


[Jan 15, 20:16:05] #> Saved optimized IVF to .ragatouille/colbert/indexes/Hayao_Miyazaki/ivf.pid.pt
Done indexing!


In [23]:
k = 3
results = RAG.search(query="What animation studio did Miyazaki found?", k=k, index_name=title)
results

New index_name received! Updating current index_name (Hayao_Miyazaki) to Hayao_Miyazaki
Loading searcher for index Hayao_Miyazaki for the first time... This may take a few seconds
[Jan 15, 20:22:09] #> Loading codec...
[Jan 15, 20:22:09] #> Loading IVF...
[Jan 15, 20:22:10] Loading segmented_lookup_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Jan 15, 20:23:02] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 1249.79it/s]

[Jan 15, 20:23:02] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 113.34it/s]

[Jan 15, 20:23:02] Loading filter_pids_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...





[Jan 15, 20:23:51] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . What animation studio did Miyazaki found?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2054,  7284,  2996,  2106,  2771,  3148, 18637,  2179,
         1029,   102,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])





[{'content': 'In April 1984, Miyazaki opened his own office in Suginami Ward, naming it Nibariki.\n\n\n=== Studio Ghibli ===\n\n\n==== Early films (1985–1996) ====\nIn June 1985, Miyazaki, Takahata, Tokuma and Suzuki founded the animation production company Studio Ghibli, with funding from Tokuma Shoten. Studio Ghibli\'s first film, Laputa: Castle in the Sky (1986), employed the same production crew of Nausicaä. Miyazaki\'s designs for the film\'s setting were inspired by Greek architecture and "European urbanistic templates".',
  'score': 25.90499496459961,
  'rank': 1},
 {'content': 'Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao, [mijaꜜzaki hajao]; born January 5, 1941) is a Japanese animator, filmmaker, and manga artist. A co-founder of Studio Ghibli, he has attained international acclaim as a masterful storyteller and creator of Japanese animated feature films, and is widely regarded as one of the most accomplished filmmakers in the history of animation.\nBorn in Tokyo City in the E

In [25]:
results = RAG.search(query="Who is Miyazaki?", k=k, index_name=title)
results



[{'content': 'Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao, [mijaꜜzaki hajao]; born January 5, 1941) is a Japanese animator, filmmaker, and manga artist. A co-founder of Studio Ghibli, he has attained international acclaim as a masterful storyteller and creator of Japanese animated feature films, and is widely regarded as one of the most accomplished filmmakers in the history of animation.\nBorn in Tokyo City in the Empire of Japan, Miyazaki expressed interest in manga and animation from an early age, and he joined Toei Animation in 1963. During his early years at Toei Animation he worked as an in-between artist and later collaborated with director Isao Takahata.',
  'score': 28.410186767578125,
  'rank': 1},
 {'content': 'Paste\'s Toussaint Egan described Miyazaki as "one of anime\'s great auteurs", whose "stories of such singular thematic vision and unmistakable aesthetic" captured viewers otherwise unfamiliar with anime. Miyazaki became the subject of an exhibit at the Academy Museu

In [27]:
results = RAG.search(query="What are Miyazaki contributions?", k=k, index_name=title)
results




[{'content': "The protagonists of his films are often strong girls or young women, and several of his films present morally ambiguous antagonists with redeeming qualities. Miyazaki's works have been highly praised and awarded; he was named a Person of Cultural Merit for outstanding cultural contributions in November 2012, and received the Academy Honorary Award for his impact on animation and cinema in November 2014. Miyazaki has frequently been cited as an inspiration for numerous animators, directors, and writers.\n\n\n== Early life ==\nHayao Miyazaki was born on January 5, 1941, in Tokyo City, Empire of Japan, the second of four sons.",
  'score': 24.584861755371094,
  'rank': 1},
 {'content': "During his early years at Toei Animation he worked as an in-between artist and later collaborated with director Isao Takahata. Notable films to which Miyazaki contributed at Toei include Doggie March and Gulliver's Travels Beyond the Moon. He provided key animation to other films at Toei, suc

In [29]:
# load an already created index
path_to_index=f".ragatouille/colbert/indexes/{title}/"
RAG = RAGPretrainedModel.from_index(path_to_index)



In [22]:
rinspect(RAG.search) 

In [17]:
rinspect(RAG.index)

In [14]:
rinspect(RAG, methods=True, docs=True, private=True)