## The MVP (most viable ?) pipeline for RAG
 - This is a *very* simple pipeline that uses simple components.
 - uses the HuggingFace models.
 - uses `sentence-transformers` for embedding.

In [1]:
from sentence_transformers import SentenceTransformer
from wikipediaapi import Wikipedia
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [2]:
# init the model & the embed_models
model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True)

In [3]:
wiki = Wikipedia('RAGBot/0.0', 'en')
doc = wiki.page("The Laws of Cricket").text

In [4]:
paragraphs = doc.split("\n\n")
print(len(paragraphs))

25


In [5]:
docs_embed = model.encode(paragraphs, normalize_embeddings=True)
# embed query
query = "What are the most prominent laws of cricket?"
query_embed = model.encode(query, normalize_embeddings=True)

In [6]:
similarities = np.dot(docs_embed, query_embed.T)
print(similarities.shape)
top3_index = np.argsort(-similarities)[:3].tolist()
most_similar_documents = [paragraphs[idx] for idx in top3_index]

(25,)


In [7]:
most_similar_documents

['The Laws of Cricket is a code that specifies the rules of the game of cricket worldwide. The earliest known code was drafted in 1744. Since 1788, the code has been owned and maintained by the private Marylebone Cricket Club (MCC) in Lord\'s Cricket Ground, London. There are currently 42 Laws (always written with a capital "L"), which describe all aspects of how the game is to be played. MCC has re-coded the Laws six times, each with interim revisions that produce more than one edition. The most recent code, the seventh, was released in October 2017; its 3rd edition came into force on 1 October 2022. \nFormerly cricket\'s official governing body, the MCC has handed that role to the International Cricket Council (ICC). But MCC retains copyright of the Laws and remains the only body that may change them, although usually this is only done after close consultation with the ICC and other interested parties such as the Association of Cricket Umpires and Scorers. \nCricket is one of the few