In [None]:
!pip install ragatouille faiss-gpu langchain_community

In [None]:
from ragatouille import RAGPretrainedModel
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

In [None]:
import requests

def get_wikipedia_page(title: str):
    """
    Retrieve the full text content of a Wikipedia page.

    :param title: str - Title of the Wikipedia page.
    :return: str - Full text content of the page as raw string.
    """
    # Wikipedia API endpoint
    URL = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
    }

    # Custom User-Agent header to comply with Wikipedia's best practices
    headers = {"User-Agent": "RAGatouille_tutorial/0.0.1 (ben@clavie.eu)"}

    response = requests.get(URL, params=params, headers=headers)
    data = response.json()

    # Extracting page content
    page = next(iter(data["query"]["pages"].values()))
    return page["extract"] if "extract" in page else None

full_document = get_wikipedia_page("Hayao_Miyazaki")

In [None]:
full_document

In [None]:
RAG.index(
    collection=[full_document],
    index_name="test",
    split_documents=True,
    max_document_length=100
)
retriever_advanced = RAG.as_langchain_retriever(k=5)

In [None]:
results = retriever_advanced.invoke("Who was Hayao_Miyazaki?")
results

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=len
    )
chunks = text_splitter.split_text(full_document)

In [None]:
chunks

In [None]:
RAG.index(
    collection=chunks,
    index_name="new_test2",
    split_documents=False
)
retriever_advanced = RAG.as_langchain_retriever(k=5)

In [None]:
results = retriever_advanced.invoke("Who was Hayao_Miyazaki?")
results