In [1]:
import dspy
import fitz
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5")

In [3]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_file:
        for page_num in range(len(pdf_file)):
            page = pdf_file.load_page(page_num)
            text += page.get_text()

    return text

In [4]:
def chunk_text(text, chunk_size=512, overlap=30):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

In [5]:
text1 = extract_text_from_pdf("1.pdf")
text2 = extract_text_from_pdf("2.pdf")
text3 = extract_text_from_pdf("3.pdf")

In [6]:
corpus = text1 + text2 + text3
chunked_docs = chunk_text(corpus)

In [7]:
class InMemoryRetrievalModel:
    def __init__(self, model_name, documents):
        """
        Initialize the retrieval model by creating and populating the FAISS index.

        :param model_name: Name of the pre-trained model to use for embeddings.
        :param documents: List of documents to index.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.index = self._create_and_populate_index(documents)
    
    def _get_embeddings(self, texts):
        """
        Generate embeddings for a list of texts.

        :param texts: List of strings.
        :return: NumPy array of embeddings.
        """
        # Tokenize the texts and pad & truncate to the same length
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        return embeddings
    
    def _create_and_populate_index(self, documents):
        """
        Create a FAISS index and populate it with document embeddings.

        :param documents: List of documents to index.
        :return: Populated FAISS index.
        """
        # Generate embeddings for the documents
        embeddings = self._get_embeddings(documents)
        # Create a FAISS index
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        # Add document embeddings to the index
        index.add(embeddings)
        return index
    
    def retrieve_documents(self, query, k=5):
        """
        Retrieve top-k documents relevant to the query.

        :param query: Query string.
        :param k: Number of top documents to retrieve.
        :return: Indices of the top-k relevant documents.
        """
        query_embedding = self._get_embeddings([query])
        _, indices = self.index.search(query_embedding, k)
        return indices[0]

In [8]:
retrieval_model = InMemoryRetrievalModel("BAAI/bge-base-en-v1.5", chunked_docs)

In [9]:
import dspy
from huggingface_hub import login

In [17]:
turbo = dspy.OpenAI(model='gpt-3.5-turbo')

hf_token = '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'
login(token=hf_token)
llm = dspy.HFModel(model = 'google/gemma-2b')

dspy.settings.configure(lm=llm, rm=retrieval_model)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/joudy/.cache/huggingface/token
Login successful


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [12]:
example_query = "what is the most important thing about matrin heidegger?"

response = llm(example_query)
print(response)



KeyboardInterrupt: 

In [13]:
example_question1 = "Has there been Kaggle competition about cute animals and if yes, which one?"
example_answer1 = "You might be interested in the PetFinder.my Adoption Prediction competition."

example_question2 = "I'm interested in autonomous driving. What Kaggle competition can you recommend?"
example_answer2 ="You might be interested in the Lyft Motion Prediction for Autonomous Vehicles or Lyft 3D Object Detection for Autonomous Vehicles competition,"

example_question3 = "What Kaggle competitions should I look at to learn more about predicting the stock market?"
example_answer3 = "You might be interested in the JPX Tokyo Stock Exchange Prediction or Jane Street Market Prediction or Ubiquant Market Prediction competition."

In [14]:
# Small training set with question and answer pairs
trainset = [dspy.Example(question=example_question1, 
                         answer=example_answer1).with_inputs('question'),
            dspy.Example(question=example_question2, 
                         answer=example_answer2).with_inputs('question'),
           dspy.Example(question=example_question3, 
                         answer=example_answer3).with_inputs('question'),]

In [15]:
from dspy.teleprompt import BootstrapFewShot

# The teleprompter will bootstrap missing labels: reasoning chains and retrieval contexts
teleprompter = BootstrapFewShot(metric=dspy.evaluate.answer_exact_match)
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)

100%|██████████| 3/3 [00:00<00:00, 8278.23it/s]

Failed to run or to evaluate example Example({'question': 'Has there been Kaggle competition about cute animals and if yes, which one?', 'answer': 'You might be interested in the PetFinder.my Adoption Prediction competition.'}) (input_keys={'question'}) with <function answer_exact_match at 0x7977b553f0d0> due to 'InMemoryRetrievalModel' object is not callable.
Failed to run or to evaluate example Example({'question': "I'm interested in autonomous driving. What Kaggle competition can you recommend?", 'answer': 'You might be interested in the Lyft Motion Prediction for Autonomous Vehicles or Lyft 3D Object Detection for Autonomous Vehicles competition,'}) (input_keys={'question'}) with <function answer_exact_match at 0x7977b553f0d0> due to 'InMemoryRetrievalModel' object is not callable.
Failed to run or to evaluate example Example({'question': 'What Kaggle competitions should I look at to learn more about predicting the stock market?', 'answer': 'You might be interested in the JPX Tokyo




In [18]:
response = compiled_rag(example_query)

print(response.answer)

TypeError: 'InMemoryRetrievalModel' object is not callable