In [1]:
import dspy
import fitz
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_file:
        for page_num in range(len(pdf_file)):
            page = pdf_file.load_page(page_num)
            text += page.get_text()

    return text

In [3]:
def chunk_text(text, chunk_size=512, overlap=30):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

In [4]:
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5")

In [5]:
def get_embeddings(texts):
    tokens = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()

In [6]:
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings.detach()

In [7]:
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]  # Get the dimension of the embeddings
    index = faiss.IndexFlatL2(dimension)  # Using L2 distance for similarity
    index.add(embeddings)  # Add embeddings to the index
    return index

def retrieve_documents(query_embedding, k=5):
    D, I = index.search(query_embedding.numpy(), k)  # Search for top-k similar documents
    return I[0]  # Indices of the top-k documents

In [16]:
text1 = extract_text_from_pdf("1.pdf")
text2 = extract_text_from_pdf("2.pdf")
text3 = extract_text_from_pdf("3.pdf")

In [17]:
corpus = text1 + text2 + text3
chunked_docs = chunk_text(corpus)

In [19]:
chunked_docs_1 = chunked_docs[:10]

In [30]:
embeddings = np.vstack([get_embeddings(chunk) for chunk in chunked_docs])

In [None]:
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

In [None]:
def search_similar_texts(query_text, k=4):
    query_embedding = get_embeddings([query_text])
    D, I = index.search(query_embedding, k)
    return [chunked_docs[i] for i in I[0]], D[0]

In [None]:
query = "what is the impact of heidegger's philosophy on contemporary art?"
similar_texts, distances = search_similar_texts(query, k=4)

In [32]:
index = create_faiss_index(embeddings)

In [31]:
import dspy
from huggingface_hub import login

In [None]:
turbo = dspy.OpenAI(model='gpt-3.5-turbo')

hf_token = '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'
login(token=hf_token)
llm = dspy.HFModel(model = 'google/gemma-2b')

dspy.settings.configure(lm=llm, rm=colbertv2_wiki17_abstracts)