<a href="https://colab.research.google.com/github/cloudpendyala/pendyala_ai/blob/main/SentenceLevelChunking_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
!pip install PyMuPDF sentence-transformers faiss-cpu transformers




In [29]:
import fitz  # PyMuPDF

def chunk_pdf_paragraphs(pdf_path):
    document = fitz.open(pdf_path)
    paragraphs = []

    for page_num in range(len(document)):
        page = document[page_num]
        text = page.get_text("text")
        paragraphs.extend(text.split('\n\n'))  # Splitting by double newline for paragraphs

    # Clean empty paragraphs
    paragraphs = [p.strip() for p in paragraphs if p.strip()]
    return paragraphs

pdf_path = "/content/drive/My Drive/Colab/ramayana_kids.pdf"  # Change to your PDF path

paragraphs = chunk_pdf_paragraphs(pdf_path)



In [30]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-small-qg-hl")
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-small-qg-hl")

def generate_qa(paragraphs):
    qa_pairs = []
    for paragraph in paragraphs:
        input_text = "generate questions: " + paragraph
        input_ids = tokenizer.encode(input_text, return_tensors='pt')
        output_ids = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
        questions = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Assuming the model returns questions separated by '<sep>'
        questions = questions.split('<sep>')

        # Generate answers using a simple approach for now
        for question in questions:
            if question.strip():
                qa_pairs.append((paragraph, question.strip(), paragraph))  # Using the paragraph as the answer
    return qa_pairs

qa_pairs = generate_qa(paragraphs)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors


In [31]:
print(qa_pairs)



In [32]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')  # Using a more robust model

def generate_embeddings(texts):
    embeddings = model.encode(texts)
    return embeddings

paragraphs = [qa[0] for qa in qa_pairs]
answers = [qa[2] for qa in qa_pairs]

paragraph_embeddings = generate_embeddings(paragraphs)
answer_embeddings = generate_embeddings(answers)


In [33]:
import faiss
import numpy as np

def store_embeddings_faiss(paragraphs, questions, answers, paragraph_embeddings, answer_embeddings):
    dimension = paragraph_embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Use inner product (cosine similarity)

    combined_embeddings = np.concatenate((paragraph_embeddings, answer_embeddings), axis=0)
    faiss.normalize_L2(combined_embeddings)
    index.add(combined_embeddings)

    combined_data = [{'paragraph': paragraphs[i], 'question': questions[i], 'answer': answers[i]} for i in range(len(paragraphs))]
    combined_data += [{'paragraph': '', 'question': '', 'answer': answers[i]} for i in range(len(answers))]

    return index, combined_data

questions = [qa[1] for qa in qa_pairs]
index, stored_data = store_embeddings_faiss(paragraphs, questions, answers, paragraph_embeddings, answer_embeddings)


In [36]:
def search_paragraphs(query, index, stored_data, k=5):
    query_embedding = model.encode([query])
    query_embedding = np.array(query_embedding).reshape(1, -1)
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, k)

    results = []
    for i in range(k):
        match_percentage = distances[0][i] * 100  # Convert cosine similarity to percentage
        if match_percentage >= 30:
            result = stored_data[indices[0][i]]
            result['match_percentage'] = match_percentage
            results.append(result)

    results = sorted(results, key=lambda x: x['match_percentage'], reverse=True)  # Ensure sorting
    return results

query = "who wrote this epic?"
results = search_paragraphs(query, index, stored_data)

for i, result in enumerate(results):
    print(f"Result {i + 1}:")
    print(f"Paragraph: {result['paragraph']}")
    print(f"Question: {result['question']}")
    print(f"Answer: {result['answer']}")
    print(f"Match Percentage: {result['match_percentage']:.2f}%\n")


Result 1:
Paragraph: 
Question: 
Answer: 44 
 
Vashishta blessed Rama and with their consent, Rama agreed to 
become the king. 
Soon Rama was crowned the king of Ayodhya. The coronation was 
conducted on a grand scale, Rama and Sita were seated on the 
throne, Lakshmana, Bharatha and Shatrugna stood behind them 
Hanuman sat at Rama’s feet. 
People rejoiced about this happy event. The festivities continued for 
a week and Rama made generous gifts to everyone.  
Sri Rama ruled over Ayodhya for many years. People lived a happy, 
contented life in the kingdom and this glorious reign was hailed as 
Rama Rajya. 
 
This is the story of Ramayana .This epic was written by Maharishi 
Valmiki. Reading Ramayana will help us to follow Rama’s ideals, his 
devotion towards his parents, his values and his truthfulness. Let us 
all strive to be better human beings.
Match Percentage: 42.96%

Result 2:
Paragraph: 44 
 
Vashishta blessed Rama and with their consent, Rama agreed to 
become the king. 
Soon 