<a href="https://colab.research.google.com/github/cloudpendyala/pendyala_ai/blob/main/Split_at_Paragraph_level.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install PyMuPDF sentence-transformers faiss-cpu


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [63]:
import fitz  # PyMuPDF

def chunk_pdf_paragraphs(pdf_path):
    document = fitz.open(pdf_path)
    paragraphs = []

    for page_num in range(len(document)):
        page = document[page_num]
        text = page.get_text("text")
        paragraphs.extend(text.split('\n\n'))  # Splitting by double newline for paragraphs

    # Clean empty paragraphs
    paragraphs = [p.strip() for p in paragraphs if p.strip()]
    return paragraphs

paragraphs = chunk_pdf_paragraphs(pdf_path)


pdf_path = "/content/drive/My Drive/Colab/presidents.pdf"  # Change to your PDF path
paragraphs = chunk_pdf_paragraphs(pdf_path)


President Lyndon B. Johnson signs
the 1964 Civil Rights Act at the
White House on July 2, 1964, as
Martin Luther King Jr. and others
look on.
presidency has become too powerful,[64][65] unchecked, unbalanced,[66] and "monarchist" in nature.[67] In 2008
professor Dana D. Nelson expressed belief that presidents over the previous thirty years worked towards
"undivided presidential control of the executive branch and its agencies".[68] She criticized proponents of the
Unitary executive theory for expanding "the many existing uncheckable executive powers—such as executive
orders, decrees, memorandums, proclamations, national security directives and legislative signing statements—
that already allow presidents to enact a good deal of foreign and domestic policy without aid, interference or
consent from Congress".[68] Bill Wilson, board member of Americans for Limited Government, opined that the
expanded presidency was "the greatest threat ever to individual freedom and democratic rule".[69]


In [56]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')  # Using a more robust model

def generate_embeddings(paragraphs):
    embeddings = model.encode(paragraphs)
    print(paragraphs, embeddings)
    return embeddings

embeddings = generate_embeddings(paragraphs)




['President of the United\nStates\nPresidential seal\nPresidential flag\nIncumbent\nJoe Biden\nsince\xa0January 20, 2021\nExecutive branch of the U.S.\ngovernment\nExecutive Office of the President\nStyle\nMr. President[1][2]\n(informal)\nThe Honorable[3]\n(formal)\nHis Excellency[4][5]\n(diplomatic)\nType\nHead of state\nPresident of the United States\nThe president of the United States (POTUS)[B] is the head of state and\nhead of government of the United States of America. The president directs the\nexecutive branch of the federal government and is the commander-in-chief of\nthe United States Armed Forces.\nThe power of the presidency has grown substantially[12] since the first\npresident, George Washington, took office in 1789.[6] While presidential power\nhas ebbed and flowed over time, the presidency has played an increasingly\nsignificant role in American political life since the beginning of the 20th\ncentury, carrying over into the 21st century with notable expansions during th

In [37]:
import faiss
import numpy as np

def store_embeddings_faiss(paragraphs, embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Use inner product (cosine similarity)
    embeddings = np.array(embeddings)  # Convert to numpy array
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    return index, paragraphs

index, stored_paragraphs = store_embeddings_faiss(paragraphs, embeddings)


In [53]:
def search_paragraphs(query, index, stored_paragraphs, k=5):
    query_embedding = model.encode([query])
    query_embedding = np.array(query_embedding).reshape(1, -1)
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, k)

    results = []
    for i in range(k):
        match_percentage = distances[0][i] * 100  # Convert cosine similarity to percentage
        if match_percentage >= 50:
            result = {
                'paragraph': stored_paragraphs[indices[0][i]],
                'match_percentage': match_percentage
            }
            results.append(result)

    results = sorted(results, key=lambda x: x['match_percentage'], reverse=True)  # Ensure sorting
    return results

query = "who is the president now?"
results = search_paragraphs(query, index, stored_paragraphs)


In [54]:
from transformers import pipeline

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

def summarize_results(results, max_length=150):
    text_to_summarize = " ".join([result['paragraph'] for result in results])

    # Split text into chunks to fit model's max length
    max_chunk_length = 1024  # The max length the model can handle
    chunks = [text_to_summarize[i:i+max_chunk_length] for i in range(0, len(text_to_summarize), max_chunk_length)]

    summarized_text = ""
    for chunk in chunks:
        summary = summarizer(chunk, max_length=max_length, min_length=30, do_sample=False)
        summarized_text += summary[0]['summary_text'] + " "

    return summarized_text.strip()

summary = summarize_results(results)
print("Summary of top results:")
print(summary)


KeyboardInterrupt: 