In [7]:
import fitz
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import os



In [15]:
from fpdf import FPDF
import unicodedata

# Paragraph to repeat
paragraph = (
    "Artificial Intelligence (AI) is transforming industries with its capabilities in "
    "natural language processing, computer vision, and decision-making. From healthcare "
    "diagnostics to financial forecasting and personalized education, AI applications are "
    "expanding rapidly. The integration of large language models such as GPT-4 has revolutionized "
    "the way machines understand and generate human language.\n\n"
    "In healthcare, AI is aiding in early diagnosis of diseases, robotic surgeries, and personalized "
    "treatment plans. In the financial sector, it is enabling better fraud detection, credit scoring, "
    "and algorithmic trading. In education, AI tools are personalizing content to suit individual learning "
    "styles, making learning more effective.\n\n"
    "Ethical concerns remain, including data privacy, algorithmic bias, and the potential displacement "
    "of jobs. Responsible AI development and policy regulations are crucial to mitigate these risks.\n\n"
    "AI’s future includes even more sophisticated models that can think, reason, and interact with the world "
    "in human-like ways. With continued research and innovation, AI will continue to redefine what's possible "
    "across all domains of life.\n\n"
)

# Create the PDF
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)

for _ in range(25):  
    clean_para = unicodedata.normalize("NFKD", paragraph).encode("latin-1", "ignore").decode("latin-1")
    pdf.multi_cell(0, 10, clean_para)

# Save the file
pdf.output("sample.pdf")
print("PDF created: sample.pdf")


PDF created: sample.pdf


In [None]:

#loading and Chunk the Document 
def load_and_chunk_pdf(filepath, chunk_size=500, overlap=100):
    doc = fitz.open(filepath)
    text = "\n".join(page.get_text() for page in doc)
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

#embed and Store with FAISS 
def embed_chunks(chunks, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    return index, embeddings, chunks

#Retrieve Top k Chunks 
def retrieve_top_k(query, model, index, chunks, k=5):
    query_vec = model.encode([query])
    distances, indices = index.search(np.array(query_vec), k)
    return [chunks[i] for i in indices[0]]

#summarizing Using LLM 
def summarize_chunks(chunks):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    input_text = "\n\n".join(chunks)[:3000]  # Limit input length
    summary = summarizer(input_text, max_length=200, min_length=50, do_sample=False)
    return summary[0]['summary_text']

def main():
    filepath = "sample.pdf"  # Change this to your input file
    print("[+] Loading and chunking document...")
    chunks = load_and_chunk_pdf(filepath)

    print("[+] Embedding chunks and building FAISS index...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    index, embeddings, chunks = embed_chunks(chunks)

    print("[+] Retrieving top chunks for summary query...")
    top_chunks = retrieve_top_k("Summarize this document", model, index, chunks)

    print("[+] Generating summary with LLM...")
    summary = summarize_chunks(top_chunks)

    print("\n===== SUMMARY =====")
    print(summary)

if __name__ == '__main__':
    main()


[+] Loading and chunking document...
[+] Embedding chunks and building FAISS index...
[+] Retrieving top chunks for summary query...
[+] Generating summary with LLM...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu



===== SUMMARY =====
Artificial Intelligence (AI) is transforming industries with its capabilities in natural language processing, computer vision, and decision-making. From healthcare diagnostics to financial forecasting and personalized education, AI applications are expanding rapidly. Ethical concerns remain, including data privacy, algorithmic bias, and the potential displacement of jobs.
