<a href="https://colab.research.google.com/github/mayur2829/GenAI_Document_Summarization_Integration/blob/main/Copy_of_Niveus_GenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install langchain chromadb pypdf openpyxl pandas unstructured -q

In [None]:
from google.colab import files
uploaded = files.upload()  # Select up to 50 files manually

In [None]:
# Load ans extract data from Excel and PDF

!pip install PyPDF2 -q

In [None]:
import os
import pandas as pd
from PyPDF2 import PdfReader

In [None]:
# Extract pdf file text

def extract_pdf_text(file_path):
    text = ''
    with open(file_path, 'rb') as f:
        reader = PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ''
    return text.strip()

In [None]:
# Extract Excel file text

!pip install openpyxl xlrd -q

def extract_excel_text(file_path):
    ext = file_path.split('.')[-1].lower()
    engine = 'openpyxl' if ext == 'xlsx' else 'xlrd'
    text = ''
    df = pd.read_excel(file_path, engine=engine)
    text = df.astype(str).apply(lambda x: ' '.join(x), axis=1).str.cat(sep=' ')
    return text.strip()

In [None]:
# Create a list of documents with content and metadata

documents = []

for file in uploaded.keys():
    ext = file.split('.')[-1].lower()
    try:
        if ext == 'pdf':
            content = extract_pdf_text(file)
        elif ext in ['xls', 'xlsx']:
            content = extract_excel_text(file)
        else:
            continue
        documents.append({'filename': file, 'text': content})
    except Exception as e:
        print(f"Error processing {file}: {e}")

In [None]:
len(documents)

In [None]:
# Embedding in ChromeDB

!pip install -U langchain-community

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
import uuid


In [None]:
# Using HuggingFace embeddings

from getpass import getpass
import os

huggingfacehub_api_key = getpass("Enter your Hugging Face API token: ")
# Set the API token as an environment variable
os.environ["HUGGING_FACE_HUB_TOKEN"] = huggingfacehub_api_key

embedding_function = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [None]:
embedding_function

In [None]:
# Initialize ChromaDB
db = Chroma(embedding_function=embedding_function, persist_directory="./chroma_db")

In [None]:
db

In [None]:
# Convert documents to LangChain format
docs = [Document(page_content=d["text"], metadata={"source": d["filename"]}) for d in documents]
db.add_documents(docs)

In [None]:
db

In [None]:
# Query Documents Simmilarity search
# seacrh from pdf

query = "Genai is it future"
results = db.similarity_search(query, k=3)

for i, r in enumerate(results):
    print(f"Result {i+1} - {r.metadata['source']}\n{r.page_content[:500]}\n{'-'*50}\n")

In [None]:
# Search from excel

query = "nls exper every year"
results = db.similarity_search(query, k=3)

for i, r in enumerate(results):
    print(f"Result {i+1} - {r.metadata['source']}\n{r.page_content[:500]}\n{'-'*50}\n")

In [None]:
# Check filename
all_filenames = [d.metadata["source"] for d in docs]
print("Available documents:", all_filenames[::-5])

In [None]:
#  Update a Document's Embedding

def update_document(db, filename, new_text):
    try:
        db.delete([filename])
        doc = Document(page_content=new_text, metadata={"source": filename})
        db.add_documents([doc])
        print(f"Updated: {filename}")
    except Exception as e:
        print(f"Update failed for {filename}: {e}")

In [None]:
new_text = "This is the UPDATED version of report1.pdf with revised financials."
update_document(db, 'SpecialIssueCFP (1).pdf', new_text)

In [None]:
# Delete a Document

def delete_document(db, filename):
    try:
        db.delete([filename])
        print(f"Deleted: {filename}")
    except Exception as e:
        print(f"Delete failed for {filename}: {e}")

In [None]:
delete_document(db, 'rice.xls')

In [None]:
# replace embedding one document to other

def replace_document(db, target_filename, source_text):
    try:
        db.delete([target_filename])
        doc = Document(page_content=source_text, metadata={"source": target_filename})
        db.add_documents([doc])
        print(f"Replaced content in: {target_filename}")
    except Exception as e:
        print(f"Replace failed for {target_filename}: {e}")

In [None]:
source_text = "Content originally from another report, now replacing summary.pdf"
replace_document(db, 'Make Your LLM Fully Utilize the Context (1).pdf', source_text)

In [None]:
# LangChain Summarization of a document

!pip install transformers langchain huggingface_hub

from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.chains.summarize import load_summarize_chain

In [None]:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6")
llm = HuggingFacePipeline(pipeline=summarizer)

In [None]:
# Load summarization chain
chain = load_summarize_chain(llm, chain_type="stuff")


In [None]:
all_filenames = [d.metadata["source"] for d in docs]
print("Available documents:", all_filenames[:])

In [None]:
# Pick a document from vector store
print(docs[0])
doc_to_summarize = docs[-1]  #summarizing last document 'SpecialIssueCFP (1).pdf'


In [None]:
# Ensure the document content is a string
doc_content = doc_to_summarize.page_content
if not isinstance(doc_content, str):
    doc_content = str(doc_content)

# Truncate the document to a maximum length if needed
max_length = 512 # Example maximum length
if len(doc_content) > max_length:
    doc_content = doc_content[:max_length]

In [None]:
# Update the document content with the truncated string
doc_to_summarize = Document(page_content=doc_content, metadata=doc_to_summarize.metadata)

# Run summarization
summary = chain.run([doc_to_summarize])
print("Summary:\n", summary)