In [None]:
!pip install transformers PyPDF2 python-docx


In [None]:
pip install -U sentence-transformers


In [None]:
from transformers import pipeline
import PyPDF2
import textwrap
import docx
from difflib import get_close_matches
import numpy as np
from sentence_transformers import SentenceTransformer
import re

# Load advanced QA and summarization pipelines
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
summarizer = pipeline("summarization", model="google/pegasus-xsum")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # For semantic search

# Cleaning function to preprocess text
def clean_text(text):
    text = re.sub(r'[^\x20-\x7E\n]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'Page \d+ of \d+|\bheader text\b|\bfooter text\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(\b\w+\b)(?:\s+\1\b)+', r'\1', text, flags=re.IGNORECASE)
    return text

# Extract text from TXT
def extract_text_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Extract text from PDF
def extract_text_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Extract text from DOCX
def extract_text_docx(file_path):
    doc = docx.Document(file_path)
    text = " ".join([paragraph.text for paragraph in doc.paragraphs])
    return text

# Unified function for processing
def process_document(file_path, file_type):
    if file_type == "txt":
        raw_text = extract_text_txt(file_path)
    elif file_type == "pdf":
        raw_text = extract_text_pdf(file_path)
    elif file_type == "docx":
        raw_text = extract_text_docx(file_path)
    else:
        raise ValueError("Unsupported file type")
    return clean_text(raw_text)

# Answer a question
def answer_question(question, context):
    result = qa_pipeline(question=question, context=context)
    return result['answer'], result['score']

# Semantic Search for Question Answering across multiple documents
def semantic_search(query, documents):
    query_embedding = embedding_model.encode([query])
    document_embeddings = [embedding_model.encode([doc]) for doc in documents.values()]
    similarities = [np.dot(query_embedding, doc_emb.T) for doc_emb in document_embeddings]
    most_similar_idx = np.argmax(similarities)
    return list(documents.values())[most_similar_idx]

# Handle typos and unclear inputs
def handle_typos(user_input, valid_commands):
    close_matches = get_close_matches(user_input, valid_commands, n=1, cutoff=0.8)
    return close_matches[0] if close_matches else None

# Improved summarize function (handles long documents in chunks)
def summarize_document(text, max_length=150, min_length=50, summary_type="abstract"):
    # Handle chunking of large documents (for abstractive summarization)
    def chunk_text(text, max_chunk_size=1024):
        return textwrap.wrap(text, max_chunk_size, break_long_words=False)

    if summary_type == "abstract":
        chunks = chunk_text(text)
        summaries = []
        for chunk in chunks:
            summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        return " ".join(summaries)  
    elif summary_type == "extractive":
        sentences = text.split('.')
        num_sentences = 5 
        return '. '.join(sentences[:num_sentences]) + '.'
    else:
        raise ValueError("Unsupported summary type")

# Interactive chatbot with multiple document support
documents = {}
print("Upload and load your documents first.")
while True:
    command = input("Command (upload/view/switch/start/exit): ").lower()
    if command == "exit":
        print("Goodbye!")
        break
    elif command == "upload":
        file_path = input("Enter the file path: ")
        file_type = input("Enter the file type (txt/pdf/docx): ").lower()
        try:
            doc_name = input("Enter a name for this document: ")
            documents[doc_name] = process_document(file_path, file_type)
            print(f"Document '{doc_name}' uploaded and cleaned successfully.")
        except Exception as e:
            print(f"Error: {e}")
    elif command == "view":
        print("Available documents:")
        for name in documents.keys():
            print(f"- {name}")
    elif command == "switch":
        doc_name = input("Enter the document name to switch to: ")
        if doc_name in documents:
            context = documents[doc_name]
            print(f"Switched to document '{doc_name}'.")
        else:
            print("Document not found.")
    elif command == "start":
        if not documents:
            print("No documents uploaded. Please upload a document first.")
            continue
        context = list(documents.values())[0]  # Default to the first document
        print("Chatbot is ready! You can ask questions or request summaries.")
        while True:
            user_input = input("Enter your question or command (type 'exit' to go back): ")
            if user_input.lower() == "exit":
                print("Exiting chatbot interaction.")
                break
            elif "summarize" in user_input.lower():
                summary_type = "abstract" if "abstract" in user_input.lower() else "extractive"
                section = input("Enter section to summarize or leave blank for the whole document: ").lower()
                if section:
                    if section in context.lower():
                        section_text = context[context.lower().index(section):]
                        summary = summarize_document(section_text, summary_type=summary_type)
                        print(f"Summary of the section '{section}':")
                        print(summary)
                    else:
                        print(f"Section '{section}' not found in the document.")
                else:
                    summary = summarize_document(context, summary_type=summary_type)
                    print("Summary of the document:")
                    print(summary)
            else:
                relevant_document = semantic_search(user_input, documents)
                answer, confidence = answer_question(user_input, relevant_document)
                if confidence < 0.0:
                    print("I'm not confident about the answer. Can you clarify your question?")
                else:
                    print(f"Answer: {answer}\nConfidence: {confidence:.2f}\n")
    else:
        suggested_command = handle_typos(command, ["upload", "view", "switch", "start", "exit"])
        if suggested_command:
            print(f"Did you mean '{suggested_command}'?")
        else:
            print("Invalid command. Please try again.")
