In [None]:
import faiss
import numpy as np
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import textwrap
from typing import List, Dict
import re
import logging
from tqdm import tqdm

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class ImprovedRAGChatbot:
    def __init__(self, 
                 embedding_model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',
                 llm_model_name: str = 'gpt2-medium',  # Better model for coherent responses
                 chunk_size: int = 250,
                 chunk_overlap: int = 40):
        """
        Initialize the improved RAG chatbot with better response quality.
        """
        # Detect device
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Using device: {self.device}")
        
        # Load embedding model
        logger.info(f"Loading embedding model: {embedding_model_name}")
        self.embedder = SentenceTransformer(embedding_model_name, device=self.device)
        
        # Load language model with better configuration
        logger.info(f"Loading LLM: {llm_model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
        
        # Configure tokenizer properly
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        # Initialize the model with better parameters
        self.model = AutoModelForCausalLM.from_pretrained(
            llm_model_name,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            low_cpu_mem_usage=True
        )
        
        if self.device == "cuda":
            self.model = self.model.to(self.device)
            
        self.generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device == "cuda" else -1,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
        )
        
        # RAG parameters
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.chunks = []
        self.index = None
        self.metadata = {}
        
        # Conversation history
        self.history = []
        
    def load_document(self, file_path: str) -> None:
        """
        Load and process document with enhanced text cleaning.
        """
        logger.info(f"Loading document: {file_path}")
        
        # Simple extension-based handling
        ext = os.path.splitext(file_path)[1].lower()
        
        if ext == '.txt':
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
        elif ext == '.pdf':
            try:
                import PyPDF2
                with open(file_path, 'rb') as f:
                    reader = PyPDF2.PdfReader(f)
                    text = ""
                    for page in reader.pages:
                        page_text = page.extract_text()
                        if page_text:
                            text += page_text + "\n"
            except ImportError:
                logger.error("PyPDF2 not installed. Install it using 'pip install PyPDF2'")
                return
            except Exception as e:
                logger.error(f"Error reading PDF: {e}")
                return
        elif ext in ['.docx', '.doc']:
            try:
                import docx
                doc = docx.Document(file_path)
                text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
            except ImportError:
                logger.error("python-docx not installed. Install it using 'pip install python-docx'")
                return
            except Exception as e:
                logger.error(f"Error reading Word document: {e}")
                return
        else:
            logger.error(f"Unsupported file format: {ext}. Supported formats: .txt, .pdf, .docx")
            return
            
        if not text.strip():
            logger.error("No text content found in the document")
            return
            
        # Process the text
        self._process_text(text, source=file_path)
    
    def load_text(self, text: str, source: str = "custom_text") -> None:
        """
        Load and process text directly with validation.
        """
        if not text.strip():
            logger.error("Empty text provided")
            return
            
        logger.info(f"Loading text from: {source}")
        self._process_text(text, source)
        
    def _process_text(self, text: str, source: str) -> None:
        """
        Process text into chunks with improved cleaning and build the vector index.
        """
        # Enhanced text cleaning
        text = self._clean_text(text)
        
        if len(text.split()) < 10:
            logger.error("Text is too short to process effectively")
            return
        
        # Chunk the text with overlap
        chunks_with_metadata = self._chunk_text(text, source)
        
        if not chunks_with_metadata:
            logger.error("No valid chunks created from the text")
            return
        
        # Store chunks and metadata
        self.chunks = [item["text"] for item in chunks_with_metadata]
        self.metadata = {i: {
            "source": item["source"],
            "chunk_id": i,
            "word_count": len(item["text"].split())
        } for i, item in enumerate(chunks_with_metadata)}
        
        # Create embeddings with progress bar
        logger.info("Creating embeddings...")
        embeddings = []
        for chunk in tqdm(self.chunks, desc="Embedding chunks"):
            try:
                embedding = self.embedder.encode(chunk, convert_to_tensor=False)
                embeddings.append(embedding)
            except Exception as e:
                logger.warning(f"Failed to embed chunk: {e}")
                continue
        
        if not embeddings:
            logger.error("No embeddings created")
            return
            
        # Build FAISS index
        logger.info("Building FAISS index...")
        dimension = embeddings[0].shape[0]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(np.array(embeddings))
        
        logger.info(f"Successfully processed {len(self.chunks)} chunks")
        
    def _clean_text(self, text: str) -> str:
        """
        Enhanced text cleaning for better processing.
        """
        # Remove excessive whitespace and normalize
        text = re.sub(r'\s+', ' ', text)
        
        # Remove special characters that might interfere
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
        
        # Fix common OCR issues
        text = re.sub(r'\b([a-z])([A-Z])', r'\1 \2', text)  # Split camelCase
        
        # Remove very short lines (likely OCR artifacts)
        lines = text.split('\n')
        cleaned_lines = [line.strip() for line in lines if len(line.strip()) > 3]
        
        return ' '.join(cleaned_lines).strip()
        
    def _chunk_text(self, text: str, source: str) -> List[Dict]:
        """
        Improved text chunking with overlap and quality filtering.
        """
        words = text.split()
        
        if len(words) < self.chunk_size:
            # If text is shorter than chunk size, return as single chunk
            return [{
                "text": text,
                "source": source,
            }] if len(words) >= 20 else []  # Minimum 20 words per chunk
        
        chunks_with_metadata = []
        for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
            chunk_words = words[i:i + self.chunk_size]
            
            # Skip very small chunks
            if len(chunk_words) < 20:
                continue
                
            chunk_text = ' '.join(chunk_words)
            
            # Quality filter: skip chunks that are mostly numbers or special characters
            if self._is_quality_chunk(chunk_text):
                chunks_with_metadata.append({
                    "text": chunk_text,
                    "source": source,
                })
            
        return chunks_with_metadata
    
    def _is_quality_chunk(self, text: str) -> bool:
        """
        Check if a chunk contains meaningful content.
        """
        words = text.split()
        if len(words) < 10:
            return False
            
        # Check if chunk has reasonable word/number ratio
        word_count = len([w for w in words if w.isalpha()])
        return word_count / len(words) > 0.5  # At least 50% actual words
        
    def retrieve_relevant_chunks(self, query: str, top_k: int = 3) -> List[Dict]:
        """
        Retrieve most relevant chunks with improved scoring.
        """
        if not self.index:
            logger.error("No index available. Load a document first.")
            return []
            
        # Get embeddings for the query
        try:
            query_embedding = self.embedder.encode([query], convert_to_tensor=False)
        except Exception as e:
            logger.error(f"Failed to embed query: {e}")
            return []
        
        # Search the index
        D, I = self.index.search(np.array(query_embedding), min(top_k, len(self.chunks)))
        
        # Return relevant chunks with metadata
        results = []
        for idx, distance in zip(I[0], D[0]):
            if idx < 0 or idx >= len(self.chunks):
                continue
                
            # Only include chunks with reasonable similarity (lower distance = higher similarity)
            if distance < 2.0:  # Threshold for relevance
                results.append({
                    "text": self.chunks[idx],
                    "metadata": self.metadata[idx],
                    "score": float(distance)
                })
            
        return results
        
    def _detect_repetitive_response(self, text: str) -> bool:
        """
        Detect if the response is repetitive or low quality.
        """
        if not text or len(text.strip()) < 10:
            return True
            
        words = text.split()
        if len(words) < 5:
            return True
            
        # Check for excessive repetition
        unique_words = set(words)
        repetition_ratio = len(unique_words) / len(words)
        
        if repetition_ratio < 0.3:  # Less than 30% unique words
            return True
            
        # Check for common repetitive patterns
        text_lower = text.lower()
        repetitive_phrases = [
            "it is used for", "it can be used", "docker is", "docker can"
        ]
        
        phrase_count = sum(text_lower.count(phrase) for phrase in repetitive_phrases)
        if phrase_count > 3:  # Too many repetitive phrases
            return True
            
        return False
        
    def generate_response(self, query: str, context_chunks: List[Dict]) -> str:
        """
        Generate a high-quality response using retrieved context.
        """
        if not context_chunks:
            return "I couldn't find relevant information in the document to answer your question."
        
        # Format the context more effectively
        context_parts = []
        for i, chunk in enumerate(context_chunks[:2]):  # Use top 2 chunks to avoid overwhelming
            context_parts.append(f"Reference {i+1}: {chunk['text']}")
        
        context_text = "\n\n".join(context_parts)
        
        # Create a more structured prompt
        prompt = f"""Based on the following information, provide a clear and concise answer to the question.

Information:
{context_text}

Question: {query}

Please provide a focused answer based only on the information above. If the information doesn't fully answer the question, say so.

Answer:"""
        
        # Generate response with improved parameters
        try:
            # Use better generation parameters
            response = self.generator(
                prompt,
                max_new_tokens=150,
                do_sample=True,
                temperature=0.3,  # Lower temperature for more focused responses
                top_p=0.85,
                repetition_penalty=1.2,  # Reduce repetition
                num_return_sequences=1,
                pad_token_id=self.tokenizer.eos_token_id,
            )
            
            # Extract the answer
            full_response = response[0]['generated_text']
            
            # Clean the response
            if "Answer:" in full_response:
                answer = full_response.split("Answer:")[-1].strip()
            else:
                # Fallback: take text after the prompt
                answer = full_response[len(prompt):].strip()
            
            # Post-process the answer
            answer = self._post_process_answer(answer)
            
            # Check for quality issues
            if self._detect_repetitive_response(answer):
                # Provide a fallback response with direct context
                best_chunk = context_chunks[0]
                return f"Based on the document, here's what I found about your question: {best_chunk['text'][:300]}..."
            
            # Track conversation history
            self.history.append({"query": query, "response": answer})
            
            return answer
            
        except Exception as e:
            logger.error(f"Error generating response: {e}")
            # Fallback to context-based response
            if context_chunks:
                return f"I found relevant information: {context_chunks[0]['text'][:200]}..."
            return "I'm sorry, I encountered an error while generating a response."
    
    def _post_process_answer(self, answer: str) -> str:
        """
        Clean and improve the generated answer.
        """
        if not answer:
            return "I couldn't generate a clear answer based on the available information."
        
        # Remove incomplete sentences at the end
        sentences = re.split(r'[.!?]+', answer)
        
        # Keep only complete sentences
        complete_sentences = []
        for sentence in sentences[:-1]:  # Exclude last (potentially incomplete) sentence
            sentence = sentence.strip()
            if len(sentence) > 10 and sentence[0].isupper():
                complete_sentences.append(sentence)
        
        # If we have complete sentences, use them
        if complete_sentences:
            result = '. '.join(complete_sentences) + '.'
        else:
            # Fallback to original answer, but truncate at reasonable length
            result = answer[:300] if len(answer) > 300 else answer
        
        # Clean up formatting
        result = re.sub(r'\s+', ' ', result).strip()
        
        return result
    
    def ask(self, query: str, top_k: int = 3) -> str:
        """
        End-to-end process to answer a question with improved error handling.
        """
        if not self.index:
            return "Please load a document first using the load_document() method."
            
        if not query.strip():
            return "Please provide a question to answer."
            
        # Retrieve relevant chunks
        relevant_chunks = self.retrieve_relevant_chunks(query, top_k)
        
        if not relevant_chunks:
            return "I couldn't find relevant information in the document to answer your question. Try rephrasing your question or check if the document contains information about this topic."
            
        # Generate and return response
        return self.generate_response(query, relevant_chunks)
        
    def chat(self) -> None:
        """
        Enhanced interactive chat interface.
        """
        if not self.index:
            print("❌ No document loaded. Please load a document first.")
            return
            
        print("\n📚 Enhanced RAG Chatbot ready!")
        print(f"📊 Loaded {len(self.chunks)} chunks from your document(s)")
        print("💬 Ask questions about the content. Type 'exit' to quit, 'help' for commands.\n")
        
        while True:
            try:
                query = input("🧑 You: ").strip()
                
                if query.lower() == 'exit':
                    print("👋 Goodbye!")
                    break
                elif query.lower() == 'help':
                    print("\n📋 Available commands:")
                    print("  'exit' - Exit the chat")
                    print("  'help' - Show this help message")
                    print("  'sources' - Show sources for the last answer")
                    print("  'stats' - Show document statistics")
                    continue
                elif query.lower() == 'sources':
                    if not self.history:
                        print("📝 No previous answers to show sources for.")
                    else:
                        print("\n📚 Sources for the last answer:")
                        last_query = self.history[-1]["query"]
                        sources = self.retrieve_relevant_chunks(last_query, 3)
                        for i, source in enumerate(sources, 1):
                            print(f"\n📄 Source {i}: {source['metadata']['source']}")
                            print(f"🎯 Relevance score: {source['score']:.3f}")
                            print(f"📖 Preview: {textwrap.shorten(source['text'], width=150)}")
                    continue
                elif query.lower() == 'stats':
                    print(f"\n📊 Document Statistics:")
                    print(f"  📄 Total chunks: {len(self.chunks)}")
                    total_words = sum(meta['word_count'] for meta in self.metadata.values())
                    print(f"  📝 Total words: {total_words:,}")
                    print(f"  🔢 Average words per chunk: {total_words // len(self.chunks)}")
                    sources = set(meta['source'] for meta in self.metadata.values())
                    print(f"  📚 Source documents: {len(sources)}")
                    continue
                elif not query:
                    continue
                    
                # Get and display answer
                print("\n🤖 Analyzing... ", end="", flush=True)
                answer = self.ask(query)
                print("Done!")
                
                # Format and display the answer
                print(f"\n🤖 Bot: {answer}")
                    
            except KeyboardInterrupt:
                print("\n👋 Goodbye!")
                break
            except Exception as e:
                logger.error(f"Chat error: {e}")
                print(f"\n🤖 Bot: I encountered an error. Please try again.")


# Example usage and testing
if __name__ == "__main__":
    try:
        # Create the improved chatbot
        print("🚀 Initializing Enhanced RAG Chatbot...")
        chatbot = ImprovedRAGChatbot()
        
        # Check if document path is provided
        document_path = input("📁 Enter the path to your document (or press Enter for demo): ").strip()
        
        if document_path:
            if os.path.exists(document_path):
                chatbot.load_document(document_path)
                if chatbot.index:
                    # Start interactive chat
                    chatbot.chat()
                else:
                    print("❌ Failed to process the document. Please check the file format and content.")
            else:
                print(f"❌ File not found: {document_path}")
        else:
            # Demo mode with sample text
            sample_text = """
            Docker is a containerization platform that enables developers to package applications and their dependencies into lightweight, portable containers. 
            
            These containers include everything needed to run an application: code, runtime, system tools, libraries, and settings. Docker containers are isolated from each other and the host system, making them secure and consistent across different environments.
            
            Key benefits of Docker include:
            - Portability: Containers run consistently across development, testing, and production environments
            - Efficiency: Containers share the host OS kernel, making them more resource-efficient than virtual machines
            - Scalability: Easy to scale applications horizontally by spinning up multiple container instances
            - Version control: Docker images can be versioned and tracked
            - Microservices architecture: Perfect for breaking down monolithic applications into smaller, manageable services
            
            Docker uses a client-server architecture with the Docker daemon managing containers, images, networks, and volumes. The Docker CLI provides commands to interact with the daemon.
            
            Common Docker commands include:
            - docker run: Create and start a new container
            - docker build: Build an image from a Dockerfile
            - docker pull: Download an image from a registry
            - docker ps: List running containers
            - docker stop: Stop a running container
            """
            
            print("🎯 Loading demo content about Docker...")
            chatbot.load_text(sample_text, "Docker Documentation Demo")
            
            if chatbot.index:
                print("✅ Demo loaded successfully!")
                chatbot.chat()
            else:
                print("❌ Failed to process demo content.")
                
    except Exception as e:
        logger.error(f"Startup error: {e}")
        print(f"❌ Error starting chatbot: {e}")

2025-05-21 22:46:15,290 - INFO - Using device: cpu
2025-05-21 22:46:15,291 - INFO - Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
2025-05-21 22:46:15,295 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


🚀 Initializing Enhanced RAG Chatbot...


2025-05-21 22:46:20,136 - INFO - Loading LLM: gpt2-medium
Device set to use cpu
2025-05-21 22:47:21,864 - INFO - Loading document: C:\Users\acer\Desktop\advanced learning\Docker+for+Beginners-Mumshad+Mannambeth.pdf
2025-05-21 22:47:24,592 - INFO - Creating embeddings...
Embedding chunks:   0%|          | 0/14 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:   7%|▋         | 1/14 [00:00<00:07,  1.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  14%|█▍        | 2/14 [00:00<00:05,  2.29it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  21%|██▏       | 3/14 [00:01<00:03,  2.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  29%|██▊       | 4/14 [00:01<00:03,  3.22it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  36%|███▌      | 5/14 [00:01<00:02,  3.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  43%|████▎     | 6/14 [00:01<00:02,  3.26it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  50%|█████     | 7/14 [00:02<00:02,  3.17it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  57%|█████▋    | 8/14 [00:02<00:01,  3.14it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  64%|██████▍   | 9/14 [00:02<00:01,  3.19it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  71%|███████▏  | 10/14 [00:03<00:01,  3.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  79%|███████▊  | 11/14 [00:03<00:00,  3.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  86%|████████▌ | 12/14 [00:03<00:00,  3.09it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks:  93%|█████████▎| 13/14 [00:04<00:00,  3.41it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding chunks: 100%|██████████| 14/14 [00:04<00:00,  3.08it/s]
2025-05-21 22:47:29,143 - INFO - Building FAISS index...
2025-05-21 22:47:29,146 - INFO - Successfully processed 14 chunks



📚 Enhanced RAG Chatbot ready!
📊 Loaded 14 chunks from your document(s)
💬 Ask questions about the content. Type 'exit' to quit, 'help' for commands.


🤖 Analyzing... 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Done!

🤖 Bot: "A virtual machine running as an application that runs inside of another process."

🤖 Analyzing... 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
2025-05-21 22:48:41,711 - ERROR - Error generating response: index out of range in self


Done!

🤖 Bot: I found relevant information: docker f o r b e g i n n e r s MUMSHAD MANNAMBETH w w w . k o d e k l o u d . c o m Objectives What are Containers? What is Docker? Why do you need it? What can it do? Run Docker Containers Create a D...

🤖 Analyzing... 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Done!

🤖 Bot: It's very powerful as an application orchestrator that allows us not just to build applications but also manage them through our infrastructure like we would any other system running software such services etc. The fact of this is because there isn´t one single solution out here yet which will allow all these different scenarios together without having too much overhead when they come into play at once – especially if your workload has many components involved including networking/storage systems… And even though some people might argue about how "unnecessary" virtual machines really are compared against physical ones, I think most users who use their computers regularly already have enough computing power available within those devices anyway due both hardware resources used by computer itself plus network bandwidth usage.
