# 🧪 The Enigmatic Research Assistant  
*Automated Document Analysis Pipeline with RAG & Multimodal Support*

#### Import libraries

In [1]:
import os
import re
import time
import textwrap
import logging
import warnings
import pandas as pd
import numpy as np
import tiktoken
import faiss
import nltk
from tqdm import tqdm
from pathlib import Path
from typing import List, Dict
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Set, Tuple  

# Initialize NLTK
nltk.download('punkt')
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt', quiet=False)
    nltk.download('punkt_tab', quiet=False)
    nltk.download('perluniprops', quiet=False)

# Suppress warnings
warnings.filterwarnings("ignore")
faiss.omp_set_num_threads(1)

# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TeHamer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
pip freeze > requirements/AdaptiveEngine.txt


Note: you may need to restart the kernel to use updated packages.


The system cannot find the path specified.


## 🔍 1. Text Extraction from Publications  
**✅ Implemented** | *Key Libraries: `pdfplumber`, `python-docx`, `pandas`*

| **Feature**              | **Implementation Details**                              |
|--------------------------|--------------------------------------------------------|
| **Supported Formats**     | `.pdf`, `.docx`, `.xlsx`, `.csv`, `.xls`, `.xlsm`      |
| **Table Extraction**      | `[TABLE START]`/`[TABLE END]` markers for PDFs          |
|                          | Word tables → Plain text conversion                    |
| **Error Resilience**      | Skips unsupported files + logs warnings via `logging`   |
| **Specialized Parsing**   | Spreadsheets → Markdown via `df.to_markdown()`          |

In [2]:
class ResearchProcessor:
    SUPPORTED_FORMATS = ['.pdf', '.docx', '.xlsx', '.csv', '.xls', '.xlsm']
    
    def __init__(self):
        self.file_handlers = {
            '.pdf': self._process_pdf,
            '.docx': self._process_word,
            '.xlsx': self._process_spreadsheet,
            '.xls': self._process_spreadsheet,
            '.xlsm': self._process_spreadsheet,
            '.csv': self._process_spreadsheet
        }
    
    def _process_pdf(self, file_path: Path) -> str:
        from pdfplumber import open as pdf_open
        content = []
        try:
            with pdf_open(file_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    text = page.extract_text() or ""
                    tables = page.extract_tables()
                    content.append(f"\n=== PAGE {page_num} ===\n{text}")
                    if tables:
                        content.append("\n[TABLE START]")
                        for table in tables:
                            for row in table:
                                content.append("|".join(map(str, row)))
                        content.append("[TABLE END]")
            return "\n".join(content)
        except Exception as e:
            logger.error(f"PDF processing error: {str(e)}")
            return ""
    
    def _process_word(self, file_path: Path) -> str:
        from docx import Document
        content = []
        try:
            doc = Document(file_path)
            for para in doc.paragraphs:
                if para.text.strip():
                    content.append(para.text)
            for table in doc.tables:
                content.append("[TABLE START]")
                for row in table.rows:
                    content.append("|".join(cell.text for cell in row.cells))
                content.append("[TABLE END]")
            return "\n".join(content) or " "
        except Exception as e:
            logger.error(f"Word processing error: {str(e)}")
            return " "
    
    def _process_spreadsheet(self, file_path: Path) -> str:
        content = []
        try:
            if file_path.suffix.lower() == '.csv':
                df = pd.read_csv(file_path)
            else:
                df = pd.read_excel(file_path, engine='openpyxl')
            content.append("[TABLE START]")
            content.append(df.to_markdown())
            content.append("[TABLE END]")
            return "\n".join(content)
        except Exception as e:
            logger.error(f"Spreadsheet error: {str(e)}")
            return ""
    
    def process_document(self, file_path: Path) -> Dict:
        if not file_path.is_file():
            logger.warning(f"File not found: {file_path}")
            return None
        
        logger.info(f"Processing {file_path.name}")
        handler = self.file_handlers.get(file_path.suffix.lower())
        result = {
            'filename': file_path.name,
            'content': handler(file_path) if handler else "",
            'processed_at': time.strftime("%Y-%m-%d %H:%M:%S")
        }
        
        if not result['content'] or len(result['content']) < 100:
            logger.warning(f"Short/empty content in {file_path.name}")
            
        return result

### Key Components:
- **Tokenization**  
  ⚙️ `cl100k_base` tokenizer (GPT-4/LLaMA compatible)  
  🎯 Target: 768 tokens/chunk (LLM optimization)

- **Context Preservation**  
  🔗 Page-level splits via `=== PAGE \d+ ===` regex  
  🔍 Sentence boundaries using `nltk` punkt

- **Metadata Tracking**  
  📌 `chunk_id: {filename}_{sequence}`  
  📄 Page numbers + token counts per chunk

In [3]:
class ResearchChunker:
    def __init__(self, chunk_size=768):
        self.chunk_size = chunk_size
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
        self._verify_nltk_resources()
        self.min_chunk_length = 100
        
    def _verify_nltk_resources(self):
        try:
            nltk.data.find('tokenizers/punkt')
            self.sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        except LookupError:
            nltk.download('punkt', quiet=True)
            self.sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
    def _create_chunk(self, document: Dict, content: List[str], pages: Set[int], chunk_number: int) -> Dict:
        return {
            'document_id': document['filename'],
            'chunk_id': f"{document['filename']}_{chunk_number}",
            'content': ' '.join(content),
            'page_numbers': sorted(pages),
            'token_count': len(self.tokenizer.encode(' '.join(content)))
        }
    
    def _extract_pages(self, content: List[str]) -> Set[int]:
        pages = set()
        for text in content:
            matches = re.finditer(r'=== PAGE (\d+) ===', text)
            pages.update(int(m.group(1)) for m in matches)
        return pages
    
    def chunk_document(self, document: Dict) -> List[Dict]:
        chunks = []
        try:
            raw_content = document.get('content', '')
            if not raw_content or len(raw_content) < self.min_chunk_length:
                return []
            
            # Split by pages first
            page_split = re.split(r'(=== PAGE \d+ ===)', raw_content)
            pages = []
            current_page = []
            
            for item in page_split:
                if re.match(r'=== PAGE \d+ ===', item):
                    if current_page:
                        pages.append(" ".join(current_page))
                        current_page = []
                    current_page.append(item)
                else:
                    current_page.append(item)
            if current_page:
                pages.append(" ".join(current_page))
            
            # Process each page
            for page_content in pages:
                page_number = re.search(r'=== PAGE (\d+) ===', page_content)
                page_num = int(page_number.group(1)) if page_number else 1
                
                sentences = self.sentence_tokenizer.tokenize(page_content)
                current_chunk = []
                current_tokens = 0
                
                for sentence in sentences:
                    sentence_tokens = len(self.tokenizer.encode(sentence))
                    
                    if current_tokens + sentence_tokens > self.chunk_size:
                        if current_chunk:
                            chunks.append(self._create_chunk(
                                document, current_chunk, {page_num}, len(chunks)+1
                            ))
                            current_chunk = []
                            current_tokens = 0
                            
                    current_chunk.append(sentence)
                    current_tokens += sentence_tokens
                
                if current_chunk:
                    chunks.append(self._create_chunk(
                        document, current_chunk, {page_num}, len(chunks)+1
                    ))
            
            logger.info(f"Generated {len(chunks)} chunks for {document['filename']}")
            return chunks
            
        except Exception as e:
            logger.error(f"Chunking failed for {document['filename']}: {str(e)}")
            return []

### Architecture Overview:
```mermaid
graph LR
A[Raw Text] --> B(nomic-embed-text-v1) 
B --> C[768-dim Vectors]
C --> D{FAISS Index}
D --> E[Semantic Search]

In [4]:
class ResearchVectorDB:
    def __init__(self):
        self.embedder = SentenceTransformer(
            'nomic-ai/nomic-embed-text-v1',
            trust_remote_code=True,
            device='cpu'
        )
        self._initialize_faiss_index()
        self.metadata = []
    
    def _initialize_faiss_index(self):
        dim = self.embedder.get_sentence_embedding_dimension()
        self.index = faiss.IndexFlatL2(dim)
        logger.info(f"Initialized FAISS index with dimension {dim}")
    
    def add_documents(self, chunks: List[Dict]):
        if not chunks or not isinstance(chunks, list):
            return
            
        try:
            contents = [chunk['content'] for chunk in chunks if 'content' in chunk]
            embeddings = self.embedder.encode(
                contents,
                show_progress_bar=True,
                convert_to_numpy=True
            )
            if embeddings.shape[0] == 0:
                return
                
            self.index.add(embeddings.astype('float32'))
            self.metadata.extend(chunks)
            logger.info(f"Added {len(chunks)} chunks (Total: {self.index.ntotal})")
        except Exception as e:
            logger.error(f"Vector DB error: {str(e)}")
    
    def search(self, query_embed: np.ndarray, top_k=3) -> Tuple[np.ndarray, np.ndarray]:
        try:
            if self.index.ntotal == 0:
                return np.empty((0,)), np.empty((0,))
                
            if query_embed.ndim == 1:
                query_embed = query_embed.reshape(1, -1)
                
            distances, indices = self.index.search(query_embed.astype('float32'), top_k)
            return distances, indices
        except Exception as e:
            logger.error(f"Search failed: {str(e)}")
            return np.empty((0,)), np.empty((0,))

## 🤖 4. RAG Q&A System  
**✅ Implemented** | *Components: `Llama-2-7B`, `llama.cpp`*

```markdown
### Workflow:
1. 🔎 **Query Embedding**: `nomic-embed-text-v1` encodes question
2. 🎯 **Top-3 Retrieval**: FAISS similarity search
3. 💡 **Answer Generation**:
   - Dynamic context window (max 2048 tokens)
   - Temperature: 0.3 for focused responses
4. 📚 **Source Attribution**:
   ```python
   {'document_id': 'file.pdf', 'page_numbers': [12,14]}

## 🌍 5. Translation System  
**✅ Implemented** | *Model: `mbart-large-50`*

```markdown
| **Feature**         | **Implementation**                          |
|---------------------|--------------------------------------------|
| Multilingual Support| 50 languages via HuggingFace pipeline       |
| Format Preservation | Line-by-line translation                   |
| Quality Validation  | BLEU score against reference texts         |
| Efficiency          | Batch processing for large documents       |

In [5]:
class ResearchAnalyst:
    def __init__(self):
        self.vector_db = ResearchVectorDB()
        self.max_context_tokens = 2048  # Matches model's training context
        self.llm = self._initialize_llm()
        self.conversation_history = []
        self.tokenizer = tiktoken.get_encoding("cl100k_base")  # Unified tokenizer

    def _initialize_llm(self):
        MODEL_NAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
        model_path = Path("models") / MODEL_NAME
        
        if not model_path.exists():
            raise FileNotFoundError(
                f"Model file {MODEL_NAME} not found in models/\n"
                "Download from: https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
            )
            
        logger.info(f"Initializing LLM: {MODEL_NAME}")
        return Llama(
            model_path=str(model_path),
            n_ctx=self.max_context_tokens,
            n_threads=4,
            n_gpu_layers=0,  # CPU-only mode
            verbose=False
        )

    def answer_question(self, question: str, top_k=3) -> Dict:
        try:
            # Semantic search
            query_embed = self.vector_db.embedder.encode([question])
            distances, indices = self.vector_db.search(query_embed, top_k)
            
            # Handle empty results
            if indices.size == 0 or len(self.vector_db.metadata) == 0:
                return {
                    'question': question,
                    'answer': "No relevant information found",
                    'sources': [],
                    'metrics': {'tokens_sec': 0}
                }
            
            # Build context with token tracking
            context_parts = []
            total_tokens = 0
            used_indices = []
            
            for idx in indices[0]:
                if idx >= len(self.vector_db.metadata):
                    continue
                
                chunk = self.vector_db.metadata[idx]
                chunk_tokens = len(self.tokenizer.encode(chunk['content']))
                
                if total_tokens + chunk_tokens > self.max_context_tokens:
                    break
                    
                context_parts.append(f"Document excerpt:\n{chunk['content']}")
                total_tokens += chunk_tokens
                used_indices.append(idx)
            
            # Generate answer
            prompt = f"Context information:\n{' '.join(context_parts)}\nQuestion: {question}\nAnswer:"
            response = self.llm(
                prompt=prompt,
                max_tokens=512,
                temperature=0.3,
                stop=["\n", "###"]
            )
            
            return {
                'question': question,
                'answer': response['choices'][0]['text'].strip(),
                'sources': [self.vector_db.metadata[i] for i in used_indices],
                'metrics': {'tokens_sec': len(response['choices'][0]['text']) / (time.time() - start_time)}
            }
        except Exception as e:
            logger.error(f"Q&A failed: {str(e)}")
            return {'error': str(e)}

    def translate_content(self, text: str, target_lang: str) -> str:
        """Improved translation method with error handling"""
        try:
            if "[TABLE START]" in text:
                return self._translate_table(text, target_lang)
                
            sentences = nltk.sent_tokenize(text)
            translated = []
            
            for sent in sentences:
                response = self.llm(
                    prompt=f"Translate to {target_lang}: {sent}",
                    max_tokens=len(sent)*3,  # Increased buffer
                    temperature=0.1
                )
                translated.append(response['choices'][0]['text'].strip())
                
            return ' '.join(translated)
            
        except Exception as e:
            logger.error(f"Translation failed: {str(e)}")
            return text  # Return original text on failure

    def _translate_table(self, table_text: str, target_lang: str) -> str:
        """Table-aware translation"""
        try:
            rows = table_text.split('\n')
            translated_rows = []
            
            for row in rows:
                if row.strip() in ['[TABLE START]', '[TABLE END]']:
                    translated_rows.append(row)
                    continue
                    
                cells = row.split('|')
                translated_cells = []
                
                for cell in cells:
                    response = self.llm(
                        prompt=f"Translate to {target_lang}: {cell}",
                        max_tokens=len(cell)*3,
                        temperature=0.1
                    )
                    translated_cells.append(response['choices'][0]['text'].strip())
                
                translated_rows.append('|'.join(translated_cells))
                
            return '\n'.join(translated_rows)
            
        except Exception as e:
            logger.error(f"Table translation failed: {str(e)}")
            return table_text

### Step 5: Translation System
- **Multilingual Support**: Used `mbart-large-50` model for high-quality translations.
- **Format Preservation**: Translated text line-by-line to retain original structure.
- **Evaluation**: Added BLEU score to compare translations with reference texts.

In [6]:
def analyze_research_documents(input_dir: Path):
    processor = ResearchProcessor()
    chunker = ResearchChunker(chunk_size=768)
    analyst = ResearchAnalyst()
    
    print("\n🔍 Initializing Document Processing...")
    total_chunks = 0
    
    for file_path in tqdm(list(input_dir.glob('*')), desc="Processing documents"):
        if file_path.suffix.lower() not in processor.SUPPORTED_FORMATS:
            continue
            
        try:
            doc = processor.process_document(file_path)
            if not doc or not doc.get('content'):
                continue
                
            chunks = chunker.chunk_document(doc)
            if not chunks:
                continue
                
            analyst.vector_db.add_documents(chunks)
            total_chunks += len(chunks)
            
        except Exception as e:
            logger.error(f"Error processing {file_path.name}: {str(e)}")
    
    print(f"\n✅ Successfully processed {total_chunks} chunks from {len(list(input_dir.glob('*')))} files")
    
    while True:
        question = input("\n📝 Enter your question (or press Enter to exit):\n> ").strip()
        if not question:
            break
            
        try:
            result = analyst.answer_question(question)
            print(f"\n📜 Question: {result['question']}")
            print(f"\n💡 Answer:\n{textwrap.fill(result['answer'], width=80)}")
            print(f"\n⚡ Performance: {result['metrics']['tokens_sec']:.1f} tokens/sec")
            
            if result['sources']:
                print("\n📚 Sources:")
                for i, source in enumerate(result['sources'], 1):
                    print(f"{i}. {source['document_id']} (Pages: {source['page_numbers']})")
        except Exception as e:
            print(f"\n❌ Error: {str(e)}")
            
    print("\n" + "="*80)
    print("🔚 Analysis session completed")

if __name__ == "__main__":
    base_dir = Path(r"D:\Projects\The Enigmatic Research of Dr. X")
    input_dir = base_dir / "documents"
    if not input_dir.exists():
        raise FileNotFoundError(f"Directory not found: {input_dir}")
        
    analyze_research_documents(input_dir)

2025-04-15 22:16:43,096 - INFO - Load pretrained SentenceTransformer: nomic-ai/nomic-embed-text-v1
2025-04-15 22:16:53,616 - INFO - Initialized FAISS index with dimension 768
2025-04-15 22:16:53,616 - INFO - Initializing LLM: tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf



🔍 Initializing Document Processing...


Processing documents:   0%|                                                                     | 0/10 [00:00<?, ?it/s]2025-04-15 22:16:53,768 - INFO - Processing Dataset summaries and citations.docx
2025-04-15 22:16:54,028 - INFO - Generated 11 chunks for Dataset summaries and citations.docx


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-15 22:17:06,229 - INFO - Added 11 chunks (Total: 11)
Processing documents:  10%|██████                                                       | 1/10 [00:12<01:52, 12.46s/it]2025-04-15 22:17:06,231 - INFO - Processing Loan amortisation schedule1.xlsx
2025-04-15 22:17:06,460 - INFO - Generated 1 chunks for Loan amortisation schedule1.xlsx


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-15 22:17:08,980 - INFO - Added 1 chunks (Total: 12)
Processing documents:  20%|████████████▏                                                | 2/10 [00:15<00:53,  6.75s/it]2025-04-15 22:17:08,980 - INFO - Processing Loan analysis.xlsx
2025-04-15 22:17:09,060 - INFO - Generated 2 chunks for Loan analysis.xlsx


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-15 22:17:27,370 - INFO - Added 2 chunks (Total: 14)
Processing documents:  30%|██████████████████▎                                          | 3/10 [00:33<01:24, 12.06s/it]2025-04-15 22:17:27,370 - INFO - Processing M.Sc. Applied Psychology.docx
2025-04-15 22:17:27,866 - INFO - Generated 37 chunks for M.Sc. Applied Psychology.docx


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2025-04-15 22:18:08,412 - INFO - Added 37 chunks (Total: 51)
Processing documents:  40%|████████████████████████▍                                    | 4/10 [01:14<02:21, 23.50s/it]2025-04-15 22:18:08,414 - INFO - Processing new-approaches-and-procedures-for-cancer-treatment.pdf
2025-04-15 22:18:11,041 - INFO - Generated 21 chunks for new-approaches-and-procedures-for-cancer-treatment.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-15 22:18:32,635 - INFO - Added 21 chunks (Total: 72)
Processing documents:  50%|██████████████████████████████▌                              | 5/10 [01:38<01:58, 23.76s/it]2025-04-15 22:18:32,638 - INFO - Processing Ocean_ecogeochemistry_A_review.pdf
2025-04-15 22:18:43,353 - INFO - Generated 97 chunks for Ocean_ecogeochemistry_A_review.pdf


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-04-15 22:20:10,316 - INFO - Added 97 chunks (Total: 169)
Processing documents:  60%|████████████████████████████████████▌                        | 6/10 [03:16<03:15, 48.90s/it]2025-04-15 22:20:10,316 - INFO - Processing party budget1.xlsx
2025-04-15 22:20:10,448 - INFO - Generated 1 chunks for party budget1.xlsx


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-15 22:20:17,021 - INFO - Added 1 chunks (Total: 170)
Processing documents:  70%|██████████████████████████████████████████▋                  | 7/10 [03:23<01:45, 35.10s/it]2025-04-15 22:20:17,023 - INFO - Processing Stats.docx
2025-04-15 22:20:17,189 - INFO - Generated 3 chunks for Stats.docx


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-15 22:20:20,206 - INFO - Added 3 chunks (Total: 173)
Processing documents:  80%|████████████████████████████████████████████████▊            | 8/10 [03:26<00:49, 24.94s/it]2025-04-15 22:20:20,214 - INFO - Processing The-Alchemist.pdf
2025-04-15 22:20:34,271 - INFO - Generated 136 chunks for The-Alchemist.pdf


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

2025-04-15 22:21:59,012 - INFO - Added 136 chunks (Total: 309)
Processing documents:  90%|██████████████████████████████████████████████████████▉      | 9/10 [05:05<00:48, 48.03s/it]2025-04-15 22:21:59,015 - INFO - Processing The_Plan_of_the_Giza_Pyramids.pdf
2025-04-15 22:22:01,236 - INFO - Generated 22 chunks for The_Plan_of_the_Giza_Pyramids.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-15 22:22:25,334 - INFO - Added 22 chunks (Total: 331)
Processing documents: 100%|████████████████████████████████████████████████████████████| 10/10 [05:31<00:00, 33.16s/it]



✅ Successfully processed 331 chunks from 10 files



📝 Enter your question (or press Enter to exit):
>  Tell Me About The Giza System


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


📜 Question: Tell Me About The Giza System

💡 Answer:
The Giza System is a plan of the three pyramids of Khufu, Khafre, and Menkaure.
The plan is based on the dimensions of the pyramids and the positions of the
corners. The dimensions are in royal cubits

⚡ Performance: 4.0 tokens/sec

📚 Sources:
1. The_Plan_of_the_Giza_Pyramids.pdf (Pages: [13])
2. The_Plan_of_the_Giza_Pyramids.pdf (Pages: [16])
3. The_Plan_of_the_Giza_Pyramids.pdf (Pages: [1])



📝 Enter your question (or press Enter to exit):
>  What is the Ocean_ecogeochemistry?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


📜 Question: What is the Ocean_ecogeochemistry?

💡 Answer:
Ocean ecoGeoChemistry is a discipline that integrates chemistry, physiology, and
biology to study the chemical, physical, and biochemical processes that
determine the isotopic composition of marine animals.

⚡ Performance: 5.6 tokens/sec

📚 Sources:
1. Ocean_ecogeochemistry_A_review.pdf (Pages: [3])
2. Ocean_ecogeochemistry_A_review.pdf (Pages: [11])
3. Ocean_ecogeochemistry_A_review.pdf (Pages: [29])



📝 Enter your question (or press Enter to exit):
>  



🔚 Analysis session completed
