From 0c4353669bef73fded2f14112e4942d842575e4c Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Fri, 11 Apr 2025 20:35:52 +0200 Subject: [PATCH 01/16] feat: added more supported models --- agentic_rag/gradio_app.py | 186 ++++++++++++++++++++++++++------- agentic_rag/local_rag_agent.py | 15 ++- 2 files changed, 158 insertions(+), 43 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 50dd22e..67f099d 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -142,13 +142,7 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, model_type = "Local (Mistral)" elif "Ollama" in agent_type: model_type = "Ollama" - # Extract model name from agent_type and use correct Ollama model names - if "llama3" in agent_type.lower(): - model_name = "ollama:llama3" - elif "phi-3" in agent_type.lower(): - model_name = "ollama:phi3" - elif "qwen2" in agent_type.lower(): - model_name = "ollama:qwen2" + # Model name will be extracted later else: model_type = agent_type @@ -161,28 +155,26 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, return history + [[message, response_text]] agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, skip_analysis=skip_analysis, quantization=quantization) - elif model_type == "Ollama": + elif "Ollama" in model_type: # For Ollama models - if model_name: - try: - agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, - collection=collection, skip_analysis=skip_analysis) - except Exception as e: - response_text = f"Error initializing Ollama model: {str(e)}. Falling back to Local Mistral." - print(f"Error: {response_text}") - # Fall back to Mistral if Ollama fails - if hf_token: - agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, - skip_analysis=skip_analysis) - else: - return history + [[message, "Local Mistral agent not available for fallback. Please check your HuggingFace token configuration."]] - else: - response_text = "Ollama model not specified correctly." + # Extract model name directly from the model_type + model_name = model_type.replace("Ollama - ", "").strip() + + try: + agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, + collection=collection, skip_analysis=skip_analysis) + except Exception as e: + response_text = f"Error initializing Ollama model: {str(e)}. Falling back to Local Mistral." print(f"Error: {response_text}") - return history + [[message, response_text]] + # Fall back to Mistral if Ollama fails + if hf_token: + agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, + skip_analysis=skip_analysis) + else: + return history + [[message, "Local Mistral agent not available for fallback. Please check your HuggingFace token configuration."]] else: if not openai_key: - response_text = "OpenAI agent not available. Please check your OpenAI API key configuration." + response_text = "OpenAI key not found. Please check your config." print(f"Error: {response_text}") return history + [[message, response_text]] agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, @@ -316,7 +308,32 @@ def create_interface(): model_choices.extend([ "Ollama - llama3", "Ollama - phi-3", - "Ollama - qwen2" + "Ollama - qwen2", + # New Ollama models + "Ollama - gemma3:1b", + "Ollama - gemma3", + "Ollama - gemma3:12b", + "Ollama - gemma3:27b", + "Ollama - qwq", + "Ollama - deepseek-r1", + "Ollama - deepseek-r1:671b", + "Ollama - llama3.3", + "Ollama - llama3.2", + "Ollama - llama3.2:1b", + "Ollama - llama3.2-vision", + "Ollama - llama3.2-vision:90b", + "Ollama - llama3.1", + "Ollama - llama3.1:405b", + "Ollama - phi4", + "Ollama - phi4-mini", + "Ollama - mistral", + "Ollama - moondream", + "Ollama - neural-chat", + "Ollama - starling-lm", + "Ollama - codellama", + "Ollama - llama2-uncensored", + "Ollama - llava", + "Ollama - granite3.2" ]) if openai_key: model_choices.append("OpenAI") @@ -390,8 +407,88 @@ def create_interface(): - Size: ~7GB - VRAM Required: ~6GB - Balance between quality and memory usage + + For a complete list of supported models and specifications, see the **Model FAQ** tab. """) + # Model FAQ Tab + with gr.Tab("Model FAQ"): + gr.Markdown(""" + ## Model Information & Technical Requirements + + This page provides detailed information about all supported models, including size, parameter count, and hardware requirements. + + ### Memory Requirements + + As a general guideline: + - You should have at least 8 GB of RAM available to run 7B parameter models + - You should have at least 16 GB of RAM available to run 13B parameter models + - You should have at least 32 GB of RAM available to run 33B+ parameter models + - For vision models, additional memory is required for image processing + + ### Ollama Models + + | Model | Parameters | Size | Download Command | + |-------|------------|------|-----------------| + | Gemma 3 | 1B | 815MB | ollama run gemma3:1b | + | Gemma 3 | 4B | 3.3GB | ollama run gemma3 | + | Gemma 3 | 12B | 8.1GB | ollama run gemma3:12b | + | Gemma 3 | 27B | 17GB | ollama run gemma3:27b | + | QwQ | 32B | 20GB | ollama run qwq | + | DeepSeek-R1 | 7B | 4.7GB | ollama run deepseek-r1 | + | DeepSeek-R1 | 671B | 404GB | ollama run deepseek-r1:671b | + | Llama 3.3 | 70B | 43GB | ollama run llama3.3 | + | Llama 3.2 | 3B | 2.0GB | ollama run llama3.2 | + | Llama 3.2 | 1B | 1.3GB | ollama run llama3.2:1b | + | Llama 3.2 Vision | 11B | 7.9GB | ollama run llama3.2-vision | + | Llama 3.2 Vision | 90B | 55GB | ollama run llama3.2-vision:90b | + | Llama 3.1 | 8B | 4.7GB | ollama run llama3.1 | + | Llama 3.1 | 405B | 231GB | ollama run llama3.1:405b | + | Phi 4 | 14B | 9.1GB | ollama run phi4 | + | Phi 4 Mini | 3.8B | 2.5GB | ollama run phi4-mini | + | Mistral | 7B | 4.1GB | ollama run mistral | + | Moondream 2 | 1.4B | 829MB | ollama run moondream | + | Neural Chat | 7B | 4.1GB | ollama run neural-chat | + | Starling | 7B | 4.1GB | ollama run starling-lm | + | Code Llama | 7B | 3.8GB | ollama run codellama | + | Llama 2 Uncensored | 7B | 3.8GB | ollama run llama2-uncensored | + | LLaVA | 7B | 4.5GB | ollama run llava | + | Granite-3.2 | 8B | 4.9GB | ollama run granite3.2 | + | Llama 3 | 8B | 4.7GB | ollama run llama3 | + | Phi 3 | 4B | 4.0GB | ollama run phi3 | + | Qwen 2 | 7B | 4.1GB | ollama run qwen2 | + + ### HuggingFace Models + + | Model | Parameters | Size | Quantization | VRAM Required | + |-------|------------|------|--------------|---------------| + | Mistral | 7B | 14GB | None | 8GB | + | Mistral | 7B | 4GB | 4-bit | 4GB | + | Mistral | 7B | 7GB | 8-bit | 6GB | + + ### Recommended Models + + **Best Overall Performance**: + - Ollama - llama3 + - Ollama - llama3.2-vision (for image processing) + - Ollama - phi4 + + **Best for Limited Hardware (8GB RAM)**: + - Ollama - llama3.2:1b + - Ollama - gemma3:1b + - Ollama - phi4-mini + - Ollama - moondream + + **Best for Code Tasks**: + - Ollama - codellama + - Ollama - deepseek-r1 + + **Best for Enterprise Use**: + - Ollama - qwen2 + - Ollama - granite3.2 + - Ollama - neural-chat + """) + # Document Processing Tab with gr.Tab("Document Processing"): with gr.Row(): @@ -580,13 +677,30 @@ def main(): try: import ollama try: - # Check if Ollama is running and qwen2 is available + # Check if Ollama is running and list available models models = ollama.list().models available_models = [model.model for model in models] - if "qwen2" not in available_models and "qwen2:latest" not in available_models: - print("⚠️ Warning: Ollama is running but qwen2 model is not available. Please run 'ollama pull qwen2' or download through the interface.") - except Exception: - print("⚠️ Warning: Ollama is installed but not running or encountered an error. The default model may not work.") + + # Check if any default models are available + if "qwen2" not in available_models and "qwen2:latest" not in available_models and \ + "llama3" not in available_models and "llama3:latest" not in available_models and \ + "phi3" not in available_models and "phi3:latest" not in available_models: + print("⚠️ Warning: Ollama is running but no default models (qwen2, llama3, phi3) are available.") + print("Please download a model through the Model Management tab or run:") + print(" ollama pull qwen2") + print(" ollama pull llama3") + print(" ollama pull phi3") + else: + available_default_models = [] + for model in ["qwen2", "llama3", "phi3"]: + if model in available_models or f"{model}:latest" in available_models: + available_default_models.append(model) + + print(f"✅ Ollama is running with available default models: {', '.join(available_default_models)}") + print(f"All available models: {', '.join(available_models)}") + except Exception as e: + print(f"⚠️ Warning: Ollama is installed but not running or encountered an error: {str(e)}") + print("Please start Ollama before using the interface.") except ImportError: print("⚠️ Warning: Ollama package not installed. Please install with: pip install ollama") @@ -677,14 +791,8 @@ def download_model(model_type: str) -> str: elif "Ollama" in model_type: # Extract model name from model_type - if "llama3" in model_type.lower(): - model_name = "llama3" - elif "phi-3" in model_type.lower(): - model_name = "phi3" - elif "qwen2" in model_type.lower(): - model_name = "qwen2" - else: - return "❌ Error: Unknown Ollama model type" + # Remove the 'Ollama - ' prefix and any leading/trailing whitespace + model_name = model_type.replace("Ollama - ", "").strip() # Use Ollama to pull the model try: diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index c26f99e..3b2b8ca 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -57,8 +57,8 @@ def __init__(self, model_name: str): Args: model_name: Name of the Ollama model to use """ - # Remove the 'ollama:' prefix if present - self.model_name = model_name.replace("ollama:", "") if model_name.startswith("ollama:") else model_name + # Use the model name directly without any transformation + self.model_name = model_name self._check_ollama_running() def _check_ollama_running(self): @@ -165,11 +165,18 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala # skip_analysis parameter kept for backward compatibility but no longer used # Check if this is an Ollama model - self.is_ollama = model_name.startswith("ollama:") + self.is_ollama = model_name.startswith("ollama:") or "ollama" in model_name.lower() if self.is_ollama: # Extract the actual model name from the prefix - ollama_model_name = model_name.replace("ollama:", "") + # If model_name contains 'ollama:' prefix, remove it + # If model_name is from gradio interface (e.g., "Ollama - llama3"), extract just the model name + if model_name.startswith("ollama:"): + ollama_model_name = model_name.replace("ollama:", "") + elif "Ollama - " in model_name: + ollama_model_name = model_name.replace("Ollama - ", "") + else: + ollama_model_name = model_name # Load Ollama model print("\nLoading Ollama model...") From dc353456a2a9a9681475b1b4db33480c3a0a79f5 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Fri, 11 Apr 2025 20:46:50 +0200 Subject: [PATCH 02/16] =?UTF-8?q?=1B[200~fix:=20improve=20Ollama=20model?= =?UTF-8?q?=20detection=20in=20LocalRAGAgent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Changed model detection logic to only treat models as Ollama models when they start with 'ollama:' or contain 'Ollama - ' - Previously, any model name containing 'ollama' (case-insensitive) was incorrectly treated as an Ollama model - This fixes the issue where model names like 'deepseek-r1' were being incorrectly identified as Ollama models~ --- agentic_rag/gradio_app.py | 54 +++++++++++++++++----------------- agentic_rag/local_rag_agent.py | 2 +- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 67f099d..f18b5df 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -430,33 +430,33 @@ def create_interface(): | Model | Parameters | Size | Download Command | |-------|------------|------|-----------------| - | Gemma 3 | 1B | 815MB | ollama run gemma3:1b | - | Gemma 3 | 4B | 3.3GB | ollama run gemma3 | - | Gemma 3 | 12B | 8.1GB | ollama run gemma3:12b | - | Gemma 3 | 27B | 17GB | ollama run gemma3:27b | - | QwQ | 32B | 20GB | ollama run qwq | - | DeepSeek-R1 | 7B | 4.7GB | ollama run deepseek-r1 | - | DeepSeek-R1 | 671B | 404GB | ollama run deepseek-r1:671b | - | Llama 3.3 | 70B | 43GB | ollama run llama3.3 | - | Llama 3.2 | 3B | 2.0GB | ollama run llama3.2 | - | Llama 3.2 | 1B | 1.3GB | ollama run llama3.2:1b | - | Llama 3.2 Vision | 11B | 7.9GB | ollama run llama3.2-vision | - | Llama 3.2 Vision | 90B | 55GB | ollama run llama3.2-vision:90b | - | Llama 3.1 | 8B | 4.7GB | ollama run llama3.1 | - | Llama 3.1 | 405B | 231GB | ollama run llama3.1:405b | - | Phi 4 | 14B | 9.1GB | ollama run phi4 | - | Phi 4 Mini | 3.8B | 2.5GB | ollama run phi4-mini | - | Mistral | 7B | 4.1GB | ollama run mistral | - | Moondream 2 | 1.4B | 829MB | ollama run moondream | - | Neural Chat | 7B | 4.1GB | ollama run neural-chat | - | Starling | 7B | 4.1GB | ollama run starling-lm | - | Code Llama | 7B | 3.8GB | ollama run codellama | - | Llama 2 Uncensored | 7B | 3.8GB | ollama run llama2-uncensored | - | LLaVA | 7B | 4.5GB | ollama run llava | - | Granite-3.2 | 8B | 4.9GB | ollama run granite3.2 | - | Llama 3 | 8B | 4.7GB | ollama run llama3 | - | Phi 3 | 4B | 4.0GB | ollama run phi3 | - | Qwen 2 | 7B | 4.1GB | ollama run qwen2 | + | Gemma 3 | 1B | 815MB | gemma3:1b | + | Gemma 3 | 4B | 3.3GB | gemma3 | + | Gemma 3 | 12B | 8.1GB | gemma3:12b | + | Gemma 3 | 27B | 17GB | gemma3:27b | + | QwQ | 32B | 20GB | qwq | + | DeepSeek-R1 | 7B | 4.7GB | deepseek-r1 | + | DeepSeek-R1 | 671B | 404GB | deepseek-r1:671b | + | Llama 3.3 | 70B | 43GB | llama3.3 | + | Llama 3.2 | 3B | 2.0GB | llama3.2 | + | Llama 3.2 | 1B | 1.3GB | llama3.2:1b | + | Llama 3.2 Vision | 11B | 7.9GB | llama3.2-vision | + | Llama 3.2 Vision | 90B | 55GB | llama3.2-vision:90b | + | Llama 3.1 | 8B | 4.7GB | llama3.1 | + | Llama 3.1 | 405B | 231GB | llama3.1:405b | + | Phi 4 | 14B | 9.1GB | phi4 | + | Phi 4 Mini | 3.8B | 2.5GB | phi4-mini | + | Mistral | 7B | 4.1GB | mistral | + | Moondream 2 | 1.4B | 829MB | moondream | + | Neural Chat | 7B | 4.1GB | neural-chat | + | Starling | 7B | 4.1GB | starling-lm | + | Code Llama | 7B | 3.8GB | codellama | + | Llama 2 Uncensored | 7B | 3.8GB | llama2-uncensored | + | LLaVA | 7B | 4.5GB | llava | + | Granite-3.2 | 8B | 4.9GB | granite3.2 | + | Llama 3 | 8B | 4.7GB | llama3 | + | Phi 3 | 4B | 4.0GB | phi3 | + | Qwen 2 | 7B | 4.1GB | qwen2 | ### HuggingFace Models diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index 3b2b8ca..128d1da 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -165,7 +165,7 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala # skip_analysis parameter kept for backward compatibility but no longer used # Check if this is an Ollama model - self.is_ollama = model_name.startswith("ollama:") or "ollama" in model_name.lower() + self.is_ollama = model_name.startswith("ollama:") or "Ollama - " in model_name if self.is_ollama: # Extract the actual model name from the prefix From 6e18eb34a6be276fc5d4800bf3e9b6c6393f787c Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Fri, 11 Apr 2025 21:34:41 +0200 Subject: [PATCH 03/16] fix: bugfix --- agentic_rag/local_rag_agent.py | 147 +++++++++++++-------------------- 1 file changed, 58 insertions(+), 89 deletions(-) diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index 128d1da..8de8b64 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -57,7 +57,10 @@ def __init__(self, model_name: str): Args: model_name: Name of the Ollama model to use """ - # Use the model name directly without any transformation + # Ensure model name has :latest suffix + if not model_name.endswith(":latest"): + model_name = f"{model_name}:latest" + self.model_name = model_name self._check_ollama_running() @@ -74,13 +77,11 @@ def _check_ollama_running(self): # Check if the requested model is available if self.model_name not in available_models: - # Try with :latest suffix - if f"{self.model_name}:latest" in available_models: - self.model_name = f"{self.model_name}:latest" - print(f"Using model with :latest suffix: {self.model_name}") - else: - print(f"Model '{self.model_name}' not found in Ollama. Available models: {', '.join(available_models)}") - print(f"You can pull it with: ollama pull {self.model_name}") + print(f"Model '{self.model_name}' not found in Ollama. Available models: {', '.join(available_models)}") + print(f"You can pull it with: ollama pull {self.model_name}") + raise ValueError(f"Model '{self.model_name}' not found in Ollama") + else: + print(f"Using Ollama model: {self.model_name}") except Exception as e: raise ConnectionError(f"Failed to connect to Ollama. Please make sure Ollama is running. Error: {str(e)}") @@ -92,6 +93,9 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw try: import ollama + print(f"\nGenerating response with Ollama model: {self.model_name}") + print(f"Prompt: {prompt[:100]}...") # Print first 100 chars of prompt + # Generate text response = ollama.generate( model=self.model_name, @@ -103,6 +107,8 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw } ) + print(f"Response generated successfully with {self.model_name}") + # Format result to match transformers pipeline output formatted_result = [{ "generated_text": response["response"] @@ -114,7 +120,7 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw raise Exception(f"Failed to generate text with Ollama: {str(e)}") class LocalRAGAgent: - def __init__(self, vector_store: VectorStore = None, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", + def __init__(self, vector_store: VectorStore = None, model_name: str = None, use_cot: bool = False, collection: str = None, skip_analysis: bool = False, quantization: str = None, use_oracle_db: bool = True): """Initialize local RAG agent with vector store and local LLM @@ -165,7 +171,7 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala # skip_analysis parameter kept for backward compatibility but no longer used # Check if this is an Ollama model - self.is_ollama = model_name.startswith("ollama:") or "Ollama - " in model_name + self.is_ollama = model_name and (model_name.startswith("ollama:") or "Ollama - " in model_name) if self.is_ollama: # Extract the actual model name from the prefix @@ -178,6 +184,10 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala else: ollama_model_name = model_name + # Add :latest suffix if not present + if not ollama_model_name.endswith(":latest"): + ollama_model_name = f"{ollama_model_name}:latest" + # Load Ollama model print("\nLoading Ollama model...") print(f"Model: {ollama_model_name}") @@ -188,87 +198,46 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala # Create pipeline-like interface self.pipeline = self.ollama_handler - + print(f"Using Ollama model: {ollama_model_name}") else: - # Load HuggingFace token from config - try: - with open('config.yaml', 'r') as f: - config = yaml.safe_load(f) - token = config.get('HUGGING_FACE_HUB_TOKEN') - if not token: - raise ValueError("HUGGING_FACE_HUB_TOKEN not found in config.yaml") - except Exception as e: - raise Exception(f"Failed to load HuggingFace token from config.yaml: {str(e)}") - - # Load model and tokenizer - print("\nLoading model and tokenizer...") - print(f"Model: {model_name}") - if quantization: - print(f"Quantization: {quantization}") - print("Note: Initial loading and inference can take 1-5 minutes depending on your hardware.") - print("Subsequent queries will be faster but may still take 30-60 seconds per response.") - - # Check if CUDA is available and set appropriate dtype - if torch.cuda.is_available(): - print("CUDA is available. Using GPU acceleration.") - dtype = torch.float16 + # Only initialize Mistral if no model is specified + if not model_name: + print("\nLoading default model and tokenizer...") + print("Model: mistralai/Mistral-7B-Instruct-v0.2") + self.model_name = "mistralai/Mistral-7B-Instruct-v0.2" + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map="auto", + torch_dtype=torch.float16, + load_in_8bit=quantization == "8bit", + load_in_4bit=quantization == "4bit" + ) + self.pipeline = pipeline( + "text-generation", + model=self.model, + tokenizer=self.tokenizer, + device_map="auto" + ) + print(f"Using default model: {self.model_name}") else: - print("CUDA is not available. Using CPU only (this will be slow).") - dtype = torch.float32 - - # Set up model loading parameters - model_kwargs = { - "torch_dtype": dtype, - "device_map": "auto", - "token": token, - "low_cpu_mem_usage": True, - "offload_folder": "offload" - } - - # Apply quantization if specified - if quantization == '4bit': - try: - from transformers import BitsAndBytesConfig - quantization_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4" - ) - model_kwargs["quantization_config"] = quantization_config - print("Using 4-bit quantization with bitsandbytes") - except ImportError: - print("Warning: bitsandbytes not installed. Falling back to standard loading.") - print("To use 4-bit quantization, install bitsandbytes: pip install bitsandbytes") - elif quantization == '8bit': - try: - from transformers import BitsAndBytesConfig - quantization_config = BitsAndBytesConfig(load_in_8bit=True) - model_kwargs["quantization_config"] = quantization_config - print("Using 8-bit quantization with bitsandbytes") - except ImportError: - print("Warning: bitsandbytes not installed. Falling back to standard loading.") - print("To use 8-bit quantization, install bitsandbytes: pip install bitsandbytes") - - # Load model with appropriate settings - self.model = AutoModelForCausalLM.from_pretrained( - model_name, - **model_kwargs - ) - self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=token) - - # Create text generation pipeline with optimized settings - self.pipeline = pipeline( - "text-generation", - model=self.model, - tokenizer=self.tokenizer, - max_new_tokens=512, - do_sample=True, - temperature=0.1, - top_p=0.95, - device_map="auto" - ) - print("✓ Model loaded successfully") + print(f"\nUsing specified model: {model_name}") + self.model_name = model_name + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map="auto", + torch_dtype=torch.float16, + load_in_8bit=quantization == "8bit", + load_in_4bit=quantization == "4bit" + ) + self.pipeline = pipeline( + "text-generation", + model=self.model, + tokenizer=self.tokenizer, + device_map="auto" + ) + print(f"Using specified model: {self.model_name}") # Create LLM wrapper self.llm = LocalLLM(self.pipeline) From 7e840522c39cca9510b70d70e69f36484f4efd52 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Fri, 11 Apr 2025 23:15:02 +0200 Subject: [PATCH 04/16] feat: model names fix --- agentic_rag/gradio_app.py | 58 +++++++++++++++++----------------- agentic_rag/local_rag_agent.py | 2 +- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index f18b5df..06c98c7 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -428,35 +428,35 @@ def create_interface(): ### Ollama Models - | Model | Parameters | Size | Download Command | - |-------|------------|------|-----------------| - | Gemma 3 | 1B | 815MB | gemma3:1b | - | Gemma 3 | 4B | 3.3GB | gemma3 | - | Gemma 3 | 12B | 8.1GB | gemma3:12b | - | Gemma 3 | 27B | 17GB | gemma3:27b | - | QwQ | 32B | 20GB | qwq | - | DeepSeek-R1 | 7B | 4.7GB | deepseek-r1 | - | DeepSeek-R1 | 671B | 404GB | deepseek-r1:671b | - | Llama 3.3 | 70B | 43GB | llama3.3 | - | Llama 3.2 | 3B | 2.0GB | llama3.2 | - | Llama 3.2 | 1B | 1.3GB | llama3.2:1b | - | Llama 3.2 Vision | 11B | 7.9GB | llama3.2-vision | - | Llama 3.2 Vision | 90B | 55GB | llama3.2-vision:90b | - | Llama 3.1 | 8B | 4.7GB | llama3.1 | - | Llama 3.1 | 405B | 231GB | llama3.1:405b | - | Phi 4 | 14B | 9.1GB | phi4 | - | Phi 4 Mini | 3.8B | 2.5GB | phi4-mini | - | Mistral | 7B | 4.1GB | mistral | - | Moondream 2 | 1.4B | 829MB | moondream | - | Neural Chat | 7B | 4.1GB | neural-chat | - | Starling | 7B | 4.1GB | starling-lm | - | Code Llama | 7B | 3.8GB | codellama | - | Llama 2 Uncensored | 7B | 3.8GB | llama2-uncensored | - | LLaVA | 7B | 4.5GB | llava | - | Granite-3.2 | 8B | 4.9GB | granite3.2 | - | Llama 3 | 8B | 4.7GB | llama3 | - | Phi 3 | 4B | 4.0GB | phi3 | - | Qwen 2 | 7B | 4.1GB | qwen2 | + | Model | Parameters | Size | Download Command | Description | Pulls | Tags | Last Updated | + |-------|------------|------|-----------------|-------------|-------|------|--------------| + | Gemma 3 | 1B | 815MB | gemma3:1b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago | + | Gemma 3 | 4B | 3.3GB | gemma3 | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago | + | Gemma 3 | 12B | 8.1GB | gemma3:12b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago | + | Gemma 3 | 27B | 17GB | gemma3:27b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago | + | QwQ | 32B | 20GB | qwq | QwQ is the reasoning model of the Qwen series | 1.2M | 8 | 4 weeks ago | + | DeepSeek-R1 | 7B | 4.7GB | deepseek-r1 | DeepSeek's first-generation of reasoning models with comparable performance to OpenAI-o1 | 35.5M | 29 | 2 months ago | + | DeepSeek-R1 | 671B | 404GB | deepseek-r1:671b | DeepSeek's first-generation of reasoning models with comparable performance to OpenAI-o1 | 35.5M | 29 | 2 months ago | + | Llama 3.3 | 70B | 43GB | llama3.3 | New state of the art 70B model. Llama 3.3 70B offers similar performance compared to the Llama 3.1 405B model | 1.7M | 14 | 4 months ago | + | Llama 3.2 | 3B | 2.0GB | llama3.2 | Meta's Llama 3.2 goes small with 1B and 3B models | 12.8M | 63 | 6 months ago | + | Llama 3.2 | 1B | 1.3GB | llama3.2:1b | Meta's Llama 3.2 goes small with 1B and 3B models | 12.8M | 63 | 6 months ago | + | Llama 3.2 Vision | 11B | 7.9GB | llama3.2-vision | Llama 3.2 Vision is a collection of instruction-tuned image reasoning generative models | 1.8M | 9 | 5 months ago | + | Llama 3.2 Vision | 90B | 55GB | llama3.2-vision:90b | Llama 3.2 Vision is a collection of instruction-tuned image reasoning generative models | 1.8M | 9 | 5 months ago | + | Llama 3.1 | 8B | 4.7GB | llama3.1 | Llama 3.1 is a new state-of-the-art model from Meta | 89.6M | 93 | 4 months ago | + | Llama 3.1 | 405B | 231GB | llama3.1:405b | Llama 3.1 is a new state-of-the-art model from Meta | 89.6M | 93 | 4 months ago | + | Phi 4 | 14B | 9.1GB | phi4 | Phi-4 is a 14B parameter, state-of-the-art open model from Microsoft | 1.5M | 5 | 3 months ago | + | Phi 4 Mini | 3.8B | 2.5GB | phi4-mini | Phi-4 is a 14B parameter, state-of-the-art open model from Microsoft | 1.5M | 5 | 3 months ago | + | Mistral | 7B | 4.1GB | mistral | The 7B model released by Mistral AI, updated to version 0.3 | 11.6M | 84 | 8 months ago | + | Moondream 2 | 1.4B | 829MB | moondream | A series of multimodal LLMs (MLLMs) designed for vision-language understanding | 946.6K | 17 | 4 months ago | + | Neural Chat | 7B | 4.1GB | neural-chat | A state-of-the-art 12B model with 128k context length | 1.5M | 17 | 8 months ago | + | Starling | 7B | 4.1GB | starling-lm | A state-of-the-art 12B model with 128k context length | 1.5M | 17 | 8 months ago | + | Code Llama | 7B | 3.8GB | codellama | A large language model that can use text prompts to generate and discuss code | 1.9M | 199 | 8 months ago | + | Llama 2 Uncensored | 7B | 3.8GB | llama2-uncensored | Uncensored Llama 2 model by George Sung and Jarrad Hope | 913.2K | 34 | 17 months ago | + | LLaVA | 7B | 4.5GB | llava | LLaVA is a novel end-to-end trained large multimodal model for visual and language understanding | 4.8M | 98 | 14 months ago | + | Granite-3.2 | 8B | 4.9GB | granite3.2 | A high-performing and efficient model | 3.9M | 94 | 8 months ago | + | Llama 3 | 8B | 4.7GB | llama3 | Meta Llama 3: The most capable openly available LLM to date | 7.8M | 68 | 10 months ago | + | Phi 3 | 4B | 4.0GB | phi3 | Phi-3 is a family of lightweight 3B (Mini) and 14B (Medium) state-of-the-art open models | 3M | 72 | 8 months ago | + | Qwen 2 | 7B | 4.1GB | qwen2 | Qwen2 is a new series of large language models from Alibaba group | 4.2M | 97 | 7 months ago | ### HuggingFace Models diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index 8de8b64..6ac2f8e 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -180,7 +180,7 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None, if model_name.startswith("ollama:"): ollama_model_name = model_name.replace("ollama:", "") elif "Ollama - " in model_name: - ollama_model_name = model_name.replace("Ollama - ", "") + ollama_model_name = model_name.replace("Ollama - ", "").strip() else: ollama_model_name = model_name From 64b0dd8c74a85043d8c709224ae6ccd35462f40f Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Fri, 11 Apr 2025 23:20:52 +0200 Subject: [PATCH 05/16] feat: fix model name initializations! --- agentic_rag/gradio_app.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 06c98c7..90461d9 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -141,8 +141,9 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, quantization = "8bit" model_type = "Local (Mistral)" elif "Ollama" in agent_type: + # Extract model name from agent_type (e.g., "Ollama - deepseek-r1" -> "deepseek-r1") + model_name = agent_type.replace("Ollama - ", "").strip() model_type = "Ollama" - # Model name will be extracted later else: model_type = agent_type @@ -156,10 +157,7 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, skip_analysis=skip_analysis, quantization=quantization) elif "Ollama" in model_type: - # For Ollama models - # Extract model name directly from the model_type - model_name = model_type.replace("Ollama - ", "").strip() - + # For Ollama models, use the extracted model_name directly try: agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, collection=collection, skip_analysis=skip_analysis) From 8c3a6b6843c55575c591ea8b2fd5b053e2c2e2ea Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Fri, 11 Apr 2025 23:56:13 +0200 Subject: [PATCH 06/16] fix: harmonizing of model names and selectors --- agentic_rag/gradio_app.py | 66 +++++++++++++++++----------------- agentic_rag/local_rag_agent.py | 24 ++++--------- 2 files changed, 40 insertions(+), 50 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 90461d9..25100a7 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -298,46 +298,46 @@ def create_interface(): # HF models first if token is available if hf_token: model_choices.extend([ - "Local (Mistral)", - "Local (Mistral) - 4-bit Quantized", - "Local (Mistral) - 8-bit Quantized", + "mistral", + "mistral-4bit", + "mistral-8bit", ]) # Then Ollama models (don't require HF token) model_choices.extend([ - "Ollama - llama3", - "Ollama - phi-3", - "Ollama - qwen2", + "llama3", + "phi-3", + "qwen2", # New Ollama models - "Ollama - gemma3:1b", - "Ollama - gemma3", - "Ollama - gemma3:12b", - "Ollama - gemma3:27b", - "Ollama - qwq", - "Ollama - deepseek-r1", - "Ollama - deepseek-r1:671b", - "Ollama - llama3.3", - "Ollama - llama3.2", - "Ollama - llama3.2:1b", - "Ollama - llama3.2-vision", - "Ollama - llama3.2-vision:90b", - "Ollama - llama3.1", - "Ollama - llama3.1:405b", - "Ollama - phi4", - "Ollama - phi4-mini", - "Ollama - mistral", - "Ollama - moondream", - "Ollama - neural-chat", - "Ollama - starling-lm", - "Ollama - codellama", - "Ollama - llama2-uncensored", - "Ollama - llava", - "Ollama - granite3.2" + "gemma3:1b", + "gemma3", + "gemma3:12b", + "gemma3:27b", + "qwq", + "deepseek-r1", + "deepseek-r1:671b", + "llama3.3", + "llama3.2", + "llama3.2:1b", + "llama3.2-vision", + "llama3.2-vision:90b", + "llama3.1", + "llama3.1:405b", + "phi4", + "phi4-mini", + "mistral", + "moondream", + "neural-chat", + "starling-lm", + "codellama", + "llama2-uncensored", + "llava", + "granite3.2" ]) if openai_key: - model_choices.append("OpenAI") + model_choices.append("openai") - # Set default model to Ollama - qwen2 - default_model = "Ollama - qwen2" + # Set default model to qwen2 + default_model = "qwen2" # Model Management Tab (First Tab) with gr.Tab("Model Management"): diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index 6ac2f8e..249d944 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -170,35 +170,25 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None, self.model_name = model_name # skip_analysis parameter kept for backward compatibility but no longer used - # Check if this is an Ollama model - self.is_ollama = model_name and (model_name.startswith("ollama:") or "Ollama - " in model_name) + # Check if this is an Ollama model (anything not Mistral is considered Ollama) + self.is_ollama = not (model_name and "mistral" in model_name.lower()) if self.is_ollama: - # Extract the actual model name from the prefix - # If model_name contains 'ollama:' prefix, remove it - # If model_name is from gradio interface (e.g., "Ollama - llama3"), extract just the model name - if model_name.startswith("ollama:"): - ollama_model_name = model_name.replace("ollama:", "") - elif "Ollama - " in model_name: - ollama_model_name = model_name.replace("Ollama - ", "").strip() - else: - ollama_model_name = model_name - # Add :latest suffix if not present - if not ollama_model_name.endswith(":latest"): - ollama_model_name = f"{ollama_model_name}:latest" + if not model_name.endswith(":latest"): + model_name = f"{model_name}:latest" # Load Ollama model print("\nLoading Ollama model...") - print(f"Model: {ollama_model_name}") + print(f"Model: {model_name}") print("Note: Make sure Ollama is running on your system.") # Initialize Ollama model handler - self.ollama_handler = OllamaModelHandler(ollama_model_name) + self.ollama_handler = OllamaModelHandler(model_name) # Create pipeline-like interface self.pipeline = self.ollama_handler - print(f"Using Ollama model: {ollama_model_name}") + print(f"Using Ollama model: {model_name}") else: # Only initialize Mistral if no model is specified if not model_name: From f1fb91f8d3a9f1d32a3dca75f4d9c7c99fe6e6f1 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Fri, 11 Apr 2025 23:58:07 +0200 Subject: [PATCH 07/16] fix: bugname with suffixes --- agentic_rag/local_rag_agent.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index 249d944..3adf336 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -57,10 +57,6 @@ def __init__(self, model_name: str): Args: model_name: Name of the Ollama model to use """ - # Ensure model name has :latest suffix - if not model_name.endswith(":latest"): - model_name = f"{model_name}:latest" - self.model_name = model_name self._check_ollama_running() @@ -174,10 +170,6 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None, self.is_ollama = not (model_name and "mistral" in model_name.lower()) if self.is_ollama: - # Add :latest suffix if not present - if not model_name.endswith(":latest"): - model_name = f"{model_name}:latest" - # Load Ollama model print("\nLoading Ollama model...") print(f"Model: {model_name}") From 1b46a3b7e858ac82437528a1679f63eed1f742b8 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Sat, 12 Apr 2025 00:00:11 +0200 Subject: [PATCH 08/16] fix: bugname with suffixes --- agentic_rag/local_rag_agent.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index 3adf336..daf9408 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -57,6 +57,10 @@ def __init__(self, model_name: str): Args: model_name: Name of the Ollama model to use """ + # Remove 'ollama:' prefix if present + if model_name and model_name.startswith("ollama:"): + model_name = model_name.replace("ollama:", "") + self.model_name = model_name self._check_ollama_running() @@ -170,6 +174,10 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None, self.is_ollama = not (model_name and "mistral" in model_name.lower()) if self.is_ollama: + # Remove 'ollama:' prefix if present + if model_name and model_name.startswith("ollama:"): + model_name = model_name.replace("ollama:", "") + # Load Ollama model print("\nLoading Ollama model...") print(f"Model: {model_name}") From 752ab954b133e62372899aa0cf2ffa41c11df6c8 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Sat, 12 Apr 2025 00:43:45 +0200 Subject: [PATCH 09/16] fix: bugname with suffixes --- agentic_rag/local_rag_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index daf9408..ab85d7f 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -481,7 +481,7 @@ def main(): parser = argparse.ArgumentParser(description="Query documents using local LLM") parser.add_argument("--query", required=True, help="Query to search for") parser.add_argument("--embeddings", default="oracle", choices=["oracle", "chromadb"], help="Embeddings backend to use") - parser.add_argument("--model", default="ollama:qwen2", help="Model to use (default: ollama:qwen2)") + parser.add_argument("--model", default="qwen2", help="Model to use (default: qwen2)") parser.add_argument("--collection", help="Collection to search (PDF, Repository, General Knowledge)") parser.add_argument("--use-cot", action="store_true", help="Use Chain of Thought reasoning") parser.add_argument("--store-path", default="embeddings", help="Path to ChromaDB store") From 09c597cfc451ebbd59d8a3044bb028552aff02d6 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Sat, 12 Apr 2025 00:46:59 +0200 Subject: [PATCH 10/16] fix: bugname with suffixes --- agentic_rag/local_rag_agent.py | 1 + 1 file changed, 1 insertion(+) diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index ab85d7f..b0ca35c 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -168,6 +168,7 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None, self.collection = collection self.quantization = quantization self.model_name = model_name + print('Model Name pre-check:', model_name) # skip_analysis parameter kept for backward compatibility but no longer used # Check if this is an Ollama model (anything not Mistral is considered Ollama) From 8ce9da8b30ed77bf25e01b0505bab291d6e5a8de Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Sat, 12 Apr 2025 01:19:08 +0200 Subject: [PATCH 11/16] feat: added minikube, oke deployment, standard k8s readme, and updated local rag agent with more debug options --- agentic_rag/k8s/MINIKUBE.md | 210 +++++++++++++++++++++++++ agentic_rag/k8s/OKE_DEPLOYMENT.md | 246 ++++++++++++++++++++++++++++++ agentic_rag/k8s/README_k8s.md | 95 ++++++++++++ agentic_rag/local_rag_agent.py | 11 +- 4 files changed, 561 insertions(+), 1 deletion(-) create mode 100644 agentic_rag/k8s/MINIKUBE.md create mode 100644 agentic_rag/k8s/OKE_DEPLOYMENT.md create mode 100644 agentic_rag/k8s/README_k8s.md diff --git a/agentic_rag/k8s/MINIKUBE.md b/agentic_rag/k8s/MINIKUBE.md new file mode 100644 index 0000000..cd48157 --- /dev/null +++ b/agentic_rag/k8s/MINIKUBE.md @@ -0,0 +1,210 @@ +# Quick Start with Minikube + +This guide provides instructions for deploying the Agentic RAG system on Minikube for local testing. + +## Prerequisites + +1. [Minikube](https://minikube.sigs.k8s.io/docs/start/) installed +2. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) installed +3. Docker or another container runtime installed +4. NVIDIA GPU with appropriate drivers installed +5. [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) installed + +## Step 1: Start Minikube with GPU Support + +Start Minikube with sufficient resources and GPU support: + +```bash +# For Linux +minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=kvm2 --gpu + +# For Windows +minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=hyperv --gpu + +# For macOS (Note: GPU passthrough is limited on macOS) +minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=hyperkit +``` + +Verify that Minikube is running: + +```bash +minikube status +``` + +## Step 2: Install NVIDIA Device Plugin + +Install the NVIDIA device plugin to enable GPU support in Kubernetes: + +```bash +# Apply the NVIDIA device plugin +kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.0/nvidia-device-plugin.yml +``` + +Verify that the GPU is available in the cluster: + +```bash +kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" +``` + +## Step 3: Clone the Repository + +Clone the repository containing the Kubernetes manifests: + +```bash +git clone https://github.com/devrel/devrel-labs.git +cd devrel-labs/agentic_rag/k8s +``` + +## Step 4: Deploy the Application + +The deployment includes both Hugging Face models and Ollama for inference. The Hugging Face token is optional but recommended for using Mistral models. + +### Option 1: Deploy without a Hugging Face token (Ollama models only) + +```bash +# Create a namespace +kubectl create namespace agentic-rag + +# Create an empty ConfigMap +cat <`. + +## Troubleshooting + +### Pod Stuck in Pending State + +If the pod is stuck in Pending state, check the events: + +```bash +kubectl describe pod -l app=agentic-rag -n agentic-rag +``` + +Common issues include: + +1. **Insufficient resources**: Ensure your node pool has enough resources +2. **GPU not available**: Ensure your node pool has GPU-enabled nodes +3. **Image pull issues**: Check if the image can be pulled from the registry + +### GPU-Related Issues + +If you encounter GPU-related issues: + +1. **Check GPU availability in OKE**: + ```bash + kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" + ``` + +2. **Verify NVIDIA device plugin is running**: + ```bash + kubectl get pods -n kube-system | grep nvidia-device-plugin + ``` + +3. **Check if GPU is available to the pod**: + ```bash + kubectl describe pod -l app=agentic-rag -n agentic-rag | grep -A5 'Allocated resources' + ``` + +4. **Check NVIDIA driver installation on the node**: + ```bash + # Get the node name + NODE_NAME=$(kubectl get pod -l app=agentic-rag -n agentic-rag -o jsonpath='{.items[0].spec.nodeName}') + + # Create a debug pod on the node + kubectl debug node/$NODE_NAME -it --image=ubuntu + + # Inside the debug pod + chroot /host + nvidia-smi + ``` + +### Load Balancer Issues + +If the load balancer is not provisioning or not accessible: + +1. Check the service status: + ```bash + kubectl get service agentic-rag -n agentic-rag + ``` + +2. Check OCI Console for load balancer status and configuration + +3. Ensure your VCN security lists allow traffic to the load balancer + +## Scaling + +To scale the deployment: + +```bash +kubectl scale deployment agentic-rag -n agentic-rag --replicas=2 +``` + +Note: Each replica will require its own GPU. + +## Cleanup + +To remove all resources: + +```bash +kubectl delete namespace agentic-rag +``` + +To delete the OCI Load Balancer (if it's not automatically deleted): + +1. Navigate to the Load Balancers page in the OCI Console +2. Find the load balancer created for your service +3. Click "Delete" and confirm \ No newline at end of file diff --git a/agentic_rag/k8s/README_k8s.md b/agentic_rag/k8s/README_k8s.md new file mode 100644 index 0000000..a6bc5fd --- /dev/null +++ b/agentic_rag/k8s/README_k8s.md @@ -0,0 +1,95 @@ +# Kubernetes Deployment for Agentic RAG + +This directory contains Kubernetes manifests for deploying the Agentic RAG system. + +## Prerequisites + +- Kubernetes cluster (e.g., Oracle Kubernetes Engine, Minikube, or any other Kubernetes cluster) +- `kubectl` configured to access your cluster +- At least 8GB of RAM and 4 CPU cores available for the deployment + +## Deployment + +This deployment includes both Hugging Face models and Ollama for inference. The Hugging Face token is optional but recommended for using Mistral models. + +1. **Update the ConfigMap with your Hugging Face token** (optional but recommended): + + ```bash + # Edit the configmap.yaml file + nano local-deployment/configmap.yaml + + # Replace "your-huggingface-token" with your actual token + ``` + +2. **Deploy the application**: + + ```bash + kubectl apply -f local-deployment/configmap.yaml + kubectl apply -f local-deployment/deployment.yaml + kubectl apply -f local-deployment/service.yaml + ``` + +3. **Access the application**: + + If using LoadBalancer: + ```bash + kubectl get service agentic-rag + ``` + + If using NodePort: + ```bash + # Get the NodePort + kubectl get service agentic-rag + + # Access the application at http://: + ``` + +## Model Selection + +The deployment includes both Hugging Face models and Ollama models: + +- **Hugging Face Models**: Mistral-7B models (requires token in config.yaml) +- **Ollama Models**: llama3, phi3, and qwen2 (automatically downloaded during deployment) + +You can select which model to use from the Gradio interface after deployment. + +## Monitoring and Troubleshooting + +### Check pod status: + +```bash +kubectl get pods +``` + +### View logs: + +```bash +kubectl logs -f deployment/agentic-rag +``` + +### Shell into the pod: + +```bash +kubectl exec -it deployment/agentic-rag -- /bin/bash +``` + +## Scaling + +For production deployments, consider: + +1. Using persistent volumes for data storage +2. Adjusting resource requests and limits based on your workload +3. Setting up proper monitoring and logging +4. Implementing horizontal pod autoscaling + +## Cleanup + +To remove the deployment: + +```bash +kubectl delete -f local-deployment/ +``` + +## Future Work + +A distributed system deployment that separates the LLM inference system into its own service is planned for future releases. This will allow for better resource allocation and scaling in production environments. \ No newline at end of file diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index b0ca35c..e6558d3 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -134,6 +134,13 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None, quantization: Quantization method to use (None, '4bit', '8bit') use_oracle_db: Whether to use Oracle DB for vector storage (if False, uses ChromaDB) """ + print(f"LocalRAGAgent init - model_name: {model_name}") + + # Set default model if none provided + if model_name is None: + model_name = "qwen2" + print(f"Using default model: {model_name}") + # Initialize vector store if not provided self.use_oracle_db = use_oracle_db and ORACLE_DB_AVAILABLE @@ -168,7 +175,7 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None, self.collection = collection self.quantization = quantization self.model_name = model_name - print('Model Name pre-check:', model_name) + print('Model Name after assignment:', self.model_name) # skip_analysis parameter kept for backward compatibility but no longer used # Check if this is an Ollama model (anything not Mistral is considered Ollama) @@ -501,6 +508,7 @@ def main(): print("\nInitializing RAG agent...") print("=" * 50) + print(f"Using model: {args.model}") try: # Determine which vector store to use based on args.embeddings @@ -527,6 +535,7 @@ def main(): # Set use_oracle_db based on the actual store type use_oracle_db = args.embeddings == "oracle" and isinstance(store, OraDBVectorStore) + print(f"Creating LocalRAGAgent with model: {args.model}") agent = LocalRAGAgent( store, model_name=args.model, From 7a9dd4419e5f03f5cce771cb786629f9d66b2219 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Sat, 12 Apr 2025 01:37:46 +0200 Subject: [PATCH 12/16] feat: added minikube, oke deployment, standard k8s readme, and updated local rag agent with more debug options --- agentic_rag/gradio_app.py | 199 +++++---------------------------- agentic_rag/local_rag_agent.py | 4 + 2 files changed, 32 insertions(+), 171 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 25100a7..a011d63 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -295,196 +295,53 @@ def create_interface(): # Create model choices list for reuse model_choices = [] - # HF models first if token is available - if hf_token: - model_choices.extend([ - "mistral", - "mistral-4bit", - "mistral-8bit", - ]) - # Then Ollama models (don't require HF token) + # Only Ollama models (no more local Mistral deployments) model_choices.extend([ - "llama3", - "phi-3", - "qwen2", - # New Ollama models - "gemma3:1b", - "gemma3", - "gemma3:12b", - "gemma3:27b", "qwq", - "deepseek-r1", - "deepseek-r1:671b", + "gemma3", "llama3.3", - "llama3.2", - "llama3.2:1b", - "llama3.2-vision", - "llama3.2-vision:90b", - "llama3.1", - "llama3.1:405b", "phi4", - "phi4-mini", "mistral", - "moondream", - "neural-chat", - "starling-lm", - "codellama", - "llama2-uncensored", "llava", - "granite3.2" + "phi3", + "deepseek-r1" ]) if openai_key: model_choices.append("openai") - # Set default model to qwen2 - default_model = "qwen2" + # Set default model to qwq + default_model = "qwq" # Model Management Tab (First Tab) with gr.Tab("Model Management"): gr.Markdown(""" - ## Model Management - - Download models in advance to prepare them for use in the chat interface. - - ### Hugging Face Models - - For Hugging Face models (Mistral), you'll need a Hugging Face token in your config.yaml file. - - ### Ollama Models (Default) - - Ollama models are used by default. For Ollama models, this will pull the model using the Ollama client. - Make sure Ollama is installed and running on your system. - You can download Ollama from [ollama.com/download](https://ollama.com/download) + ## Model Selection + Choose your preferred model for the conversation. """) - with gr.Row(): - with gr.Column(): - model_dropdown = gr.Dropdown( - choices=model_choices, - value=default_model if default_model in model_choices else model_choices[0] if model_choices else None, - label="Select Model to Download", - interactive=True - ) - download_button = gr.Button("Download Selected Model") - model_status = gr.Textbox( - label="Download Status", - placeholder="Select a model and click Download to begin...", - interactive=False - ) - - with gr.Column(): - gr.Markdown(""" - ### Model Information - - **Ollama - qwen2** (DEFAULT): Alibaba's Qwen2 model via Ollama. - - Size: ~4GB - - Requires Ollama to be installed and running - - High-quality model with good performance - - **Ollama - llama3**: Meta's Llama 3 model via Ollama. - - Size: ~4GB - - Requires Ollama to be installed and running - - Excellent performance and quality - - **Ollama - phi-3**: Microsoft's Phi-3 model via Ollama. - - Size: ~4GB - - Requires Ollama to be installed and running - - Efficient small model with good performance - - **Local (Mistral)**: The default Mistral-7B-Instruct-v0.2 model. - - Size: ~14GB - - VRAM Required: ~8GB - - Good balance of quality and speed - - **Local (Mistral) - 4-bit Quantized**: 4-bit quantized version of Mistral-7B. - - Size: ~4GB - - VRAM Required: ~4GB - - Faster inference with minimal quality loss - - **Local (Mistral) - 8-bit Quantized**: 8-bit quantized version of Mistral-7B. - - Size: ~7GB - - VRAM Required: ~6GB - - Balance between quality and memory usage - - For a complete list of supported models and specifications, see the **Model FAQ** tab. - """) - - # Model FAQ Tab - with gr.Tab("Model FAQ"): - gr.Markdown(""" - ## Model Information & Technical Requirements - - This page provides detailed information about all supported models, including size, parameter count, and hardware requirements. - - ### Memory Requirements + model_dropdown = gr.Dropdown( + choices=model_choices, + value=default_model, + label="Select Model", + info="Choose the model to use for the conversation" + ) - As a general guideline: - - You should have at least 8 GB of RAM available to run 7B parameter models - - You should have at least 16 GB of RAM available to run 13B parameter models - - You should have at least 32 GB of RAM available to run 33B+ parameter models - - For vision models, additional memory is required for image processing - - ### Ollama Models - - | Model | Parameters | Size | Download Command | Description | Pulls | Tags | Last Updated | - |-------|------------|------|-----------------|-------------|-------|------|--------------| - | Gemma 3 | 1B | 815MB | gemma3:1b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago | - | Gemma 3 | 4B | 3.3GB | gemma3 | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago | - | Gemma 3 | 12B | 8.1GB | gemma3:12b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago | - | Gemma 3 | 27B | 17GB | gemma3:27b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago | - | QwQ | 32B | 20GB | qwq | QwQ is the reasoning model of the Qwen series | 1.2M | 8 | 4 weeks ago | - | DeepSeek-R1 | 7B | 4.7GB | deepseek-r1 | DeepSeek's first-generation of reasoning models with comparable performance to OpenAI-o1 | 35.5M | 29 | 2 months ago | - | DeepSeek-R1 | 671B | 404GB | deepseek-r1:671b | DeepSeek's first-generation of reasoning models with comparable performance to OpenAI-o1 | 35.5M | 29 | 2 months ago | - | Llama 3.3 | 70B | 43GB | llama3.3 | New state of the art 70B model. Llama 3.3 70B offers similar performance compared to the Llama 3.1 405B model | 1.7M | 14 | 4 months ago | - | Llama 3.2 | 3B | 2.0GB | llama3.2 | Meta's Llama 3.2 goes small with 1B and 3B models | 12.8M | 63 | 6 months ago | - | Llama 3.2 | 1B | 1.3GB | llama3.2:1b | Meta's Llama 3.2 goes small with 1B and 3B models | 12.8M | 63 | 6 months ago | - | Llama 3.2 Vision | 11B | 7.9GB | llama3.2-vision | Llama 3.2 Vision is a collection of instruction-tuned image reasoning generative models | 1.8M | 9 | 5 months ago | - | Llama 3.2 Vision | 90B | 55GB | llama3.2-vision:90b | Llama 3.2 Vision is a collection of instruction-tuned image reasoning generative models | 1.8M | 9 | 5 months ago | - | Llama 3.1 | 8B | 4.7GB | llama3.1 | Llama 3.1 is a new state-of-the-art model from Meta | 89.6M | 93 | 4 months ago | - | Llama 3.1 | 405B | 231GB | llama3.1:405b | Llama 3.1 is a new state-of-the-art model from Meta | 89.6M | 93 | 4 months ago | - | Phi 4 | 14B | 9.1GB | phi4 | Phi-4 is a 14B parameter, state-of-the-art open model from Microsoft | 1.5M | 5 | 3 months ago | - | Phi 4 Mini | 3.8B | 2.5GB | phi4-mini | Phi-4 is a 14B parameter, state-of-the-art open model from Microsoft | 1.5M | 5 | 3 months ago | - | Mistral | 7B | 4.1GB | mistral | The 7B model released by Mistral AI, updated to version 0.3 | 11.6M | 84 | 8 months ago | - | Moondream 2 | 1.4B | 829MB | moondream | A series of multimodal LLMs (MLLMs) designed for vision-language understanding | 946.6K | 17 | 4 months ago | - | Neural Chat | 7B | 4.1GB | neural-chat | A state-of-the-art 12B model with 128k context length | 1.5M | 17 | 8 months ago | - | Starling | 7B | 4.1GB | starling-lm | A state-of-the-art 12B model with 128k context length | 1.5M | 17 | 8 months ago | - | Code Llama | 7B | 3.8GB | codellama | A large language model that can use text prompts to generate and discuss code | 1.9M | 199 | 8 months ago | - | Llama 2 Uncensored | 7B | 3.8GB | llama2-uncensored | Uncensored Llama 2 model by George Sung and Jarrad Hope | 913.2K | 34 | 17 months ago | - | LLaVA | 7B | 4.5GB | llava | LLaVA is a novel end-to-end trained large multimodal model for visual and language understanding | 4.8M | 98 | 14 months ago | - | Granite-3.2 | 8B | 4.9GB | granite3.2 | A high-performing and efficient model | 3.9M | 94 | 8 months ago | - | Llama 3 | 8B | 4.7GB | llama3 | Meta Llama 3: The most capable openly available LLM to date | 7.8M | 68 | 10 months ago | - | Phi 3 | 4B | 4.0GB | phi3 | Phi-3 is a family of lightweight 3B (Mini) and 14B (Medium) state-of-the-art open models | 3M | 72 | 8 months ago | - | Qwen 2 | 7B | 4.1GB | qwen2 | Qwen2 is a new series of large language models from Alibaba group | 4.2M | 97 | 7 months ago | - - ### HuggingFace Models - - | Model | Parameters | Size | Quantization | VRAM Required | - |-------|------------|------|--------------|---------------| - | Mistral | 7B | 14GB | None | 8GB | - | Mistral | 7B | 4GB | 4-bit | 4GB | - | Mistral | 7B | 7GB | 8-bit | 6GB | - - ### Recommended Models - - **Best Overall Performance**: - - Ollama - llama3 - - Ollama - llama3.2-vision (for image processing) - - Ollama - phi4 - - **Best for Limited Hardware (8GB RAM)**: - - Ollama - llama3.2:1b - - Ollama - gemma3:1b - - Ollama - phi4-mini - - Ollama - moondream + # Add model FAQ section + gr.Markdown(""" + ## Model FAQ - **Best for Code Tasks**: - - Ollama - codellama - - Ollama - deepseek-r1 + | Model | Parameters | Size | Download Command | + |-------|------------|------|------------------| + | qwq | 7B | 4.1GB | qwq:latest | + | gemma3 | 7B | 4.1GB | gemma3:latest | + | llama3.3 | 7B | 4.1GB | llama3.3:latest | + | phi4 | 7B | 4.1GB | phi4:latest | + | mistral | 7B | 4.1GB | mistral:latest | + | llava | 7B | 4.1GB | llava:latest | + | phi3 | 7B | 4.1GB | phi3:latest | + | deepseek-r1 | 7B | 4.1GB | deepseek-r1:latest | - **Best for Enterprise Use**: - - Ollama - qwen2 - - Ollama - granite3.2 - - Ollama - neural-chat + Note: All models are available through Ollama. Make sure Ollama is running on your system. """) # Document Processing Tab diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index e6558d3..a64f4c7 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -186,6 +186,10 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None, if model_name and model_name.startswith("ollama:"): model_name = model_name.replace("ollama:", "") + # Always append :latest to Ollama model names + if not model_name.endswith(":latest"): + model_name = f"{model_name}:latest" + # Load Ollama model print("\nLoading Ollama model...") print(f"Model: {model_name}") From ac1a470c5869135767fc500b1c01de0f68e53e60 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Sat, 12 Apr 2025 01:49:04 +0200 Subject: [PATCH 13/16] feat: added minikube, oke deployment, standard k8s readme, and updated local rag agent with more debug options --- agentic_rag/gradio_app.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index a011d63..295f093 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -434,9 +434,6 @@ def create_interface(): url_button.click(process_url, inputs=[url_input], outputs=[url_output]) repo_button.click(process_repo, inputs=[repo_input], outputs=[repo_output]) - # Model download event handler - download_button.click(download_model, inputs=[model_dropdown], outputs=[model_status]) - # Standard chat handlers standard_msg.submit( chat, From a71753ef5409445bb036e4e1cd3c912afc6ce5fa Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Sat, 12 Apr 2025 01:58:19 +0200 Subject: [PATCH 14/16] feat: added minikube, oke deployment, standard k8s readme, and updated local rag agent with more debug options --- agentic_rag/gradio_app.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 295f093..4d9455d 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -319,12 +319,20 @@ def create_interface(): Choose your preferred model for the conversation. """) - model_dropdown = gr.Dropdown( - choices=model_choices, - value=default_model, - label="Select Model", - info="Choose the model to use for the conversation" - ) + with gr.Row(): + with gr.Column(): + model_dropdown = gr.Dropdown( + choices=model_choices, + value=default_model, + label="Select Model", + info="Choose the model to use for the conversation" + ) + download_button = gr.Button("Download Selected Model") + model_status = gr.Textbox( + label="Download Status", + placeholder="Select a model and click Download to begin...", + interactive=False + ) # Add model FAQ section gr.Markdown(""" @@ -332,14 +340,14 @@ def create_interface(): | Model | Parameters | Size | Download Command | |-------|------------|------|------------------| - | qwq | 7B | 4.1GB | qwq:latest | - | gemma3 | 7B | 4.1GB | gemma3:latest | - | llama3.3 | 7B | 4.1GB | llama3.3:latest | - | phi4 | 7B | 4.1GB | phi4:latest | + | qwq | 32B | 20GB | qwq:latest | + | gemma3 | 4B | 3.3GB | gemma3:latest | + | llama3.3 | 70B | 43GB | llama3.3:latest | + | phi4 | 14B | 9.1GB | phi4:latest | | mistral | 7B | 4.1GB | mistral:latest | - | llava | 7B | 4.1GB | llava:latest | - | phi3 | 7B | 4.1GB | phi3:latest | - | deepseek-r1 | 7B | 4.1GB | deepseek-r1:latest | + | llava | 7B | 4.5GB | llava:latest | + | phi3 | 4B | 4.0GB | phi3:latest | + | deepseek-r1 | 7B | 4.7GB | deepseek-r1:latest | Note: All models are available through Ollama. Make sure Ollama is running on your system. """) @@ -434,6 +442,9 @@ def create_interface(): url_button.click(process_url, inputs=[url_input], outputs=[url_output]) repo_button.click(process_repo, inputs=[repo_input], outputs=[repo_output]) + # Model download event handler + download_button.click(download_model, inputs=[model_dropdown], outputs=[model_status]) + # Standard chat handlers standard_msg.submit( chat, From 6e5055221f2b7edf108b152c4459c7d9f5c5843d Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Sat, 12 Apr 2025 02:00:53 +0200 Subject: [PATCH 15/16] feat: added minikube, oke deployment, standard k8s readme, and updated local rag agent with more debug options --- agentic_rag/gradio_app.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 4d9455d..228b958 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -651,8 +651,8 @@ def download_model(model_type: str) -> str: except Exception as e: return f"❌ Error downloading model: {str(e)}" - - elif "Ollama" in model_type: + # all ollama models + else: # Extract model name from model_type # Remove the 'Ollama - ' prefix and any leading/trailing whitespace model_name = model_type.replace("Ollama - ", "").strip() @@ -703,8 +703,6 @@ def download_model(model_type: str) -> str: return "❌ Error: Could not connect to Ollama. Please make sure Ollama is installed and running." except Exception as e: return f"❌ Error pulling Ollama model: {str(e)}" - else: - return "❌ Error: Unknown model type" except Exception as e: return f"❌ Error: {str(e)}" From 5033bc1edbf0ced4f96e7075fd3fd085f78fc10e Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Sat, 12 Apr 2025 02:19:19 +0200 Subject: [PATCH 16/16] feat: added minikube, oke deployment, standard k8s readme, and updated local rag agent with more debug options --- agentic_rag/gradio_app.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 228b958..7c39376 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -140,15 +140,22 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, elif "8-bit" in agent_type: quantization = "8bit" model_type = "Local (Mistral)" - elif "Ollama" in agent_type: - # Extract model name from agent_type (e.g., "Ollama - deepseek-r1" -> "deepseek-r1") - model_name = agent_type.replace("Ollama - ", "").strip() - model_type = "Ollama" + elif agent_type == "openai": + model_type = "OpenAI" else: - model_type = agent_type + # All other models are treated as Ollama models + model_type = "Ollama" + model_name = agent_type # Select appropriate agent and reinitialize with correct settings - if "Local" in model_type: + if model_type == "OpenAI": + if not openai_key: + response_text = "OpenAI key not found. Please check your config." + print(f"Error: {response_text}") + return history + [[message, response_text]] + agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, + collection=collection, skip_analysis=skip_analysis) + elif model_type == "Local (Mistral)": # For HF models, we need the token if not hf_token: response_text = "Local agent not available. Please check your HuggingFace token configuration." @@ -156,27 +163,14 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, return history + [[message, response_text]] agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, skip_analysis=skip_analysis, quantization=quantization) - elif "Ollama" in model_type: - # For Ollama models, use the extracted model_name directly + else: # Ollama models try: agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, collection=collection, skip_analysis=skip_analysis) except Exception as e: - response_text = f"Error initializing Ollama model: {str(e)}. Falling back to Local Mistral." - print(f"Error: {response_text}") - # Fall back to Mistral if Ollama fails - if hf_token: - agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, - skip_analysis=skip_analysis) - else: - return history + [[message, "Local Mistral agent not available for fallback. Please check your HuggingFace token configuration."]] - else: - if not openai_key: - response_text = "OpenAI key not found. Please check your config." + response_text = f"Error initializing Ollama model: {str(e)}" print(f"Error: {response_text}") return history + [[message, response_text]] - agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, - collection=collection, skip_analysis=skip_analysis) # Process query and get response print("Processing query...")