From 0c4353669bef73fded2f14112e4942d842575e4c Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Fri, 11 Apr 2025 20:35:52 +0200
Subject: [PATCH 01/16] feat: added more supported models

---
 agentic_rag/gradio_app.py      | 186 ++++++++++++++++++++++++++-------
 agentic_rag/local_rag_agent.py |  15 ++-
 2 files changed, 158 insertions(+), 43 deletions(-)

diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index 50dd22e..67f099d 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -142,13 +142,7 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
             model_type = "Local (Mistral)"
         elif "Ollama" in agent_type:
             model_type = "Ollama"
-            # Extract model name from agent_type and use correct Ollama model names
-            if "llama3" in agent_type.lower():
-                model_name = "ollama:llama3"
-            elif "phi-3" in agent_type.lower():
-                model_name = "ollama:phi3"
-            elif "qwen2" in agent_type.lower():
-                model_name = "ollama:qwen2"
+            # Model name will be extracted later
         else:
             model_type = agent_type
         
@@ -161,28 +155,26 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
                 return history + [[message, response_text]]
             agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, 
                                  skip_analysis=skip_analysis, quantization=quantization)
-        elif model_type == "Ollama":
+        elif "Ollama" in model_type:
             # For Ollama models
-            if model_name:
-                try:
-                    agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, 
-                                         collection=collection, skip_analysis=skip_analysis)
-                except Exception as e:
-                    response_text = f"Error initializing Ollama model: {str(e)}. Falling back to Local Mistral."
-                    print(f"Error: {response_text}")
-                    # Fall back to Mistral if Ollama fails
-                    if hf_token:
-                        agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, 
-                                             skip_analysis=skip_analysis)
-                    else:
-                        return history + [[message, "Local Mistral agent not available for fallback. Please check your HuggingFace token configuration."]]
-            else:
-                response_text = "Ollama model not specified correctly."
+            # Extract model name directly from the model_type
+            model_name = model_type.replace("Ollama - ", "").strip()
+            
+            try:
+                agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, 
+                                     collection=collection, skip_analysis=skip_analysis)
+            except Exception as e:
+                response_text = f"Error initializing Ollama model: {str(e)}. Falling back to Local Mistral."
                 print(f"Error: {response_text}")
-                return history + [[message, response_text]]
+                # Fall back to Mistral if Ollama fails
+                if hf_token:
+                    agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, 
+                                         skip_analysis=skip_analysis)
+                else:
+                    return history + [[message, "Local Mistral agent not available for fallback. Please check your HuggingFace token configuration."]]
         else:
             if not openai_key:
-                response_text = "OpenAI agent not available. Please check your OpenAI API key configuration."
+                response_text = "OpenAI key not found. Please check your config."
                 print(f"Error: {response_text}")
                 return history + [[message, response_text]]
             agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, 
@@ -316,7 +308,32 @@ def create_interface():
         model_choices.extend([
             "Ollama - llama3",
             "Ollama - phi-3",
-            "Ollama - qwen2"
+            "Ollama - qwen2",
+            # New Ollama models
+            "Ollama - gemma3:1b",
+            "Ollama - gemma3",
+            "Ollama - gemma3:12b", 
+            "Ollama - gemma3:27b",
+            "Ollama - qwq",
+            "Ollama - deepseek-r1",
+            "Ollama - deepseek-r1:671b",
+            "Ollama - llama3.3",
+            "Ollama - llama3.2",
+            "Ollama - llama3.2:1b",
+            "Ollama - llama3.2-vision",
+            "Ollama - llama3.2-vision:90b",
+            "Ollama - llama3.1",
+            "Ollama - llama3.1:405b",
+            "Ollama - phi4",
+            "Ollama - phi4-mini",
+            "Ollama - mistral",
+            "Ollama - moondream",
+            "Ollama - neural-chat",
+            "Ollama - starling-lm",
+            "Ollama - codellama",
+            "Ollama - llama2-uncensored",
+            "Ollama - llava",
+            "Ollama - granite3.2"
         ])
         if openai_key:
             model_choices.append("OpenAI")
@@ -390,8 +407,88 @@ def create_interface():
                     - Size: ~7GB
                     - VRAM Required: ~6GB
                     - Balance between quality and memory usage
+                    
+                    For a complete list of supported models and specifications, see the **Model FAQ** tab.
                     """)
         
+        # Model FAQ Tab
+        with gr.Tab("Model FAQ"):
+            gr.Markdown("""
+            ## Model Information & Technical Requirements
+            
+            This page provides detailed information about all supported models, including size, parameter count, and hardware requirements.
+            
+            ### Memory Requirements
+            
+            As a general guideline:
+            - You should have at least 8 GB of RAM available to run 7B parameter models
+            - You should have at least 16 GB of RAM available to run 13B parameter models
+            - You should have at least 32 GB of RAM available to run 33B+ parameter models
+            - For vision models, additional memory is required for image processing
+            
+            ### Ollama Models
+            
+            | Model | Parameters | Size | Download Command |
+            |-------|------------|------|-----------------|
+            | Gemma 3 | 1B | 815MB | ollama run gemma3:1b |
+            | Gemma 3 | 4B | 3.3GB | ollama run gemma3 |
+            | Gemma 3 | 12B | 8.1GB | ollama run gemma3:12b |
+            | Gemma 3 | 27B | 17GB | ollama run gemma3:27b |
+            | QwQ | 32B | 20GB | ollama run qwq |
+            | DeepSeek-R1 | 7B | 4.7GB | ollama run deepseek-r1 |
+            | DeepSeek-R1 | 671B | 404GB | ollama run deepseek-r1:671b |
+            | Llama 3.3 | 70B | 43GB | ollama run llama3.3 |
+            | Llama 3.2 | 3B | 2.0GB | ollama run llama3.2 |
+            | Llama 3.2 | 1B | 1.3GB | ollama run llama3.2:1b |
+            | Llama 3.2 Vision | 11B | 7.9GB | ollama run llama3.2-vision |
+            | Llama 3.2 Vision | 90B | 55GB | ollama run llama3.2-vision:90b |
+            | Llama 3.1 | 8B | 4.7GB | ollama run llama3.1 |
+            | Llama 3.1 | 405B | 231GB | ollama run llama3.1:405b |
+            | Phi 4 | 14B | 9.1GB | ollama run phi4 |
+            | Phi 4 Mini | 3.8B | 2.5GB | ollama run phi4-mini |
+            | Mistral | 7B | 4.1GB | ollama run mistral |
+            | Moondream 2 | 1.4B | 829MB | ollama run moondream |
+            | Neural Chat | 7B | 4.1GB | ollama run neural-chat |
+            | Starling | 7B | 4.1GB | ollama run starling-lm |
+            | Code Llama | 7B | 3.8GB | ollama run codellama |
+            | Llama 2 Uncensored | 7B | 3.8GB | ollama run llama2-uncensored |
+            | LLaVA | 7B | 4.5GB | ollama run llava |
+            | Granite-3.2 | 8B | 4.9GB | ollama run granite3.2 |
+            | Llama 3 | 8B | 4.7GB | ollama run llama3 |
+            | Phi 3 | 4B | 4.0GB | ollama run phi3 |
+            | Qwen 2 | 7B | 4.1GB | ollama run qwen2 |
+            
+            ### HuggingFace Models
+            
+            | Model | Parameters | Size | Quantization | VRAM Required |
+            |-------|------------|------|--------------|---------------|
+            | Mistral | 7B | 14GB | None | 8GB |
+            | Mistral | 7B | 4GB | 4-bit | 4GB |
+            | Mistral | 7B | 7GB | 8-bit | 6GB |
+            
+            ### Recommended Models
+            
+            **Best Overall Performance**:
+            - Ollama - llama3
+            - Ollama - llama3.2-vision (for image processing)
+            - Ollama - phi4
+            
+            **Best for Limited Hardware (8GB RAM)**:
+            - Ollama - llama3.2:1b
+            - Ollama - gemma3:1b
+            - Ollama - phi4-mini
+            - Ollama - moondream
+            
+            **Best for Code Tasks**:
+            - Ollama - codellama
+            - Ollama - deepseek-r1
+            
+            **Best for Enterprise Use**:
+            - Ollama - qwen2
+            - Ollama - granite3.2
+            - Ollama - neural-chat
+            """)
+        
         # Document Processing Tab
         with gr.Tab("Document Processing"):
             with gr.Row():
@@ -580,13 +677,30 @@ def main():
     try:
         import ollama
         try:
-            # Check if Ollama is running and qwen2 is available
+            # Check if Ollama is running and list available models
             models = ollama.list().models
             available_models = [model.model for model in models]
-            if "qwen2" not in available_models and "qwen2:latest" not in available_models:
-                print("⚠️ Warning: Ollama is running but qwen2 model is not available. Please run 'ollama pull qwen2' or download through the interface.")
-        except Exception:
-            print("⚠️ Warning: Ollama is installed but not running or encountered an error. The default model may not work.")
+            
+            # Check if any default models are available
+            if "qwen2" not in available_models and "qwen2:latest" not in available_models and \
+               "llama3" not in available_models and "llama3:latest" not in available_models and \
+               "phi3" not in available_models and "phi3:latest" not in available_models:
+                print("⚠️ Warning: Ollama is running but no default models (qwen2, llama3, phi3) are available.")
+                print("Please download a model through the Model Management tab or run:")
+                print("    ollama pull qwen2")
+                print("    ollama pull llama3")
+                print("    ollama pull phi3")
+            else:
+                available_default_models = []
+                for model in ["qwen2", "llama3", "phi3"]:
+                    if model in available_models or f"{model}:latest" in available_models:
+                        available_default_models.append(model)
+                
+                print(f"✅ Ollama is running with available default models: {', '.join(available_default_models)}")
+                print(f"All available models: {', '.join(available_models)}")
+        except Exception as e:
+            print(f"⚠️ Warning: Ollama is installed but not running or encountered an error: {str(e)}")
+            print("Please start Ollama before using the interface.")
     except ImportError:
         print("⚠️ Warning: Ollama package not installed. Please install with: pip install ollama")
         
@@ -677,14 +791,8 @@ def download_model(model_type: str) -> str:
                 
         elif "Ollama" in model_type:
             # Extract model name from model_type
-            if "llama3" in model_type.lower():
-                model_name = "llama3"
-            elif "phi-3" in model_type.lower():
-                model_name = "phi3"
-            elif "qwen2" in model_type.lower():
-                model_name = "qwen2"
-            else:
-                return "❌ Error: Unknown Ollama model type"
+            # Remove the 'Ollama - ' prefix and any leading/trailing whitespace
+            model_name = model_type.replace("Ollama - ", "").strip()
             
             # Use Ollama to pull the model
             try:
diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index c26f99e..3b2b8ca 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -57,8 +57,8 @@ def __init__(self, model_name: str):
         Args:
             model_name: Name of the Ollama model to use
         """
-        # Remove the 'ollama:' prefix if present
-        self.model_name = model_name.replace("ollama:", "") if model_name.startswith("ollama:") else model_name
+        # Use the model name directly without any transformation
+        self.model_name = model_name
         self._check_ollama_running()
     
     def _check_ollama_running(self):
@@ -165,11 +165,18 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala
         # skip_analysis parameter kept for backward compatibility but no longer used
         
         # Check if this is an Ollama model
-        self.is_ollama = model_name.startswith("ollama:")
+        self.is_ollama = model_name.startswith("ollama:") or "ollama" in model_name.lower()
         
         if self.is_ollama:
             # Extract the actual model name from the prefix
-            ollama_model_name = model_name.replace("ollama:", "")
+            # If model_name contains 'ollama:' prefix, remove it
+            # If model_name is from gradio interface (e.g., "Ollama - llama3"), extract just the model name
+            if model_name.startswith("ollama:"):
+                ollama_model_name = model_name.replace("ollama:", "")
+            elif "Ollama - " in model_name:
+                ollama_model_name = model_name.replace("Ollama - ", "")
+            else:
+                ollama_model_name = model_name
             
             # Load Ollama model
             print("\nLoading Ollama model...")

From dc353456a2a9a9681475b1b4db33480c3a0a79f5 Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Fri, 11 Apr 2025 20:46:50 +0200
Subject: [PATCH 02/16] =?UTF-8?q?=1B[200~fix:=20improve=20Ollama=20model?=
 =?UTF-8?q?=20detection=20in=20LocalRAGAgent?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Changed model detection logic to only treat models as Ollama models when they start with 'ollama:' or contain 'Ollama - '
- Previously, any model name containing 'ollama' (case-insensitive) was incorrectly treated as an Ollama model
- This fixes the issue where model names like 'deepseek-r1' were being incorrectly identified as Ollama models~
---
 agentic_rag/gradio_app.py      | 54 +++++++++++++++++-----------------
 agentic_rag/local_rag_agent.py |  2 +-
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index 67f099d..f18b5df 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -430,33 +430,33 @@ def create_interface():
             
             | Model | Parameters | Size | Download Command |
             |-------|------------|------|-----------------|
-            | Gemma 3 | 1B | 815MB | ollama run gemma3:1b |
-            | Gemma 3 | 4B | 3.3GB | ollama run gemma3 |
-            | Gemma 3 | 12B | 8.1GB | ollama run gemma3:12b |
-            | Gemma 3 | 27B | 17GB | ollama run gemma3:27b |
-            | QwQ | 32B | 20GB | ollama run qwq |
-            | DeepSeek-R1 | 7B | 4.7GB | ollama run deepseek-r1 |
-            | DeepSeek-R1 | 671B | 404GB | ollama run deepseek-r1:671b |
-            | Llama 3.3 | 70B | 43GB | ollama run llama3.3 |
-            | Llama 3.2 | 3B | 2.0GB | ollama run llama3.2 |
-            | Llama 3.2 | 1B | 1.3GB | ollama run llama3.2:1b |
-            | Llama 3.2 Vision | 11B | 7.9GB | ollama run llama3.2-vision |
-            | Llama 3.2 Vision | 90B | 55GB | ollama run llama3.2-vision:90b |
-            | Llama 3.1 | 8B | 4.7GB | ollama run llama3.1 |
-            | Llama 3.1 | 405B | 231GB | ollama run llama3.1:405b |
-            | Phi 4 | 14B | 9.1GB | ollama run phi4 |
-            | Phi 4 Mini | 3.8B | 2.5GB | ollama run phi4-mini |
-            | Mistral | 7B | 4.1GB | ollama run mistral |
-            | Moondream 2 | 1.4B | 829MB | ollama run moondream |
-            | Neural Chat | 7B | 4.1GB | ollama run neural-chat |
-            | Starling | 7B | 4.1GB | ollama run starling-lm |
-            | Code Llama | 7B | 3.8GB | ollama run codellama |
-            | Llama 2 Uncensored | 7B | 3.8GB | ollama run llama2-uncensored |
-            | LLaVA | 7B | 4.5GB | ollama run llava |
-            | Granite-3.2 | 8B | 4.9GB | ollama run granite3.2 |
-            | Llama 3 | 8B | 4.7GB | ollama run llama3 |
-            | Phi 3 | 4B | 4.0GB | ollama run phi3 |
-            | Qwen 2 | 7B | 4.1GB | ollama run qwen2 |
+            | Gemma 3 | 1B | 815MB | gemma3:1b |
+            | Gemma 3 | 4B | 3.3GB | gemma3 |
+            | Gemma 3 | 12B | 8.1GB | gemma3:12b |
+            | Gemma 3 | 27B | 17GB | gemma3:27b |
+            | QwQ | 32B | 20GB | qwq |
+            | DeepSeek-R1 | 7B | 4.7GB | deepseek-r1 |
+            | DeepSeek-R1 | 671B | 404GB | deepseek-r1:671b |
+            | Llama 3.3 | 70B | 43GB | llama3.3 |
+            | Llama 3.2 | 3B | 2.0GB | llama3.2 |
+            | Llama 3.2 | 1B | 1.3GB | llama3.2:1b |
+            | Llama 3.2 Vision | 11B | 7.9GB | llama3.2-vision |
+            | Llama 3.2 Vision | 90B | 55GB | llama3.2-vision:90b |
+            | Llama 3.1 | 8B | 4.7GB | llama3.1 |
+            | Llama 3.1 | 405B | 231GB | llama3.1:405b |
+            | Phi 4 | 14B | 9.1GB | phi4 |
+            | Phi 4 Mini | 3.8B | 2.5GB | phi4-mini |
+            | Mistral | 7B | 4.1GB | mistral |
+            | Moondream 2 | 1.4B | 829MB | moondream |
+            | Neural Chat | 7B | 4.1GB | neural-chat |
+            | Starling | 7B | 4.1GB | starling-lm |
+            | Code Llama | 7B | 3.8GB | codellama |
+            | Llama 2 Uncensored | 7B | 3.8GB | llama2-uncensored |
+            | LLaVA | 7B | 4.5GB | llava |
+            | Granite-3.2 | 8B | 4.9GB | granite3.2 |
+            | Llama 3 | 8B | 4.7GB | llama3 |
+            | Phi 3 | 4B | 4.0GB | phi3 |
+            | Qwen 2 | 7B | 4.1GB | qwen2 |
             
             ### HuggingFace Models
             
diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index 3b2b8ca..128d1da 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -165,7 +165,7 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala
         # skip_analysis parameter kept for backward compatibility but no longer used
         
         # Check if this is an Ollama model
-        self.is_ollama = model_name.startswith("ollama:") or "ollama" in model_name.lower()
+        self.is_ollama = model_name.startswith("ollama:") or "Ollama - " in model_name
         
         if self.is_ollama:
             # Extract the actual model name from the prefix

From 6e18eb34a6be276fc5d4800bf3e9b6c6393f787c Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Fri, 11 Apr 2025 21:34:41 +0200
Subject: [PATCH 03/16] fix: bugfix

---
 agentic_rag/local_rag_agent.py | 147 +++++++++++++--------------------
 1 file changed, 58 insertions(+), 89 deletions(-)

diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index 128d1da..8de8b64 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -57,7 +57,10 @@ def __init__(self, model_name: str):
         Args:
             model_name: Name of the Ollama model to use
         """
-        # Use the model name directly without any transformation
+        # Ensure model name has :latest suffix
+        if not model_name.endswith(":latest"):
+            model_name = f"{model_name}:latest"
+        
         self.model_name = model_name
         self._check_ollama_running()
     
@@ -74,13 +77,11 @@ def _check_ollama_running(self):
                 
                 # Check if the requested model is available
                 if self.model_name not in available_models:
-                    # Try with :latest suffix
-                    if f"{self.model_name}:latest" in available_models:
-                        self.model_name = f"{self.model_name}:latest"
-                        print(f"Using model with :latest suffix: {self.model_name}")
-                    else:
-                        print(f"Model '{self.model_name}' not found in Ollama. Available models: {', '.join(available_models)}")
-                        print(f"You can pull it with: ollama pull {self.model_name}")
+                    print(f"Model '{self.model_name}' not found in Ollama. Available models: {', '.join(available_models)}")
+                    print(f"You can pull it with: ollama pull {self.model_name}")
+                    raise ValueError(f"Model '{self.model_name}' not found in Ollama")
+                else:
+                    print(f"Using Ollama model: {self.model_name}")
             except Exception as e:
                 raise ConnectionError(f"Failed to connect to Ollama. Please make sure Ollama is running. Error: {str(e)}")
                 
@@ -92,6 +93,9 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw
         try:
             import ollama
             
+            print(f"\nGenerating response with Ollama model: {self.model_name}")
+            print(f"Prompt: {prompt[:100]}...")  # Print first 100 chars of prompt
+            
             # Generate text
             response = ollama.generate(
                 model=self.model_name,
@@ -103,6 +107,8 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw
                 }
             )
             
+            print(f"Response generated successfully with {self.model_name}")
+            
             # Format result to match transformers pipeline output
             formatted_result = [{
                 "generated_text": response["response"]
@@ -114,7 +120,7 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw
             raise Exception(f"Failed to generate text with Ollama: {str(e)}")
 
 class LocalRAGAgent:
-    def __init__(self, vector_store: VectorStore = None, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", 
+    def __init__(self, vector_store: VectorStore = None, model_name: str = None, 
                  use_cot: bool = False, collection: str = None, skip_analysis: bool = False,
                  quantization: str = None, use_oracle_db: bool = True):
         """Initialize local RAG agent with vector store and local LLM
@@ -165,7 +171,7 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala
         # skip_analysis parameter kept for backward compatibility but no longer used
         
         # Check if this is an Ollama model
-        self.is_ollama = model_name.startswith("ollama:") or "Ollama - " in model_name
+        self.is_ollama = model_name and (model_name.startswith("ollama:") or "Ollama - " in model_name)
         
         if self.is_ollama:
             # Extract the actual model name from the prefix
@@ -178,6 +184,10 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala
             else:
                 ollama_model_name = model_name
             
+            # Add :latest suffix if not present
+            if not ollama_model_name.endswith(":latest"):
+                ollama_model_name = f"{ollama_model_name}:latest"
+            
             # Load Ollama model
             print("\nLoading Ollama model...")
             print(f"Model: {ollama_model_name}")
@@ -188,87 +198,46 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala
             
             # Create pipeline-like interface
             self.pipeline = self.ollama_handler
-            
+            print(f"Using Ollama model: {ollama_model_name}")
         else:
-            # Load HuggingFace token from config
-            try:
-                with open('config.yaml', 'r') as f:
-                    config = yaml.safe_load(f)
-                token = config.get('HUGGING_FACE_HUB_TOKEN')
-                if not token:
-                    raise ValueError("HUGGING_FACE_HUB_TOKEN not found in config.yaml")
-            except Exception as e:
-                raise Exception(f"Failed to load HuggingFace token from config.yaml: {str(e)}")
-            
-            # Load model and tokenizer
-            print("\nLoading model and tokenizer...")
-            print(f"Model: {model_name}")
-            if quantization:
-                print(f"Quantization: {quantization}")
-            print("Note: Initial loading and inference can take 1-5 minutes depending on your hardware.")
-            print("Subsequent queries will be faster but may still take 30-60 seconds per response.")
-            
-            # Check if CUDA is available and set appropriate dtype
-            if torch.cuda.is_available():
-                print("CUDA is available. Using GPU acceleration.")
-                dtype = torch.float16
+            # Only initialize Mistral if no model is specified
+            if not model_name:
+                print("\nLoading default model and tokenizer...")
+                print("Model: mistralai/Mistral-7B-Instruct-v0.2")
+                self.model_name = "mistralai/Mistral-7B-Instruct-v0.2"
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_name,
+                    device_map="auto",
+                    torch_dtype=torch.float16,
+                    load_in_8bit=quantization == "8bit",
+                    load_in_4bit=quantization == "4bit"
+                )
+                self.pipeline = pipeline(
+                    "text-generation",
+                    model=self.model,
+                    tokenizer=self.tokenizer,
+                    device_map="auto"
+                )
+                print(f"Using default model: {self.model_name}")
             else:
-                print("CUDA is not available. Using CPU only (this will be slow).")
-                dtype = torch.float32
-            
-            # Set up model loading parameters
-            model_kwargs = {
-                "torch_dtype": dtype,
-                "device_map": "auto",
-                "token": token,
-                "low_cpu_mem_usage": True,
-                "offload_folder": "offload"
-            }
-            
-            # Apply quantization if specified
-            if quantization == '4bit':
-                try:
-                    from transformers import BitsAndBytesConfig
-                    quantization_config = BitsAndBytesConfig(
-                        load_in_4bit=True,
-                        bnb_4bit_compute_dtype=torch.float16,
-                        bnb_4bit_use_double_quant=True,
-                        bnb_4bit_quant_type="nf4"
-                    )
-                    model_kwargs["quantization_config"] = quantization_config
-                    print("Using 4-bit quantization with bitsandbytes")
-                except ImportError:
-                    print("Warning: bitsandbytes not installed. Falling back to standard loading.")
-                    print("To use 4-bit quantization, install bitsandbytes: pip install bitsandbytes")
-            elif quantization == '8bit':
-                try:
-                    from transformers import BitsAndBytesConfig
-                    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-                    model_kwargs["quantization_config"] = quantization_config
-                    print("Using 8-bit quantization with bitsandbytes")
-                except ImportError:
-                    print("Warning: bitsandbytes not installed. Falling back to standard loading.")
-                    print("To use 8-bit quantization, install bitsandbytes: pip install bitsandbytes")
-            
-            # Load model with appropriate settings
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                **model_kwargs
-            )
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
-            
-            # Create text generation pipeline with optimized settings
-            self.pipeline = pipeline(
-                "text-generation",
-                model=self.model,
-                tokenizer=self.tokenizer,
-                max_new_tokens=512,
-                do_sample=True,
-                temperature=0.1,
-                top_p=0.95,
-                device_map="auto"
-            )
-            print("✓ Model loaded successfully")
+                print(f"\nUsing specified model: {model_name}")
+                self.model_name = model_name
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_name,
+                    device_map="auto",
+                    torch_dtype=torch.float16,
+                    load_in_8bit=quantization == "8bit",
+                    load_in_4bit=quantization == "4bit"
+                )
+                self.pipeline = pipeline(
+                    "text-generation",
+                    model=self.model,
+                    tokenizer=self.tokenizer,
+                    device_map="auto"
+                )
+                print(f"Using specified model: {self.model_name}")
         
         # Create LLM wrapper
         self.llm = LocalLLM(self.pipeline)

From 7e840522c39cca9510b70d70e69f36484f4efd52 Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Fri, 11 Apr 2025 23:15:02 +0200
Subject: [PATCH 04/16] feat: model names fix

---
 agentic_rag/gradio_app.py      | 58 +++++++++++++++++-----------------
 agentic_rag/local_rag_agent.py |  2 +-
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index f18b5df..06c98c7 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -428,35 +428,35 @@ def create_interface():
             
             ### Ollama Models
             
-            | Model | Parameters | Size | Download Command |
-            |-------|------------|------|-----------------|
-            | Gemma 3 | 1B | 815MB | gemma3:1b |
-            | Gemma 3 | 4B | 3.3GB | gemma3 |
-            | Gemma 3 | 12B | 8.1GB | gemma3:12b |
-            | Gemma 3 | 27B | 17GB | gemma3:27b |
-            | QwQ | 32B | 20GB | qwq |
-            | DeepSeek-R1 | 7B | 4.7GB | deepseek-r1 |
-            | DeepSeek-R1 | 671B | 404GB | deepseek-r1:671b |
-            | Llama 3.3 | 70B | 43GB | llama3.3 |
-            | Llama 3.2 | 3B | 2.0GB | llama3.2 |
-            | Llama 3.2 | 1B | 1.3GB | llama3.2:1b |
-            | Llama 3.2 Vision | 11B | 7.9GB | llama3.2-vision |
-            | Llama 3.2 Vision | 90B | 55GB | llama3.2-vision:90b |
-            | Llama 3.1 | 8B | 4.7GB | llama3.1 |
-            | Llama 3.1 | 405B | 231GB | llama3.1:405b |
-            | Phi 4 | 14B | 9.1GB | phi4 |
-            | Phi 4 Mini | 3.8B | 2.5GB | phi4-mini |
-            | Mistral | 7B | 4.1GB | mistral |
-            | Moondream 2 | 1.4B | 829MB | moondream |
-            | Neural Chat | 7B | 4.1GB | neural-chat |
-            | Starling | 7B | 4.1GB | starling-lm |
-            | Code Llama | 7B | 3.8GB | codellama |
-            | Llama 2 Uncensored | 7B | 3.8GB | llama2-uncensored |
-            | LLaVA | 7B | 4.5GB | llava |
-            | Granite-3.2 | 8B | 4.9GB | granite3.2 |
-            | Llama 3 | 8B | 4.7GB | llama3 |
-            | Phi 3 | 4B | 4.0GB | phi3 |
-            | Qwen 2 | 7B | 4.1GB | qwen2 |
+            | Model | Parameters | Size | Download Command | Description | Pulls | Tags | Last Updated |
+            |-------|------------|------|-----------------|-------------|-------|------|--------------|
+            | Gemma 3 | 1B | 815MB | gemma3:1b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago |
+            | Gemma 3 | 4B | 3.3GB | gemma3 | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago |
+            | Gemma 3 | 12B | 8.1GB | gemma3:12b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago |
+            | Gemma 3 | 27B | 17GB | gemma3:27b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago |
+            | QwQ | 32B | 20GB | qwq | QwQ is the reasoning model of the Qwen series | 1.2M | 8 | 4 weeks ago |
+            | DeepSeek-R1 | 7B | 4.7GB | deepseek-r1 | DeepSeek's first-generation of reasoning models with comparable performance to OpenAI-o1 | 35.5M | 29 | 2 months ago |
+            | DeepSeek-R1 | 671B | 404GB | deepseek-r1:671b | DeepSeek's first-generation of reasoning models with comparable performance to OpenAI-o1 | 35.5M | 29 | 2 months ago |
+            | Llama 3.3 | 70B | 43GB | llama3.3 | New state of the art 70B model. Llama 3.3 70B offers similar performance compared to the Llama 3.1 405B model | 1.7M | 14 | 4 months ago |
+            | Llama 3.2 | 3B | 2.0GB | llama3.2 | Meta's Llama 3.2 goes small with 1B and 3B models | 12.8M | 63 | 6 months ago |
+            | Llama 3.2 | 1B | 1.3GB | llama3.2:1b | Meta's Llama 3.2 goes small with 1B and 3B models | 12.8M | 63 | 6 months ago |
+            | Llama 3.2 Vision | 11B | 7.9GB | llama3.2-vision | Llama 3.2 Vision is a collection of instruction-tuned image reasoning generative models | 1.8M | 9 | 5 months ago |
+            | Llama 3.2 Vision | 90B | 55GB | llama3.2-vision:90b | Llama 3.2 Vision is a collection of instruction-tuned image reasoning generative models | 1.8M | 9 | 5 months ago |
+            | Llama 3.1 | 8B | 4.7GB | llama3.1 | Llama 3.1 is a new state-of-the-art model from Meta | 89.6M | 93 | 4 months ago |
+            | Llama 3.1 | 405B | 231GB | llama3.1:405b | Llama 3.1 is a new state-of-the-art model from Meta | 89.6M | 93 | 4 months ago |
+            | Phi 4 | 14B | 9.1GB | phi4 | Phi-4 is a 14B parameter, state-of-the-art open model from Microsoft | 1.5M | 5 | 3 months ago |
+            | Phi 4 Mini | 3.8B | 2.5GB | phi4-mini | Phi-4 is a 14B parameter, state-of-the-art open model from Microsoft | 1.5M | 5 | 3 months ago |
+            | Mistral | 7B | 4.1GB | mistral | The 7B model released by Mistral AI, updated to version 0.3 | 11.6M | 84 | 8 months ago |
+            | Moondream 2 | 1.4B | 829MB | moondream | A series of multimodal LLMs (MLLMs) designed for vision-language understanding | 946.6K | 17 | 4 months ago |
+            | Neural Chat | 7B | 4.1GB | neural-chat | A state-of-the-art 12B model with 128k context length | 1.5M | 17 | 8 months ago |
+            | Starling | 7B | 4.1GB | starling-lm | A state-of-the-art 12B model with 128k context length | 1.5M | 17 | 8 months ago |
+            | Code Llama | 7B | 3.8GB | codellama | A large language model that can use text prompts to generate and discuss code | 1.9M | 199 | 8 months ago |
+            | Llama 2 Uncensored | 7B | 3.8GB | llama2-uncensored | Uncensored Llama 2 model by George Sung and Jarrad Hope | 913.2K | 34 | 17 months ago |
+            | LLaVA | 7B | 4.5GB | llava | LLaVA is a novel end-to-end trained large multimodal model for visual and language understanding | 4.8M | 98 | 14 months ago |
+            | Granite-3.2 | 8B | 4.9GB | granite3.2 | A high-performing and efficient model | 3.9M | 94 | 8 months ago |
+            | Llama 3 | 8B | 4.7GB | llama3 | Meta Llama 3: The most capable openly available LLM to date | 7.8M | 68 | 10 months ago |
+            | Phi 3 | 4B | 4.0GB | phi3 | Phi-3 is a family of lightweight 3B (Mini) and 14B (Medium) state-of-the-art open models | 3M | 72 | 8 months ago |
+            | Qwen 2 | 7B | 4.1GB | qwen2 | Qwen2 is a new series of large language models from Alibaba group | 4.2M | 97 | 7 months ago |
             
             ### HuggingFace Models
             
diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index 8de8b64..6ac2f8e 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -180,7 +180,7 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None,
             if model_name.startswith("ollama:"):
                 ollama_model_name = model_name.replace("ollama:", "")
             elif "Ollama - " in model_name:
-                ollama_model_name = model_name.replace("Ollama - ", "")
+                ollama_model_name = model_name.replace("Ollama - ", "").strip()
             else:
                 ollama_model_name = model_name
             

From 64b0dd8c74a85043d8c709224ae6ccd35462f40f Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Fri, 11 Apr 2025 23:20:52 +0200
Subject: [PATCH 05/16] feat: fix model name initializations!

---
 agentic_rag/gradio_app.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index 06c98c7..90461d9 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -141,8 +141,9 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
             quantization = "8bit"
             model_type = "Local (Mistral)"
         elif "Ollama" in agent_type:
+            # Extract model name from agent_type (e.g., "Ollama - deepseek-r1" -> "deepseek-r1")
+            model_name = agent_type.replace("Ollama - ", "").strip()
             model_type = "Ollama"
-            # Model name will be extracted later
         else:
             model_type = agent_type
         
@@ -156,10 +157,7 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
             agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, 
                                  skip_analysis=skip_analysis, quantization=quantization)
         elif "Ollama" in model_type:
-            # For Ollama models
-            # Extract model name directly from the model_type
-            model_name = model_type.replace("Ollama - ", "").strip()
-            
+            # For Ollama models, use the extracted model_name directly
             try:
                 agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, 
                                      collection=collection, skip_analysis=skip_analysis)

From 8c3a6b6843c55575c591ea8b2fd5b053e2c2e2ea Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Fri, 11 Apr 2025 23:56:13 +0200
Subject: [PATCH 06/16] fix: harmonizing of model names and selectors

---
 agentic_rag/gradio_app.py      | 66 +++++++++++++++++-----------------
 agentic_rag/local_rag_agent.py | 24 ++++---------
 2 files changed, 40 insertions(+), 50 deletions(-)

diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index 90461d9..25100a7 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -298,46 +298,46 @@ def create_interface():
         # HF models first if token is available
         if hf_token:
             model_choices.extend([
-                "Local (Mistral)", 
-                "Local (Mistral) - 4-bit Quantized",
-                "Local (Mistral) - 8-bit Quantized",
+                "mistral", 
+                "mistral-4bit",
+                "mistral-8bit",
             ])
         # Then Ollama models (don't require HF token)
         model_choices.extend([
-            "Ollama - llama3",
-            "Ollama - phi-3",
-            "Ollama - qwen2",
+            "llama3",
+            "phi-3",
+            "qwen2",
             # New Ollama models
-            "Ollama - gemma3:1b",
-            "Ollama - gemma3",
-            "Ollama - gemma3:12b", 
-            "Ollama - gemma3:27b",
-            "Ollama - qwq",
-            "Ollama - deepseek-r1",
-            "Ollama - deepseek-r1:671b",
-            "Ollama - llama3.3",
-            "Ollama - llama3.2",
-            "Ollama - llama3.2:1b",
-            "Ollama - llama3.2-vision",
-            "Ollama - llama3.2-vision:90b",
-            "Ollama - llama3.1",
-            "Ollama - llama3.1:405b",
-            "Ollama - phi4",
-            "Ollama - phi4-mini",
-            "Ollama - mistral",
-            "Ollama - moondream",
-            "Ollama - neural-chat",
-            "Ollama - starling-lm",
-            "Ollama - codellama",
-            "Ollama - llama2-uncensored",
-            "Ollama - llava",
-            "Ollama - granite3.2"
+            "gemma3:1b",
+            "gemma3",
+            "gemma3:12b", 
+            "gemma3:27b",
+            "qwq",
+            "deepseek-r1",
+            "deepseek-r1:671b",
+            "llama3.3",
+            "llama3.2",
+            "llama3.2:1b",
+            "llama3.2-vision",
+            "llama3.2-vision:90b",
+            "llama3.1",
+            "llama3.1:405b",
+            "phi4",
+            "phi4-mini",
+            "mistral",
+            "moondream",
+            "neural-chat",
+            "starling-lm",
+            "codellama",
+            "llama2-uncensored",
+            "llava",
+            "granite3.2"
         ])
         if openai_key:
-            model_choices.append("OpenAI")
+            model_choices.append("openai")
         
-        # Set default model to Ollama - qwen2
-        default_model = "Ollama - qwen2"
+        # Set default model to qwen2
+        default_model = "qwen2"
         
         # Model Management Tab (First Tab)
         with gr.Tab("Model Management"):
diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index 6ac2f8e..249d944 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -170,35 +170,25 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None,
         self.model_name = model_name
         # skip_analysis parameter kept for backward compatibility but no longer used
         
-        # Check if this is an Ollama model
-        self.is_ollama = model_name and (model_name.startswith("ollama:") or "Ollama - " in model_name)
+        # Check if this is an Ollama model (anything not Mistral is considered Ollama)
+        self.is_ollama = not (model_name and "mistral" in model_name.lower())
         
         if self.is_ollama:
-            # Extract the actual model name from the prefix
-            # If model_name contains 'ollama:' prefix, remove it
-            # If model_name is from gradio interface (e.g., "Ollama - llama3"), extract just the model name
-            if model_name.startswith("ollama:"):
-                ollama_model_name = model_name.replace("ollama:", "")
-            elif "Ollama - " in model_name:
-                ollama_model_name = model_name.replace("Ollama - ", "").strip()
-            else:
-                ollama_model_name = model_name
-            
             # Add :latest suffix if not present
-            if not ollama_model_name.endswith(":latest"):
-                ollama_model_name = f"{ollama_model_name}:latest"
+            if not model_name.endswith(":latest"):
+                model_name = f"{model_name}:latest"
             
             # Load Ollama model
             print("\nLoading Ollama model...")
-            print(f"Model: {ollama_model_name}")
+            print(f"Model: {model_name}")
             print("Note: Make sure Ollama is running on your system.")
             
             # Initialize Ollama model handler
-            self.ollama_handler = OllamaModelHandler(ollama_model_name)
+            self.ollama_handler = OllamaModelHandler(model_name)
             
             # Create pipeline-like interface
             self.pipeline = self.ollama_handler
-            print(f"Using Ollama model: {ollama_model_name}")
+            print(f"Using Ollama model: {model_name}")
         else:
             # Only initialize Mistral if no model is specified
             if not model_name:

From f1fb91f8d3a9f1d32a3dca75f4d9c7c99fe6e6f1 Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Fri, 11 Apr 2025 23:58:07 +0200
Subject: [PATCH 07/16] fix: bugname with suffixes

---
 agentic_rag/local_rag_agent.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index 249d944..3adf336 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -57,10 +57,6 @@ def __init__(self, model_name: str):
         Args:
             model_name: Name of the Ollama model to use
         """
-        # Ensure model name has :latest suffix
-        if not model_name.endswith(":latest"):
-            model_name = f"{model_name}:latest"
-        
         self.model_name = model_name
         self._check_ollama_running()
     
@@ -174,10 +170,6 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None,
         self.is_ollama = not (model_name and "mistral" in model_name.lower())
         
         if self.is_ollama:
-            # Add :latest suffix if not present
-            if not model_name.endswith(":latest"):
-                model_name = f"{model_name}:latest"
-            
             # Load Ollama model
             print("\nLoading Ollama model...")
             print(f"Model: {model_name}")

From 1b46a3b7e858ac82437528a1679f63eed1f742b8 Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Sat, 12 Apr 2025 00:00:11 +0200
Subject: [PATCH 08/16] fix: bugname with suffixes

---
 agentic_rag/local_rag_agent.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index 3adf336..daf9408 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -57,6 +57,10 @@ def __init__(self, model_name: str):
         Args:
             model_name: Name of the Ollama model to use
         """
+        # Remove 'ollama:' prefix if present
+        if model_name and model_name.startswith("ollama:"):
+            model_name = model_name.replace("ollama:", "")
+        
         self.model_name = model_name
         self._check_ollama_running()
     
@@ -170,6 +174,10 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None,
         self.is_ollama = not (model_name and "mistral" in model_name.lower())
         
         if self.is_ollama:
+            # Remove 'ollama:' prefix if present
+            if model_name and model_name.startswith("ollama:"):
+                model_name = model_name.replace("ollama:", "")
+            
             # Load Ollama model
             print("\nLoading Ollama model...")
             print(f"Model: {model_name}")

From 752ab954b133e62372899aa0cf2ffa41c11df6c8 Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Sat, 12 Apr 2025 00:43:45 +0200
Subject: [PATCH 09/16] fix: bugname with suffixes

---
 agentic_rag/local_rag_agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index daf9408..ab85d7f 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -481,7 +481,7 @@ def main():
     parser = argparse.ArgumentParser(description="Query documents using local LLM")
     parser.add_argument("--query", required=True, help="Query to search for")
     parser.add_argument("--embeddings", default="oracle", choices=["oracle", "chromadb"], help="Embeddings backend to use")
-    parser.add_argument("--model", default="ollama:qwen2", help="Model to use (default: ollama:qwen2)")
+    parser.add_argument("--model", default="qwen2", help="Model to use (default: qwen2)")
     parser.add_argument("--collection", help="Collection to search (PDF, Repository, General Knowledge)")
     parser.add_argument("--use-cot", action="store_true", help="Use Chain of Thought reasoning")
     parser.add_argument("--store-path", default="embeddings", help="Path to ChromaDB store")

From 09c597cfc451ebbd59d8a3044bb028552aff02d6 Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Sat, 12 Apr 2025 00:46:59 +0200
Subject: [PATCH 10/16] fix: bugname with suffixes

---
 agentic_rag/local_rag_agent.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index ab85d7f..b0ca35c 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -168,6 +168,7 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None,
         self.collection = collection
         self.quantization = quantization
         self.model_name = model_name
+        print('Model Name pre-check:', model_name)
         # skip_analysis parameter kept for backward compatibility but no longer used
         
         # Check if this is an Ollama model (anything not Mistral is considered Ollama)

From 8ce9da8b30ed77bf25e01b0505bab291d6e5a8de Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Sat, 12 Apr 2025 01:19:08 +0200
Subject: [PATCH 11/16] feat: added minikube, oke deployment, standard k8s
 readme, and updated local rag agent with more debug options

---
 agentic_rag/k8s/MINIKUBE.md       | 210 +++++++++++++++++++++++++
 agentic_rag/k8s/OKE_DEPLOYMENT.md | 246 ++++++++++++++++++++++++++++++
 agentic_rag/k8s/README_k8s.md     |  95 ++++++++++++
 agentic_rag/local_rag_agent.py    |  11 +-
 4 files changed, 561 insertions(+), 1 deletion(-)
 create mode 100644 agentic_rag/k8s/MINIKUBE.md
 create mode 100644 agentic_rag/k8s/OKE_DEPLOYMENT.md
 create mode 100644 agentic_rag/k8s/README_k8s.md

diff --git a/agentic_rag/k8s/MINIKUBE.md b/agentic_rag/k8s/MINIKUBE.md
new file mode 100644
index 0000000..cd48157
--- /dev/null
+++ b/agentic_rag/k8s/MINIKUBE.md
@@ -0,0 +1,210 @@
+# Quick Start with Minikube
+
+This guide provides instructions for deploying the Agentic RAG system on Minikube for local testing.
+
+## Prerequisites
+
+1. [Minikube](https://minikube.sigs.k8s.io/docs/start/) installed
+2. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) installed
+3. Docker or another container runtime installed
+4. NVIDIA GPU with appropriate drivers installed
+5. [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) installed
+
+## Step 1: Start Minikube with GPU Support
+
+Start Minikube with sufficient resources and GPU support:
+
+```bash
+# For Linux
+minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=kvm2 --gpu
+
+# For Windows
+minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=hyperv --gpu
+
+# For macOS (Note: GPU passthrough is limited on macOS)
+minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=hyperkit
+```
+
+Verify that Minikube is running:
+
+```bash
+minikube status
+```
+
+## Step 2: Install NVIDIA Device Plugin
+
+Install the NVIDIA device plugin to enable GPU support in Kubernetes:
+
+```bash
+# Apply the NVIDIA device plugin
+kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.0/nvidia-device-plugin.yml
+```
+
+Verify that the GPU is available in the cluster:
+
+```bash
+kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
+```
+
+## Step 3: Clone the Repository
+
+Clone the repository containing the Kubernetes manifests:
+
+```bash
+git clone https://github.com/devrel/devrel-labs.git
+cd devrel-labs/agentic_rag/k8s
+```
+
+## Step 4: Deploy the Application
+
+The deployment includes both Hugging Face models and Ollama for inference. The Hugging Face token is optional but recommended for using Mistral models.
+
+### Option 1: Deploy without a Hugging Face token (Ollama models only)
+
+```bash
+# Create a namespace
+kubectl create namespace agentic-rag
+
+# Create an empty ConfigMap
+cat <<EOF | kubectl apply -n agentic-rag -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: agentic-rag-config
+data:
+  config.yaml: |
+    # No Hugging Face token provided
+    # You can still use Ollama models
+EOF
+
+# Apply the manifests
+kubectl apply -n agentic-rag -f local-deployment/deployment.yaml
+kubectl apply -n agentic-rag -f local-deployment/service.yaml
+```
+
+### Option 2: Deploy with a Hugging Face token (both Mistral and Ollama models)
+
+```bash
+# Create a namespace
+kubectl create namespace agentic-rag
+
+# Create ConfigMap with your Hugging Face token
+cat <<EOF | kubectl apply -n agentic-rag -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: agentic-rag-config
+data:
+  config.yaml: |
+    HUGGING_FACE_HUB_TOKEN: "your-huggingface-token"
+EOF
+
+# Apply the manifests
+kubectl apply -n agentic-rag -f local-deployment/deployment.yaml
+kubectl apply -n agentic-rag -f local-deployment/service.yaml
+```
+
+### Option 3: Using the deployment script
+
+```bash
+# Make the script executable
+chmod +x deploy.sh
+
+# Deploy with a Hugging Face token
+./deploy.sh --hf-token "your-huggingface-token" --namespace agentic-rag
+
+# Or deploy without a Hugging Face token
+./deploy.sh --namespace agentic-rag
+```
+
+## Step 5: Monitor the Deployment
+
+Check the status of your pods:
+
+```bash
+kubectl get pods -n agentic-rag
+```
+
+View the logs:
+
+```bash
+kubectl logs -f deployment/agentic-rag -n agentic-rag
+```
+
+## Step 6: Access the Application
+
+For Minikube, you need to use port-forwarding to access the application:
+
+```bash
+kubectl port-forward -n agentic-rag service/agentic-rag 8080:80
+```
+
+Then access the application in your browser at `http://localhost:8080`.
+
+Alternatively, you can use Minikube's service command:
+
+```bash
+minikube service agentic-rag -n agentic-rag
+```
+
+## Troubleshooting
+
+### Insufficient Resources
+
+If pods are stuck in Pending state due to insufficient resources, you can increase Minikube's resources:
+
+```bash
+minikube stop
+minikube start --cpus 6 --memory 16384 --disk-size 50g --driver=kvm2 --gpu
+```
+
+### GPU-Related Issues
+
+If you encounter GPU-related issues:
+
+1. **Check GPU availability in Minikube**:
+   ```bash
+   minikube ssh -- nvidia-smi
+   ```
+
+2. **Verify NVIDIA device plugin is running**:
+   ```bash
+   kubectl get pods -n kube-system | grep nvidia-device-plugin
+   ```
+
+3. **Check if GPU is available to Kubernetes**:
+   ```bash
+   kubectl describe nodes | grep nvidia.com/gpu
+   ```
+
+### Slow Model Download
+
+The first time you deploy, the models will be downloaded, which can take some time. You can check the progress in the logs:
+
+```bash
+kubectl logs -f deployment/agentic-rag -n agentic-rag
+```
+
+### Service Not Accessible
+
+If you can't access the service, make sure port-forwarding is running or try using the Minikube service command.
+
+## Cleanup
+
+To remove all resources:
+
+```bash
+kubectl delete namespace agentic-rag
+```
+
+To stop Minikube:
+
+```bash
+minikube stop
+```
+
+To delete the Minikube cluster:
+
+```bash
+minikube delete
+``` 
\ No newline at end of file
diff --git a/agentic_rag/k8s/OKE_DEPLOYMENT.md b/agentic_rag/k8s/OKE_DEPLOYMENT.md
new file mode 100644
index 0000000..5866606
--- /dev/null
+++ b/agentic_rag/k8s/OKE_DEPLOYMENT.md
@@ -0,0 +1,246 @@
+# Deploying Agentic RAG on Oracle Kubernetes Engine (OKE)
+
+This guide provides detailed instructions for deploying the Agentic RAG system on Oracle Kubernetes Engine (OKE).
+
+## Prerequisites
+
+1. Access to an Oracle Cloud Infrastructure (OCI) account
+2. OKE cluster created and configured
+3. `kubectl` installed and configured to connect to your OKE cluster
+4. OCI CLI installed and configured (optional but recommended)
+5. GPU-enabled node pool in your OKE cluster
+
+## Step 1: Create a GPU-enabled Node Pool
+
+If you don't already have a GPU-enabled node pool in your OKE cluster, you'll need to create one:
+
+1. Navigate to the OKE cluster in the OCI Console
+2. Click on "Add Node Pool"
+3. Configure the node pool:
+   - Name: `gpu-pool`
+   - Shape: Select a GPU-enabled shape (e.g., `VM.GPU2.1`, `VM.GPU3.1`, or `BM.GPU4.8`)
+   - Image: Select an Oracle Linux image
+   - Node count: Start with 1-2 nodes
+4. Click "Create"
+
+Wait for the node pool to be created and the nodes to become active.
+
+## Step 2: Install NVIDIA Device Plugin
+
+Install the NVIDIA device plugin to enable GPU support in Kubernetes:
+
+```bash
+# Apply the NVIDIA device plugin
+kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.0/nvidia-device-plugin.yml
+```
+
+Verify that the GPU is available in the cluster:
+
+```bash
+kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
+```
+
+## Step 3: Clone the Repository
+
+Clone the repository containing the Kubernetes manifests:
+
+```bash
+git clone https://github.com/devrel/devrel-labs.git
+cd devrel-labs/agentic_rag/k8s
+```
+
+## Step 4: Deploy the Application
+
+The deployment includes both Hugging Face models and Ollama for inference. The Hugging Face token is optional but recommended for using Mistral models.
+
+### Option 1: Deploy without a Hugging Face token (Ollama models only)
+
+```bash
+# Create a namespace
+kubectl create namespace agentic-rag
+
+# Create an empty ConfigMap
+cat <<EOF | kubectl apply -n agentic-rag -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: agentic-rag-config
+data:
+  config.yaml: |
+    # No Hugging Face token provided
+    # You can still use Ollama models
+EOF
+
+# Apply the manifests
+kubectl apply -n agentic-rag -f local-deployment/deployment.yaml
+kubectl apply -n agentic-rag -f local-deployment/service.yaml
+```
+
+### Option 2: Deploy with a Hugging Face token (both Mistral and Ollama models)
+
+```bash
+# Create a namespace
+kubectl create namespace agentic-rag
+
+# Create ConfigMap with your Hugging Face token
+cat <<EOF | kubectl apply -n agentic-rag -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: agentic-rag-config
+data:
+  config.yaml: |
+    HUGGING_FACE_HUB_TOKEN: "your-huggingface-token"
+EOF
+
+# Apply the manifests
+kubectl apply -n agentic-rag -f local-deployment/deployment.yaml
+kubectl apply -n agentic-rag -f local-deployment/service.yaml
+```
+
+### Option 3: Using the deployment script
+
+```bash
+# Make the script executable
+chmod +x deploy.sh
+
+# Deploy with a Hugging Face token
+./deploy.sh --hf-token "your-huggingface-token" --namespace agentic-rag
+
+# Or deploy without a Hugging Face token
+./deploy.sh --namespace agentic-rag
+```
+
+## Step 5: Configure Load Balancer (Optional)
+
+By default, the service is exposed as a LoadBalancer, which will automatically create an OCI Load Balancer. If you want to customize the load balancer:
+
+```bash
+# Edit the service
+kubectl edit service agentic-rag -n agentic-rag
+```
+
+Add annotations for OCI Load Balancer configuration:
+
+```yaml
+metadata:
+  annotations:
+    service.beta.kubernetes.io/oci-load-balancer-shape: "flexible"
+    service.beta.kubernetes.io/oci-load-balancer-shape-flex-min: "10"
+    service.beta.kubernetes.io/oci-load-balancer-shape-flex-max: "100"
+```
+
+## Step 6: Monitor the Deployment
+
+Check the status of your pods:
+
+```bash
+kubectl get pods -n agentic-rag
+```
+
+View the logs:
+
+```bash
+kubectl logs -f deployment/agentic-rag -n agentic-rag
+```
+
+Check GPU allocation:
+
+```bash
+kubectl describe pod -l app=agentic-rag -n agentic-rag | grep -A5 'Allocated resources'
+```
+
+## Step 7: Access the Application
+
+Get the external IP of the load balancer:
+
+```bash
+kubectl get service agentic-rag -n agentic-rag
+```
+
+Access the application in your browser at `http://<EXTERNAL-IP>`.
+
+## Troubleshooting
+
+### Pod Stuck in Pending State
+
+If the pod is stuck in Pending state, check the events:
+
+```bash
+kubectl describe pod -l app=agentic-rag -n agentic-rag
+```
+
+Common issues include:
+
+1. **Insufficient resources**: Ensure your node pool has enough resources
+2. **GPU not available**: Ensure your node pool has GPU-enabled nodes
+3. **Image pull issues**: Check if the image can be pulled from the registry
+
+### GPU-Related Issues
+
+If you encounter GPU-related issues:
+
+1. **Check GPU availability in OKE**:
+   ```bash
+   kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
+   ```
+
+2. **Verify NVIDIA device plugin is running**:
+   ```bash
+   kubectl get pods -n kube-system | grep nvidia-device-plugin
+   ```
+
+3. **Check if GPU is available to the pod**:
+   ```bash
+   kubectl describe pod -l app=agentic-rag -n agentic-rag | grep -A5 'Allocated resources'
+   ```
+
+4. **Check NVIDIA driver installation on the node**:
+   ```bash
+   # Get the node name
+   NODE_NAME=$(kubectl get pod -l app=agentic-rag -n agentic-rag -o jsonpath='{.items[0].spec.nodeName}')
+   
+   # Create a debug pod on the node
+   kubectl debug node/$NODE_NAME -it --image=ubuntu
+   
+   # Inside the debug pod
+   chroot /host
+   nvidia-smi
+   ```
+
+### Load Balancer Issues
+
+If the load balancer is not provisioning or not accessible:
+
+1. Check the service status:
+   ```bash
+   kubectl get service agentic-rag -n agentic-rag
+   ```
+
+2. Check OCI Console for load balancer status and configuration
+
+3. Ensure your VCN security lists allow traffic to the load balancer
+
+## Scaling
+
+To scale the deployment:
+
+```bash
+kubectl scale deployment agentic-rag -n agentic-rag --replicas=2
+```
+
+Note: Each replica will require its own GPU.
+
+## Cleanup
+
+To remove all resources:
+
+```bash
+kubectl delete namespace agentic-rag
+```
+
+To delete the OCI Load Balancer (if it's not automatically deleted):
+
+1. Navigate to the Load Balancers page in the OCI Console
+2. Find the load balancer created for your service
+3. Click "Delete" and confirm 
\ No newline at end of file
diff --git a/agentic_rag/k8s/README_k8s.md b/agentic_rag/k8s/README_k8s.md
new file mode 100644
index 0000000..a6bc5fd
--- /dev/null
+++ b/agentic_rag/k8s/README_k8s.md
@@ -0,0 +1,95 @@
+# Kubernetes Deployment for Agentic RAG
+
+This directory contains Kubernetes manifests for deploying the Agentic RAG system.
+
+## Prerequisites
+
+- Kubernetes cluster (e.g., Oracle Kubernetes Engine, Minikube, or any other Kubernetes cluster)
+- `kubectl` configured to access your cluster
+- At least 8GB of RAM and 4 CPU cores available for the deployment
+
+## Deployment
+
+This deployment includes both Hugging Face models and Ollama for inference. The Hugging Face token is optional but recommended for using Mistral models.
+
+1. **Update the ConfigMap with your Hugging Face token** (optional but recommended):
+
+   ```bash
+   # Edit the configmap.yaml file
+   nano local-deployment/configmap.yaml
+   
+   # Replace "your-huggingface-token" with your actual token
+   ```
+
+2. **Deploy the application**:
+
+   ```bash
+   kubectl apply -f local-deployment/configmap.yaml
+   kubectl apply -f local-deployment/deployment.yaml
+   kubectl apply -f local-deployment/service.yaml
+   ```
+
+3. **Access the application**:
+
+   If using LoadBalancer:
+   ```bash
+   kubectl get service agentic-rag
+   ```
+   
+   If using NodePort:
+   ```bash
+   # Get the NodePort
+   kubectl get service agentic-rag
+   
+   # Access the application at http://<node-ip>:<node-port>
+   ```
+
+## Model Selection
+
+The deployment includes both Hugging Face models and Ollama models:
+
+- **Hugging Face Models**: Mistral-7B models (requires token in config.yaml)
+- **Ollama Models**: llama3, phi3, and qwen2 (automatically downloaded during deployment)
+
+You can select which model to use from the Gradio interface after deployment.
+
+## Monitoring and Troubleshooting
+
+### Check pod status:
+
+```bash
+kubectl get pods
+```
+
+### View logs:
+
+```bash
+kubectl logs -f deployment/agentic-rag
+```
+
+### Shell into the pod:
+
+```bash
+kubectl exec -it deployment/agentic-rag -- /bin/bash
+```
+
+## Scaling
+
+For production deployments, consider:
+
+1. Using persistent volumes for data storage
+2. Adjusting resource requests and limits based on your workload
+3. Setting up proper monitoring and logging
+4. Implementing horizontal pod autoscaling
+
+## Cleanup
+
+To remove the deployment:
+
+```bash
+kubectl delete -f local-deployment/
+```
+
+## Future Work
+
+A distributed system deployment that separates the LLM inference system into its own service is planned for future releases. This will allow for better resource allocation and scaling in production environments. 
\ No newline at end of file
diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index b0ca35c..e6558d3 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -134,6 +134,13 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None,
             quantization: Quantization method to use (None, '4bit', '8bit')
             use_oracle_db: Whether to use Oracle DB for vector storage (if False, uses ChromaDB)
         """
+        print(f"LocalRAGAgent init - model_name: {model_name}")
+        
+        # Set default model if none provided
+        if model_name is None:
+            model_name = "qwen2"
+            print(f"Using default model: {model_name}")
+        
         # Initialize vector store if not provided
         self.use_oracle_db = use_oracle_db and ORACLE_DB_AVAILABLE
         
@@ -168,7 +175,7 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None,
         self.collection = collection
         self.quantization = quantization
         self.model_name = model_name
-        print('Model Name pre-check:', model_name)
+        print('Model Name after assignment:', self.model_name)
         # skip_analysis parameter kept for backward compatibility but no longer used
         
         # Check if this is an Ollama model (anything not Mistral is considered Ollama)
@@ -501,6 +508,7 @@ def main():
     
     print("\nInitializing RAG agent...")
     print("=" * 50)
+    print(f"Using model: {args.model}")
     
     try:
         # Determine which vector store to use based on args.embeddings
@@ -527,6 +535,7 @@ def main():
         # Set use_oracle_db based on the actual store type
         use_oracle_db = args.embeddings == "oracle" and isinstance(store, OraDBVectorStore)
         
+        print(f"Creating LocalRAGAgent with model: {args.model}")
         agent = LocalRAGAgent(
             store, 
             model_name=args.model, 

From 7a9dd4419e5f03f5cce771cb786629f9d66b2219 Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Sat, 12 Apr 2025 01:37:46 +0200
Subject: [PATCH 12/16] feat: added minikube, oke deployment, standard k8s
 readme, and updated local rag agent with more debug options

---
 agentic_rag/gradio_app.py      | 199 +++++----------------------------
 agentic_rag/local_rag_agent.py |   4 +
 2 files changed, 32 insertions(+), 171 deletions(-)

diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index 25100a7..a011d63 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -295,196 +295,53 @@ def create_interface():
         
         # Create model choices list for reuse
         model_choices = []
-        # HF models first if token is available
-        if hf_token:
-            model_choices.extend([
-                "mistral", 
-                "mistral-4bit",
-                "mistral-8bit",
-            ])
-        # Then Ollama models (don't require HF token)
+        # Only Ollama models (no more local Mistral deployments)
         model_choices.extend([
-            "llama3",
-            "phi-3",
-            "qwen2",
-            # New Ollama models
-            "gemma3:1b",
-            "gemma3",
-            "gemma3:12b", 
-            "gemma3:27b",
             "qwq",
-            "deepseek-r1",
-            "deepseek-r1:671b",
+            "gemma3",
             "llama3.3",
-            "llama3.2",
-            "llama3.2:1b",
-            "llama3.2-vision",
-            "llama3.2-vision:90b",
-            "llama3.1",
-            "llama3.1:405b",
             "phi4",
-            "phi4-mini",
             "mistral",
-            "moondream",
-            "neural-chat",
-            "starling-lm",
-            "codellama",
-            "llama2-uncensored",
             "llava",
-            "granite3.2"
+            "phi3",
+            "deepseek-r1"
         ])
         if openai_key:
             model_choices.append("openai")
         
-        # Set default model to qwen2
-        default_model = "qwen2"
+        # Set default model to qwq
+        default_model = "qwq"
         
         # Model Management Tab (First Tab)
         with gr.Tab("Model Management"):
             gr.Markdown("""
-            ## Model Management
-            
-            Download models in advance to prepare them for use in the chat interface.
-            
-            ### Hugging Face Models
-            
-            For Hugging Face models (Mistral), you'll need a Hugging Face token in your config.yaml file.
-            
-            ### Ollama Models (Default)
-            
-            Ollama models are used by default. For Ollama models, this will pull the model using the Ollama client.
-            Make sure Ollama is installed and running on your system.
-            You can download Ollama from [ollama.com/download](https://ollama.com/download)
+            ## Model Selection
+            Choose your preferred model for the conversation.
             """)
             
-            with gr.Row():
-                with gr.Column():
-                    model_dropdown = gr.Dropdown(
-                        choices=model_choices,
-                        value=default_model if default_model in model_choices else model_choices[0] if model_choices else None,
-                        label="Select Model to Download",
-                        interactive=True
-                    )
-                    download_button = gr.Button("Download Selected Model")
-                    model_status = gr.Textbox(
-                        label="Download Status",
-                        placeholder="Select a model and click Download to begin...",
-                        interactive=False
-                    )
-                
-                with gr.Column():
-                    gr.Markdown("""
-                    ### Model Information
-                    
-                    **Ollama - qwen2** (DEFAULT): Alibaba's Qwen2 model via Ollama.
-                    - Size: ~4GB
-                    - Requires Ollama to be installed and running
-                    - High-quality model with good performance
-                    
-                    **Ollama - llama3**: Meta's Llama 3 model via Ollama.
-                    - Size: ~4GB
-                    - Requires Ollama to be installed and running
-                    - Excellent performance and quality
-                    
-                    **Ollama - phi-3**: Microsoft's Phi-3 model via Ollama.
-                    - Size: ~4GB
-                    - Requires Ollama to be installed and running
-                    - Efficient small model with good performance
-                    
-                    **Local (Mistral)**: The default Mistral-7B-Instruct-v0.2 model.
-                    - Size: ~14GB
-                    - VRAM Required: ~8GB
-                    - Good balance of quality and speed
-                    
-                    **Local (Mistral) - 4-bit Quantized**: 4-bit quantized version of Mistral-7B.
-                    - Size: ~4GB
-                    - VRAM Required: ~4GB
-                    - Faster inference with minimal quality loss
-                    
-                    **Local (Mistral) - 8-bit Quantized**: 8-bit quantized version of Mistral-7B.
-                    - Size: ~7GB
-                    - VRAM Required: ~6GB
-                    - Balance between quality and memory usage
-                    
-                    For a complete list of supported models and specifications, see the **Model FAQ** tab.
-                    """)
-        
-        # Model FAQ Tab
-        with gr.Tab("Model FAQ"):
-            gr.Markdown("""
-            ## Model Information & Technical Requirements
-            
-            This page provides detailed information about all supported models, including size, parameter count, and hardware requirements.
-            
-            ### Memory Requirements
+            model_dropdown = gr.Dropdown(
+                choices=model_choices,
+                value=default_model,
+                label="Select Model",
+                info="Choose the model to use for the conversation"
+            )
             
-            As a general guideline:
-            - You should have at least 8 GB of RAM available to run 7B parameter models
-            - You should have at least 16 GB of RAM available to run 13B parameter models
-            - You should have at least 32 GB of RAM available to run 33B+ parameter models
-            - For vision models, additional memory is required for image processing
-            
-            ### Ollama Models
-            
-            | Model | Parameters | Size | Download Command | Description | Pulls | Tags | Last Updated |
-            |-------|------------|------|-----------------|-------------|-------|------|--------------|
-            | Gemma 3 | 1B | 815MB | gemma3:1b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago |
-            | Gemma 3 | 4B | 3.3GB | gemma3 | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago |
-            | Gemma 3 | 12B | 8.1GB | gemma3:12b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago |
-            | Gemma 3 | 27B | 17GB | gemma3:27b | The current, most capable model that runs on a single GPU | 3.4M | 17 | 2 weeks ago |
-            | QwQ | 32B | 20GB | qwq | QwQ is the reasoning model of the Qwen series | 1.2M | 8 | 4 weeks ago |
-            | DeepSeek-R1 | 7B | 4.7GB | deepseek-r1 | DeepSeek's first-generation of reasoning models with comparable performance to OpenAI-o1 | 35.5M | 29 | 2 months ago |
-            | DeepSeek-R1 | 671B | 404GB | deepseek-r1:671b | DeepSeek's first-generation of reasoning models with comparable performance to OpenAI-o1 | 35.5M | 29 | 2 months ago |
-            | Llama 3.3 | 70B | 43GB | llama3.3 | New state of the art 70B model. Llama 3.3 70B offers similar performance compared to the Llama 3.1 405B model | 1.7M | 14 | 4 months ago |
-            | Llama 3.2 | 3B | 2.0GB | llama3.2 | Meta's Llama 3.2 goes small with 1B and 3B models | 12.8M | 63 | 6 months ago |
-            | Llama 3.2 | 1B | 1.3GB | llama3.2:1b | Meta's Llama 3.2 goes small with 1B and 3B models | 12.8M | 63 | 6 months ago |
-            | Llama 3.2 Vision | 11B | 7.9GB | llama3.2-vision | Llama 3.2 Vision is a collection of instruction-tuned image reasoning generative models | 1.8M | 9 | 5 months ago |
-            | Llama 3.2 Vision | 90B | 55GB | llama3.2-vision:90b | Llama 3.2 Vision is a collection of instruction-tuned image reasoning generative models | 1.8M | 9 | 5 months ago |
-            | Llama 3.1 | 8B | 4.7GB | llama3.1 | Llama 3.1 is a new state-of-the-art model from Meta | 89.6M | 93 | 4 months ago |
-            | Llama 3.1 | 405B | 231GB | llama3.1:405b | Llama 3.1 is a new state-of-the-art model from Meta | 89.6M | 93 | 4 months ago |
-            | Phi 4 | 14B | 9.1GB | phi4 | Phi-4 is a 14B parameter, state-of-the-art open model from Microsoft | 1.5M | 5 | 3 months ago |
-            | Phi 4 Mini | 3.8B | 2.5GB | phi4-mini | Phi-4 is a 14B parameter, state-of-the-art open model from Microsoft | 1.5M | 5 | 3 months ago |
-            | Mistral | 7B | 4.1GB | mistral | The 7B model released by Mistral AI, updated to version 0.3 | 11.6M | 84 | 8 months ago |
-            | Moondream 2 | 1.4B | 829MB | moondream | A series of multimodal LLMs (MLLMs) designed for vision-language understanding | 946.6K | 17 | 4 months ago |
-            | Neural Chat | 7B | 4.1GB | neural-chat | A state-of-the-art 12B model with 128k context length | 1.5M | 17 | 8 months ago |
-            | Starling | 7B | 4.1GB | starling-lm | A state-of-the-art 12B model with 128k context length | 1.5M | 17 | 8 months ago |
-            | Code Llama | 7B | 3.8GB | codellama | A large language model that can use text prompts to generate and discuss code | 1.9M | 199 | 8 months ago |
-            | Llama 2 Uncensored | 7B | 3.8GB | llama2-uncensored | Uncensored Llama 2 model by George Sung and Jarrad Hope | 913.2K | 34 | 17 months ago |
-            | LLaVA | 7B | 4.5GB | llava | LLaVA is a novel end-to-end trained large multimodal model for visual and language understanding | 4.8M | 98 | 14 months ago |
-            | Granite-3.2 | 8B | 4.9GB | granite3.2 | A high-performing and efficient model | 3.9M | 94 | 8 months ago |
-            | Llama 3 | 8B | 4.7GB | llama3 | Meta Llama 3: The most capable openly available LLM to date | 7.8M | 68 | 10 months ago |
-            | Phi 3 | 4B | 4.0GB | phi3 | Phi-3 is a family of lightweight 3B (Mini) and 14B (Medium) state-of-the-art open models | 3M | 72 | 8 months ago |
-            | Qwen 2 | 7B | 4.1GB | qwen2 | Qwen2 is a new series of large language models from Alibaba group | 4.2M | 97 | 7 months ago |
-            
-            ### HuggingFace Models
-            
-            | Model | Parameters | Size | Quantization | VRAM Required |
-            |-------|------------|------|--------------|---------------|
-            | Mistral | 7B | 14GB | None | 8GB |
-            | Mistral | 7B | 4GB | 4-bit | 4GB |
-            | Mistral | 7B | 7GB | 8-bit | 6GB |
-            
-            ### Recommended Models
-            
-            **Best Overall Performance**:
-            - Ollama - llama3
-            - Ollama - llama3.2-vision (for image processing)
-            - Ollama - phi4
-            
-            **Best for Limited Hardware (8GB RAM)**:
-            - Ollama - llama3.2:1b
-            - Ollama - gemma3:1b
-            - Ollama - phi4-mini
-            - Ollama - moondream
+            # Add model FAQ section
+            gr.Markdown("""
+            ## Model FAQ
             
-            **Best for Code Tasks**:
-            - Ollama - codellama
-            - Ollama - deepseek-r1
+            | Model | Parameters | Size | Download Command |
+            |-------|------------|------|------------------|
+            | qwq | 7B | 4.1GB | qwq:latest |
+            | gemma3 | 7B | 4.1GB | gemma3:latest |
+            | llama3.3 | 7B | 4.1GB | llama3.3:latest |
+            | phi4 | 7B | 4.1GB | phi4:latest |
+            | mistral | 7B | 4.1GB | mistral:latest |
+            | llava | 7B | 4.1GB | llava:latest |
+            | phi3 | 7B | 4.1GB | phi3:latest |
+            | deepseek-r1 | 7B | 4.1GB | deepseek-r1:latest |
             
-            **Best for Enterprise Use**:
-            - Ollama - qwen2
-            - Ollama - granite3.2
-            - Ollama - neural-chat
+            Note: All models are available through Ollama. Make sure Ollama is running on your system.
             """)
         
         # Document Processing Tab
diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index e6558d3..a64f4c7 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -186,6 +186,10 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = None,
             if model_name and model_name.startswith("ollama:"):
                 model_name = model_name.replace("ollama:", "")
             
+            # Always append :latest to Ollama model names
+            if not model_name.endswith(":latest"):
+                model_name = f"{model_name}:latest"
+            
             # Load Ollama model
             print("\nLoading Ollama model...")
             print(f"Model: {model_name}")

From ac1a470c5869135767fc500b1c01de0f68e53e60 Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Sat, 12 Apr 2025 01:49:04 +0200
Subject: [PATCH 13/16] feat: added minikube, oke deployment, standard k8s
 readme, and updated local rag agent with more debug options

---
 agentic_rag/gradio_app.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index a011d63..295f093 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -434,9 +434,6 @@ def create_interface():
         url_button.click(process_url, inputs=[url_input], outputs=[url_output])
         repo_button.click(process_repo, inputs=[repo_input], outputs=[repo_output])
         
-        # Model download event handler
-        download_button.click(download_model, inputs=[model_dropdown], outputs=[model_status])
-        
         # Standard chat handlers
         standard_msg.submit(
             chat,

From a71753ef5409445bb036e4e1cd3c912afc6ce5fa Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Sat, 12 Apr 2025 01:58:19 +0200
Subject: [PATCH 14/16] feat: added minikube, oke deployment, standard k8s
 readme, and updated local rag agent with more debug options

---
 agentic_rag/gradio_app.py | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index 295f093..4d9455d 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -319,12 +319,20 @@ def create_interface():
             Choose your preferred model for the conversation.
             """)
             
-            model_dropdown = gr.Dropdown(
-                choices=model_choices,
-                value=default_model,
-                label="Select Model",
-                info="Choose the model to use for the conversation"
-            )
+            with gr.Row():
+                with gr.Column():
+                    model_dropdown = gr.Dropdown(
+                        choices=model_choices,
+                        value=default_model,
+                        label="Select Model",
+                        info="Choose the model to use for the conversation"
+                    )
+                    download_button = gr.Button("Download Selected Model")
+                    model_status = gr.Textbox(
+                        label="Download Status",
+                        placeholder="Select a model and click Download to begin...",
+                        interactive=False
+                    )
             
             # Add model FAQ section
             gr.Markdown("""
@@ -332,14 +340,14 @@ def create_interface():
             
             | Model | Parameters | Size | Download Command |
             |-------|------------|------|------------------|
-            | qwq | 7B | 4.1GB | qwq:latest |
-            | gemma3 | 7B | 4.1GB | gemma3:latest |
-            | llama3.3 | 7B | 4.1GB | llama3.3:latest |
-            | phi4 | 7B | 4.1GB | phi4:latest |
+            | qwq | 32B | 20GB | qwq:latest |
+            | gemma3 | 4B | 3.3GB | gemma3:latest |
+            | llama3.3 | 70B | 43GB | llama3.3:latest |
+            | phi4 | 14B | 9.1GB | phi4:latest |
             | mistral | 7B | 4.1GB | mistral:latest |
-            | llava | 7B | 4.1GB | llava:latest |
-            | phi3 | 7B | 4.1GB | phi3:latest |
-            | deepseek-r1 | 7B | 4.1GB | deepseek-r1:latest |
+            | llava | 7B | 4.5GB | llava:latest |
+            | phi3 | 4B | 4.0GB | phi3:latest |
+            | deepseek-r1 | 7B | 4.7GB | deepseek-r1:latest |
             
             Note: All models are available through Ollama. Make sure Ollama is running on your system.
             """)
@@ -434,6 +442,9 @@ def create_interface():
         url_button.click(process_url, inputs=[url_input], outputs=[url_output])
         repo_button.click(process_repo, inputs=[repo_input], outputs=[repo_output])
         
+        # Model download event handler
+        download_button.click(download_model, inputs=[model_dropdown], outputs=[model_status])
+        
         # Standard chat handlers
         standard_msg.submit(
             chat,

From 6e5055221f2b7edf108b152c4459c7d9f5c5843d Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Sat, 12 Apr 2025 02:00:53 +0200
Subject: [PATCH 15/16] feat: added minikube, oke deployment, standard k8s
 readme, and updated local rag agent with more debug options

---
 agentic_rag/gradio_app.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index 4d9455d..228b958 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -651,8 +651,8 @@ def download_model(model_type: str) -> str:
                 
             except Exception as e:
                 return f"❌ Error downloading model: {str(e)}"
-                
-        elif "Ollama" in model_type:
+        # all ollama models
+        else:
             # Extract model name from model_type
             # Remove the 'Ollama - ' prefix and any leading/trailing whitespace
             model_name = model_type.replace("Ollama - ", "").strip()
@@ -703,8 +703,6 @@ def download_model(model_type: str) -> str:
                 return "❌ Error: Could not connect to Ollama. Please make sure Ollama is installed and running."
             except Exception as e:
                 return f"❌ Error pulling Ollama model: {str(e)}"
-        else:
-            return "❌ Error: Unknown model type"
     
     except Exception as e:
         return f"❌ Error: {str(e)}"

From 5033bc1edbf0ced4f96e7075fd3fd085f78fc10e Mon Sep 17 00:00:00 2001
From: jasperan <23caj23@gmail.com>
Date: Sat, 12 Apr 2025 02:19:19 +0200
Subject: [PATCH 16/16] feat: added minikube, oke deployment, standard k8s
 readme, and updated local rag agent with more debug options

---
 agentic_rag/gradio_app.py | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index 228b958..7c39376 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -140,15 +140,22 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
         elif "8-bit" in agent_type:
             quantization = "8bit"
             model_type = "Local (Mistral)"
-        elif "Ollama" in agent_type:
-            # Extract model name from agent_type (e.g., "Ollama - deepseek-r1" -> "deepseek-r1")
-            model_name = agent_type.replace("Ollama - ", "").strip()
-            model_type = "Ollama"
+        elif agent_type == "openai":
+            model_type = "OpenAI"
         else:
-            model_type = agent_type
+            # All other models are treated as Ollama models
+            model_type = "Ollama"
+            model_name = agent_type
         
         # Select appropriate agent and reinitialize with correct settings
-        if "Local" in model_type:
+        if model_type == "OpenAI":
+            if not openai_key:
+                response_text = "OpenAI key not found. Please check your config."
+                print(f"Error: {response_text}")
+                return history + [[message, response_text]]
+            agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, 
+                            collection=collection, skip_analysis=skip_analysis)
+        elif model_type == "Local (Mistral)":
             # For HF models, we need the token
             if not hf_token:
                 response_text = "Local agent not available. Please check your HuggingFace token configuration."
@@ -156,27 +163,14 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
                 return history + [[message, response_text]]
             agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, 
                                  skip_analysis=skip_analysis, quantization=quantization)
-        elif "Ollama" in model_type:
-            # For Ollama models, use the extracted model_name directly
+        else:  # Ollama models
             try:
                 agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, 
                                      collection=collection, skip_analysis=skip_analysis)
             except Exception as e:
-                response_text = f"Error initializing Ollama model: {str(e)}. Falling back to Local Mistral."
-                print(f"Error: {response_text}")
-                # Fall back to Mistral if Ollama fails
-                if hf_token:
-                    agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, 
-                                         skip_analysis=skip_analysis)
-                else:
-                    return history + [[message, "Local Mistral agent not available for fallback. Please check your HuggingFace token configuration."]]
-        else:
-            if not openai_key:
-                response_text = "OpenAI key not found. Please check your config."
+                response_text = f"Error initializing Ollama model: {str(e)}"
                 print(f"Error: {response_text}")
                 return history + [[message, response_text]]
-            agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, 
-                            collection=collection, skip_analysis=skip_analysis)
         
         # Process query and get response
         print("Processing query...")