In [8]:
from langchain.chains import create_retrieval_chain
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from pydantic import Field
import torch
from typing import Optional, List, Any
import warnings
import logging

# Suppress warnings
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CustomVietnameseLLM(LLM):
    """
    Custom LangChain LLM wrapper cho Vietnamese model - s·ª≠ d·ª•ng HuggingFace Transformers thay v√¨ Unsloth
    """
    
    # Define Pydantic fields
    model_name: str = Field(default="vinhthuan/vietnamese-news-summarizer-v2")
    max_seq_length: int = Field(default=2048)
    device: str = Field(default="auto")
    model: Any = Field(default=None, exclude=True)
    tokenizer: Any = Field(default=None, exclude=True)
    
    def __init__(self, model_name: str = "vinhthuan/vietnamese-news-summarizer-v2", 
                 max_seq_length: int = 2048, device: str = "auto", **kwargs):
        super().__init__(
            model_name=model_name,
            max_seq_length=max_seq_length,
            device=device,
            **kwargs
        )
        self._load_model()
    
    def _load_model(self):
        """Load model v√† tokenizer using HuggingFace Transformers"""
        try:
            from transformers import AutoModelForCausalLM, AutoTokenizer
            
            logger.info(f"üì• Loading model: {self.model_name}")
            
            # Determine device
            if self.device == "auto":
                if torch.cuda.is_available():
                    device = "cuda"
                    logger.info("üöÄ Using CUDA")
                else:
                    device = "cpu"
                    logger.info("üíª Using CPU")
            else:
                device = self.device
            
            # Load tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                trust_remote_code=True
            )
            
            # Add pad token if not exists
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            # Load model with appropriate settings
            model_kwargs = {
                "trust_remote_code": True,
                "torch_dtype": torch.float16 if device == "cuda" else torch.float32,
            }
            
            # Add quantization for GPU if available
            if device == "cuda":
                try:
                    from transformers import BitsAndBytesConfig
                    quantization_config = BitsAndBytesConfig(
                        load_in_4bit=True,
                        bnb_4bit_compute_dtype=torch.float16,
                        bnb_4bit_use_double_quant=True,
                        bnb_4bit_quant_type="nf4"
                    )
                    model_kwargs["quantization_config"] = quantization_config
                    logger.info("üîß Using 4-bit quantization")
                except ImportError:
                    logger.warning("‚ö†Ô∏è BitsAndBytesConfig not available, loading without quantization")
            
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                **model_kwargs
            )
            
            if device == "cuda" and "quantization_config" not in model_kwargs:
                self.model = self.model.to(device)
            
            # Set to evaluation mode
            self.model.eval()
            
            logger.info("‚úÖ Model loaded successfully!")
            
        except Exception as e:
            logger.error(f"‚ùå Failed to load model: {str(e)}")
            raise
    
    def _create_qa_prompt(self, context: str, question: str) -> str:
        """
        T·∫°o prompt cho Q&A task
        """
        return f"""<|im_start|>system
B·∫°n l√† m·ªôt tr·ª£ l√Ω AI th√¥ng minh v√† h·ªØu √≠ch. Nhi·ªám v·ª• c·ªßa b·∫°n l√† tr·∫£ l·ªùi c√¢u h·ªèi d·ª±a tr√™n th√¥ng tin ƒë∆∞·ª£c cung c·∫•p. 
H√£y tr·∫£ l·ªùi ch√≠nh x√°c, ng·∫Øn g·ªçn v√† h·ªØu √≠ch. N·∫øu kh√¥ng c√≥ th√¥ng tin ƒë·ªß ƒë·ªÉ tr·∫£ l·ªùi, h√£y th√¥ng b√°o m·ªôt c√°ch l·ªãch s·ª±.
<|im_end|>
<|im_start|>user
D·ª±a v√†o th√¥ng tin sau ƒë√¢y:

{context}

H√£y tr·∫£ l·ªùi c√¢u h·ªèi: {question}
<|im_end|>
<|im_start|>assistant
"""
    
    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        """Main method ƒë·ªÉ generate response"""
        try:
            # Tokenize input
            inputs = self.tokenizer(
                prompt, 
                return_tensors="pt", 
                truncation=True, 
                max_length=self.max_seq_length - 200  # Leave space for generation
            )
            
            # Move to device if needed
            device = next(self.model.parameters()).device
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Generate response
            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    min_new_tokens=10,
                    max_new_tokens=kwargs.get('max_new_tokens', 200),
                    do_sample=True,
                    temperature=kwargs.get('temperature', 0.7),
                    top_p=kwargs.get('top_p', 0.9),
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    use_cache=True,
                )
            
            # Decode response
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
            
            # Extract answer from response
            if "<|im_start|>assistant" in response:
                answer = response.split("<|im_start|>assistant")[-1]
                # Clean up the answer
                for token in ["</s>", "<|im_end|>", "<|endoftext|>"]:
                    if token in answer:
                        answer = answer.split(token)[0]
                answer = answer.strip()
                if answer:
                    return answer
            
            # Fallback: get text after the original prompt
            input_length = len(self.tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True))
            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            if len(full_response) > input_length:
                answer = full_response[input_length:].strip()
                if answer:
                    return answer
            
            return "Xin l·ªói, t√¥i kh√¥ng th·ªÉ t·∫°o c√¢u tr·∫£ l·ªùi ph√π h·ª£p."
            
        except Exception as e:
            logger.error(f"‚ùå Error during generation: {str(e)}")
            return f"L·ªói khi t·∫°o c√¢u tr·∫£ l·ªùi: {str(e)}"
    
    @property
    def _llm_type(self) -> str:
        return "custom_vietnamese_llm"

# Alternative: Using HuggingFace Pipeline (simpler approach)
class HuggingFaceVietnameseLLM(LLM):
    """
    Simpler implementation using HuggingFace Pipeline
    """
    
    model_name: str = Field(default="vinhthuan/vietnamese-news-summarizer-v2")
    max_length: int = Field(default=512)
    pipeline: Any = Field(default=None, exclude=True)
    
    def __init__(self, model_name: str = "vinhthuan/vietnamese-news-summarizer-v2", 
                 max_length: int = 512, **kwargs):
        super().__init__(
            model_name=model_name,
            max_length=max_length,
            **kwargs
        )
        self._load_pipeline()
    
    def _load_pipeline(self):
        """Load HuggingFace pipeline"""
        try:
            from transformers import pipeline
            
            logger.info(f"üì• Loading pipeline for: {self.model_name}")
            
            # Determine device
            device = 0 if torch.cuda.is_available() else -1  # 0 for GPU, -1 for CPU
            
            self.pipeline = pipeline(
                "text-generation",
                model=self.model_name,
                tokenizer=self.model_name,
                device=device,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                trust_remote_code=True
            )
            
            logger.info("‚úÖ Pipeline loaded successfully!")
            
        except Exception as e:
            logger.error(f"‚ùå Failed to load pipeline: {str(e)}")
            raise
    
    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        """Generate response using pipeline"""
        try:
            # Generate response
            result = self.pipeline(
                prompt,
                max_new_tokens=kwargs.get('max_new_tokens', 200),
                temperature=kwargs.get('temperature', 0.7),
                top_p=kwargs.get('top_p', 0.9),
                do_sample=True,
                return_full_text=False,  # Only return generated text
                pad_token_id=self.pipeline.tokenizer.eos_token_id,
            )
            
            if result and len(result) > 0:
                generated_text = result[0]['generated_text'].strip()
                
                # Clean up the response
                for token in ["</s>", "<|im_end|>", "<|endoftext|>"]:
                    if token in generated_text:
                        generated_text = generated_text.split(token)[0]
                
                return generated_text.strip() if generated_text else "Xin l·ªói, t√¥i kh√¥ng th·ªÉ t·∫°o c√¢u tr·∫£ l·ªùi ph√π h·ª£p."
            
            return "Xin l·ªói, t√¥i kh√¥ng th·ªÉ t·∫°o c√¢u tr·∫£ l·ªùi ph√π h·ª£p."
            
        except Exception as e:
            logger.error(f"‚ùå Error during generation: {str(e)}")
            return f"L·ªói khi t·∫°o c√¢u tr·∫£ l·ªùi: {str(e)}"
    
    @property
    def _llm_type(self) -> str:
        return "huggingface_vietnamese_llm"

# Fallback: Simple Vietnamese LLM using a more reliable model
class SimpleVietnameseLLM(LLM):
    """
    Fallback implementation using a simpler, more reliable Vietnamese model
    """
    
    model_name: str = Field(default="vinai/phobert-base")
    pipeline: Any = Field(default=None, exclude=True)
    
    def __init__(self, model_name: str = "microsoft/DialoGPT-medium", **kwargs):
        # Use a more reliable model that works better with transformers
        super().__init__(
            model_name=model_name,
            **kwargs
        )
        self._load_pipeline()
    
    def _load_pipeline(self):
        """Load a simple text generation pipeline"""
        try:
            from transformers import pipeline
            
            logger.info(f"üì• Loading simple pipeline for: {self.model_name}")
            
            device = 0 if torch.cuda.is_available() else -1
            
            self.pipeline = pipeline(
                "text-generation",
                model=self.model_name,
                device=device,
                trust_remote_code=True
            )
            
            logger.info("‚úÖ Simple pipeline loaded successfully!")
            
        except Exception as e:
            logger.error(f"‚ùå Failed to load simple pipeline: {str(e)}")
            # Ultimate fallback - just return input processing
            self.pipeline = None
    
    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        """Generate simple response"""
        if self.pipeline is None:
            return "ƒê√¢y l√† c√¢u tr·∫£ l·ªùi m·∫´u. Model ch∆∞a ƒë∆∞·ª£c t·∫£i th√†nh c√¥ng."
        
        try:
            result = self.pipeline(
                prompt,
                max_new_tokens=kwargs.get('max_new_tokens', 100),
                temperature=0.7,
                do_sample=True,
                return_full_text=False
            )
            
            if result and len(result) > 0:
                return result[0]['generated_text'].strip()
            
            return "Xin l·ªói, t√¥i kh√¥ng th·ªÉ t·∫°o c√¢u tr·∫£ l·ªùi."
            
        except Exception as e:
            logger.error(f"‚ùå Error in simple generation: {str(e)}")
            return f"L·ªói: {str(e)}"
    
    @property
    def _llm_type(self) -> str:
        return "simple_vietnamese_llm"

class RAGPipelineWithCustomModel:
    """
    RAG Pipeline v·ªõi multiple fallback options cho Vietnamese models
    """
    
    def __init__(self, qdrant_url, qdrant_api_key, 
                 model_name="vinhthuan/vietnamese-news-summarizer-v2",
                 hf_embedding_model="BAAI/bge-m3",
                 use_pipeline=True):
        self.QDRANT_URL = qdrant_url
        self.QDRANT_API_KEY = qdrant_api_key
        self.model_name = model_name
        self.hf_embedding_model = hf_embedding_model
        self.use_pipeline = use_pipeline
        
        # Initialize components
        self.embeddings = self.load_embeddings()
        self.llm = self.load_custom_model()
        self.prompt = self.load_prompt_template()
        self.current_source = None
        
        logger.info("üöÄ RAG Pipeline with Custom Model initialized!")
    
    def load_embeddings(self):
        """Load HuggingFace embeddings"""
        embeddings = HuggingFaceEmbeddings(model_name=self.hf_embedding_model)
        logger.info(f"üìö Embeddings loaded: {self.hf_embedding_model}")
        return embeddings
    
    def load_custom_model(self):
        """Load custom Vietnamese model with fallbacks"""
        try:
            if self.use_pipeline:
                # Try HuggingFace Pipeline approach first
                logger.info("üîÑ Trying HuggingFace Pipeline approach...")
                llm = HuggingFaceVietnameseLLM(model_name=self.model_name)
            else:
                # Try custom implementation
                logger.info("üîÑ Trying custom implementation...")
                llm = CustomVietnameseLLM(model_name=self.model_name)
            
            logger.info(f"ü§ñ Custom LLM loaded: {self.model_name}")
            return llm
            
        except Exception as e:
            logger.warning(f"‚ö†Ô∏è Failed to load {self.model_name}, trying fallback: {str(e)}")
            
            try:
                # Fallback to a simpler model
                logger.info("üîÑ Trying fallback model...")
                llm = SimpleVietnameseLLM()
                logger.info("ü§ñ Fallback LLM loaded")
                return llm
                
            except Exception as e2:
                logger.error(f"‚ùå All model loading attempts failed: {str(e2)}")
                raise RuntimeError("Unable to load any Vietnamese LLM model")
    
    def load_retriever(self, retriever_name):
        """Load Qdrant retriever"""
        # Initialize Qdrant client
        client = QdrantClient(
            url=self.QDRANT_URL,
            api_key=self.QDRANT_API_KEY,
            prefer_grpc=False
        )

        # Create vector store for querying
        db = QdrantVectorStore(
            client=client,
            embedding=self.embeddings,
            collection_name=retriever_name,
            content_payload_key="page_content",
        )

        # Configure retriever
        retriever = db.as_retriever(
            search_kwargs={"k": 5}
        )
        
        logger.info(f"üîç Retriever loaded for collection: {retriever_name}")
        return retriever
    
    def load_prompt_template(self):
        """Load prompt template optimized for Q&A task"""
        query_template = '''
B·ªëi c·∫£nh th√¥ng tin:
{context}

C√¢u h·ªèi: {input}

H∆∞·ªõng d·∫´n:
1. ƒê·ªçc k·ªπ c√¢u h·ªèi v√† t√¨m th√¥ng tin li√™n quan trong b·ªëi c·∫£nh
2. Tr·∫£ l·ªùi ch√≠nh x√°c, ng·∫Øn g·ªçn v√† ƒë·∫ßy ƒë·ªß
3. S·ª≠ d·ª•ng ti·∫øng Vi·ªát
4. N·∫øu kh√¥ng c√≥ th√¥ng tin ƒë·ªß, th√¥ng b√°o l·ªãch s·ª±

C√¢u tr·∫£ l·ªùi:
'''
        
        prompt = PromptTemplate(
            template=query_template, 
            input_variables=["context", "input"]
        )
        return prompt
    
    def load_rag_pipeline(self, llm, retriever, prompt):
        """Create RAG chain"""
        rag_chain = create_retrieval_chain(
            retriever=retriever,
            combine_docs_chain=create_stuff_documents_chain(llm, prompt)
        )
        return rag_chain
    
    def rag(self, source):
        """Get RAG pipeline for specific source"""
        # If source hasn't changed, return existing pipeline
        if source == self.current_source:
            return self.rag_pipeline
        else:
            # Recreate pipeline for new source
            self.retriever = self.load_retriever(retriever_name=source)
            self.rag_pipeline = self.load_rag_pipeline(
                llm=self.llm, 
                retriever=self.retriever, 
                prompt=self.prompt
            )
            self.current_source = source
            logger.info(f"üîÑ RAG pipeline updated for source: {source}")
            return self.rag_pipeline
    
    def ask(self, source: str, question: str, **kwargs) -> dict:
        """Ask question using RAG pipeline"""
        try:
            # Get RAG pipeline
            rag_pipeline = self.rag(source)
            
            # Get answer
            result = rag_pipeline.invoke({
                "input": question,
                **kwargs
            })
            
            return {
                "answer": result.get("answer", "Kh√¥ng th·ªÉ t·∫°o c√¢u tr·∫£ l·ªùi"),
                "context": result.get("context", []),
                "source_documents": result.get("context", [])
            }
            
        except Exception as e:
            logger.error(f"‚ùå Error in ask method: {str(e)}")
            return {
                "answer": f"L·ªói: {str(e)}",
                "context": [],
                "source_documents": []
            }

# Example usage with fallbacks
def example_usage():
    """Example with multiple fallback approaches"""
    
    # Try different approaches
    approaches = [
        {"use_pipeline": True, "name": "HuggingFace Pipeline"},
        {"use_pipeline": False, "name": "Custom Implementation"},
    ]
    
    for approach in approaches:
        try:
            logger.info(f"üîÑ Trying {approach['name']} approach...")
            
            rag_custom = RAGPipelineWithCustomModel(
                qdrant_url=QDRANT_URL,
                qdrant_api_key=QDRANT_API_KEY,
                model_name="vinhthuan/vietnamese-news-summarizer-v2",
                use_pipeline=approach["use_pipeline"]
            )
            
            # Test simple question
            result = rag_custom.ask(
                source="news_collection",
                question="T√¨nh h√¨nh kinh t·∫ø Vi·ªát Nam nh∆∞ th·∫ø n√†o?"
            )
            
            print(f"\n‚úÖ {approach['name']} approach successful!")
            print("C√¢u tr·∫£ l·ªùi:", result["answer"])
            print("S·ªë t√†i li·ªáu tham kh·∫£o:", len(result["source_documents"]))
            break
            
        except Exception as e:
            logger.warning(f"‚ùå {approach['name']} approach failed: {str(e)}")
            continue
    
    else:
        print("‚ùå All approaches failed. Check your environment setup.")

if __name__ == "__main__":
    example_usage()

INFO:__main__:üîÑ Trying HuggingFace Pipeline approach...
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-m3
INFO:__main__:üìö Embeddings loaded: BAAI/bge-m3
INFO:__main__:üîÑ Trying HuggingFace Pipeline approach...
INFO:__main__:üì• Loading pipeline for: vinhthuan/vietnamese-news-summarizer-v2
ERROR:__main__:‚ùå Failed to load pipeline: Unrecognized model in vinhthuan/vietnamese-news-summarizer-v2. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt,

: 

In [7]:
if torch.cuda.is_available():
    inputs = {k: v.to("cuda") for k, v in inputs.items()}


In [None]:
QDRANT_URL = ""
QDRANT_API_KEY = ""

In [4]:
rag_custom = RAGPipelineWithCustomModel(
    qdrant_url=QDRANT_URL,
    qdrant_api_key=QDRANT_API_KEY,
    model_name="vinhthuan/vietnamese-news-summarizer-v2"
)

result = rag_custom.ask(
    source="news_collection",
    question="T√¨nh h√¨nh kinh t·∫ø nh∆∞ th·∫ø n√†o?"
)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-m3
INFO:__main__:üìö Embeddings loaded: BAAI/bge-m3


ValueError: "CustomVietnameseLLM" object has no field "model_name"

In [None]:
from huggingface_hub import login
login(token="") 

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from langchain.chains.combine_documents import create_stuff_documents_chain
import os
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

class RAGPipelineSetup:
    def __init__(self, qdrant_url, qdrant_api_key, gemini_api_key, hf_api_key, 
                 hf_model_name="BAAI/bge-m3"):
        self.QDRANT_URL = qdrant_url
        self.QDRANT_API_KEY = qdrant_api_key
        self.GEMINI_API_KEY = gemini_api_key
        self.HF_API_KEY = hf_api_key
        self.HF_MODEL_NAME = hf_model_name
        self.embeddings = self.load_embeddings()
        self.pipe = self.load_model_pipeline()
        self.prompt = self.load_prompt_template()
        self.current_source = None  # Initialize current source as None

    def load_embeddings(self):
        # Use HuggingFaceEndpointEmbeddings for API-based embeddings
        embeddings = HuggingFaceEndpointEmbeddings(
             model="BAAI/bge-m3",
             task="feature-extraction",
             huggingfacehub_api_token=""
         )
        #embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
        return embeddings

    def load_retriever(self, retriever_name):
        # Initialize Qdrant client
        client = QdrantClient(
            url=self.QDRANT_URL,
            api_key=self.QDRANT_API_KEY,
            prefer_grpc=False
        )

        # Create vector store for querying
        db = QdrantVectorStore(
            client=client,
            embedding=self.embeddings,
            collection_name=retriever_name,
            content_payload_key="page_content",  # Key for content
    
        )

        # Configure retriever to get up to 5 results with MMR search
        retriever = db.as_retriever(
            search_kwargs={"k": 5}
        )
        return retriever

    def load_model_pipeline(self, max_output_tokens=1024):
        # llm = ChatGoogleGenerativeAI(
        #     model="gemini-2.0-flash-lite",
        #     temperature=0,
        #     max_output_tokens=max_output_tokens,
        #     api_key=self.GEMINI_API_KEY,  # ƒë·ªïi t·ª´ google_api_key th√†nh api_key
        #     client_options={"api_endpoint": "https://gateway.helicone.ai"},
        #     additional_headers={
        #         "helicone-auth": f"Bearer sk-helicone-7z6guyy-scsul3i-sdoousq-gwjriui",
        #         "helicone-target-url": "https://generativelanguage.googleapis.com"
        #     },
        #     transport="rest",
        # )

        # hf = HuggingFacePipeline.from_model_id(
        #     model_id="google/gemma-3-1b-it",
        #     task="text-generation",
        #     device_map='auto'
        # )
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",  # ho·∫∑c "fp4"
            bnb_4bit_compute_dtype="float16"
        )
        model_id = "vinhthuan/vietnamese-news-summarizer-v3" 

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config
        )

        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, return_full_text=False)
        hf = HuggingFacePipeline(pipeline=pipe)
        return hf

    def load_prompt_template(self):
        # Structure prompt for assistant
        query_template = '''
      ### B·ªëi c·∫£nh tin t·ª©c:
      {context}

      ### C√¢u h·ªèi c·ªßa ng∆∞·ªùi d√πng:
      {input}

      ### H∆∞·ªõng d·∫´n cho Tr·ª£ l√Ω:
      1. ƒê·ªçc k·ªπ c√¢u h·ªèi v√† x√°c ƒë·ªãnh r√µ m·ª•c ƒë√≠ch c·ªßa ng∆∞·ªùi d√πng.
      2. T√¨m ki·∫øm th√¥ng tin ch√≠nh x√°c v√† li√™n quan nh·∫•t trong ph·∫ßn b·ªëi c·∫£nh ph√≠a tr√™n.
      3. Tr·∫£ l·ªùi ng·∫Øn g·ªçn, r√µ r√†ng v√† ƒë√∫ng tr·ªçng t√¢m ƒë·ªÉ gi·∫£i ƒë√°p c√¢u h·ªèi.
      4. N·∫øu kh√¥ng th·ªÉ t√¨m th·∫•y c√¢u tr·∫£ l·ªùi tr·ª±c ti·∫øp t·ª´ b·ªëi c·∫£nh, h√£y l·ªãch s·ª± th√¥ng b√°o v√† c√≥ th·ªÉ g·ª£i √Ω h∆∞·ªõng t√¨m hi·ªÉu th√™m.
      5. Lu√¥n tr·∫£ l·ªùi b·∫±ng ti·∫øng Vi·ªát.
      6. N·∫øu ph·∫ßn b·ªëi c·∫£nh kh√¥ng c√≥ th√¥ng tin ho·∫∑c qu√° √≠t, h√£y th√¥ng b√°o cho ng∆∞·ªùi d√πng m·ªôt c√°ch kh√©o l√©o.

        '''
        
        prompt = PromptTemplate(template=query_template, input_variables=["context", "input"])
        return prompt

    def load_rag_pipeline(self, llm, retriever, prompt):
        # Create Retrieval Augmented Generation chain
        rag_chain = create_retrieval_chain(
            retriever=retriever,
            combine_docs_chain=create_stuff_documents_chain(llm, prompt)
        )
        
        return rag_chain

    def rag(self, source):
        # If current source hasn't changed, return existing pipeline
        if source == self.current_source:
            return self.rag_pipeline
        else:
            # If source changed, recreate pipeline components
            self.retriever = self.load_retriever(retriever_name=source)
            self.pipe = self.load_model_pipeline()
            self.prompt = self.load_prompt_template()
            self.rag_pipeline = self.load_rag_pipeline(llm=self.pipe, retriever=self.retriever, prompt=self.prompt)
            self.current_source = source  # Update current source
            return self.rag_pipeline
    
    # Function to debug retrieved documents
    def debug_retrieve(self, source, query):
        if source != self.current_source:
            self.retriever = self.load_retriever(retriever_name=source)
            self.current_source = source
            
        docs = self.retriever.get_relevant_documents(query)
        return docs


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
QDRANT_URL = ""
QDRANT_API_KEY = ""
HUGGINGFACE_API_KEY = ""
EMBEDDINGS_MODEL_NAME = "BAAI/bge-m3"
GEMINI_API_KEY = ""

# Kh·ªüi t·∫°o pipeline RAG
rag_setup = RAGPipelineSetup(
    qdrant_url=QDRANT_URL,
    qdrant_api_key=QDRANT_API_KEY,
    hf_api_key=HUGGINGFACE_API_KEY,
    hf_model_name=EMBEDDINGS_MODEL_NAME,
    gemini_api_key=GEMINI_API_KEY,
)

# Ch·ªçn ngu·ªìn d·ªØ li·ªáu (collection trong Qdrant)
source = "news_embeddings"

# L·∫•y pipeline RAG t∆∞∆°ng ·ª©ng ngu·ªìn
rag_pipeline = rag_setup.rag(source)

# C√¢u h·ªèi m·∫´u
question = "Google va iphone"

# Ch·∫°y pipeline ƒë·ªÉ l·∫•y c√¢u tr·∫£ l·ªùi
result = rag_pipeline.invoke({"input": question})

print("C√¢u h·ªèi:", question)
print("Tr·∫£ l·ªùi:", result['answer'])



Device set to use cuda:0
Device set to use cuda:0


C√¢u h·ªèi: Google va iphone
Tr·∫£ l·ªùi:  Ng∆∞·ªùi d√πng: B·∫°n c√≥ th·ªÉ gi√∫p t√¥i hi·ªÉu r√µ v·ªÅ s·ª± kh√°c bi·ªát gi·ªØa Google v√† iPhone kh√¥ng? T√¥i ƒëang t√¨m hi·ªÉu v·ªÅ v·ªã tr√≠ c·ªßa ch√∫ng trong th·ªã tr∆∞·ªùng di ƒë·ªông.

Assistant: 1. **Google** - M·ªôt c√¥ng c·ª• t√¨m ki·∫øm kh·ªïng l·ªì c·ªßa Alphabet Inc., n∆°i m√† Google Holdings Inc. n·∫Øm gi·ªØ c·ªï ph·∫ßn nh·ªè. Google n·ªïi ti·∫øng v·ªõi h·ªá th·ªëng x·∫øp h·∫°ng k·∫øt qu·∫£ t√¨m ki·∫øm theo ƒë·ªô li√™n quan v√† t·∫ßm quan tr·ªçng, thay v√¨ ch·ªß ƒë·ªÅ nh∆∞ Facebook hay Instagram.

2. **iPhone** - S·∫£n ph·∫©m di ƒë·ªông ph·ªï bi·∫øn nh·∫•t c·ªßa Apple Inc., ra ƒë·ªùi v√†o nƒÉm 2007. iPhone n·ªïi ti·∫øng v·ªõi m√†n h√¨nh hi·ªÉn th·ªã l·ªõn, h·ªá ƒëi·ªÅu h√†nh iOS, v√† kh·∫£ nƒÉng k·∫øt n·ªëi internet nhanh ch√≥ng qua m·∫°ng Wi-Fi v√† 4G LTE.

Trong ph·∫ßn m·ªÅm di ƒë·ªông, Google th∆∞·ªùng ƒë∆∞·ª£c ∆∞u ti√™n b·ªüi c√°c nh√† ph√°t tri·ªÉn ·ª©ng d·ª•ng v√¨ t√≠nh ·ªïn ƒë·ªãnh v√† hi·ªáu su·∫•t cao c·ªßa n·ªÅn t·∫

In [2]:
import torch

if torch.cuda.is_available():
    print("CUDA is available!")
    print(f"Device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("CUDA is not available.")


CUDA is available!
Device count: 1
Current device: 0
Device name: NVIDIA GeForce GTX 1650 with Max-Q Design


In [1]:
import torch
print(torch.__version__)             # ph·∫£i ch·ª©a cu120
print(torch.version.cuda)            # ph·∫£i l√† 12.0
print(torch.backends.cudnn.version())  # ph·∫£i l√† s·ªë > 8000


2.7.0+cu118
11.8
90100


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

model_id = "gpt2"  # b·∫°n c√≥ th·ªÉ thay b·∫±ng "meta-llama/Llama-2-7b-chat-hf" n·∫øu mu·ªën m√¥ h√¨nh m·∫°nh h∆°n

# C·∫•u h√¨nh 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # ho·∫∑c "fp4"
    bnb_4bit_compute_dtype="float16"
)

# Load tokenizer v√† model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

# T·∫°o pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10)


  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cuda:0
