In [1]:
!pip install -U xformers --index-url https://download.pytorch.org/whl/cu121
!pip install langchain optimum qdrant-client wikipedia FastAPI uvicorn pyngrok
!pip install --upgrade pydantic
!pip install vllm

Looking in indexes: https://download.pytorch.org/whl/cu121


In [2]:
#GENERATE_MODEL_NAME="phatjk/vietcuna-7b-v3-AWQ"
GENERATE_MODEL_NAME="vilm/vietcuna-3b-v2"
EMBEDDINGS_MODEL_NAME="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
QDRANT_URL = "https://d3966086-8c65-4b03-895a-6926e1f83994.us-east4-0.gcp.cloud.qdrant.io"
QDRANT_COLLECTION_NAME = "Luat_vectordb"
NGROK_STATIC_DOMAIN = "briefly-knowing-treefrog.ngrok-free.app"
NGROK_TOKEN=          "2pHsZScewzWnFPxgNOvwnCtfA9R_2J42SPU3YQJhacrYbj4hM"
HUGGINGFACE_API_KEY = "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"
QDRANT_API_KEY =      "vkZ3snjz8mkKNj0weWgZxCvnz83ANbesUvYhz7HitC2X-rw_-d4hEg"

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from optimum.bettertransformer import BetterTransformer
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_rerank = AutoModelForSequenceClassification.from_pretrained('amberoad/bert-multilingual-passage-reranking-msmarco').to(device)
#model_rerank = BetterTransformer.transform(model_rerank)
tokenizer_rerank = AutoTokenizer.from_pretrained('amberoad/bert-multilingual-passage-reranking-msmarco')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
!pip install -U langchain-community



In [5]:
from langchain.schema.document import Document
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.retrievers import WikipediaRetriever
from typing import List
class RerankRetriever(VectorStoreRetriever):
    vectorstore: VectorStoreRetriever
    def get_relevant_documents(self, query: str) -> List[Document]:
        docs = self.vectorstore.get_relevant_documents(query=query)
        candidates = [doc.page_content for doc in docs]
        queries = [query]*len(candidates)
        features = tokenizer_rerank(queries, candidates,  padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            scores = model_rerank(**features).logits
            values, indices = torch.sum(scores, dim=1).sort()
            # relevant_docs = docs[indices[0]]
        return [docs[indices[0]],docs[indices[1]]]
class RerankWikiRetriever(VectorStoreRetriever):
    vectorstore: WikipediaRetriever
    def get_relevant_documents(self, query: str) -> List[Document]:
        docs = self.vectorstore.get_relevant_documents(query=query)
        candidates = [doc.page_content for doc in docs]
        queries = [query]*len(candidates)
        features = tokenizer_rerank(queries, candidates,  padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            scores = model_rerank(**features).logits
            values, indices = torch.sum(scores, dim=1).sort()
            # relevant_docs = docs[indices[0]]
        return [docs[indices[0]],docs[indices[1]]]

  class RerankRetriever(VectorStoreRetriever):
  class RerankWikiRetriever(VectorStoreRetriever):


In [6]:
!pip install accelerate bitsandbytes



In [7]:
from langchain.retrievers import WikipediaRetriever
from langchain.vectorstores import Qdrant
from langchain.llms import HuggingFacePipeline
from qdrant_client import QdrantClient
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.chains import RetrievalQA,MultiRetrievalQAChain
from langchain.llms import VLLM
from langchain.llms import HuggingFaceHub

class LLMServe:
    def __init__(self) -> None:
      self.embeddings = self.load_embeddings()
      self.current_source = "wiki"
      self.retriever = self.load_retriever(retriever_name = self.current_source,embeddings=self.embeddings)
      self.pipe = self.load_model_pipeline(max_new_tokens=300)
      self.prompt = self.load_prompt_template()
      self.rag_pipeline = self.load_rag_pipeline(llm=self.pipe,
                                            retriever=self.retriever,
                                            prompt=self.prompt)
    def load_embeddings(self):
      embeddings = HuggingFaceInferenceAPIEmbeddings(
          model_name=EMBEDDINGS_MODEL_NAME,
          api_key = HUGGINGFACE_API_KEY,
          #model_kwargs = {'device': "auto"}
      )
      return embeddings

    def load_retriever(self,retriever_name,embeddings):
      retriever=None
      if retriever_name == "wiki":
        retriever = RerankWikiRetriever(vectorstore = WikipediaRetriever(lang="vi",
                                       doc_content_chars_max=800,top_k_results=15))
      else:
        client = QdrantClient(
            url=QDRANT_URL,api_key=QDRANT_API_KEY, prefer_grpc=False
        )
        db = Qdrant(client=client,
                    embeddings=embeddings,
                    collection_name=QDRANT_COLLECTION_NAME)

        retriever = RerankRetriever(vectorstore = db.as_retriever(search_kwargs={"k":15}))

      return retriever

    def load_model_pipeline(self,max_new_tokens=100):
      llm = VLLM(
          model=GENERATE_MODEL_NAME,
          trust_remote_code=True,  # mandatory for hf models
          max_new_tokens=max_new_tokens,
            # temperature=1.0,
            # top_k=50,
            # top_p=0.9,
          top_k=10,
          top_p=0.95,
          temperature=0.4,
          dtype="half",
          #vllm_kwargs={"quantization": "awq"}
      )
      return llm

    def load_prompt_template(self):

      query_template = "Bạn là một chatbot thông minh trả lời câu hỏi dựa trên ngữ cảnh (context).\n\n### Context:{context} \n\n### Human: {question}\n\n### Assistant:"
      prompt = PromptTemplate(template=query_template,
                        input_variables= ["context","question"])
      return prompt

    def load_rag_pipeline(self,llm,retriever,prompt):
      rag_pipeline = RetrievalQA.from_chain_type(
      llm=llm, chain_type='stuff',
      retriever=retriever,
      chain_type_kwargs={
      "prompt": prompt
      },
      return_source_documents=True)
      return rag_pipeline

    def rag(self,source):
      if source == self.current_source:
        return self.rag_pipeline
      else:
        self.retriever = self.load_retriever(retriever_name=source,embeddings=self.embeddings)
        self.rag_pipeline = self.load_rag_pipeline(llm=self.pipe,
                                      retriever=self.retriever,
                                      prompt=self.prompt)
        self.current_source = source
        return self.rag_pipeline

In [8]:
!pip install triton



In [9]:
app = LLMServe()

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

INFO 11-27 15:47:13 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='vilm/vietcuna-3b-v2', speculative_config=None, tokenizer='vilm/vietcuna-3b-v2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=vilm/vietcuna-3b-v2, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

INFO 11-27 15:47:18 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 11-27 15:47:18 selector.py:144] Using XFormers backend.
INFO 11-27 15:47:19 model_runner.py:1072] Starting to load model vilm/vietcuna-3b-v2...
INFO 11-27 15:47:19 weight_utils.py:243] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/6.01G [00:00<?, ?B/s]

INFO 11-27 15:48:55 weight_utils.py:288] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 11-27 15:49:25 model_runner.py:1077] Loading model weights took 5.6083 GB
INFO 11-27 15:49:28 worker.py:232] Memory profiling results: total_gpu_memory=14.75GiB initial_memory_usage=6.43GiB peak_torch_memory=8.53GiB memory_usage_post_profile=6.46GiB non_torch_memory=0.22GiB kv_cache_size=4.52GiB gpu_memory_utilization=0.90
INFO 11-27 15:49:29 gpu_executor.py:113] # GPU blocks: 988, # CPU blocks: 873
INFO 11-27 15:49:29 gpu_executor.py:117] Maximum concurrency for 2048 tokens per request: 7.72x
INFO 11-27 15:49:35 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-27 15:49:35 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-27

In [10]:
from typing import Union
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi.encoders import jsonable_encoder
from fastapi import FastAPI
origins = ["*"]
app_api = FastAPI()
app_api.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app_api.get("/")
def read_root():
    return "API RAG"

@app_api.get("/rag/{source}")
async def read_item(source: str, q: str | None = None):
    if q:
        data = app.rag(source=source)(q)
        sources = []
        for docs in data["source_documents"]:
            sources.append(docs.to_json()["kwargs"])
        res = {
            "result" : data["result"],
            "source_documents":sources
        }
        return JSONResponse(content=jsonable_encoder(res))
    return None


In [None]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn
ngrok.set_auth_token(NGROK_TOKEN)
ngrok_tunnel = ngrok.connect(8000,domain=NGROK_STATIC_DOMAIN)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app_api, port=8000)

Public URL: https://briefly-knowing-treefrog.ngrok-free.app


INFO:     Started server process [8475]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     2a09:bac1:7a80:50::245:bd:0 - "GET / HTTP/1.1" 200 OK
INFO:     2a09:bac1:7a80:50::245:bd:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     2a09:bac1:7a80:50::245:bd:0 - "GET / HTTP/1.1" 200 OK
