In [1]:
!pip install -q einops==0.7.0 langchain==0.1.9 pymilvus==2.3.6 sentence-transformers==2.4.0 openai==1.13.3


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
from langchain.callbacks.base import BaseCallbackHandler
from langchain.chains import RetrievalQA
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import VLLMOpenAI
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import Milvus

In [3]:
def connect_milvus_vector_store(embedding_model_name, vector_db_collection_name):
    # Create embedding
    model_kwargs = {
        'device': 'cuda',
        'trust_remote_code': True,
    }
    encode_kwargs = {
        'normalize_embeddings': False
    }
    embeddings = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
        show_progress=True
    )

    # Connect to vector DB
    return Milvus(
        embedding_function=embeddings,
        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
        collection_name=vector_db_collection_name,
        metadata_field="metadata",
        text_field="page_content",
        drop_old=False
    )

# Program starts here

In [4]:
INFERENCE_SERVER_URL = "https://qwen-25-7b-instruct-peter-lm.apps.rosa.peter-rosa-hcp.y0r0.p3.openshiftapps.com/v1"
MODEL_NAME = "qwen-25-7b-instruct"

In [5]:
MAX_TOKENS=1024
TOP_P=0.95
TEMPERATURE=0.01
PRESENCE_PENALTY=1.03

In [6]:
MILVUS_HOST = "vectordb-milvus.milvus.svc.cluster.local"
MILVUS_PORT = 19530
MILVUS_USERNAME = os.getenv('MILVUS_USERNAME')
MILVUS_PASSWORD = os.getenv('MILVUS_PASSWORD')

### Apply Prompt Engineering, define prompt instructions

In [7]:
chatbot_prompt_template = PromptTemplate.from_template("""
    <s>[INST] <<SYS>>
    You are a helpful, respectful and honest assistant named HatBot answering questions.
    You will be given a question you need to answer, and a context to provide you with information. You must answer the question based as much as possible on this context.
    Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
    
    If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
    <</SYS>>
    
    Context: 
    {context}
    
    Question: {question} [/INST]
""")

### Connect to the language model (vLLM) server

In [8]:
language_model = VLLMOpenAI(
    openai_api_key="EMPTY",
    openai_api_base=INFERENCE_SERVER_URL,
    model_name=MODEL_NAME,
    max_tokens=MAX_TOKENS,
    top_p=TOP_P,
    temperature=TEMPERATURE,
    presence_penalty=PRESENCE_PENALTY,
    streaming=True,
    verbose=False,
    callbacks=[StreamingStdOutCallbackHandler()]
)

### Integrate the vector DB and the vLLM server, so that whenever there is a query, it retrives from vector DB first, which implements RAG

In [9]:
vector_store = connect_milvus_vector_store(
    embedding_model_name="ibm-granite/granite-embedding-278m-multilingual",
    vector_db_collection_name="openshift_ai_2_16_en_document"
)

qa_chain = RetrievalQA.from_chain_type(
    language_model,
    retriever=vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 10}
    ),
    chain_type_kwargs={
        "prompt": chatbot_prompt_template
    },
    return_source_documents=True
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Query example

In [10]:
question = "How can I work with GPU and taints in OpenShift AI?"
result = qa_chain.invoke({"query": question})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

    To work with GPUs and taints in OpenShift AI, you need to follow these steps:

1. **Enable GPU Support**:
   - Ensure that the NVIDIA GPU Operator is installed on your OpenShift cluster.
   - If you are in a disconnected or airgapped environment, follow the instructions provided by NVIDIA to deploy the GPU Operators.

2. **Delete the `migration-gpu-status` ConfigMap**:
   - Switch to the Administrator perspective in the OpenShift web console.
   - Set the Project to All Projects or redhat-ods-applications.
   - Search for the `migration-gpu-status` ConfigMap.
   - Click the action menu (⋮) and select Delete ConfigMap from the list.
   - Confirm the deletion of the correct ConfigMap.

3. **Apply Taints to Nodes or MachineSets**:
   - Apply taints to your nodes or machine sets to restrict access to GPUs. For example:
     ```yaml
     apiVersion: machine.openshift.io/v1beta1
     kind: MachineSet
     metadata:
       ...
     spec:
       replicas: 1
       selector:
         ...
  

### Retrieve source of answers

In [11]:
def remove_duplicates(input_list):
    unique_list = []
    for item in input_list:
        if item.metadata['source'] not in unique_list:
            unique_list.append(item.metadata['source'])
    return unique_list

results = remove_duplicates(result['source_documents'])

for s in results:
    print(s)

https://ai-on-openshift.io/odh-rhoai/nvidia-gpus/
https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.16/html-single/installing_and_uninstalling_openshift_ai_self-managed/index
https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.16/html-single/introduction_to_red_hat_openshift_ai/index
https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.16/html-single/release_notes/index
https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.16/html-single/installing_and_uninstalling_openshift_ai_self-managed_in_a_disconnected_environment/index
