In [1]:
!pip install -q einops==0.7.0 langchain==0.1.9 pypdf==4.0.2 pymilvus==2.3.6 sentence-transformers==2.4.0


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import requests
import os
from langchain.document_loaders import PyPDFDirectoryLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus
from pathlib import Path

In [3]:
def download_and_load_documentation_pdfs(product_name, product_version, sections, language):
    pdfs = [f"https://docs.redhat.com/{language}/documentation/{product_name}/{product_version}/pdf/{section}/{product_name}-{product_version}-{section}-{language}.pdf" for section in sections]
    pdfs_to_urls = {f"{product_name}-{product_version}-{section}-{language}": f"https://docs.redhat.com/{language}/documentation/{product_name}/{product_version}/html-single/{section}/index" for section in sections}

    docs_dir = f"{product_name}-{product_version}-{language}"
    pdf_folder_path = f"./{product_name}-{product_version}-{language}"

    if not os.path.exists(docs_dir):
        os.mkdir(docs_dir)

    for pdf in pdfs:
        try:
            response = requests.get(pdf)
        except:
            print(f"Skipped {pdf}")
            continue
        if response.status_code != 200:
            print(f"Skipped {pdf}")
            continue
        with open(f"{docs_dir}/{pdf.split('/')[-1]}", 'wb') as f:
            f.write(response.content)

    pdf_loader = PyPDFDirectoryLoader(pdf_folder_path)
    pdf_docs = pdf_loader.load()

    # Inject document metadata so that we can find out the LLM answers' source later
    for doc in pdf_docs:
        doc.metadata["source"] = pdfs_to_urls[Path(doc.metadata["source"]).stem.lower()]

    return pdf_docs

In [4]:
def download_and_load_website_text_contents(websites):
    website_loader = WebBaseLoader(websites)
    website_docs = website_loader.load()

    return website_docs

In [5]:
def combine_and_text_splitting(pdf_docs, website_docs):
    merged_documents = pdf_docs + website_docs

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                                   chunk_overlap=128)
    all_splits = text_splitter.split_documents(merged_documents)
    return all_splits

In [6]:
def inject_into_vector_db(document_splits, embedding_model_name, vector_db_collection_name):
    # Create embedding
    model_kwargs = {
        'device': 'cuda',
        'trust_remote_code': True,
    }
    encode_kwargs = {
        'normalize_embeddings': False
    }
    embeddings = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
        show_progress=True
    )

    # Connect to vector DB
    db = Milvus(
        embedding_function=embeddings,
        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
        collection_name=vector_db_collection_name,
        metadata_field="metadata",
        text_field="page_content",
        auto_id=True,
        drop_old=True
    )

    # Insert into vector DB
    db.add_documents(document_splits)

    return db

# Program starts here

In [7]:
MILVUS_HOST = "vectordb-milvus.milvus.svc.cluster.local"
MILVUS_PORT = 19530
MILVUS_USERNAME = os.getenv('MILVUS_USERNAME')
MILVUS_PASSWORD = os.getenv('MILVUS_PASSWORD')

### Download Red Hat product documentation PDFs first

In [8]:
pdf_documents = download_and_load_documentation_pdfs(
    product_name="openshift_container_platform",
    product_version="4.17",
    sections=[
        "architecture",
        "postinstallation_configuration",
        "machine_management",
        "machine_configuration",
        "networking",
        "registry",
        "backup_and_restore",
    ],
    language="zh-cn"
)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


### Combine both and split them into chunks that can save into Vector DB

In [9]:
split_documents = combine_and_text_splitting(pdf_documents, [])

### Inject all split document into Vector DB

In [10]:
vector_db = inject_into_vector_db(
    document_splits=split_documents,
    embedding_model_name="ibm-granite/granite-embedding-278m-multilingual",
    vector_db_collection_name="openshift_container_platform_4_17_zh_cn_document"
)

Batches:   0%|          | 0/144 [00:00<?, ?it/s]

### Verify if documents are injected into Vector DB

In [11]:
query = "如何限制 Pod 與 Pod 之間的網絡流量?"
docs_with_score = vector_db.similarity_search_with_score(query)

for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
Score:  0.4713785648345947
拒
拒
绝
绝
来自所有命名空
来自所有命名空
间
间
中的所有
中的所有
 pod 
的入口流量
的入口流量
           
这
这
是一个基本的策略，阻止配置其他网
是一个基本的策略，阻止配置其他网
络
络
策略所允
策略所允
许
许
的跨
的跨
 pod 
流量以外的所有跨
流量以外的所有跨
 pod
 
网
网
络
络
。
。
          
允
允
许
许
来自所有命名空
来自所有命名空
间
间
中的所有
中的所有
 pod 
的入口流量
的入口流量
           
          
允
允
许
许
从特定命名空
从特定命名空
间
间
中到一个
中到一个
 pod 
的入口流量
的入口流量
           
此策略允
此策略允
许
许
流量从在
流量从在
 
namespace-y
 
中
中
运
运
行的容器集到
行的容器集到
标记
标记
 
pod-a
 
的
的
 pod
。
。
          
kind:
 NetworkPolicy
apiVersion:
 networking.k8s.io/v1
metadata:
  name:
 deny-by-default
spec:
  podSelector:
 {}
  policyTypes:
  -
 Ingress
  ingress:
 []
kind:
 NetworkPolicy
apiVersion:
 networking.k8s.io/v1
metadata:
  name:
 allow-same-namespace
spec:
  podSelector:
  ingress:
  - from:
    - podSelector:
 {}
kind:
 NetworkPolicy
apiVersion:
 networking.k8s.io/v1
metadata:
  name:
 allow-traffic-pod
spec:
  podSelector:
   matchLabels:
      pod:
 po