In [None]:
%pip install chromadb

In [24]:
# 初始化智谱 embedding
from typing import Any, Dict, List, Optional

from langchain_core.embeddings import Embeddings
from langchain_core.utils import get_from_dict_or_env
from pydantic import BaseModel, Field, model_validator
import os
from zhipuai import ZhipuAI

class ZhipuAIEmbeddings(BaseModel, Embeddings):
    """ZhipuAI embedding model integration.

    Setup:

        To use, you should have the ``zhipuai`` python package installed, and the
        environment variable ``ZHIPU_API_KEY`` set with your API KEY.

        More instructions about ZhipuAi Embeddings, you can get it
        from  https://open.bigmodel.cn/dev/api#vector

        .. code-block:: bash

            pip install -U zhipuai
            export ZHIPU_API_KEY="your-api-key"

    Key init args — completion params:
        model: Optional[str]
            Name of ZhipuAI model to use.
        api_key: str
            Automatically inferred from env var `ZHIPU_API_KEY` if not provided.

    See full list of supported init args and their descriptions in the params section.

    Instantiate:

        .. code-block:: python

            from langchain_community.embeddings import ZhipuAIEmbeddings

            embed = ZhipuAIEmbeddings(
                model="embedding-2",
                # api_key="...",
            )

    Embed single text:
        .. code-block:: python

            input_text = "The meaning of life is 42"
            embed.embed_query(input_text)

        .. code-block:: python

            [-0.003832892, 0.049372625, -0.035413884, -0.019301128, 0.0068899863, 0.01248398, -0.022153955, 0.006623926, 0.00778216, 0.009558191, ...]


    Embed multiple text:
        .. code-block:: python

            input_texts = ["This is a test query1.", "This is a test query2."]
            embed.embed_documents(input_texts)

        .. code-block:: python

            [
                [0.0083934665, 0.037985895, -0.06684559, -0.039616987, 0.015481004, -0.023952313, ...],
                [-0.02713102, -0.005470169, 0.032321047, 0.042484466, 0.023290444, 0.02170547, ...]
            ]
    """  # noqa: E501

    client: Any = Field(default=None, exclude=True)  #: :meta private:
    model: str = Field(default="embedding-2")
    """Model name"""
    api_key: str
    """Automatically inferred from env var `ZHIPU_API_KEY` if not provided."""
    dimensions: Optional[int] = None
    """The number of dimensions the resulting output embeddings should have.

    Only supported in `embedding-3` and later models.
    """

    @model_validator(mode="before")
    @classmethod
    def validate_environment(cls, values: Dict) -> Any:
        """Validate that auth token exists in environment."""
        values["api_key"] = get_from_dict_or_env(values, "api_key", "ZHIPUAI_API_KEY")
        try:
            from zhipuai import ZhipuAI

            values["client"] = ZhipuAI(api_key=values["api_key"])
        except ImportError:
            raise ImportError(
                "Could not import zhipuai python package."
                "Please install it with `pip install zhipuai`."
            )
        return values



    def embed_query(self, text: str) -> List[float]:
        """
        Embeds a text using the AutoVOT algorithm.

        Args:
            text: A text to embed.

        Returns:
            Input document's embedded list.
        """
        resp = self.embed_documents([text])
        return resp[0]




    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Embeds a list of text documents using the AutoVOT algorithm.

        Args:
            texts: A list of text documents to embed.

        Returns:
            A list of embeddings for each document in the input list.
            Each embedding is represented as a list of float values.
        """
        if self.dimensions is not None:
            resp = self.client.embeddings.create(
                model=self.model,
                input=texts,
                dimensions=self.dimensions,
            )
        else:
            resp = self.client.embeddings.create(model=self.model, input=texts)
        embeddings = [r.embedding for r in resp.data]
        return embeddings


In [45]:
import chromadb
from chromadb.utils import embedding_functions
from pprint import pprint

# 初始化Chroma客户端
client = chromadb.Client()

# 创建一个集合（类似于表）
collection = client.get_or_create_collection("products")

# 定义嵌入函数（假设使用预训练的嵌入模型）
embedding = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=os.getenv("ZHIPU_API_KEY"),
    dimensions=1024
)

# 添加嵌入数据
collection.add(
    documents=["Galaxy S21", "iPhone 13", "MacBook Pro"],
    embeddings=embedding.embed_documents(["Galaxy S21", "iPhone 13", "MacBook Pro"]),
    metadatas=[
        {"category": "手机", "price": 799.99},
        {"category": "手机", "price": 999.99},
        {"category": "笔记本电脑", "price": 1299.99}
    ],
    ids=["prod1", "prod2", "prod3"]
)

# 查询相似向量
results = collection.query(
    query_embeddings=embedding.embed_documents(["智能手机"]),
    n_results=2
)

pprint(results)



Add of existing embedding ID: prod1
Add of existing embedding ID: prod2
Add of existing embedding ID: prod3
Insert of existing embedding ID: prod1
Insert of existing embedding ID: prod2
Insert of existing embedding ID: prod3


{'data': None,
 'distances': [[1.1217098236083984, 1.2391629219055176]],
 'documents': [['Galaxy S21', 'iPhone 13']],
 'embeddings': None,
 'ids': [['prod1', 'prod2']],
 'included': [<IncludeEnum.distances: 'distances'>,
              <IncludeEnum.documents: 'documents'>,
              <IncludeEnum.metadatas: 'metadatas'>],
 'metadatas': [[{'category': '手机', 'price': 799.99},
                {'category': '手机', 'price': 999.99}]],
 'uris': None}


In [44]:
import chromadb
from chromadb.utils import embedding_functions

# 初始化Chroma客户端
client = chromadb.Client()

# 创建一个集合（类似于表）
collection = client.get_or_create_collection("products")

# 定义嵌入函数（假设使用预训练的嵌入模型）

# 添加嵌入数据
collection.add(
    documents=["Galaxy S21", "iPhone 13", "MacBook Pro"],
    embeddings=embedding.embed_documents(["Galaxy S21", "iPhone 13", "MacBook Pro"]),
    metadatas=[
        {"category": "手机", "price": 799.99},
        {"category": "手机", "price": 999.99},
        {"category": "笔记本电脑", "price": 1299.99}
    ],
    ids=["prod1", "prod2", "prod3"]
)

# 查询相似向量



results = collection.query(
    query_embeddings=embedding.embed_documents(["智能手机"]),
    n_results=2
)

pprint(results)

Add of existing embedding ID: prod1
Add of existing embedding ID: prod2
Add of existing embedding ID: prod3
Insert of existing embedding ID: prod1
Insert of existing embedding ID: prod2
Insert of existing embedding ID: prod3


{'data': None,
 'distances': [[1.1217098236083984, 1.2391629219055176]],
 'documents': [['Galaxy S21', 'iPhone 13']],
 'embeddings': None,
 'ids': [['prod1', 'prod2']],
 'included': [<IncludeEnum.distances: 'distances'>,
              <IncludeEnum.documents: 'documents'>,
              <IncludeEnum.metadatas: 'metadatas'>],
 'metadatas': [[{'category': '手机', 'price': 799.99},
                {'category': '手机', 'price': 999.99}]],
 'uris': None}


In [46]:
# 导入所需的模块
from langchain.vectorstores import Chroma

# 从磁盘加载持久化数据库
persist_directory = "ch16_db"
embedding = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=os.getenv("ZHIPU_API_KEY"),
    dimensions=1024
)
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
# 执行相似性搜索并返回结果及其得分
query = "iphone"
print(query)
result = vectordb.similarity_search_with_score(query)
# 输出第一条搜索结果
print(result)

iphone
[]
