In [None]:
%pip install chromadb

In [32]:
# 初始化智谱 embedding
from typing import Any, Dict, List, Optional

from langchain_core.embeddings import Embeddings
from langchain_core.utils import get_from_dict_or_env
from pydantic import BaseModel, Field, model_validator
import os
from zhipuai import ZhipuAI

class ZhipuAIEmbeddings(BaseModel, Embeddings):
    """ZhipuAI embedding model integration.

    Setup:

        To use, you should have the ``zhipuai`` python package installed, and the
        environment variable ``ZHIPU_API_KEY`` set with your API KEY.

        More instructions about ZhipuAi Embeddings, you can get it
        from  https://open.bigmodel.cn/dev/api#vector

        .. code-block:: bash

            pip install -U zhipuai
            export ZHIPU_API_KEY="your-api-key"

    Key init args — completion params:
        model: Optional[str]
            Name of ZhipuAI model to use.
        api_key: str
            Automatically inferred from env var `ZHIPU_API_KEY` if not provided.

    See full list of supported init args and their descriptions in the params section.

    Instantiate:

        .. code-block:: python

            from langchain_community.embeddings import ZhipuAIEmbeddings

            embed = ZhipuAIEmbeddings(
                model="embedding-2",
                # api_key="...",
            )

    Embed single text:
        .. code-block:: python

            input_text = "The meaning of life is 42"
            embed.embed_query(input_text)

        .. code-block:: python

            [-0.003832892, 0.049372625, -0.035413884, -0.019301128, 0.0068899863, 0.01248398, -0.022153955, 0.006623926, 0.00778216, 0.009558191, ...]


    Embed multiple text:
        .. code-block:: python

            input_texts = ["This is a test query1.", "This is a test query2."]
            embed.embed_documents(input_texts)

        .. code-block:: python

            [
                [0.0083934665, 0.037985895, -0.06684559, -0.039616987, 0.015481004, -0.023952313, ...],
                [-0.02713102, -0.005470169, 0.032321047, 0.042484466, 0.023290444, 0.02170547, ...]
            ]
    """  # noqa: E501

    client: Any = Field(default=None, exclude=True)  #: :meta private:
    model: str = Field(default="embedding-2")
    """Model name"""
    api_key: str
    """Automatically inferred from env var `ZHIPU_API_KEY` if not provided."""
    dimensions: Optional[int] = None
    """The number of dimensions the resulting output embeddings should have.

    Only supported in `embedding-3` and later models.
    """

    @model_validator(mode="before")
    @classmethod
    def validate_environment(cls, values: Dict) -> Any:
        """Validate that auth token exists in environment."""
        values["api_key"] = get_from_dict_or_env(values, "api_key", "ZHIPUAI_API_KEY")
        try:
            from zhipuai import ZhipuAI

            values["client"] = ZhipuAI(api_key=values["api_key"])
        except ImportError:
            raise ImportError(
                "Could not import zhipuai python package."
                "Please install it with `pip install zhipuai`."
            )
        return values



    def embed_query(self, text: str) -> List[float]:
        """
        Embeds a text using the AutoVOT algorithm.

        Args:
            text: A text to embed.

        Returns:
            Input document's embedded list.
        """
        resp = self.embed_documents([text])
        return resp[0]




    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Embeds a list of text documents using the AutoVOT algorithm.

        Args:
            texts: A list of text documents to embed.

        Returns:
            A list of embeddings for each document in the input list.
            Each embedding is represented as a list of float values.
        """
        if self.dimensions is not None:
            resp = self.client.embeddings.create(
                model=self.model,
                input=texts,
                dimensions=self.dimensions,
            )
        else:
            resp = self.client.embeddings.create(model=self.model, input=texts)
        embeddings = [r.embedding for r in resp.data]
        return embeddings


In [None]:

import chromadb
from chromadb.utils import embedding_functions
from pprint import pprint
import os

# 初始化Chroma客户端（持久化模式）
client = chromadb.PersistentClient(path="ch16_db")

# 创建一个集合（类似于表）
collection = client.get_or_create_collection("products")

# 定义嵌入函数（使用预训练的嵌入模型）
embedding = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=os.getenv("ZHIPU_API_KEY"),
    dimensions=1024
)

# 添加嵌入数据
collection.add(
    documents=["Galaxy S21", "iPhone 13", "MacBook Pro"],
    embeddings=embedding.embed_documents(["Galaxy S21", "iPhone 13", "MacBook Pro"]),
    metadatas=[
        {"category": "手机", "price": 799.99},
        {"category": "手机", "price": 999.99},
        {"category": "笔记本电脑", "price": 1299.99}
    ],
    ids=["prod1", "prod2", "prod3"]
)

print("数据添加完成！")

# 获取集合中的所有数据
all_data = collection.get()
print("集合中的所有数据：")
pprint(all_data)

# 根据 ID 获取特定的文档
specific_data = collection.get(ids=["prod1"])
print("\nID 为 'prod1' 的文档：")
pprint(specific_data)

# 根据元数据条件获取文档
filtered_data = collection.get(where={"category": "手机"})
print("\n类别为 '手机' 的文档：")
pprint(filtered_data)

# 更新已有文档的元数据
collection.update(
    ids=["prod1"],
    metadatas=[{"category": "手机", "price": 749.99}]
)
print("\n已更新 ID 为 'prod1' 的文档价格。")

# 删除特定 ID 的文档
collection.delete(ids=["prod2"])
print("\n已删除 ID 为 'prod2' 的文档。")

# 查看集合中剩余的文档
remaining_data = collection.get()
print("\n剩余的文档：")
pprint(remaining_data)



In [None]:
import chromadb
from chromadb.utils import embedding_functions
from pprint import pprint
import os

# 初始化Chroma客户端（持久化模式）
client = chromadb.PersistentClient(path="ch16_db")

# 创建一个集合（类似于表）
collection = client.get_or_create_collection("products")

# 定义嵌入函数（使用预训练的嵌入模型）
embedding = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=os.getenv("ZHIPU_API_KEY"),
    dimensions=1024
)

# 添加嵌入数据
collection.add(
    documents=["Galaxy S21", "iPhone 13", "MacBook Pro"],
    embeddings=embedding.embed_documents(["Galaxy S21", "iPhone 13", "MacBook Pro"]),
    metadatas=[
        {"category": "手机", "price": 799.99},
        {"category": "手机", "price": 999.99},
        {"category": "笔记本电脑", "price": 1299.99}
    ],
    ids=["prod1", "prod2", "prod3"]
)

print("数据添加完成！")

In [None]:
# 获取集合中的所有数据
all_data = collection.get()
print("集合中的所有数据：")
pprint(all_data)

# 根据 ID 获取特定的文档
specific_data = collection.get(ids=["prod1"])
print("\nID 为 'prod1' 的文档：")
pprint(specific_data)

# 根据元数据条件获取文档
filtered_data = collection.get(where={"category": "手机"})
print("\n类别为 '手机' 的文档：")
pprint(filtered_data)

# 更新已有文档的元数据
collection.update(
    ids=["prod1"],
    metadatas=[{"category": "手机", "price": 749.99}]
)
print("\n已更新 ID 为 'prod1' 的文档价格。")

# 删除特定 ID 的文档
collection.delete(ids=["prod2"])
print("\n已删除 ID 为 'prod2' 的文档。")

# 查看集合中剩余的文档
remaining_data = collection.get()
print("\n剩余的文档：")
pprint(remaining_data)

In [None]:
# 导入所需的模块
from langchain.vectorstores import Chroma
import os

# 从磁盘加载持久化数据库
persist_directory = "ch16_db"
embedding = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=os.getenv("ZHIPU_API_KEY"),
    dimensions=1024
)

vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

# 执行相似性搜索并返回结果及其得分
query = "智能手机"
results = vectordb.similarity_search_with_score(query)

# 输出搜索结果
print("Search results:")
for doc, score in results:
    print(f"Document: {doc.page_content}, Score: {score}")

In [None]:

# 创建基于Chroma向量存储的检索器
retriever = vectordb.as_retriever(search_type="mmr")

# 检索与查询相关的文档
retrieved_docs = retriever.get_relevant_documents(query)

# 输出第一篇检索到的文档内容
print("Retrieved document:")
print(retrieved_docs[0].page_content)

In [None]:
# 导入所需的模块
from langchain.vectorstores import Chroma
import os

# 从磁盘加载持久化数据库
persist_directory = "ch16_db"
embedding = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=os.getenv("ZHIPU_API_KEY"),  # 确保环境变量已设置或直接替换为您的API密钥
    dimensions=1024
)

vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

# 执行相似性搜索并返回结果及其得分
query = "智能手机"
result = vectordb.similarity_search_with_score(query)

# 输出第一条搜索结果
print(result)


In [None]:
# 创建基于Chroma向量存储的检索器
retriever = vectordb.as_retriever(search_type="mmr")

# 检索与查询相关的文档
retrieved_docs = retriever.get_relevant_documents(query)

# 输出第一篇检索到的文档内容
print(retrieved_docs[0].page_content)

In [None]:

collection = client.get_or_create_collection("products")

# 定义嵌入函数（假设使用预训练的嵌入模型）

# 添加嵌入数据
collection.add(
    documents=["Galaxy S21", "iPhone 13", "MacBook Pro"],
    embeddings=embedding.embed_documents(["Galaxy S21", "iPhone 13", "MacBook Pro"]),
    metadatas=[
        {"category": "手机", "price": 799.99},
        {"category": "手机", "price": 999.99},
        {"category": "笔记本电脑", "price": 1299.99}
    ],
    ids=["prod1", "prod2", "prod3"]
)

# 查询相似向量



results = collection.query(
    query_embeddings=embedding.embed_documents(["智能手机"]),
    n_results=2
)

pprint(results)

In [None]:
print(collection.count())  # 确保集合中有文档


In [None]:
query_embeddings = embedding.embed_documents(["智能手机"])
print(query_embeddings)  # 检查查询嵌入是否正确生成

In [None]:
embeddings = embedding.embed_documents(["Galaxy S21", "iPhone 13", "MacBook Pro"])
print(embeddings)  # 确保嵌入生成成功


In [None]:
# 导入所需的模块
from langchain.vectorstores import Chroma

# 从磁盘加载持久化数据库
persist_directory = "ch16_db"
embedding = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=os.getenv("ZHIPU_API_KEY"),
    dimensions=1024
)
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
# 执行相似性搜索并返回结果及其得分
query = "Galaxy S21"
print(query)
result = vectordb.similarity_search_with_score(query)
# 输出第一条搜索结果
print(result)

In [None]:
# 导入所需模块
from langchain.vectorstores import Chroma
from langchain_core.embeddings import Embeddings
from zhipuai import ZhipuAI
import os


# 设置持久化目录
persist_directory = "ch16_db"



# 初始化 Chroma 数据库
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

# 执行相似性搜索并返回结果及其得分
query = "iphone"
print("Query:", query)

# 生成查询嵌入
query_embeddings = embedding.embed_documents([query])
print("Query embeddings:", query_embeddings)  # 打印查询嵌入

# 获取集合并检查数据库中的文档数量

documents = collection.get()  # 获取所有文档
document_count = len(documents)  # 计算文档数量
print("Number of documents in database:", document_count)

# 如果数据库中有文档，则进行相似性搜索
if document_count > 0:
    # 进行相似性搜索
    result = vectordb.similarity_search_with_score(query, k=5)
    # 输出结果
    print("Search results:", result)
else:
    print("数据库中没有文档，无法进行搜索。")


In [None]:
import chromadb
from langchain.vectorstores import Chroma

import os

# 初始化Chroma客户端
persist_directory = "ch16_db"
embedding = ZhipuAIEmbeddings(model="embedding-2", api_key=os.getenv("ZHIPU_API_KEY"), dimensions=1024)
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

# 查询内容
query = "Galaxy S21"
query_embeddings = embedding.embed_documents([query])

# 检查查询嵌入
print("Query embeddings:", query_embeddings)

# 获取集合中的文档
documents = vectordb.get()  # 确保从vectordb获取文档
document_count = len(documents)
print("Number of documents in database:", document_count)

# 如果有文档，则进行相似性搜索
if document_count > 0:
    result = vectordb.similarity_search_with_score(query, k=5)
    print("Search results:", result)
else:
    print("数据库中没有文档，无法进行搜索。")


In [None]:
import chromadb
from chromadb.utils import embedding_functions
from pprint import pprint
import os
import numpy as np

# 假设您已经定义了 ZhipuAIEmbeddings，如果没有，需要替换为实际的嵌入函数
# 请确保您已经正确导入或定义了 ZhipuAIEmbeddings 类

# 初始化 Chroma 客户端（持久化模式）
client = chromadb.PersistentClient(path="ch16_db")

# 删除并重新创建集合
try:
    client.delete_collection(name="products")
    print("集合已删除。")
except Exception as e:
    print("集合不存在，创建新的集合。")

# 创建新的集合
collection = client.create_collection(name="products")
print("新的集合已创建。")

# 定义嵌入函数（使用预训练的嵌入模型）
embedding = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=os.getenv("ZHIPU_API_KEY"),
    dimensions=1024
)

# 添加嵌入数据
collection.add(
    documents=["Galaxy S21", "iPhone 13", "MacBook Pro"],
    embeddings=embedding.embed_documents(["Galaxy S21", "iPhone 13", "MacBook Pro"]),
    metadatas=[
        {"category": "手机", "price": 799.99},
        {"category": "手机", "price": 999.99},
        {"category": "笔记本电脑", "price": 1299.99}
    ],
    ids=["prod1", "prod2", "prod3"]
)
print("数据添加完成！")

# 打印集合中的所有数据，验证添加是否成功
# all_data = collection.get(include=["documents", "metadatas", "ids"])  # 错误的方式
all_data = collection.get(include=["documents", "metadatas"])  # 移除 "ids"注意，ids 会自动返回。不需要写到这里面

print("当前集合中的数据：")
for metadata, document, doc_id in zip(all_data.get('metadatas', []), all_data.get('documents', []), all_data.get('ids', [])):
    print(f"ID: {doc_id}, 产品: {document}, 分类: {metadata['category']}, 价格: {metadata['price']}")

# 自定义元数据过滤：获取价格低于1000的手机，并包括嵌入向量
filtered_data = collection.get(
    where={
        "$and": [
            {"category": "手机"},
            {"price": {"$lte": 1000}}
        ]
    },
    include=["embeddings", "documents", "metadatas"]  # 移除 "ids"
)

# 检查是否有返回结果，且结果不为 None
if not filtered_data or not filtered_data.get('documents'):
    print("没有找到符合条件的产品。")
else:
    # 组合元数据、文档、ID 和嵌入
    combined_data = list(zip(
        filtered_data.get('metadatas', []),
        filtered_data.get('documents', []),
        filtered_data.get('ids', []),  # ids 默认返回
        filtered_data.get('embeddings', [])
    ))

    # 定义查询嵌入（例如，用户想查询“高性能手机”）
    query_text = "高性能手机"
    query_embedding = embedding.embed_query(query_text)

    # 自定义相似度函数（余弦相似度）
    def cosine_similarity(a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    # 对结果根据自定义相似度进行排序
    sorted_data = sorted(
        combined_data,
        key=lambda x: cosine_similarity(query_embedding, x[3]),
        reverse=True  # 相似度从高到低排序
    )

    print(f"\n查询 '{query_text}' 的结果（按相似度排序）：")
    for metadata, document, doc_id, _ in sorted_data:
        print(f"ID: {doc_id}, 产品: {document}, 价格: {metadata['price']}")


In [None]:
import chromadb
from chromadb.utils import embedding_functions
from pprint import pprint
import os
import numpy as np

# 假设您已经定义了 ZhipuAIEmbeddings，如果没有，需要替换为实际的嵌入函数
# 请确保您已经正确导入或定义了 ZhipuAIEmbeddings 类

# 初始化 Chroma 客户端（持久化模式）
client = chromadb.PersistentClient(path="ch16_db")

# 删除并重新创建集合
try:
    client.delete_collection(name="products")
    print("集合已删除。")
except Exception as e:
    print("集合不存在，创建新的集合。")

# 创建新的集合
collection = client.create_collection(name="products")
print("新的集合已创建。")

# 定义嵌入函数（使用预训练的嵌入模型）
embedding = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=os.getenv("ZHIPU_API_KEY"),
    dimensions=1024
)

# 添加嵌入数据
documents = ["Galaxy S21", "iPhone 13", "MacBook Pro"]
metadatas = [
    {"category": "手机", "price": 799.99},
    {"category": "手机", "price": 999.99},
    {"category": "笔记本电脑", "price": 1299.99}
]
ids = ["prod1", "prod2", "prod3"]
embeddings = embedding.embed_documents(documents)

collection.add(
    documents=documents,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)
print("数据添加完成！")

# 打印集合中的所有数据，验证添加是否成功
all_data = collection.get(include=["documents", "metadatas"])  # ids 会自动返回

print("当前集合中的数据：")
for metadata, document, doc_id in zip(all_data.get('metadatas', []), all_data.get('documents', []), all_data.get('ids', [])):
    print(f"ID: {doc_id}, 产品: {document}, 分类: {metadata['category']}, 价格: {metadata['price']}")

# 定义自定义相似度检索器
class CustomSimilarityRetriever:
    def __init__(self, collection, embedding_function, k=3):
        self.collection = collection
        self.embedding_function = embedding_function
        self.k = k

    def get_relevant_documents(self, query):
        # 计算查询的嵌入
        query_embedding = self.embedding_function.embed_query(query)
        # 获取集合中的所有嵌入数据
        all_data = self.collection.get(
            include=["embeddings", "documents", "metadatas"]  # 移除了 "ids"
        )
        if not all_data or not all_data.get('documents'):
            return []
        embeddings = all_data.get('embeddings')
        documents = all_data.get('documents')
        metadatas = all_data.get('metadatas')
        ids = all_data.get('ids')  # ids 会自动返回

        # 转换嵌入为 numpy 数组
        query_embedding = np.array(query_embedding)
        doc_embeddings = np.array(embeddings)

        # 定义余弦相似度函数
        def cosine_similarity(a, b):
            return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

        # 计算查询与每个文档的相似度
        similarities = [cosine_similarity(query_embedding, doc_embedding) for doc_embedding in doc_embeddings]

        # 获取相似度最高的前 k 个文档的索引
        top_k_indices = np.argsort(similarities)[-self.k:][::-1]

        # 准备返回的文档列表
        results = []
        for idx in top_k_indices:
            doc = {
                'document': documents[idx],
                'metadata': metadatas[idx],
                'id': ids[idx],
                # 'embedding': embeddings[idx]  # 如果需要，可以包括嵌入向量
            }
            results.append(doc)
        return results

# 创建自定义相似度检索器
custom_retriever = CustomSimilarityRetriever(
    collection=collection,
    embedding_function=embedding,
    k=3  # 返回3个文档
)

# 使用自定义检索器进行检索
query_text = "高性能手机"
retrieved_docs = custom_retriever.get_relevant_documents(query_text)

print(f"\n查询 '{query_text}' 的结果：")
for doc in retrieved_docs:
    print(f"ID: {doc['id']}, 产品: {doc['document']}, 价格: {doc['metadata']['price']}")
