## 测试openRouter


In [1]:
! pip install haystack-ai



In [2]:
! pip install -U chroma-haystack
! pip install python-docx
! pip install pypdf
! pip install "sentence-transformers>=3.0.0
! pip install haystack-ai



In [3]:
import os
from haystack.dataclasses import ChatMessage
from haystack_integrations.components.generators.openrouter import OpenRouterChatGenerator

os.environ["OPENROUTER_API_KEY"] = "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"

def show(chunk):                              # simple streaming callback
    print(chunk.content, end="", flush=True)

client = OpenRouterChatGenerator(
    model="microsoft/mai-ds-r1:free",                  # let OpenRouter pick a model
    # streaming_callback=show,
    generation_kwargs={
        "provider": {"sort": "throughput"},   # pick the fastest provider
    }
)

response = client.run([ChatMessage.from_user("吃了被门夹过的核桃能补脑吗")])

print (response)


{'replies': [ChatMessage(_role=<ChatRole.ASSISTANT: 'assistant'>, _content=[TextContent(text='“吃了被门夹过的核桃能补脑吗”这个问题，可以分两个角度解释：\n\n1. **字面科学角度**  \n   核桃本身富含Omega-3脂肪酸、抗氧化剂和维生素E等对大脑健康有益的成分。即使核桃壳被门夹裂，只要果肉未变质或受污染，营养价值和补脑作用并无改变。但需注意：若夹碎后存放不当（如受潮、发霉），则可能产生有害物质，此时应避免食用。\n\n2. **幽默双关角度**  \n   问题可能隐含“脑子被门夹过”的调侃梗，暗示某人的思维方式有问题。若以补脑治疗“被门夹过的脑子”，则纯属玩笑——核桃无法修复逻辑或情商问题，但吃点健康食物总比不吃强！（笑）\n\n**总结**：没变质的夹碎核桃照样补脑；而用食物治“脑子被门夹”的幽默，建议搭配一杯奶茶，疗效更佳（狗头保命）。')], _name=None, _meta={'model': 'microsoft/mai-ds-r1:free', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 581, 'prompt_tokens': 16, 'total_tokens': 597, 'completion_tokens_details': None, 'prompt_tokens_details': None}})]}


In [4]:
# -*- coding: utf-8 -*-


from pprint import pprint
from haystack.components.writers import DocumentWriter
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from test_chinese_document_spliter import chinese_DocumentSpliter
input_dir= r"C:\\Users\Administrator\Desktop\\操作-大模型测试文件"
# 存储在内存中
# document_store = InMemoryDocumentStore()


# 使用milvus进行向量数据库存储(只有linux能用，windows没办法使用）
# document_store = MilvusDocumentStore(
#     connection_args={"uri": r"F:\MC-PROJECT\CUDA_Preject\medical_assistant\RAG\haystack2.0\RAG\chroma_cache/milvus.db"},  # Milvus Lite
#     # connection_args={"uri": "http://localhost:19530"},  # Milvus standalone docker service.
#     drop_old=True,
# )

# 使用chromaDB作为document_store，需要执行安装命令pip install chroma-haystack
# 将向量数据库做持久化存储。
document_store = ChromaDocumentStore(persist_path=r"D:\\Project\\tetst_haystack_chinese/chroma_cache")

# 在下面添加MIME格式的docx文档匹配
# 文档路由器，通过MIME匹配文档类型，捕捉不同类型的文件
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"])
# 下面是多个converter对象，详情参考：https://docs.haystack.deepset.ai/docs/converters
# 组件：txt文件转换成Document对象
text_file_converter = TextFileToDocument()
# 组件：将docx文件转换成Document对象
docx_converter = converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
# 组将：将PDF转换成Document对象
pdf_converter = PyPDFToDocument()
# 将多个不同来源的文档组成列表，操作对象是Document对象。
document_joiner = DocumentJoiner()
# 文本清洗组件。功能比较简单：清理空行什么的，可以自定义正则表达式。  建议提前处理好文档。
document_cleaner = DocumentCleaner()
# 文档分割器，按照word进行切分，切分长度为150个单词，重叠窗口大小为50
document_splitter = chinese_DocumentSpliter(split_by="word", split_length=50, split_overlap=10,language='zh',respect_sentence_boundary=False)

model_path = "BAAI/bge-small-zh-v1.5"
# embedding组件，以及写入document_store
document_embedder = SentenceTransformersDocumentEmbedder(model=model_path)
document_writer = DocumentWriter(document_store)

# 实例化管道，将组件作为节点添加进管道
my_pipe = Pipeline()
my_pipe.add_component(instance=file_type_router, name="file_type_router")
my_pipe.add_component(instance=text_file_converter, name="text_file_converter")
my_pipe.add_component(instance=docx_converter, name="docx_converter")
my_pipe.add_component(instance=pdf_converter, name="pypdf_converter")
my_pipe.add_component(instance=document_joiner, name="document_joiner")
my_pipe.add_component(instance=document_cleaner, name="document_cleaner")
my_pipe.add_component(instance=document_splitter, name="document_splitter")
my_pipe.add_component(instance=document_embedder, name="document_embedder")
my_pipe.add_component(instance=document_writer, name="document_writer")

# 设定管道组件连接。
my_pipe.connect("file_type_router.text/plain", "text_file_converter.sources")
my_pipe.connect("file_type_router.application/pdf", "pypdf_converter.sources")
my_pipe.connect("file_type_router.application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx_converter.sources")
my_pipe.connect("text_file_converter", "document_joiner")
my_pipe.connect("pypdf_converter", "document_joiner")
my_pipe.connect("docx_converter", "document_joiner")
my_pipe.connect("document_joiner", "document_cleaner")
my_pipe.connect("document_cleaner", "document_splitter")
my_pipe.connect("document_splitter", "document_embedder")
my_pipe.connect("document_embedder", "document_writer")

from pathlib import Path

# 执行
result=my_pipe.run({"file_type_router": {"sources": list(Path(input_dir).glob("**/*"))}},include_outputs_from={"document_splitter"})
# print(result)





Could not read C:\Users\Administrator\Desktop\操作-大模型测试文件\~$果物】软件介绍（二级二等）.docx and convert it to a DOCX Document, skipping. Error: File is not a zip file


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Document 89397ed7359f5ccbb1c06cf5b6cc84bc1f1883255ba606f3af04afc2a28b43f0 contains `meta` values of unsupported types for the keys: docx, _split_overlap. These items will be discarded. Supported types are: str, int, float, bool.
Document 4080c55e424d939bc09e3faa1c297effaef8f48e9e6926f265bfb802068e95dc contains `meta` values of unsupported types for the keys: docx, _split_overlap. These items will be discarded. Supported types are: str, int, float, bool.
Document a1dc2da9f0c75006a826a592fb2ad8e9f3f91bc9f7b02248cb3eea04b77ce46a contains `meta` values of unsupported types for the keys: docx, _split_overlap. These items will be discarded. Supported types are: str, int, float, bool.
Document a23c986b9dd6bd0c3a88b19ccd6d91018f9c2f971c74bbb1283f0ab5cd625432 contains `meta` values of unsupported types for the keys: docx, _split_overlap. These items will be discarded. Supported types are: str, int, float, bool.
Document 5b895751524d42477b2add007ab0a3f09a59eeca0416a641e9de53254bef5d8c contains `

In [5]:
import os
from haystack.dataclasses import ChatMessage
from haystack_integrations.components.generators.openrouter import OpenRouterChatGenerator

from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
from haystack.components.builders import PromptBuilder,ChatPromptBuilder



os.environ["OPENROUTER_API_KEY"] = "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"
def show(chunk):                              # simple streaming callback
    print(chunk.content, end="", flush=True)


template = """
根据给出的文本来回答问题.

文本:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

问题: {{ question }}
你的回答:
"""
RAG_pipe = Pipeline()
RAG_pipe.add_component("embedder", SentenceTransformersTextEmbedder(model="BAAI/bge-small-zh-v1.5"))
RAG_pipe.add_component("retriever", ChromaEmbeddingRetriever(document_store=document_store))

RAG_pipe.connect("embedder.embedding", "retriever.query_embedding")



question = (
    "我们软件的优势是什么？"
)

# 将问题输入进embedder组件的“text”参数，以及prompt的“question”参数，llm组件的大模型配置参数字典，修改生成的最大长度为350
result=RAG_pipe.run(
    {
        "embedder": {"text": question},
        "retriever":{"top_k":3}

    }
)

pprint(result["retriever"]["documents"])
# for i in result["retriever"]["documents"]:
#     print(i.content)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[Document(id=4bd467401da7e9c127864eead662cad67264c2aed0cded703c051b5082112d9b, content: '这样的定位。这是我们的核心优势。软件优势：软件属于原创创新，以人的行为过程为软件的主体，通过信息过程、统计过程和绩效过程，实现了工作人员行为的标准化过程。服务优势：有', meta: {'split_id': 6, 'page_number': 1, 'split_idx_start': 645, 'file_path': '【成果物】软件介绍（二级二等）.docx', 'source_id': '9a335b3e82c47ebeeb82aa9ff3a2d4e5165defacf98cf8280aeb7af896f50f04'}, score: 0.45290058851242065, embedding: vector of size 512),
 Document(id=b976cc884673bf171b6b89fc02de0f6f42eecf04d89c33ca4e470a26eef1a481, content: '：定位优势：提供用户最理想的结果，对最终的结果负责。我们是唯一的这样定位的技术服务团队。尤其是对象是企业管理这样的及其复杂的问题，给这样的定位。这是我们的核心优势。软件优势：软件属于原创创新，以人的行...', meta: {'page_number': 1, 'file_path': '【成果物】软件介绍（二级二等）.docx', 'split_id': 2, 'source_id': '9a335b3e82c47ebeeb82aa9ff3a2d4e5165defacf98cf8280aeb7af896f50f04', 'split_idx_start': 539}, score: 0.5787925720214844, embedding: vector of size 512),
 Document(id=704497d7c68cd81759daad131441a1ad63519303ea590a53f781b7b3600de916, content: '：定位优势：提供用户最理想的结果，对最终的结果负责。我们是唯一的这样定位的技术服务团队。尤其是对象是企业管理这样

In [9]:
import os
from haystack.dataclasses import ChatMessage
from haystack_integrations.components.generators.openrouter import OpenRouterChatGenerator

from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
from haystack.components.builders import PromptBuilder
import os
from haystack.dataclasses import ChatMessage
from haystack_integrations.components.generators.openrouter import OpenRouterChatGenerator


os.environ["OPENROUTER_API_KEY"] = "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"

def show(chunk):                              # simple streaming callback
    print(chunk.content, end="", flush=True)

client = OpenRouterChatGenerator(
    model="microsoft/mai-ds-r1:free",                  # let OpenRouter pick a model
    # streaming_callback=show,
    generation_kwargs={
        "provider": {"sort": "throughput"},   # pick the fastest provider
    }
)


template = [ChatMessage.from_user("""
根据给出的文本来回答问题.

文本:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

问题: {{ question }}
你的回答:
""")]

prompt_builder=ChatPromptBuilder(template=template)
llm= OpenRouterChatGenerator(
    model="microsoft/mai-ds-r1:free",                  # let OpenRouter pick a model
    # streaming_callback=show,
    generation_kwargs={
        "provider": {"sort": "throughput"},   # pick the fastest provider
    }
)




#   创建管道节点
RAG_pipe = Pipeline()
RAG_pipe.add_component("text_embedder", SentenceTransformersTextEmbedder(model="BAAI/bge-small-zh-v1.5"))
RAG_pipe.add_component("retriever", ChromaEmbeddingRetriever(document_store=document_store))
RAG_pipe.add_component("prompt_builder", prompt_builder)
RAG_pipe.add_component("llm", llm)


# 连接管道节点
RAG_pipe.connect("text_embedder.embedding", "retriever.query_embedding")
RAG_pipe.connect("retriever", "prompt_builder")
RAG_pipe.connect("prompt_builder.prompt", "llm.messages")


question = "我们软件的最大优势是什么"

response = RAG_pipe.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0].text)





ChatPromptBuilder has 2 prompt variables, but `required_variables` is not set. By default, all prompt variables are treated as optional, which may lead to unintended behavior in multi-branch pipelines. To avoid unexpected execution, ensure that variables intended to be required are explicitly set in `required_variables`.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

根据文本内容，我们软件的最大优势是**标准优势**，具体体现在：

1. **颠覆性创新标准**：提出了企业运行和生存的新方向，系统性地构建了完整的企业管理新标准，从根本上颠覆传统管理思路。"这个标准，颠覆了过去人们对企业管理的思路和方法，会给企业带来巨大的效益"。

2. **结果导向**：通过管理过程自动化、人员行为自主化与标准化，**直接实现企业管理结果升级**。在软硬件技术支撑下，"基础管理模块实现的管理结果是...企业带来巨大的效益"。

3. **唯一性定位**：强调"我们是唯一这样定位的技术服务团队"，将标准优势与核心定位绑定，突出其在复杂的企业管理领域的独特性。

其他如**技术整合优势（云服务+物联网全栈服务）**和**"软件即用性"（不用实施、立即会用）**是对标准优势的支撑，而标准优势本身成为定义行业新规则的差异化竞争力。
