In [9]:
import os
os.environ['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings,ChatOpenAI
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

# List of URLs to load documents from
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]
# hf_bge_embeddings = HuggingFaceBgeEmbeddings(
#     model_name="BAAI/bge-large-zh-v1.5"
# )
embeddings = OpenAIEmbeddings(model='Qwen/Qwen3-Embedding-8B',base_url="https://api.siliconflow.cn/v1") #初始化
llm = ChatOpenAI(model='Qwen/Qwen3-30B-A3B-Instruct-2507',base_url="https://api.siliconflow.cn/v1")

# Load documents from the URLs
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]
# import pdb; pdb.set_trace()

# Initialize a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0
)

# Split the documents into chunks
doc_splits = text_splitter.split_documents(docs_list)

# Add the document chunks to the "vector store" using OpenAIEmbeddings
vectorstore = InMemoryVectorStore.from_documents(
    documents=doc_splits,
    embedding=embeddings,
)

# With langchain we can easily turn any vector store into a retrieval component:
retriever = vectorstore.as_retriever(k=3)
retriever.invoke("什么是Agent？")

[Document(id='fe0d1ac7-37e0-44cc-9402-5530cd98d8ea', metadata={'source': 'https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/', 'title': "Adversarial Attacks on LLMs | Lil'Log", 'description': 'The use of large language models in the real world has strongly accelerated by the launch of ChatGPT. We (including my team at OpenAI, shoutout to them) have invested a lot of effort to build default safe behavior into the model during the alignment process (e.g. via RLHF). However, adversarial attacks or jailbreak prompts could potentially trigger the model to output something undesired.\nA large body of ground work on adversarial attacks is on images, and differently it operates in the continuous, high-dimensional space. Attacks for discrete data like text have been considered to be a lot more challenging, due to lack of direct gradient signals. My past post on Controllable Text Generation is quite relevant to this topic, as attacking LLMs is essentially to control the model to outpu

In [10]:
from langchain_openai import ChatOpenAI
from langchain_deepseek import ChatDeepSeek
from langsmith import traceable

# llm = ChatOpenAI(model="gpt-4o", temperature=1)
# llm = ChatDeepSeek(
#     model="deepseek-chat", 
#     temperature=0, 
#     api_key=os.getenv("DEEPSEEK_API_KEY")
# )
llm = ChatOpenAI(model='Qwen/Qwen3-30B-A3B-Instruct-2507',base_url="https://api.siliconflow.cn/v1")
# Add decorator so this function is traced in LangSmith
@traceable()
def rag_bot(question: str) -> dict:
    # LangChain retriever will be automatically traced
    docs = retriever.invoke(question)
    docs_string = "\n\n".join(doc.page_content for doc in docs)

    instructions = f"""You are a helpful assistant who is good at analyzing source information and answering questions.       Use the following source documents to answer the user's questions.       If you don't know the answer, just say that you don't know.       Use three sentences maximum and keep the answer concise.
Documents:
{docs_string}"""

    # langchain ChatModel will be automatically traced
    ai_msg = llm.invoke([
            {"role": "system", "content": instructions},
            {"role": "user", "content": question},
        ],
    )

    return {"answer": ai_msg.content, "documents": docs}

In [11]:
from langsmith import Client

client = Client()

# Define the examples for the dataset
# examples = [
#     {
#         "inputs": {"question": "How does the ReAct agent use self-reflection? "},
#         "outputs": {"answer": "ReAct integrates reasoning and acting, performing actions - such tools like Wikipedia search API - and then observing / reasoning about the tool outputs."},
#     },
#     {
#         "inputs": {"question": "What are the types of biases that can arise with few-shot prompting?"},
#         "outputs": {"answer": "The biases that can arise with few-shot prompting include (1) Majority label bias, (2) Recency bias, and (3) Common token bias."},
#     },
#     {
#         "inputs": {"question": "What are five types of adversarial attacks?"},
#         "outputs": {"answer": "Five types of adversarial attacks are (1) Token manipulation, (2) Gradient based attack, (3) Jailbreak prompting, (4) Human red-teaming, (5) Model red-teaming."},
#     }
# ]
examples = [
{
"inputs": {"question": "ReAct 智能体如何使用自我反思？"},
"outputs": {"answer": "ReAct 整合了推理和行动，执行各种操作（例如使用维基百科搜索 API 等工具），然后观察并推理工具的输出结果。"},
},
{
"inputs": {"question": "少样本提示可能会产生哪些类型的偏差？"},
"outputs": {"answer": "少样本提示可能产生的偏差包括：（1）多数标签偏差，（2）近因偏差，（3）常见标记偏差。"},
},
{
"inputs": {"question": "五种对抗性攻击类型是什么？"},
"outputs": {"answer": "五种对抗性攻击类型是：（1）令牌操纵，（2）基于梯度的攻击，（3）越狱提示，（4）人类红队测试，（5）模型红队测试。"},
}
]
# Create the dataset and examples in LangSmith
dataset_name = "Lilian Weng Blogs Q&A"
# dataset = client.create_dataset(dataset_name=dataset_name)
# client.create_examples(
#     dataset_id=dataset.id,
#     examples=examples
# )

In [12]:
from typing_extensions import Annotated, TypedDict

# Grade output schema
class CorrectnessGrade(TypedDict):
    # Note that the order in the fields are defined is the order in which the model will generate them.
    # It is useful to put explanations before responses because it forces the model to think through
    # its final response before generating it:
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    correct: Annotated[bool, ..., "True if the answer is correct, False otherwise."]

# Grade prompt
correctness_instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. 
(2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader LLM
# grader_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(CorrectnessGrade, method="json_schema", strict=True)
# grader_llm = ChatDeepSeek(
#     model="deepseek-chat", 
#     temperature=0, 
#     api_key=os.getenv("DEEPSEEK_API_KEY")
# ).with_structured_output(CorrectnessGrade, method="json_schema", strict=True)
grader_llm = ChatOpenAI(model='Qwen/Qwen3-30B-A3B-Instruct-2507',base_url="https://api.siliconflow.cn/v1").with_structured_output(CorrectnessGrade, method="json_schema", strict=True)

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy"""
    answers = f"""\
QUESTION: {inputs['question']}
GROUND TRUTH ANSWER: {reference_outputs['answer']}
STUDENT ANSWER: {outputs['answer']}"""

    # Run evaluator
    grade = grader_llm.invoke([
        {"role": "system", "content": correctness_instructions}, 
        {"role": "user", "content": answers}
    ])
    return grade["correct"]

In [13]:
# Grade output schema
class RelevanceGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    relevant: Annotated[bool, ..., "Provide the score on whether the answer addresses the question"]

# Grade prompt
relevance_instructions="""You are a teacher grading a quiz. 

You will be given a QUESTION and a STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION
(2) Ensure the STUDENT ANSWER helps to answer the QUESTION

Relevance:
A relevance value of True means that the student's answer meets all of the criteria.
A relevance value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader LLM
# relevance_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(RelevanceGrade, method="json_schema", strict=True)
relevance_llm = ChatOpenAI(model='Qwen/Qwen3-30B-A3B-Instruct-2507',base_url="https://api.siliconflow.cn/v1").with_structured_output(RelevanceGrade, method="json_schema", strict=True)
# Evaluator
def relevance(inputs: dict, outputs: dict) -> bool:
    """A simple evaluator for RAG answer helpfulness."""
    answer = f"QUESTION: {inputs['question']}\nSTUDENT ANSWER: {outputs['answer']}"
    grade = relevance_llm.invoke([
        {"role": "system", "content": relevance_instructions}, 
        {"role": "user", "content": answer}
    ])
    return grade["relevant"]

In [14]:
# Grade output schema
class GroundedGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    grounded: Annotated[bool, ..., "Provide the score on if the answer hallucinates from the documents"]

# Grade prompt
grounded_instructions = """You are a teacher grading a quiz. 

You will be given FACTS and a STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is grounded in the FACTS. 
(2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.

Grounded:
A grounded value of True means that the student's answer meets all of the criteria.
A grounded value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader LLM 
# grounded_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(GroundedGrade, method="json_schema", strict=True)
grounded_llm = ChatOpenAI(model='Qwen/Qwen3-30B-A3B-Instruct-2507',base_url="https://api.siliconflow.cn/v1").with_structured_output(GroundedGrade, method="json_schema", strict=True)
# Evaluator
def groundedness(inputs: dict, outputs: dict) -> bool:
    """A simple evaluator for RAG answer groundedness."""
    doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"])
    answer = f"FACTS: {doc_string}\nSTUDENT ANSWER: {outputs['answer']}"
    grade = grounded_llm.invoke([{"role": "system", "content": grounded_instructions}, {"role": "user", "content": answer}])
    return grade["grounded"]

In [15]:
# Grade output schema
class RetrievalRelevanceGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    relevant: Annotated[bool, ..., "True if the retrieved documents are relevant to the question, False otherwise"]

# Grade prompt
retrieval_relevance_instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION and a set of FACTS provided by the student. 

Here is the grade criteria to follow:
(1) You goal is to identify FACTS that are completely unrelated to the QUESTION
(2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant
(3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met

Relevance:
A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant.
A relevance value of False means that the FACTS are completely unrelated to the QUESTION.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader LLM
# retrieval_relevance_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(RetrievalRelevanceGrade, method="json_schema", strict=True)
retrieval_relevance_llm = ChatOpenAI(model='Qwen/Qwen3-30B-A3B-Instruct-2507',base_url="https://api.siliconflow.cn/v1").with_structured_output(RetrievalRelevanceGrade, method="json_schema", strict=True)

def retrieval_relevance(inputs: dict, outputs: dict) -> bool:
    """An evaluator for document relevance"""
    doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"])
    answer = f"FACTS: {doc_string}\nQUESTION: {inputs['question']}"

    # Run evaluator
    grade = retrieval_relevance_llm.invoke([
        {"role": "system", "content": retrieval_relevance_instructions}, 
        {"role": "user", "content": answer}
    ])
    return grade["relevant"]

In [16]:
def target(inputs: dict) -> dict:
    return rag_bot(inputs["question"])

experiment_results = client.evaluate(
    target,
    data=dataset_name,
    evaluators=[correctness, groundedness, relevance, retrieval_relevance],
    # evaluators=[correctness],
    experiment_prefix="rag-doc-relevance",
    metadata={"version": "LCEL context, gpt-4-0125-preview"},
)
# Explore results locally as a dataframe if you have pandas installed
experiment_results.to_pandas()

View the evaluation results for experiment: 'rag-doc-relevance-1f0a23b2' at:
https://smith.langchain.com/o/0f8206cd-2f5b-4f7a-a5df-9ff907b631ac/datasets/6bcb7baf-76d9-461e-9437-4a683186dcfa/compare?selectedSessions=1b5c8fe5-85d0-45fc-882e-d20f1b7cb688




3it [03:05, 61.70s/it]


Unnamed: 0,inputs.question,outputs.answer,outputs.documents,error,reference.answer,feedback.correctness,feedback.groundedness,feedback.relevance,feedback.retrieval_relevance,execution_time,example_id,id
0,How does the ReAct agent use self-reflection?,The ReAct agent uses self-reflection by incorp...,"[page_content='$$\n\nabla_{\mathbf{x}_{i,j,a \...",,"ReAct integrates reasoning and acting, perform...",True,False,True,True,4.238499,03d5c270-4483-44ce-a655-1313f423144c,26c266ef-58ae-49d6-9c4c-9c3e83830c55
1,What are five types of adversarial attacks?,The provided text does not list or describe fi...,"[page_content='$$\n\nabla_{\mathbf{x}_{i,j,a \...",,Five types of adversarial attacks are (1) Toke...,False,True,False,True,4.4172,b7907f0c-2ecf-4568-a60c-dcf1d1628917,348f3e42-e70f-443e-83b4-62c405e05091
2,What are the types of biases that can arise wi...,Few-shot prompting can introduce bias through ...,[page_content='Augment: Generate multiple pseu...,,The biases that can arise with few-shot prompt...,False,True,True,True,3.559826,ce4b0a50-3a4e-4642-9181-e22cfbb2345d,0da3221b-2170-4e09-9eae-5c2407a3bf78
