In [None]:
# Definition: Format and print the contents of multiple documents, with each document separated by --
#定义：格式化并打印多个文档的内容,每个文档用--隔开

def pretty_print(docs):
    print(
      f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n + {d.page_content}\n{d.metadata}" for i,d in enumerate(docs)])
  )

In [4]:
import os
import json
from langchain.schema import Document  
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.chat_models import ChatZhipuAI
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator

# Read JSON Files
# 读取 JSON 文件

def load_documents_from_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Parse JSON and convert back to Document object
    # 解析 JSON，转换回 Document 对象
    
    documents = []
    for item in data:
        doc = Document(
            metadata=item["metadata"], 
            page_content=json.dumps(item["page_content"], ensure_ascii=False)
        )
        documents.append(doc)
    
    return documents

# Specify the path of the JSON file
# 指定 JSON 文件路径

json_file_path = r"D:\knowledge_base\Data_doc_faiss\1.json" # ATTENTION! Here is the split JSON document # 注意，此处为分割后的json文档

# Load Document
# 加载 Document

docs1 = load_documents_from_json(json_file_path)

print(len(docs))

184


In [12]:
print(len(docs1))
pretty_print(docs1)

184
Document 1:

 + {"document_type": "journal article"}
{'source': 'D:/knowledge_base/Data/1.json', 'metadata_num': 1, 'metadata_content': 'document_type'}
----------------------------------------------------------------------------------------------------
Document 2:

 + {"document_title": "Effects of Cr and Al/W ratio on the microstructural stability, oxidation property and γ' phase nano-hardness of multi-component Co–Ni-base superalloys"}
{'source': 'D:/knowledge_base/Data/1.json', 'metadata_num': 2, 'metadata_content': 'document_title'}
----------------------------------------------------------------------------------------------------
Document 3:

 + {"authors": [{"name": "Wendao Li", "affiliation": "Beijing Advanced Innovation Center for Materials Genome Engineering, State Key Laboratory for Advanced Metals and Materials, University of Science and Technology Beijing, Beijing, 100083, China"}, {"name": "Longfei Li", "affiliation": "Beijing Advanced Innovation Center for Materials

Here, we only use the first 120 chunks to build the knowledge graph and generate the test set, as the subsequent additional_info chunks such as conflict of interest statements and references may have little significance for building the knowledge graph.

这里我们只使用前120个chunks进行构建知识图谱并用于生成测试集，因为后续的additional_info如利益声明，参考文献等chunks对于构建知识图谱来说可能是意义不大的。

In [13]:
# Define the docs used for generating knowledge graphs.
# 定义用于生成知识图谱的 docs

docs = docs1[:120]
print(len(docs))
pretty_print(docs)

120
Document 1:

 + {"document_type": "journal article"}
{'source': 'D:/knowledge_base/Data/1.json', 'metadata_num': 1, 'metadata_content': 'document_type'}
----------------------------------------------------------------------------------------------------
Document 2:

 + {"document_title": "Effects of Cr and Al/W ratio on the microstructural stability, oxidation property and γ' phase nano-hardness of multi-component Co–Ni-base superalloys"}
{'source': 'D:/knowledge_base/Data/1.json', 'metadata_num': 2, 'metadata_content': 'document_title'}
----------------------------------------------------------------------------------------------------
Document 3:

 + {"authors": [{"name": "Wendao Li", "affiliation": "Beijing Advanced Innovation Center for Materials Genome Engineering, State Key Laboratory for Advanced Metals and Materials, University of Science and Technology Beijing, Beijing, 100083, China"}, {"name": "Longfei Li", "affiliation": "Beijing Advanced Innovation Center for Materials

In [14]:
# Build a knowledge graph
# 建立知识图谱

from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph()

In [15]:
# Currently, no node content has been imported into the knowledge graph, so the print result is 0, 0.
# 目前没有向知识图谱中导入节点内容，因此打印结果为 0，0

print(kg)

KnowledgeGraph(nodes: 0, relationships: 0)


In [16]:
from ragas.testset.graph import Node, NodeType

# Traverse each chunk and take each chunk as a node of the knowledge graph.
# 遍历每一个chunks，将每一个 chunks 作为知识图谱的节点 node

for i, doc in enumerate(docs):
    try:
        
        # Check if page_content and metadata conform to the expected format
        # 检查 page_content 和 metadata 是否符合预期格式
        
        if not isinstance(doc.page_content, str):
            print(f"Error in doc {i}: 'page_content' should be a string, but got {type(doc.page_content)}")
        
        if not isinstance(doc.metadata, dict):
            print(f"Error in doc {i}: 'metadata' should be a dictionary, but got {type(doc.metadata)}")

        # If there are no issues, add the document to the knowledge graph
        # 如果没有问题，则将文档添加到知识图谱
        
        else:
            kg.nodes.append(
                Node(
                    type=NodeType.DOCUMENT,
                    properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
                )
            )
    
    except Exception as e:
        print(f"Error processing doc {i}: {e}")


In [18]:
# Currently, 120 chunks have been added to the knowledge graph as nodes, but no relationships have been established yet. 
# It is necessary to use embedding models and LLMs to generate relationships for each node.
# 现在 120chunks 被作为节点已经添加到知识图谱中，但目前还没有建立关系。需要借助嵌入模型和 LLM 进行对各个节点生成关系。

print(kg)

KnowledgeGraph(nodes: 120, relationships: 0)


#ATTENTION:
#注意：

#In the process of constructing the test set using the knowledge graph, we recommend selecting a different embedding model from the one used for embedding the chunks, as this helps effectively avoid overfitting caused by using the same embedding model for both the chunks and the knowledge graph

#在利用知识图谱构建测试集的过程中，我们建议选择与嵌入chunks时使用的embedding模型不同的模型，因为这样可以有效避免因使用相同的embedding模型进行嵌入而导致的过拟合现象

#Although not directly utilizing the knowledge graph in RAG system can already largely mitigate this overfitting issue, we have still implemented measures to fully avoid it

#尽管在RAG系统中不直接使用知识图谱已经能在很大程度上避免这种过拟合问题，但为了更好地控制，我们仍然采取了避免措施


#Next, we will select the jinaai/jina-embeddings-v3 model (the embedding model for chunks is BAAI/bge-large-en-v1.5) along with the GPT-4o model to generate the knowledge graph and construct the test set

#接下来，我们将选择 jinaai/jina-embeddings-v3 模型（chunks嵌入模型选择的是 BAAI/bge-large-en-v1.5）与 GPT-4o 模型来生成知识图谱，并构建测试集


In [20]:
# import jinaai/jina-embeddings-v3
# 导入本地 embedding 模型

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

model_path = r"D:\jina-embeddings-v3"
model_name =  model_path
model_kwargs = {'device': 'cuda', 'trust_remote_code': True} # jina embedding requires specifying the parameter 'trust_remote_code': True.
                                                             # jina embedding 需要指定 'trust_remote_code': True 参数
    
encode_kwargs = {'normalize_embeddings': True}  
embeddings = HuggingFaceBgeEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

In [2]:
# import LLM
# 导入大模型

import os

from dotenv import load_dotenv

env_file_path = 'C:/Users/12279/OPENAI.env'

load_dotenv(env_file_path)

api_key = os.getenv('API_KEY')

from langchain_community.chat_models import ChatOpenAI

llm =  ChatOpenAI(
    temperature = 0,
    model = "gpt-4o", # Here we use the gpt-4o model. Researchers can also use other models
                          # 这里我们使用 gpt-4o 模型，研究者也可使用其他的模型
    api_key = api_key,
)
print(llm.model_name)

gpt-4o


In [22]:
# Invoke and define embedding models and LLMs used for generating a large number of relationships
# 调用和定义用于生成大量关系的嵌入模型和 llm

generator_llm = LangchainLLMWrapper(llm)
generator_embeddings = LangchainEmbeddingsWrapper(embeddings)

transformer_llm = generator_llm
embedding_model = generator_embeddings

In [None]:
# Enrich the knowledge graph with transformations. Currently, default transformations are used 
# Research can mix and match or create their own transformations as needed
#使用变换来丰富知识图谱，当前使用使用默认变换，研究者或可根据需要混合和匹配或创建自己的变换方式

from ragas.testset.transforms import default_transforms, apply_transforms

trans = default_transforms(documents = docs, llm=transformer_llm, embedding_model=embedding_model)


In [24]:
# Transform and generate knowledge graphs
# 变换生成知识图谱

apply_transforms(kg, trans)

Applying SummaryExtractor:   0%|          | 0/97 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/120 [00:00<?, ?it/s]

Node ca0fbf1e-b3ed-43c2-8ee8-6f79af9574c6 does not have a summary. Skipping filtering.
Node e60b77b8-4a67-4a3a-afc4-a182c9172df0 does not have a summary. Skipping filtering.
Node cb2bd38c-4dcc-421e-82bf-fa85180e2331 does not have a summary. Skipping filtering.
Node f214183d-e4f1-4a63-a3de-d46222aa1306 does not have a summary. Skipping filtering.
Node 79e53972-6fcc-4d36-9cd0-d9db93532454 does not have a summary. Skipping filtering.
Node 7181597e-f1f1-4a34-b0dd-2f69d58a5687 does not have a summary. Skipping filtering.
Node 6ea2a884-0fd4-47b5-b37a-fc7e8c68be60 does not have a summary. Skipping filtering.
Node 25c0b1df-59f3-4d53-9430-1167f822b1ed does not have a summary. Skipping filtering.
Node 7e86b347-881f-469b-8942-f77f4bce615d does not have a summary. Skipping filtering.
Node adfaee36-e420-45fb-84d5-9f335f66112d does not have a summary. Skipping filtering.
Node 7a84821a-89b2-4e76-b7a8-b9ca31eba078 does not have a summary. Skipping filtering.
Node 20d1b424-b38e-460f-a3e7-bfc1e963bdb4 d

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/337 [00:00<?, ?it/s]

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
# The knowledge graph has been completed at present, with 120 nodes established and a total of 5,611 relationships.
# 目前知识图谱已经构建完成，建立节点 120，关系共 5611

print(kg)

KnowledgeGraph(nodes: 120, relationships: 5611)


In [26]:
# Save the knowledge graph to the local device.
# 保存知识图谱到本地

kg.save("D:/knowledge_base/Data_doc_faiss/knowledge_graph_1.json")

In [27]:
#Load the knowledge graph
# 加载知识图谱

loaded_kg = KnowledgeGraph.load("D:/knowledge_base/Data_doc_faiss/knowledge_graph_1.json")
loaded_kg

KnowledgeGraph(nodes: 120, relationships: 5611)

In [29]:
from ragas.testset import TestsetGenerator

# invoke TestsetGenerator to generate the test set, and pass in the parameters llm, embedding model, and knowledge graph
# 调用 TestsetGenerator 用于生成测试集，需传入参数 llm，嵌入模型，知识图谱

generator = TestsetGenerator(llm=generator_llm, embedding_model=embedding_model, knowledge_graph=loaded_kg)

from ragas.testset.synthesizers import default_query_distribution

query_distribution = default_query_distribution(generator_llm)

In [None]:
# Here, it can be obsered that the distribution of the generated test set. 
# By default, it is split as single-hop specific: multi-hop abstract: multi-hop specific in a 0.33:0.33:0.33 ratio.
# Researchers can customize this distribution according to their needs.
# 这里可以打印看到测试集的生成分布；默认为单跳具体：多跳抽象：多跳具体 为 0.33：0.33：0.33；研究者可根据他们的需要进行自定义

query_distribution

[(SingleHopSpecificQuerySynthesizer(name='single_hop_specifc_query_synthesizer', llm=LangchainLLMWrapper(langchain_llm=ChatZhipuAI(...)), generate_query_reference_prompt=QueryAnswerGenerationPrompt(instruction=Generate a single-hop query and answer based on the specified conditions (persona, term, style, length) and the provided context. Ensure the answer is entirely faithful to the context, using only the information directly from the provided context.### Instructions:
  1. **Generate a Query**: Based on the context, persona, term, style, and length, create a question that aligns with the persona's perspective and incorporates the term.
  2. **Generate an Answer**: Using only the content from the provided context, construct a detailed answer to the query. Do not add any information not included in or inferable from the context.
  , examples=[(QueryCondition(persona=Persona(name='Software Engineer', role_description='Focuses on coding best practices and system design.'), term='microser

In [31]:
# testset_size is used to define how many test sets to generate, This demo code demonstrates setting it to 30
# testset_size 用于定义生成多少个测试集，该代码演示设置了 30

testset = generator.generate(testset_size = 30, query_distribution=query_distribution)

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/30 [00:00<?, ?it/s]

#Here is the generated test set displayed

#这里显示了生成的测试集

#However, we strongly recommend that researchers upload these test sets to the dashboard for detailed browsing (including various metrics) to better assess the quality of these datasets for further filtering

#然而，我们强烈建议研究者上传这些测试集至仪表板 dashboard 进行详细的浏览（包括各项指标），以更好的对这些数据集的质量以进行进一步筛选

#For more detailed information, please visit the following official website

#具体详细信息可以访问 RAGAs官网

#https://docs.ragas.io/en/stable/getstarted/rag_testset_generation/?h=te#analyzing-the-testset

#https://app.ragas.io


In [32]:
testset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,how come a materials engineer need to read a j...,"[{""document_type"": ""journal article""}]",a journal article is a document type that can ...,single_hop_specifc_query_synthesizer
1,How do Cr and Al/W ratio affect microstructral...,"[{""document_title"": ""Effects of Cr and Al/W ra...",The document titled 'Effects of Cr and Al/W ra...,single_hop_specifc_query_synthesizer
2,Where is the State Key Laboratory for Advanced...,"[{""authors"": [{""name"": ""Wendao Li"", ""affiliati...",The State Key Laboratory for Advanced Metals a...,single_hop_specifc_query_synthesizer
3,What is the Journal of Alloys and Compounds an...,"[{""journal information"": [{""journal"": ""Journal...",The Journal of Alloys and Compounds is a publi...,single_hop_specifc_query_synthesizer
4,Wat iz partishuning behavir in Co-Ni-base supe...,"[{""keywords"": [""Co-Ni-base superalloys"", ""APT""...",Partitioning behavior is one of the key aspect...,single_hop_specifc_query_synthesizer
5,How does the increase in Al/W ratio affect the...,"[{""abstract"": ""Three Co–30Ni-xAl-(15-x)W–1Ta–4...",Increasing the Al/W ratio in Co–Ni-base alloys...,single_hop_specifc_query_synthesizer
6,how long-term aging affect alloys microstructure?,"[{""content"": ""Three Co–30Ni-xAl-(15-x)W–1Ta–4T...",The microstructural stabilities of Co–30Ni-xAl...,single_hop_specifc_query_synthesizer
7,How do the changes in elemental partitioning b...,"[{""content"": ""The APT results reveal that such...",The APT results indicate that the effects on C...,single_hop_specifc_query_synthesizer
8,how Ni-base superalloys diff from Co-base supe...,"[{""content"": ""Conventional carbide and solid s...",Conventional carbide and solid solution streng...,single_hop_specifc_query_synthesizer
9,how do Co-Al-V-base alloys compare in terms of...,"[{""content"": ""Since then, other γ' strengthene...",Co-Al-V-base alloys have been developed alongs...,single_hop_specifc_query_synthesizer


In [None]:
# ATTENTION! in this demo code demonstration, all the test sets are directly saved here without any filtering.
# 注意，在目前代码演示中，直接保存了所有的测试集，没有对数据集进行任何的筛选

import pandas as pd

df = testset.to_pandas()

# Modify the column names and set them to the universal "question" and "answer"
# 修改列名，设置为通用的 question和 answer

df = df.rename(columns={"user_input": "question", "reference": "answer"})

#Save as CSV file
# 保存为 CSV 文件

df.to_csv("D:/knowledge_base/Data_doc_faiss/testset_1.csv", index=False)

print("CSV file has been saved!")

CSV file has been saved!


In [34]:
# To facilitate subsequent RAG evaluation, only retain the questions and answers and store them as a jsonl file
# 为了便于后续 RAG 评估，只保留 questions 和 answers，存储为 jsonl 文件

df_filtered = df[["question", "answer"]]

df_filtered.to_json("D:/knowledge_base/Data_doc_faiss/testset_1.jsonl", orient="records", lines=True, force_ascii=False)

print("JSONL file has been saved!")

JSONL file has been saved!
