From 2b7206a6c24f283375fdbbc79ca15e0fcc3139fa Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Wed, 13 Aug 2025 17:08:53 +0800 Subject: [PATCH 01/10] feat: add cot data generation pipeline --- README.md | 3 +- graphgen/models/__init__.py | 3 + graphgen/models/community/__init__.py | 0 .../models/community/community_detector.py | 67 +++++++++++ graphgen/models/vis/__init__.py | 0 graphgen/models/vis/community_visualizer.py | 51 ++++++++ graphgen/operators/community/__init__.py | 0 graphgen/operators/community/generate_cot.py | 110 ++++++++++++++++++ graphgen/templates/__init__.py | 11 +- graphgen/templates/answer_rephrasing.py | 14 +-- graphgen/templates/community/__init__.py | 2 + .../templates/community/cot_generation.py | 87 ++++++++++++++ .../community/cot_template_design.py | 107 +++++++++++++++++ graphgen/templates/coreference_resolution.py | 9 +- requirements-dev.txt | 1 + requirements.txt | 7 ++ 16 files changed, 450 insertions(+), 22 deletions(-) create mode 100644 graphgen/models/community/__init__.py create mode 100644 graphgen/models/community/community_detector.py create mode 100644 graphgen/models/vis/__init__.py create mode 100644 graphgen/models/vis/community_visualizer.py create mode 100644 graphgen/operators/community/__init__.py create mode 100644 graphgen/operators/community/generate_cot.py create mode 100644 graphgen/templates/community/__init__.py create mode 100644 graphgen/templates/community/cot_generation.py create mode 100644 graphgen/templates/community/cot_template_design.py create mode 100644 requirements-dev.txt diff --git a/README.md b/README.md index 947c184a..3ee43817 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,8 @@ See [analysis](https://deepwiki.com/open-sciencelab/GraphGen) by deepwiki for a ## 🍀 Acknowledgements - [SiliconFlow](https://siliconflow.cn) Abundant LLM API, some models are free - [LightRAG](https://github.com/HKUDS/LightRAG) Simple and efficient graph retrieval solution -- [ROGRAG](https://github.com/tpoisonooo/ROGRAG) ROGRAG: A Robustly Optimized GraphRAG Framework +- [ROGRAG](https://github.com/tpoisonooo/ROGRAG) A robustly optimized GraphRAG framework +- [DB-GPT](https://github.com/eosphoros-ai/DB-GPT) An AI native data app development framework ## 📚 Citation diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 7e1f6e8a..8112b417 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -1,3 +1,4 @@ +from .community.community_detector import CommunityDetector from .evaluate.length_evaluator import LengthEvaluator from .evaluate.mtld_evaluator import MTLDEvaluator from .evaluate.reward_evaluator import RewardEvaluator @@ -38,4 +39,6 @@ "UniEvaluator", # strategy models "TraverseStrategy", + # community models + "CommunityDetector", ] diff --git a/graphgen/models/community/__init__.py b/graphgen/models/community/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/models/community/community_detector.py b/graphgen/models/community/community_detector.py new file mode 100644 index 00000000..1c43851e --- /dev/null +++ b/graphgen/models/community/community_detector.py @@ -0,0 +1,67 @@ +from dataclasses import dataclass +from typing import Any, Dict + +from graphgen.models.storage.networkx_storage import NetworkXStorage + + +@dataclass +class CommunityDetector: + """Class for community detection algorithms.""" + + graph_storage: NetworkXStorage = None + method: str = "leiden" + method_params: Dict[str, Any] = None + + async def detect_communities(self) -> Dict[str, int]: + """ + Detect communities based on the chosen method. + """ + if self.method == "leiden": + return await self._leiden_communities(**self.method_params or {}) + raise ValueError(f"Unknown community detection method: {self.method}") + + async def get_graph(self): + """ + Asynchronously get the graph from the storage. + """ + return await self.graph_storage.get_graph() + + async def _leiden_communities(self, **kwargs) -> Dict[str, int]: + """ + Detect communities using the Leiden algorithm. + """ + import igraph as ig + import networkx as nx + from leidenalg import ModularityVertexPartition, find_partition + + graph = await self.get_graph() + # Filter out isolated nodes + graph.remove_nodes_from(list(nx.isolates(graph))) + + # Convert NetworkX graph to igraph graph + ig_graph = ig.Graph.TupleList(graph.edges(), directed=False) + + random_seed = kwargs.get("random_seed", 42) + use_lcc = kwargs.get("use_lcc", False) + + communities = {} + if use_lcc: + # Use the largest connected component + lcc = ig_graph.components().giant() + partition = find_partition(lcc, ModularityVertexPartition, seed=random_seed) + for part, cluster in enumerate(partition): + for v in cluster: + communities[v] = part + else: + offset = 0 + for component in ig_graph.components(): + subgraph = ig_graph.induced_subgraph(component) + partition = find_partition( + subgraph, ModularityVertexPartition, seed=random_seed + ) + for part, cluster in enumerate(partition): + for v in cluster: + original_node = subgraph.vs[v]["name"] + communities[original_node] = part + offset + offset += len(partition) + return communities diff --git a/graphgen/models/vis/__init__.py b/graphgen/models/vis/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/models/vis/community_visualizer.py b/graphgen/models/vis/community_visualizer.py new file mode 100644 index 00000000..d074d150 --- /dev/null +++ b/graphgen/models/vis/community_visualizer.py @@ -0,0 +1,51 @@ +from dataclasses import dataclass +from typing import Dict + +import matplotlib.pyplot as plt +import networkx as nx + + +@dataclass +class Visualizer: + """ + Class for visualizing graphs using NetworkX and Matplotlib. + """ + + graph: nx.Graph = None + communities: Dict[str, int] = None + layout: str = "spring" + max_nodes: int = 1000 + node_size: int = 10 + alpha: float = 0.6 + + def visualize(self, save_path: str = None): + n = self.graph.number_of_nodes() + print(f"Loaded graph: {n} nodes, {self.graph.number_of_edges()} edges") + + if self.layout == "spring": + k = max(0.1, 1.0 / (n**0.5)) + pos = nx.spring_layout(self.graph, k=k, seed=42) + else: + raise ValueError(f"Unknown layout: {self.layout}") + + plt.figure(figsize=(10, 10)) + + node_colors = [self.communities.get(node, 0) for node in self.graph.nodes()] + print(node_colors) + + nx.draw_networkx_nodes( + self.graph, + pos, + node_size=self.node_size, + node_color=node_colors, + cmap="viridis", + alpha=self.alpha, + ) + nx.draw_networkx_edges(self.graph, pos, alpha=0.3, width=0.2) + plt.axis("off") + + if save_path: + plt.savefig(save_path, dpi=300, bbox_inches="tight") + print("Saved to", save_path) + else: + plt.show() diff --git a/graphgen/operators/community/__init__.py b/graphgen/operators/community/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/operators/community/generate_cot.py b/graphgen/operators/community/generate_cot.py new file mode 100644 index 00000000..182095d2 --- /dev/null +++ b/graphgen/operators/community/generate_cot.py @@ -0,0 +1,110 @@ +import asyncio +from typing import Dict, List + +from tqdm.asyncio import tqdm as tqdm_async + +from graphgen.models import CommunityDetector, NetworkXStorage, OpenAIModel +from graphgen.templates import COT_GENERATION_PROMPT, COT_TEMPLATE_DESIGN_PROMPT +from graphgen.utils import detect_main_language + + +async def generate_cot( + graph_storage: NetworkXStorage, + synthesizer_llm_client: OpenAIModel, + method: str = "leiden", +): + detector = CommunityDetector(graph_storage=graph_storage, method=method) + + results = await detector.detect_communities() + + # Convert results to a format suitable for summarization + communities = {} + for node, community_id in results.items(): + if community_id not in communities: + communities[community_id] = [] + communities[community_id].append(node) + + if not communities: + return {} + + semaphore = asyncio.Semaphore(value=1000) + + async def _generate_from_single_community( + community_id: int, nodes: List[str] + ) -> tuple[int, tuple]: + """Summarize a single community.""" + async with semaphore: + entities: List[str] = [] + relationships: List[str] = [] + + for node in nodes: + node_data = await graph_storage.get_node(node) + if node_data is not None: + entities.append(f"({node}: {node_data.get('description')})") + + edges = await graph_storage.get_node_edges(node) + for edge in edges: + target = edge[1] + if target in nodes: + edge_data = await graph_storage.get_edge(node, target) + relationships.append( + f"({node}) - [{edge_data['description']}] -> ({target})" + ) + + entities_str = "\n".join(entities) + relationships_str = "\n".join(relationships) + + language = ( + "English" + if detect_main_language(entities_str + relationships_str) == "en" + else "Chinese" + ) + + prompt = COT_TEMPLATE_DESIGN_PROMPT[language]["TEMPLATE"].format( + entities=entities_str, + relationships=relationships_str, + ) + + cot_template = await synthesizer_llm_client.generate_answer(prompt) + + if "问题:" in cot_template and "推理路径设计:" in cot_template: + question = cot_template.split("问题:")[1].split("推理路径设计:")[0].strip() + reasoning_path = cot_template.split("推理路径设计:")[1].strip() + elif ( + "Question:" in cot_template and "Reasoning-Path Design:" in cot_template + ): + question = ( + cot_template.split("Question:")[1] + .split("Reasoning-Path Design:")[0] + .strip() + ) + reasoning_path = cot_template.split("Reasoning-Path Design:")[1].strip() + else: + raise ValueError("COT template format is incorrect.") + + prompt = COT_GENERATION_PROMPT[language]["TEMPLATE"].format( + entities=entities_str, + relationships=relationships_str, + question=question, + reasoning_template=reasoning_path, + ) + + cot_answer = await synthesizer_llm_client.generate_answer(prompt) + + return community_id, (question, reasoning_path, cot_answer) + + cid_nodes = list(communities.items()) + + templates: Dict[int, (str, str)] = {} + async for coro in tqdm_async( + asyncio.as_completed( + [_generate_from_single_community(cid, nodes) for cid, nodes in cid_nodes] + ), + total=len(cid_nodes), + desc="[Generate COT] Generating COT templates for communities", + unit="community", + ): + cid, (q, r, a) = await coro + templates[cid] = (q, r, a) + + return templates diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py index 6e362d08..a3d1e9ed 100644 --- a/graphgen/templates/__init__.py +++ b/graphgen/templates/__init__.py @@ -1,9 +1,10 @@ +from .answer_rephrasing import ANSWER_REPHRASING_PROMPT +from .community import COT_GENERATION_PROMPT, COT_TEMPLATE_DESIGN_PROMPT +from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT +from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT from .kg_extraction import KG_EXTRACTION_PROMPT from .kg_summarization import KG_SUMMARIZATION_PROMPT +from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT +from .question_generation import QUESTION_GENERATION_PROMPT from .search_judgement import SEARCH_JUDGEMENT_PROMPT -from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT -from .answer_rephrasing import ANSWER_REPHRASING_PROMPT -from .question_generation import QUESTION_GENERATION_PROMPT -from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT -from .coreference_resolution import COREFERENCE_RESOLUTION_TEMPLATE diff --git a/graphgen/templates/answer_rephrasing.py b/graphgen/templates/answer_rephrasing.py index a33e9d9e..fc988fa2 100644 --- a/graphgen/templates/answer_rephrasing.py +++ b/graphgen/templates/answer_rephrasing.py @@ -1,5 +1,4 @@ TEMPLATE_CONTEXT_EN: str = """---Role--- - You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below. You may refer to the original text to assist in generating the rephrased version, but ensure that the final output text meets the requirements. Use {language} as output language. @@ -51,12 +50,10 @@ """ TEMPLATE_CONTEXT_ZH: str = """---角色--- - 你是一位NLP专家,负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。你可以参考原始文本辅助生成,但需要确保最终输出的文本符合要求。 使用{language}作为输出语言。 ---目标--- - 生成文本的重述版本,使其传达与原始实体和关系描述相同的含义,同时: 1. 遵循清晰的逻辑流和结构 2. 建立适当的因果关系 @@ -101,7 +98,6 @@ """ TEMPLATE_EN: str = """---Role--- - You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below. Use {language} as output language. @@ -148,12 +144,10 @@ """ TEMPLATE_ZH: str = """---角色--- - 你是一位NLP专家,负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。 使用{language}作为输出语言。 ---目标--- - 生成文本的重述版本,使其传达与原始实体和关系描述相同的含义,同时: 1. 遵循清晰的逻辑流和结构 2. 建立适当的因果关系 @@ -207,13 +201,13 @@ """ -ANSWER_REPHRASING_PROMPT= { +ANSWER_REPHRASING_PROMPT = { "English": { "TEMPLATE": TEMPLATE_EN + REQUIREMENT_EN, - "CONTEXT_TEMPLATE": TEMPLATE_CONTEXT_EN + REQUIREMENT_EN + "CONTEXT_TEMPLATE": TEMPLATE_CONTEXT_EN + REQUIREMENT_EN, }, "Chinese": { "TEMPLATE": TEMPLATE_ZH + REQUIREMENT_ZH, - "CONTEXT_TEMPLATE": TEMPLATE_CONTEXT_ZH + REQUIREMENT_ZH - } + "CONTEXT_TEMPLATE": TEMPLATE_CONTEXT_ZH + REQUIREMENT_ZH, + }, } diff --git a/graphgen/templates/community/__init__.py b/graphgen/templates/community/__init__.py new file mode 100644 index 00000000..4721d03e --- /dev/null +++ b/graphgen/templates/community/__init__.py @@ -0,0 +1,2 @@ +from .cot_generation import COT_GENERATION_PROMPT +from .cot_template_design import COT_TEMPLATE_DESIGN_PROMPT diff --git a/graphgen/templates/community/cot_generation.py b/graphgen/templates/community/cot_generation.py new file mode 100644 index 00000000..0494cd80 --- /dev/null +++ b/graphgen/templates/community/cot_generation.py @@ -0,0 +1,87 @@ +TEMPLATE_ZH = """根据给定的知识图谱原始信息及已生成的推理路径,产出一条符合模板要求、可直接用于下游训练或推理的 CoT 数据。\ +CoT(Chain-of-Thought,思维链)指在回答复杂问题时,把中间推理步骤一步一步显式写出来,使推理过程透明、可追溯,而不是直接给出最终答案。 + +-输入格式- +[Entities:] +(实体名:实体描述) +... + +[Relationships:] +(来源实体)-[关系描述]->(目标实体) +... + +[Question and Reasoning Path:] +(问题) +(推理路径) + +-输出要求- +1. 每一步只完成一个不可分割的子任务,并用自然语言衔接,但是要避免生硬的连接词。 +2. 使用中文。 +3. 不要使用有序列表或编号。 +4. 请直接给出答案,不要生成无关信息。 + +-真实数据- +输入: +[Entities:]: +{entities} + +[Relationships:]: +{relationships} + +[Question:]: +{question} + +[Reasoning_Template:]: +{reasoning_template} + +输出: + +""" + +TEMPLATE_EN = """Given the raw knowledge graph information and the provided reasoning-path, \ +produce one Chain-of-Thought (CoT) sample that strictly follows the template \ +and can be directly used for downstream training or inference. +CoT (Chain-of-Thought) means that when answering a complex question, the intermediate reasoning steps are \ +explicitly written out one by one, making the reasoning process transparent and traceable instead of giving \ +only the final answer. + +-Input Format- +[Entities:]: +(ENTITY_NAME: ENTITY_DESCRIPTION) +... + +[Relationships:]: +(ENTITY_SOURCE)-[RELATIONSHIP_DESCRIPTION]->(ENTITY_TARGET) +... + +[Question and Reasoning Path:]: +(QUESTION) +(REASONING_PATH) + +-Output Requirements- +1. Each step completes a single, indivisible sub-task and is naturally connected, avoiding abrupt transition words. +2. Use English. +3. Do not use ordered lists or numbering. +4. Do not generate extraneous information, just provide the answer. + +-Real Data- +Input: +[Entities:]: +{entities} + +[Relationships:]: +{relationships} + +[Question:]: +{question} + +[Reasoning_Template:]: +{reasoning_template} + +Output: +""" + +COT_GENERATION_PROMPT = { + "Chinese": {"TEMPLATE": TEMPLATE_ZH}, + "English": {"TEMPLATE": TEMPLATE_EN}, +} diff --git a/graphgen/templates/community/cot_template_design.py b/graphgen/templates/community/cot_template_design.py new file mode 100644 index 00000000..04cfa230 --- /dev/null +++ b/graphgen/templates/community/cot_template_design.py @@ -0,0 +1,107 @@ +TEMPLATE_ZH = """你是一位“元推理架构师”。你的任务不是回答问题,\ +而是根据给定的知识图谱中的实体和关系的名称以及描述信息,设计一条可复用、可泛化的 CoT 推理路径模板。\ + +-步骤- +1. 实体识别 +- 准确地识别[Entities:]章节中的实体信息,包括实体名、实体描述信息。 +- 实体信息的一般格式为: +(实体名:实体描述) + +2. 关系识别 +- 准确地识别[Relationships:]章节中的关系信息,包括来源实体名、目标实体名、关系描述信息。 +- 关系信息的一般格式为: +(来源实体名)-[关系描述]->(目标实体名) + +3. 图结构理解 +- 正确地将关系信息中的来源实体名与实体信息关联。 +- 根据提供的关系信息还原出图结构。 + +4. 问题设计 +- 围绕知识图谱所表达的“核心主题”设计一个问题。 +- 问题必须能在图谱内部通过实体、关系或属性直接验证;避免主观判断。 +- 问题应该能够模型足够的思考,充分利用图谱中的实体和关系,避免过于简单或无关的问题。 + +5. 推理路径生成 +- 根据问题设计一个**可被后续模型直接执行的推理蓝图**。 +- 保持步骤最小化:每一步只解决一个“不可分割”的子问题。 + +-约束条件- +1. 不要在回答中描述你的思考过程,直接给出回复,只给出问题和推理路径设计,不要生成无关信息。 +2. 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的逻辑。 +3. 避免使用停用词和过于常见的词汇。 +4. 不要出现具体数值或结论,不要出现“识别实体”、“识别关系”这类无意义的操作描述。 +5. 使用中文作为输出语言。 +6. 输出格式为: +问题: +推理路径设计: + +-真实数据- +输入: +[Entities:]: +{entities} + +[Relationships:]: +{relationships} + +输出: +""" + + +TEMPLATE_EN = """You are a “meta-reasoning architect”. \ +Your task is NOT to answer the question, but to design a reusable, generalizable CoT reasoning-path \ +template based solely on the names and descriptions of entities and \ +relationships in the provided knowledge graph. + +- Steps - +1. Entity Recognition +- Accurately recognize entity information in the [Entities:] section, including entity names and descriptions. +- The general formats for entity information are: +(ENTITY_NAME: ENTITY_DESCRIPTION) + +2. Relationship Recognition +- Accurately recognize relationship information in the [Relationships:] section, including source_entity_name, target_entity_name, and relationship descriptions. +- The general formats for relationship information are: +(SOURCE_ENTITY_NAME)-[RELATIONSHIP_DESCRIPTION]->(TARGET_ENTITY_NAME) + +3. Graph Structure Understanding +- Correctly associate the source entity name in the relationship information with the entity information. +- Reconstruct the graph structure based on the provided relationship information. + +4. Question Design +- Design a question around the "core theme" expressed by the knowledge graph. +- The question must be verifiable directly within the graph through entities, relationships, or attributes; avoid subjective judgments. +- The question should allow the model to think sufficiently, fully utilizing the entities and relationships in the graph, avoiding overly simple or irrelevant questions. + +5. Reasoning-Path Design +- Output a **blueprint that any later model can directly execute**. +- Keep steps minimal: each step solves one indivisible sub-problem. + + +- Constraints - +1. Do NOT describe your thinking; output only the reasoning-path design. +2. If the provided descriptions are contradictory, resolve conflicts and provide a single coherent logic. +3. Avoid using stop words and overly common words. +4. Do not include specific numerical values or conclusions, \ +and DO NOT describing meaningless operations like "Identify the entity" or "Identify the relationship". +5. Use English as the output language. +6. The output format is: +Question: +Reasoning-Path Design: + +Please summarize the information expressed by the knowledge graph based on the following [Entities:] and [Relationships:] provided. + +- Real Data - +Input: +[Entities:]: +{entities} + +[Relationships:]: +{relationships} + +Output: +""" + +COT_TEMPLATE_DESIGN_PROMPT = { + "Chinese": {"TEMPLATE": TEMPLATE_ZH}, + "English": {"TEMPLATE": TEMPLATE_EN}, +} diff --git a/graphgen/templates/coreference_resolution.py b/graphgen/templates/coreference_resolution.py index b29394ad..bc03e671 100644 --- a/graphgen/templates/coreference_resolution.py +++ b/graphgen/templates/coreference_resolution.py @@ -1,4 +1,3 @@ -# pylint: disable=C0301 TEMPLATE_ZH: str = """请根据参考文本识别并消解文本中的指代词,明确每个代词所指代的具体实体,并直接输出消解后的文本。 -示例- @@ -16,7 +15,8 @@ 输出: """ -TEMPLATE_EN: str = """Please identify and resolve the pronouns in the reference text, specify the specific entities referred to by each pronoun, and directly output the resolved text. +TEMPLATE_EN: str = """Please identify and resolve the pronouns in the reference text, \ +specify the specific entities referred to by each pronoun, and directly output the resolved text. -Example- Input: @@ -33,7 +33,4 @@ Output: """ -COREFERENCE_RESOLUTION_TEMPLATE = { - "en": TEMPLATE_EN, - "zh": TEMPLATE_ZH -} +COREFERENCE_RESOLUTION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH} diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..55b033e9 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +pytest \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f169cb09..cf0674d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,10 @@ langcodes requests fastapi trafilatura + +leidenalg +igraph +python-louvain + +# For visualization +matplotlib \ No newline at end of file From 9879a444f4bb2ab1032fe85e55834fc732b01c90 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Wed, 13 Aug 2025 17:34:29 +0800 Subject: [PATCH 02/10] fix: fix import error --- graphgen/operators/resolute_coreference.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/graphgen/operators/resolute_coreference.py b/graphgen/operators/resolute_coreference.py index 4a1012fb..cdf702e2 100644 --- a/graphgen/operators/resolute_coreference.py +++ b/graphgen/operators/resolute_coreference.py @@ -1,12 +1,13 @@ from typing import List -from graphgen.models import Chunk -from graphgen.models import OpenAIModel -from graphgen.templates import COREFERENCE_RESOLUTION_TEMPLATE + +from graphgen.models import Chunk, OpenAIModel +from graphgen.templates import COREFERENCE_RESOLUTION_PROMPT from graphgen.utils import detect_main_language + async def resolute_coreference( - llm_client: OpenAIModel, - chunks: List[Chunk]) -> List[Chunk]: + llm_client: OpenAIModel, chunks: List[Chunk] +) -> List[Chunk]: """ Resolute conference @@ -23,9 +24,8 @@ async def resolute_coreference( for _, chunk in enumerate(chunks[1:]): language = detect_main_language(chunk.content) result = await llm_client.generate_answer( - COREFERENCE_RESOLUTION_TEMPLATE[language].format( - reference = results[0].content, - input_sentence = chunk.content + COREFERENCE_RESOLUTION_PROMPT[language].format( + reference=results[0].content, input_sentence=chunk.content ) ) results.append(Chunk(id=chunk.id, content=result)) From b18ac2ad428aa87b2a80f05cc12c8ef28b1f64f3 Mon Sep 17 00:00:00 2001 From: chenzihong <58508660+ChenZiHong-Gavin@users.noreply.github.com> Date: Wed, 13 Aug 2025 17:44:07 +0800 Subject: [PATCH 03/10] Update graphgen/operators/community/generate_cot.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- graphgen/operators/community/generate_cot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphgen/operators/community/generate_cot.py b/graphgen/operators/community/generate_cot.py index 182095d2..278ccde7 100644 --- a/graphgen/operators/community/generate_cot.py +++ b/graphgen/operators/community/generate_cot.py @@ -95,7 +95,7 @@ async def _generate_from_single_community( cid_nodes = list(communities.items()) - templates: Dict[int, (str, str)] = {} + templates: Dict[int, Tuple[str, str, str]] = {} async for coro in tqdm_async( asyncio.as_completed( [_generate_from_single_community(cid, nodes) for cid, nodes in cid_nodes] From 7243b779cab1125a3cd11127e322d0d85d2bc6cf Mon Sep 17 00:00:00 2001 From: chenzihong <58508660+ChenZiHong-Gavin@users.noreply.github.com> Date: Wed, 13 Aug 2025 17:44:17 +0800 Subject: [PATCH 04/10] Update graphgen/operators/community/generate_cot.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- graphgen/operators/community/generate_cot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphgen/operators/community/generate_cot.py b/graphgen/operators/community/generate_cot.py index 278ccde7..0689e1d0 100644 --- a/graphgen/operators/community/generate_cot.py +++ b/graphgen/operators/community/generate_cot.py @@ -31,7 +31,7 @@ async def generate_cot( async def _generate_from_single_community( community_id: int, nodes: List[str] - ) -> tuple[int, tuple]: + ) -> Tuple[int, Tuple[str, str, str]]: """Summarize a single community.""" async with semaphore: entities: List[str] = [] From f5793a74e58e37fbcaacc6c351e29afe89a3db11 Mon Sep 17 00:00:00 2001 From: chenzihong <58508660+ChenZiHong-Gavin@users.noreply.github.com> Date: Wed, 13 Aug 2025 17:44:27 +0800 Subject: [PATCH 05/10] Update graphgen/operators/community/generate_cot.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- graphgen/operators/community/generate_cot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphgen/operators/community/generate_cot.py b/graphgen/operators/community/generate_cot.py index 0689e1d0..c0a691e3 100644 --- a/graphgen/operators/community/generate_cot.py +++ b/graphgen/operators/community/generate_cot.py @@ -1,5 +1,5 @@ import asyncio -from typing import Dict, List +from typing import Dict, List, Tuple from tqdm.asyncio import tqdm as tqdm_async From e7b332f61aa927dd2605be5fa196eea204cf18f0 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Thu, 14 Aug 2025 13:51:27 +0800 Subject: [PATCH 06/10] feat: split community with max_size --- .../models/community/community_detector.py | 56 ++++++++++++++----- graphgen/models/vis/community_visualizer.py | 1 - 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/graphgen/models/community/community_detector.py b/graphgen/models/community/community_detector.py index 1c43851e..0041f4c4 100644 --- a/graphgen/models/community/community_detector.py +++ b/graphgen/models/community/community_detector.py @@ -1,5 +1,6 @@ +from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict +from typing import Any, Dict, List from graphgen.models.storage.networkx_storage import NetworkXStorage @@ -13,45 +14,40 @@ class CommunityDetector: method_params: Dict[str, Any] = None async def detect_communities(self) -> Dict[str, int]: - """ - Detect communities based on the chosen method. - """ if self.method == "leiden": return await self._leiden_communities(**self.method_params or {}) raise ValueError(f"Unknown community detection method: {self.method}") async def get_graph(self): - """ - Asynchronously get the graph from the storage. - """ return await self.graph_storage.get_graph() - async def _leiden_communities(self, **kwargs) -> Dict[str, int]: + async def _leiden_communities( + self, max_size: int = None, **kwargs + ) -> Dict[str, int]: """ Detect communities using the Leiden algorithm. + If max_size is given, any community larger than max_size will be split + into smaller sub-communities each having at most max_size nodes. """ import igraph as ig import networkx as nx from leidenalg import ModularityVertexPartition, find_partition graph = await self.get_graph() - # Filter out isolated nodes graph.remove_nodes_from(list(nx.isolates(graph))) - # Convert NetworkX graph to igraph graph ig_graph = ig.Graph.TupleList(graph.edges(), directed=False) random_seed = kwargs.get("random_seed", 42) use_lcc = kwargs.get("use_lcc", False) - communities = {} + communities: Dict[str, int] = {} if use_lcc: - # Use the largest connected component lcc = ig_graph.components().giant() partition = find_partition(lcc, ModularityVertexPartition, seed=random_seed) for part, cluster in enumerate(partition): for v in cluster: - communities[v] = part + communities[lcc.vs[v]["name"]] = part else: offset = 0 for component in ig_graph.components(): @@ -64,4 +60,36 @@ async def _leiden_communities(self, **kwargs) -> Dict[str, int]: original_node = subgraph.vs[v]["name"] communities[original_node] = part + offset offset += len(partition) - return communities + + # split large communities if max_size is specified + if max_size is None or max_size <= 0: + return communities + + return await self._split_communities(communities, max_size) + + @staticmethod + async def _split_communities( + communities: Dict[str, int], max_size: int + ) -> Dict[str, int]: + """ + Split communities larger than max_size into smaller sub-communities. + """ + cid2nodes: Dict[int, List[str]] = defaultdict(list) + for node, cid in communities.items(): + cid2nodes[cid].append(node) + + new_communities: Dict[str, int] = {} + new_cid = 0 + for cid, nodes in cid2nodes.items(): + if len(nodes) <= max_size: + for n in nodes: + new_communities[n] = new_cid + new_cid += 1 + else: + for start in range(0, len(nodes), max_size): + sub = nodes[start : start + max_size] + for n in sub: + new_communities[n] = new_cid + new_cid += 1 + + return new_communities diff --git a/graphgen/models/vis/community_visualizer.py b/graphgen/models/vis/community_visualizer.py index d074d150..2ae571b6 100644 --- a/graphgen/models/vis/community_visualizer.py +++ b/graphgen/models/vis/community_visualizer.py @@ -31,7 +31,6 @@ def visualize(self, save_path: str = None): plt.figure(figsize=(10, 10)) node_colors = [self.communities.get(node, 0) for node in self.graph.nodes()] - print(node_colors) nx.draw_networkx_nodes( self.graph, From 2eee950d05dba38d7fe5be420a603e41c610da71 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Thu, 14 Aug 2025 14:44:55 +0800 Subject: [PATCH 07/10] fix: fix visualizer --- graphgen/models/vis/community_visualizer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/graphgen/models/vis/community_visualizer.py b/graphgen/models/vis/community_visualizer.py index 2ae571b6..05551014 100644 --- a/graphgen/models/vis/community_visualizer.py +++ b/graphgen/models/vis/community_visualizer.py @@ -20,8 +20,6 @@ class Visualizer: def visualize(self, save_path: str = None): n = self.graph.number_of_nodes() - print(f"Loaded graph: {n} nodes, {self.graph.number_of_edges()} edges") - if self.layout == "spring": k = max(0.1, 1.0 / (n**0.5)) pos = nx.spring_layout(self.graph, k=k, seed=42) @@ -37,7 +35,7 @@ def visualize(self, save_path: str = None): pos, node_size=self.node_size, node_color=node_colors, - cmap="viridis", + cmap=plt.cm.tab20, alpha=self.alpha, ) nx.draw_networkx_edges(self.graph, pos, alpha=0.3, width=0.2) From 9d203640063ba86d0d0131ea93f9fcd3d963ca60 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Thu, 14 Aug 2025 16:52:04 +0800 Subject: [PATCH 08/10] refactor: refact generating pipelines --- README.md | 10 +- baselines/EntiGraph/entigraph.py | 164 ++-- baselines/Genie/genie.py | 76 +- baselines/LongForm/longform.py | 76 +- baselines/SELF-QA/self-qa.py | 121 ++- baselines/Wrap/wrap.py | 77 +- graphgen/configs/aggregated_config.yaml | 18 + ...raphgen_config.yaml => atomic_config.yaml} | 10 +- graphgen/configs/cot_config.yaml | 13 + ...fig.yaml.example => multi_hop_config.yaml} | 6 +- graphgen/generate.py | 52 +- graphgen/graphgen.py | 37 +- graphgen/operators/__init__.py | 4 +- .../{community => generate}/__init__.py | 0 .../{community => generate}/generate_cot.py | 37 +- graphgen/operators/kg/__init__.py | 0 graphgen/operators/{ => kg}/extract_kg.py | 77 +- graphgen/operators/{ => kg}/merge_kg.py | 79 +- .../{split_graph.py => kg/split_kg.py} | 136 ++- graphgen/operators/preprocess/__init__.py | 0 .../{ => preprocess}/resolute_coreference.py | 0 graphgen/operators/traverse_graph.py | 331 ++++--- graphgen/templates/search_judgement.py | 2 +- .../chunked_demo.json | 0 .../keywords_demo.txt | 0 .../raw_demo.jsonl | 0 .../{examples => input_examples}/txt_demo.txt | 0 resources/output_examples/aggregated.json | 47 + resources/output_examples/atomic.json | 882 ++++++++++++++++++ resources/output_examples/cot.json | 47 + resources/output_examples/multi-hop.json | 167 ++++ scripts/generate.sh | 1 - scripts/generate/generate_aggregated.sh | 3 + scripts/generate/generate_atomic.sh | 3 + scripts/generate/generate_cot.sh | 3 + scripts/generate/generate_multi_hop.sh | 3 + webui/app.py | 2 +- 37 files changed, 1945 insertions(+), 539 deletions(-) create mode 100644 graphgen/configs/aggregated_config.yaml rename graphgen/configs/{graphgen_config.yaml => atomic_config.yaml} (67%) create mode 100644 graphgen/configs/cot_config.yaml rename graphgen/configs/{config.yaml.example => multi_hop_config.yaml} (75%) rename graphgen/operators/{community => generate}/__init__.py (100%) rename graphgen/operators/{community => generate}/generate_cot.py (77%) create mode 100644 graphgen/operators/kg/__init__.py rename graphgen/operators/{ => kg}/extract_kg.py (67%) rename graphgen/operators/{ => kg}/merge_kg.py (76%) rename graphgen/operators/{split_graph.py => kg/split_kg.py} (74%) create mode 100644 graphgen/operators/preprocess/__init__.py rename graphgen/operators/{ => preprocess}/resolute_coreference.py (100%) rename resources/{examples => input_examples}/chunked_demo.json (100%) rename resources/{examples => input_examples}/keywords_demo.txt (100%) rename resources/{examples => input_examples}/raw_demo.jsonl (100%) rename resources/{examples => input_examples}/txt_demo.txt (100%) create mode 100644 resources/output_examples/aggregated.json create mode 100644 resources/output_examples/atomic.json create mode 100644 resources/output_examples/cot.json create mode 100644 resources/output_examples/multi-hop.json delete mode 100644 scripts/generate.sh create mode 100644 scripts/generate/generate_aggregated.sh create mode 100644 scripts/generate/generate_atomic.sh create mode 100644 scripts/generate/generate_cot.sh create mode 100644 scripts/generate/generate_multi_hop.sh diff --git a/README.md b/README.md index 3ee43817..41ef05c5 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ Furthermore, GraphGen incorporates multi-hop neighborhood sampling to capture co ## 📌 Latest Updates -- **2025.07.31**: We have added Google, Bing, Wikipedia, and UniProt as search back-ends, perfect for closing data gaps. +- **2025.07.31**: We have added Google, Bing, Wikipedia, and UniProt as search back-ends. - **2025.04.21**: We have released the initial version of GraphGen. ## 🚀 Quick Start @@ -138,15 +138,15 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe ``` 2. (Optional) If you want to modify the default generated configuration, you can edit the content of the configs/graphgen_config.yaml file. ```yaml - # configs/graphgen_config.yaml + # configs/aggregated_config.yaml # Example configuration - data_type: "raw" - input_file: "resources/examples/raw_demo.jsonl" + input_data_type: "raw" + input_file: "resources/input_examples/raw_demo.jsonl" # more configurations... ``` 3. Run the generation script ```bash - bash scripts/generate.sh + bash scripts/generate/generate_aggregated.sh ``` 4. Get the generated data ```bash diff --git a/baselines/EntiGraph/entigraph.py b/baselines/EntiGraph/entigraph.py index 01e22dbc..3020c71d 100644 --- a/baselines/EntiGraph/entigraph.py +++ b/baselines/EntiGraph/entigraph.py @@ -1,11 +1,11 @@ # https://arxiv.org/abs/2409.07431 # https://github.com/zitongyang/synthetic_continued_pretraining -import os +import argparse +import asyncio import json +import os import random -import asyncio -import argparse from hashlib import md5 from tqdm.asyncio import tqdm as tqdm_async @@ -18,9 +18,9 @@ def compute_content_hash(content, prefix: str = ""): return prefix + md5(content.encode()).hexdigest() -async def generate_entities(document_content: str, - system_message: str, - openai_model: str): +async def generate_entities( + document_content: str, system_message: str, openai_model: str +): prompt = f""" ### Document Content: {document_content} @@ -30,23 +30,25 @@ async def generate_entities(document_content: str, max_tries = 5 while not can_read_entities and max_tries > 0: try: - completion = await gptqa(prompt, - openai_model, - system_message, - json_format=False) - completion = completion[completion.find("{"): completion.rfind("}") + 1] + completion = await gptqa( + prompt, openai_model, system_message, json_format=False + ) + completion = completion[completion.find("{") : completion.rfind("}") + 1] response = json.loads(completion) - can_read_entities = response['entities'] + can_read_entities = response["entities"] return response - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Failed to generate entities: {str(e)}") max_tries -= 1 -async def generate_two_entity_relations(document_content: str, - entity1: str, - entity2: str, - system_message: str, - openai_model: str): + +async def generate_two_entity_relations( + document_content: str, + entity1: str, + entity2: str, + system_message: str, + openai_model: str, +): prompt = f""" ### Document Content: {document_content} @@ -54,17 +56,18 @@ async def generate_two_entity_relations(document_content: str, - {entity1} - {entity2} """ - completion = await gptqa(prompt, - openai_model, - system_message) + completion = await gptqa(prompt, openai_model, system_message) return completion -async def generate_three_entity_relations(document_content: str, - entity1: str, - entity2: str, - entity3: str, - system_message: str, - openai_model: str): + +async def generate_three_entity_relations( + document_content: str, + entity1: str, + entity2: str, + entity3: str, + system_message: str, + openai_model: str, +): prompt = f""" ### Document Content: {document_content} @@ -73,11 +76,10 @@ async def generate_three_entity_relations(document_content: str, - {entity2} - {entity3} """ - completion = await gptqa(prompt, - openai_model, - system_message) + completion = await gptqa(prompt, openai_model, system_message) return completion + def _post_process_synthetic_data(data): block = data.split("\n\n") qas = {} @@ -87,7 +89,7 @@ def _post_process_synthetic_data(data): answer = line.split("Answer: ")[1] qas[compute_content_hash(question)] = { "question": question, - "answer": answer + "answer": answer, } break return qas @@ -105,25 +107,26 @@ async def generate_document_entities(doc): async with semaphore: try: entities = await generate_entities( - doc.text, - task.openai_system_generate_entities, - model_name) + doc.text, task.openai_system_generate_entities, model_name + ) if not entities: return None return { - 'document': doc.text, - 'entities': entities['entities'], - 'summary': entities['summary'] + "document": doc.text, + "entities": entities["entities"], + "summary": entities["summary"], } - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return None entities_list = [] for result in tqdm_async( - asyncio.as_completed([generate_document_entities(doc) for doc in task.documents]), - total=len(task.documents), - desc="Generating entities" + asyncio.as_completed( + [generate_document_entities(doc) for doc in task.documents] + ), + total=len(task.documents), + desc="Generating entities", ): result = await result if result: @@ -132,38 +135,42 @@ async def generate_document_entities(doc): # iterate over triples of entities and generate relations pair_list = [] for doc in entities_list: - entities = doc['entities'] + entities = doc["entities"] temp = [] for i, entity_i in enumerate(entities): if i == len(entities) - 1: break for j in range(i + 1, len(entities)): entity_j = entities[j] - pair = (doc['document'], entity_i, entity_j) + pair = (doc["document"], entity_i, entity_j) temp.append(pair) # Compute all possible combinations of entities is impractical, so we randomly sample 10 pairs pair_list.extend(random.sample(temp, min(len(temp), 10))) - async def process_two_entity_relations(pair): async with semaphore: try: document, entity1, entity2 = pair response = await generate_two_entity_relations( - document, entity1, entity2, + document, + entity1, + entity2, task.openai_system_generate_two_entity_relations, - model_name) + model_name, + ) return response - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return None - corpus= [] + corpus = [] for result in tqdm_async( - asyncio.as_completed([process_two_entity_relations(pair) for pair in pair_list]), - total=len(pair_list), - desc="Generating two entity relations" + asyncio.as_completed( + [process_two_entity_relations(pair) for pair in pair_list] + ), + total=len(pair_list), + desc="Generating two entity relations", ): result = await result if result: @@ -194,51 +201,60 @@ async def process_two_entity_relations(pair): # ): # corpus.append(await result) - corpus = [doc['summary'] for doc in entities_list] + corpus + corpus = [doc["summary"] for doc in entities_list] + corpus qa_sft_results = {} async def generate_qa_sft(content): async with semaphore: - completion = await gptqa(content, model_name, task.openai_system_quality_qa_sft) + completion = await gptqa( + content, model_name, task.openai_system_quality_qa_sft + ) return completion - for result in tqdm_async( - asyncio.as_completed([generate_qa_sft(content) for content in corpus]), - total=len(corpus), - desc="Generating QA SFT" + asyncio.as_completed([generate_qa_sft(content) for content in corpus]), + total=len(corpus), + desc="Generating QA SFT", ): try: result = await result if result: qa_sft_results.update(_post_process_synthetic_data(result)) - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return qa_sft_results -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input_file', - help='Raw context jsonl path.', - default='resources/examples/chunked_demo.json', - type=str) - parser.add_argument('--data_type', - help='Data type of input file. (Raw context or chunked context)', - choices=['raw', 'chunked'], - default='raw', - type=str) - parser.add_argument('--output_file', - help='Output file path.', - default='cache/data/entigraph.json', - type=str) + parser.add_argument( + "--input_file", + help="Raw context jsonl path.", + default="resources/input_examples/chunked_demo.json", + type=str, + ) + parser.add_argument( + "--data_type", + help="Data type of input file. (Raw context or chunked context)", + choices=["raw", "chunked"], + default="raw", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/entigraph.json", + type=str, + ) args = parser.parse_args() - results = asyncio.run(generate_synthetic_data_for_document(args.input_file, args.data_type)) + results = asyncio.run( + generate_synthetic_data_for_document(args.input_file, args.data_type) + ) # Save results - with open(args.output_file, "w", encoding='utf-8') as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=4, ensure_ascii=False) diff --git a/baselines/Genie/genie.py b/baselines/Genie/genie.py index 4e740251..3ca529af 100644 --- a/baselines/Genie/genie.py +++ b/baselines/Genie/genie.py @@ -1,18 +1,19 @@ # https://arxiv.org/pdf/2401.14367 -import os -import json import argparse import asyncio -from typing import List +import json +import os from dataclasses import dataclass -from tqdm.asyncio import tqdm as tqdm_async +from typing import List + from dotenv import load_dotenv +from tqdm.asyncio import tqdm as tqdm_async from graphgen.models import OpenAIModel -from graphgen.utils import create_event_loop, compute_content_hash +from graphgen.utils import compute_content_hash, create_event_loop -PROMPT_TEMPLATE = '''Instruction: Given the next [document], create a [question] and [answer] pair that are grounded \ +PROMPT_TEMPLATE = """Instruction: Given the next [document], create a [question] and [answer] pair that are grounded \ in the main point of the document, don't add any additional information that is not in the document. The [question] is \ by an information-seeking user and the [answer] is provided by a helping AI Agent. @@ -45,13 +46,13 @@ [document]: {doc} -### Response:''' +### Response:""" def _post_process(content: str) -> tuple: if "[question]:" in content and "[answer]:" in content: - question = content.split('[question]: ')[1].split('[answer]: ')[0] - answer = content.split('[answer]: ')[1] + question = content.split("[question]: ")[1].split("[answer]: ")[0] + answer = content.split("[answer]: ")[1] return question, answer return None, None @@ -77,35 +78,44 @@ async def process_chunk(content: str): tasks = [] for doc in docs: for chunk in doc: - tasks.append(process_chunk(chunk['content'])) + tasks.append(process_chunk(chunk["content"])) - for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Genie"): + for result in tqdm_async( + asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Genie" + ): try: question, answer = _post_process(await result) if question and answer: final_results[compute_content_hash(question)] = { - 'question': question, - 'answer': answer + "question": question, + "answer": answer, } - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return final_results + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input_file', - help='Raw context jsonl path.', - default='resources/examples/chunked_demo.json', - type=str) - parser.add_argument('--data_type', - help='Data type of input file. (Raw context or chunked context)', - choices=['raw', 'chunked'], - default='raw', - type=str) - parser.add_argument('--output_file', - help='Output file path.', - default='cache/data/genie.json', - type=str) + parser.add_argument( + "--input_file", + help="Raw context jsonl path.", + default="resources/input_examples/chunked_demo.json", + type=str, + ) + parser.add_argument( + "--data_type", + help="Data type of input file. (Raw context or chunked context)", + choices=["raw", "chunked"], + default="raw", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/genie.json", + type=str, + ) args = parser.parse_args() @@ -114,21 +124,21 @@ async def process_chunk(content: str): llm_client = OpenAIModel( model_name=os.getenv("SYNTHESIZER_MODEL"), api_key=os.getenv("SYNTHESIZER_API_KEY"), - base_url=os.getenv("SYNTHESIZER_BASE_URL") + base_url=os.getenv("SYNTHESIZER_BASE_URL"), ) genie = Genie(llm_client=llm_client) - if args.data_type == 'raw': - with open(args.input_file, "r", encoding='utf-8') as f: + if args.data_type == "raw": + with open(args.input_file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] data = [[chunk] for chunk in data] - elif args.data_type == 'chunked': - with open(args.input_file, "r", encoding='utf-8') as f: + elif args.data_type == "chunked": + with open(args.input_file, "r", encoding="utf-8") as f: data = json.load(f) results = genie.generate(data) # Save results - with open(args.output_file, "w", encoding='utf-8') as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=4, ensure_ascii=False) diff --git a/baselines/LongForm/longform.py b/baselines/LongForm/longform.py index c37f1e8f..db352fed 100644 --- a/baselines/LongForm/longform.py +++ b/baselines/LongForm/longform.py @@ -1,24 +1,25 @@ # https://arxiv.org/pdf/2304.08460 # https://github.com/akoksal/LongForm/tree/main -import os -import json -from dataclasses import dataclass import argparse import asyncio +import json +import os +from dataclasses import dataclass from typing import List -from tqdm.asyncio import tqdm as tqdm_async + from dotenv import load_dotenv +from tqdm.asyncio import tqdm as tqdm_async from graphgen.models import OpenAIModel -from graphgen.utils import create_event_loop, compute_content_hash - +from graphgen.utils import compute_content_hash, create_event_loop -PROMPT_TEMPLATE = '''Instruction: X +PROMPT_TEMPLATE = """Instruction: X Output:{doc} What kind of instruction could this be the answer to? -X:''' +X:""" + @dataclass class LongForm: @@ -38,39 +39,50 @@ async def process_chunk(content: str): question = await self.llm_client.generate_answer(content) return { compute_content_hash(question): { - 'question': question, - 'answer': content + "question": question, + "answer": content, } } tasks = [] for doc in docs: for chunk in doc: - tasks.append(process_chunk(chunk['content'])) + tasks.append(process_chunk(chunk["content"])) - for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using LongForm"): + for result in tqdm_async( + asyncio.as_completed(tasks), + total=len(tasks), + desc="Generating using LongForm", + ): try: qa = await result final_results.update(qa) - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return final_results + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input_file', - help='Raw context jsonl path.', - default='resources/examples/chunked_demo.json', - type=str) - parser.add_argument('--data_type', - help='Data type of input file. (Raw context or chunked context)', - choices=['raw', 'chunked'], - default='raw', - type=str) - parser.add_argument('--output_file', - help='Output file path.', - default='cache/data/longform.json', - type=str) + parser.add_argument( + "--input_file", + help="Raw context jsonl path.", + default="resources/input_examples/chunked_demo.json", + type=str, + ) + parser.add_argument( + "--data_type", + help="Data type of input file. (Raw context or chunked context)", + choices=["raw", "chunked"], + default="raw", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/longform.json", + type=str, + ) args = parser.parse_args() @@ -79,21 +91,21 @@ async def process_chunk(content: str): llm_client = OpenAIModel( model_name=os.getenv("SYNTHESIZER_MODEL"), api_key=os.getenv("SYNTHESIZER_API_KEY"), - base_url=os.getenv("SYNTHESIZER_BASE_URL") + base_url=os.getenv("SYNTHESIZER_BASE_URL"), ) longform = LongForm(llm_client=llm_client) - if args.data_type == 'raw': - with open(args.input_file, "r", encoding='utf-8') as f: + if args.data_type == "raw": + with open(args.input_file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] data = [[chunk] for chunk in data] - elif args.data_type == 'chunked': - with open(args.input_file, "r", encoding='utf-8') as f: + elif args.data_type == "chunked": + with open(args.input_file, "r", encoding="utf-8") as f: data = json.load(f) results = longform.generate(data) # Save results - with open(args.output_file, "w", encoding='utf-8') as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=4, ensure_ascii=False) diff --git a/baselines/SELF-QA/self-qa.py b/baselines/SELF-QA/self-qa.py index ba16b15c..d0a8b878 100644 --- a/baselines/SELF-QA/self-qa.py +++ b/baselines/SELF-QA/self-qa.py @@ -1,18 +1,19 @@ # https://arxiv.org/abs/2305.11952 -import os +import argparse +import asyncio import json +import os from dataclasses import dataclass from typing import List -import argparse -import asyncio -from tqdm.asyncio import tqdm as tqdm_async + from dotenv import load_dotenv +from tqdm.asyncio import tqdm as tqdm_async from graphgen.models import OpenAIModel -from graphgen.utils import create_event_loop, compute_content_hash +from graphgen.utils import compute_content_hash, create_event_loop -INSTRUCTION_GENERATION_PROMPT = '''The background knowledge is: +INSTRUCTION_GENERATION_PROMPT = """The background knowledge is: {doc} Please generate ten instruction questions as diverse as possible based on the content of the above article. @@ -22,9 +23,9 @@ Please generate questions in the following format: 1. Question: ... 2. Question: ... -''' +""" -READING_COMPREHENSION_PROMPT = '''The background knowledge is: +READING_COMPREHENSION_PROMPT = """The background knowledge is: {doc} Please answer the following question based on the content of the article above: {question} @@ -34,24 +35,27 @@ Please generate the corresponding answer in the following format: Question: ... Answer: ... -''' +""" + def _post_process_instructions(content: str) -> list: - lines = content.split('\n') + lines = content.split("\n") questions = [] for line in lines: if "Question:" in line: - question = line.split('Question:')[1].strip() + question = line.split("Question:")[1].strip() questions.append(question) return questions + def _post_process_answers(content: str) -> tuple: if "Question:" in content and "Answer:" in content: - question = content.split('Question:')[1].split('Answer:')[0].strip() - answer = content.split('Answer:')[1].strip() + question = content.split("Question:")[1].split("Answer:")[0].strip() + answer = content.split("Answer:")[1].strip() return question, answer return None, None + @dataclass class SelfQA: llm_client: OpenAIModel = None @@ -73,58 +77,79 @@ async def process_chunk(content: str): instruction_questions = _post_process_instructions(response) qas = [] - for qa in tqdm_async(asyncio.as_completed([ - self.llm_client.generate_answer(READING_COMPREHENSION_PROMPT.format( - doc=content, - question=question - )) for question in instruction_questions]), - total=len(instruction_questions), desc="Generating QAs"): + for qa in tqdm_async( + asyncio.as_completed( + [ + self.llm_client.generate_answer( + READING_COMPREHENSION_PROMPT.format( + doc=content, question=question + ) + ) + for question in instruction_questions + ] + ), + total=len(instruction_questions), + desc="Generating QAs", + ): try: question, answer = _post_process_answers(await qa) if question and answer: - qas.append({ - compute_content_hash(question): { - 'question': question, - 'answer': answer + qas.append( + { + compute_content_hash(question): { + "question": question, + "answer": answer, + } } - }) - except Exception as e: # pylint: disable=broad-except + ) + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") continue return qas - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return [] tasks = [] for doc in docs: for chunk in doc: - tasks.append(process_chunk(chunk['content'])) + tasks.append(process_chunk(chunk["content"])) - for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using SelfQA"): + for result in tqdm_async( + asyncio.as_completed(tasks), + total=len(tasks), + desc="Generating using SelfQA", + ): try: qas = await result for qa in qas: final_results.update(qa) - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return final_results + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input_file', - help='Raw context jsonl path.', - default='resources/examples/chunked_demo.json', - type=str) - parser.add_argument('--data_type', - help='Data type of input file. (Raw context or chunked context)', - choices=['raw', 'chunked'], - default='raw', - type=str) - parser.add_argument('--output_file', - help='Output file path.', - default='cache/data/self-qa.json', - type=str) + parser.add_argument( + "--input_file", + help="Raw context jsonl path.", + default="resources/input_examples/chunked_demo.json", + type=str, + ) + parser.add_argument( + "--data_type", + help="Data type of input file. (Raw context or chunked context)", + choices=["raw", "chunked"], + default="raw", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/self-qa.json", + type=str, + ) args = parser.parse_args() @@ -133,21 +158,21 @@ async def process_chunk(content: str): llm_client = OpenAIModel( model_name=os.getenv("SYNTHESIZER_MODEL"), api_key=os.getenv("SYNTHESIZER_API_KEY"), - base_url=os.getenv("SYNTHESIZER_BASE_URL") + base_url=os.getenv("SYNTHESIZER_BASE_URL"), ) self_qa = SelfQA(llm_client=llm_client) - if args.data_type == 'raw': - with open(args.input_file, "r", encoding='utf-8') as f: + if args.data_type == "raw": + with open(args.input_file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] data = [[chunk] for chunk in data] - elif args.data_type == 'chunked': - with open(args.input_file, "r", encoding='utf-8') as f: + elif args.data_type == "chunked": + with open(args.input_file, "r", encoding="utf-8") as f: data = json.load(f) results = self_qa.generate(data) # Save results - with open(args.output_file, "w", encoding='utf-8') as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=4, ensure_ascii=False) diff --git a/baselines/Wrap/wrap.py b/baselines/Wrap/wrap.py index 1bfdcf4a..8618d613 100644 --- a/baselines/Wrap/wrap.py +++ b/baselines/Wrap/wrap.py @@ -1,19 +1,19 @@ # https://arxiv.org/abs/2401.16380 -import os -import json import argparse import asyncio +import json +import os from dataclasses import dataclass from typing import List + from dotenv import load_dotenv from tqdm.asyncio import tqdm as tqdm_async from graphgen.models import OpenAIModel -from graphgen.utils import create_event_loop, compute_content_hash +from graphgen.utils import compute_content_hash, create_event_loop - -PROMPT_TEMPLATE = '''A chat between a curious user and an artificial intelligence assistant. +PROMPT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the questions. USER: Convert the following paragraph into a conversational format with multiple tags of "Question:" followed by "Answer:":{doc}. @@ -26,18 +26,19 @@ --- Question: What was the revenue drop in the first quarter compared to the same period last year? Answer: The revenue dropped 15 percent. --- -''' +""" + def _post_process(content: str) -> list: - raw_qas = content.split('---') + raw_qas = content.split("---") qas = [] for item in raw_qas: try: if "Question:" in item and "Answer:" in item: - question = item.split('Question:')[1].split('Answer:')[0].strip() - answer = item.split('Answer:')[1].strip() + question = item.split("Question:")[1].split("Answer:")[0].strip() + answer = item.split("Answer:")[1].strip() qas.append((question, answer)) - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") continue return qas @@ -64,36 +65,44 @@ async def process_chunk(content: str): tasks = [] for doc in docs: for chunk in doc: - tasks.append(process_chunk(chunk['content'])) + tasks.append(process_chunk(chunk["content"])) - for result in tqdm_async(asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Wrap"): + for result in tqdm_async( + asyncio.as_completed(tasks), total=len(tasks), desc="Generating using Wrap" + ): try: qas = _post_process(await result) for qa in qas: final_results[compute_content_hash(qa[0])] = { - 'question': qa[0], - 'answer': qa[1] + "question": qa[0], + "answer": qa[1], } - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Error: {e}") return final_results if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input_file', - help='Raw context jsonl path.', - default='resources/examples/chunked_demo.json', - type=str) - parser.add_argument('--data_type', - help='Data type of input file. (Raw context or chunked context)', - choices=['raw', 'chunked'], - default='raw', - type=str) - parser.add_argument('--output_file', - help='Output file path.', - default='cache/data/wrap.json', - type=str) + parser.add_argument( + "--input_file", + help="Raw context jsonl path.", + default="resources/input_examples/chunked_demo.json", + type=str, + ) + parser.add_argument( + "--data_type", + help="Data type of input file. (Raw context or chunked context)", + choices=["raw", "chunked"], + default="raw", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/wrap.json", + type=str, + ) args = parser.parse_args() @@ -102,21 +111,21 @@ async def process_chunk(content: str): llm_client = OpenAIModel( model_name=os.getenv("SYNTHESIZER_MODEL"), api_key=os.getenv("SYNTHESIZER_API_KEY"), - base_url=os.getenv("SYNTHESIZER_BASE_URL") + base_url=os.getenv("SYNTHESIZER_BASE_URL"), ) wrap = Wrap(llm_client=llm_client) - if args.data_type == 'raw': - with open(args.input_file, "r", encoding='utf-8') as f: + if args.data_type == "raw": + with open(args.input_file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] data = [[chunk] for chunk in data] - elif args.data_type == 'chunked': - with open(args.input_file, "r", encoding='utf-8') as f: + elif args.data_type == "chunked": + with open(args.input_file, "r", encoding="utf-8") as f: data = json.load(f) results = wrap.generate(data) # Save results - with open(args.output_file, "w", encoding='utf-8') as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump(results, f, indent=4, ensure_ascii=False) diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml new file mode 100644 index 00000000..d620d60c --- /dev/null +++ b/graphgen/configs/aggregated_config.yaml @@ -0,0 +1,18 @@ +input_data_type: raw +input_file: resources/input_examples/raw_demo.jsonl +output_data_type: aggregated +tokenizer: cl100k_base +quiz_samples: 2 +traverse_strategy: + bidirectional: true + edge_sampling: max_loss + expand_method: max_width + isolated_node_strategy: ignore + max_depth: 5 + max_extra_edges: 20 + max_tokens: 256 + loss_strategy: only_edge +search: + enabled: false + search_types: ["google"] +re_judge: false diff --git a/graphgen/configs/graphgen_config.yaml b/graphgen/configs/atomic_config.yaml similarity index 67% rename from graphgen/configs/graphgen_config.yaml rename to graphgen/configs/atomic_config.yaml index 542786fa..1c026f48 100644 --- a/graphgen/configs/graphgen_config.yaml +++ b/graphgen/configs/atomic_config.yaml @@ -1,15 +1,15 @@ -data_type: raw -input_file: resources/examples/raw_demo.jsonl +input_data_type: raw +input_file: resources/input_examples/raw_demo.jsonl +output_data_type: atomic tokenizer: cl100k_base quiz_samples: 2 traverse_strategy: - qa_form: aggregated bidirectional: true edge_sampling: max_loss expand_method: max_width isolated_node_strategy: ignore - max_depth: 1 - max_extra_edges: 2 + max_depth: 3 + max_extra_edges: 5 max_tokens: 256 loss_strategy: only_edge search: diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml new file mode 100644 index 00000000..fca14c56 --- /dev/null +++ b/graphgen/configs/cot_config.yaml @@ -0,0 +1,13 @@ +input_data_type: raw +input_file: resources/input_examples/raw_demo.jsonl +output_data_type: cot +tokenizer: cl100k_base +search: + enabled: false + search_types: [] +method_params: + method: leiden + method_params: + max_size: 20 # Maximum size of communities + use_lcc: false + random_seed: 42 diff --git a/graphgen/configs/config.yaml.example b/graphgen/configs/multi_hop_config.yaml similarity index 75% rename from graphgen/configs/config.yaml.example rename to graphgen/configs/multi_hop_config.yaml index 542786fa..92ee446f 100644 --- a/graphgen/configs/config.yaml.example +++ b/graphgen/configs/multi_hop_config.yaml @@ -1,9 +1,9 @@ -data_type: raw -input_file: resources/examples/raw_demo.jsonl +input_data_type: raw +input_file: resources/input_examples/raw_demo.jsonl +output_data_type: multi_hop tokenizer: cl100k_base quiz_samples: 2 traverse_strategy: - qa_form: aggregated bidirectional: true edge_sampling: max_loss expand_method: max_width diff --git a/graphgen/generate.py b/graphgen/generate.py index f840094c..b3a4a238 100644 --- a/graphgen/generate.py +++ b/graphgen/generate.py @@ -8,7 +8,7 @@ from .graphgen import GraphGen from .models import OpenAIModel, Tokenizer, TraverseStrategy -from .utils import read_file, set_logger +from .utils import logger, read_file, set_logger sys_path = os.path.abspath(os.path.dirname(__file__)) @@ -35,7 +35,7 @@ def main(): parser.add_argument( "--config_file", help="Config parameters for GraphGen.", - default=files("graphgen").joinpath("configs", "graphgen_config.yaml"), + default=files("graphgen").joinpath("configs", "aggregated_config.yaml"), type=str, ) parser.add_argument( @@ -50,22 +50,27 @@ def main(): working_dir = args.output_dir set_working_dir(working_dir) - unique_id = int(time.time()) - set_logger( - os.path.join(working_dir, "logs", f"graphgen_{unique_id}.log"), if_stream=False - ) - print( - "GraphGen with unique ID", - unique_id, - "logging to", - os.path.join(working_dir, "logs", f"graphgen_{unique_id}.log"), - ) with open(args.config_file, "r", encoding="utf-8") as f: config = yaml.load(f, Loader=yaml.FullLoader) - input_file = config["input_file"] data = read_file(input_file) + output_data_type = config["output_data_type"] + + unique_id = int(time.time()) + set_logger( + os.path.join( + working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log" + ), + if_stream=True, + ) + logger.info( + "GraphGen with unique ID %s logging to %s", + unique_id, + os.path.join( + working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log" + ), + ) synthesizer_llm_client = OpenAIModel( model_name=os.getenv("SYNTHESIZER_MODEL"), @@ -78,8 +83,6 @@ def main(): base_url=os.getenv("TRAINEE_BASE_URL"), ) - traverse_strategy = TraverseStrategy(**config["traverse_strategy"]) - graph_gen = GraphGen( working_dir=working_dir, unique_id=unique_id, @@ -87,19 +90,24 @@ def main(): trainee_llm_client=trainee_llm_client, search_config=config["search"], tokenizer_instance=Tokenizer(model_name=config["tokenizer"]), - traverse_strategy=traverse_strategy, ) - graph_gen.insert(data, config["data_type"]) + graph_gen.insert(data, config["input_data_type"]) if config["search"]["enabled"]: graph_gen.search() - graph_gen.quiz(max_samples=config["quiz_samples"]) - - graph_gen.judge(re_judge=config["re_judge"]) - - graph_gen.traverse() + # Use pipeline according to the output data type + if output_data_type in ["atomic", "aggregated", "multi_hop"]: + graph_gen.quiz(max_samples=config["quiz_samples"]) + graph_gen.judge(re_judge=config["re_judge"]) + traverse_strategy = TraverseStrategy(**config["traverse_strategy"]) + traverse_strategy.qa_form = output_data_type + graph_gen.traverse(traverse_strategy=traverse_strategy) + elif output_data_type == "cot": + graph_gen.generate_reasoning(method_params=config["method_params"]) + else: + raise ValueError(f"Unsupported output data type: {output_data_type}") path = os.path.join( working_dir, "data", "graphgen", str(unique_id), f"config-{unique_id}.yaml" diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py index 0ba3a95b..0486f7fa 100644 --- a/graphgen/graphgen.py +++ b/graphgen/graphgen.py @@ -18,6 +18,7 @@ from .models.storage.base_storage import StorageNameSpace from .operators import ( extract_kg, + generate_cot, judge_statement, quiz, search_all, @@ -50,9 +51,6 @@ class GraphGen: default_factory=lambda: {"enabled": False, "search_types": ["wikipedia"]} ) - # traverse - traverse_strategy: TraverseStrategy = field(default_factory=TraverseStrategy) - # webui progress_bar: gr.Progress = None @@ -284,40 +282,53 @@ async def async_judge(self, re_judge=False, skip=False): ) await _update_relations.index_done_callback() - def traverse(self): + def traverse(self, traverse_strategy: TraverseStrategy): loop = create_event_loop() - loop.run_until_complete(self.async_traverse()) + loop.run_until_complete(self.async_traverse(traverse_strategy)) - async def async_traverse(self): - if self.traverse_strategy.qa_form == "atomic": + async def async_traverse(self, traverse_strategy: TraverseStrategy): + if traverse_strategy.qa_form == "atomic": results = await traverse_graph_atomically( self.synthesizer_llm_client, self.tokenizer_instance, self.graph_storage, - self.traverse_strategy, + traverse_strategy, self.text_chunks_storage, self.progress_bar, ) - elif self.traverse_strategy.qa_form == "multi_hop": + elif traverse_strategy.qa_form == "multi_hop": results = await traverse_graph_for_multi_hop( self.synthesizer_llm_client, self.tokenizer_instance, self.graph_storage, - self.traverse_strategy, + traverse_strategy, self.text_chunks_storage, self.progress_bar, ) - elif self.traverse_strategy.qa_form == "aggregated": + elif traverse_strategy.qa_form == "aggregated": results = await traverse_graph_by_edge( self.synthesizer_llm_client, self.tokenizer_instance, self.graph_storage, - self.traverse_strategy, + traverse_strategy, self.text_chunks_storage, self.progress_bar, ) else: - raise ValueError(f"Unknown qa_form: {self.traverse_strategy.qa_form}") + raise ValueError(f"Unknown qa_form: {traverse_strategy.qa_form}") + await self.qa_storage.upsert(results) + await self.qa_storage.index_done_callback() + + def generate_reasoning(self, method_params): + loop = create_event_loop() + loop.run_until_complete(self.async_generate_reasoning(method_params)) + + async def async_generate_reasoning(self, method_params): + results = await generate_cot( + self.graph_storage, + self.synthesizer_llm_client, + method_params=method_params, + ) await self.qa_storage.upsert(results) await self.qa_storage.index_done_callback() diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py index a56d06b2..0f532cd3 100644 --- a/graphgen/operators/__init__.py +++ b/graphgen/operators/__init__.py @@ -1,6 +1,7 @@ +from graphgen.operators.generate.generate_cot import generate_cot +from graphgen.operators.kg.extract_kg import extract_kg from graphgen.operators.search.search_all import search_all -from .extract_kg import extract_kg from .judge import judge_statement, skip_judge_statement from .quiz import quiz from .traverse_graph import ( @@ -18,4 +19,5 @@ "traverse_graph_by_edge", "traverse_graph_atomically", "traverse_graph_for_multi_hop", + "generate_cot", ] diff --git a/graphgen/operators/community/__init__.py b/graphgen/operators/generate/__init__.py similarity index 100% rename from graphgen/operators/community/__init__.py rename to graphgen/operators/generate/__init__.py diff --git a/graphgen/operators/community/generate_cot.py b/graphgen/operators/generate/generate_cot.py similarity index 77% rename from graphgen/operators/community/generate_cot.py rename to graphgen/operators/generate/generate_cot.py index c0a691e3..b87bce2b 100644 --- a/graphgen/operators/community/generate_cot.py +++ b/graphgen/operators/generate/generate_cot.py @@ -5,15 +5,18 @@ from graphgen.models import CommunityDetector, NetworkXStorage, OpenAIModel from graphgen.templates import COT_GENERATION_PROMPT, COT_TEMPLATE_DESIGN_PROMPT -from graphgen.utils import detect_main_language +from graphgen.utils import compute_content_hash, detect_main_language async def generate_cot( graph_storage: NetworkXStorage, synthesizer_llm_client: OpenAIModel, - method: str = "leiden", + method_params: Dict = None, ): - detector = CommunityDetector(graph_storage=graph_storage, method=method) + method = method_params.get("method", "leiden") + detector = CommunityDetector( + graph_storage=graph_storage, method=method, method_params=method_params + ) results = await detector.detect_communities() @@ -30,25 +33,25 @@ async def generate_cot( semaphore = asyncio.Semaphore(value=1000) async def _generate_from_single_community( - community_id: int, nodes: List[str] + c_id: int, nodes: List[str] ) -> Tuple[int, Tuple[str, str, str]]: """Summarize a single community.""" async with semaphore: entities: List[str] = [] relationships: List[str] = [] - for node in nodes: - node_data = await graph_storage.get_node(node) + for n in nodes: + node_data = await graph_storage.get_node(n) if node_data is not None: - entities.append(f"({node}: {node_data.get('description')})") + entities.append(f"({n}: {node_data.get('description')})") - edges = await graph_storage.get_node_edges(node) + edges = await graph_storage.get_node_edges(n) for edge in edges: target = edge[1] if target in nodes: - edge_data = await graph_storage.get_edge(node, target) + edge_data = await graph_storage.get_edge(n, target) relationships.append( - f"({node}) - [{edge_data['description']}] -> ({target})" + f"({n}) - [{edge_data['description']}] -> ({target})" ) entities_str = "\n".join(entities) @@ -91,20 +94,24 @@ async def _generate_from_single_community( cot_answer = await synthesizer_llm_client.generate_answer(prompt) - return community_id, (question, reasoning_path, cot_answer) + return c_id, (question, reasoning_path, cot_answer) cid_nodes = list(communities.items()) - templates: Dict[int, Tuple[str, str, str]] = {} + results: Dict = {} async for coro in tqdm_async( asyncio.as_completed( [_generate_from_single_community(cid, nodes) for cid, nodes in cid_nodes] ), total=len(cid_nodes), - desc="[Generate COT] Generating COT templates for communities", + desc="[Generating COT] Generating CoT data from communities", unit="community", ): cid, (q, r, a) = await coro - templates[cid] = (q, r, a) + results[compute_content_hash(q)] = { + "question": q, + "reasoning_path": r, + "answer": a, + } - return templates + return results diff --git a/graphgen/operators/kg/__init__.py b/graphgen/operators/kg/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/operators/extract_kg.py b/graphgen/operators/kg/extract_kg.py similarity index 67% rename from graphgen/operators/extract_kg.py rename to graphgen/operators/kg/extract_kg.py index 3fad5525..406e400b 100644 --- a/graphgen/operators/extract_kg.py +++ b/graphgen/operators/kg/extract_kg.py @@ -1,27 +1,33 @@ -import re import asyncio -from typing import List +import re from collections import defaultdict +from typing import List import gradio as gr from tqdm.asyncio import tqdm as tqdm_async + from graphgen.models import Chunk, OpenAIModel, Tokenizer from graphgen.models.storage.base_storage import BaseGraphStorage +from graphgen.operators.kg.merge_kg import merge_edges, merge_nodes from graphgen.templates import KG_EXTRACTION_PROMPT -from graphgen.utils import (logger, pack_history_conversations, split_string_by_multi_markers, - handle_single_entity_extraction, handle_single_relationship_extraction, - detect_if_chinese) -from graphgen.operators.merge_kg import merge_nodes, merge_edges +from graphgen.utils import ( + detect_if_chinese, + handle_single_entity_extraction, + handle_single_relationship_extraction, + logger, + pack_history_conversations, + split_string_by_multi_markers, +) # pylint: disable=too-many-statements async def extract_kg( - llm_client: OpenAIModel, - kg_instance: BaseGraphStorage, - tokenizer_instance: Tokenizer, - chunks: List[Chunk], - progress_bar: gr.Progress = None, - max_concurrent: int = 1000 + llm_client: OpenAIModel, + kg_instance: BaseGraphStorage, + tokenizer_instance: Tokenizer, + chunks: List[Chunk], + progress_bar: gr.Progress = None, + max_concurrent: int = 1000, ): """ :param llm_client: Synthesizer LLM model to extract entities and relationships @@ -50,25 +56,25 @@ async def _process_single_content(chunk: Chunk, max_loop: int = 3): ) final_result = await llm_client.generate_answer(hint_prompt) - logger.info('First result: %s', final_result) + logger.info("First result: %s", final_result) history = pack_history_conversations(hint_prompt, final_result) for loop_index in range(max_loop): if_loop_result = await llm_client.generate_answer( - text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"], - history=history + text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history ) if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() if if_loop_result != "yes": break glean_result = await llm_client.generate_answer( - text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], - history=history + text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history ) - logger.info('Loop %s glean: %s', loop_index, glean_result) + logger.info("Loop %s glean: %s", loop_index, glean_result) - history += pack_history_conversations(KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result) + history += pack_history_conversations( + KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result + ) final_result += glean_result if loop_index == max_loop - 1: break @@ -76,8 +82,9 @@ async def _process_single_content(chunk: Chunk, max_loop: int = 3): records = split_string_by_multi_markers( final_result, [ - KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], - KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"]], + KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], + KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], + ], ) nodes = defaultdict(list) @@ -87,16 +94,20 @@ async def _process_single_content(chunk: Chunk, max_loop: int = 3): record = re.search(r"\((.*)\)", record) if record is None: continue - record = record.group(1) # 提取括号内的内容 + record = record.group(1) # 提取括号内的内容 record_attributes = split_string_by_multi_markers( record, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] ) - entity = await handle_single_entity_extraction(record_attributes, chunk_id) + entity = await handle_single_entity_extraction( + record_attributes, chunk_id + ) if entity is not None: nodes[entity["entity_name"]].append(entity) continue - relation = await handle_single_relationship_extraction(record_attributes, chunk_id) + relation = await handle_single_relationship_extraction( + record_attributes, chunk_id + ) if relation is not None: edges[(relation["src_id"], relation["tgt_id"])].append(relation) return dict(nodes), dict(edges) @@ -106,17 +117,25 @@ async def _process_single_content(chunk: Chunk, max_loop: int = 3): async for result in tqdm_async( asyncio.as_completed([_process_single_content(c) for c in chunks]), total=len(chunks), - desc="[3/4]Extracting entities and relationships from chunks", + desc="[2/4]Extracting entities and relationships from chunks", unit="chunk", ): try: if progress_bar is not None: - progress_bar(len(results) / chunk_number, desc="[3/4]Extracting entities and relationships from chunks") + progress_bar( + len(results) / chunk_number, + desc="[3/4]Extracting entities and relationships from chunks", + ) results.append(await result) if progress_bar is not None and len(results) == chunk_number: - progress_bar(1, desc="[3/4]Extracting entities and relationships from chunks") - except Exception as e: # pylint: disable=broad-except - logger.error("Error occurred while extracting entities and relationships from chunks: %s", e) + progress_bar( + 1, desc="[3/4]Extracting entities and relationships from chunks" + ) + except Exception as e: # pylint: disable=broad-except + logger.error( + "Error occurred while extracting entities and relationships from chunks: %s", + e, + ) nodes = defaultdict(list) edges = defaultdict(list) diff --git a/graphgen/operators/merge_kg.py b/graphgen/operators/kg/merge_kg.py similarity index 76% rename from graphgen/operators/merge_kg.py rename to graphgen/operators/kg/merge_kg.py index 33aa1395..30379e66 100644 --- a/graphgen/operators/merge_kg.py +++ b/graphgen/operators/kg/merge_kg.py @@ -1,19 +1,21 @@ -from collections import Counter import asyncio +from collections import Counter + from tqdm.asyncio import tqdm as tqdm_async -from graphgen.utils.format import split_string_by_multi_markers -from graphgen.utils import logger, detect_main_language -from graphgen.models import TopkTokenModel, Tokenizer +from graphgen.models import Tokenizer, TopkTokenModel from graphgen.models.storage.base_storage import BaseGraphStorage -from graphgen.templates import KG_SUMMARIZATION_PROMPT, KG_EXTRACTION_PROMPT +from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT +from graphgen.utils import detect_main_language, logger +from graphgen.utils.format import split_string_by_multi_markers + async def _handle_kg_summary( entity_or_relation_name: str, description: str, llm_client: TopkTokenModel, tokenizer_instance: Tokenizer, - max_summary_tokens: int = 200 + max_summary_tokens: int = 200, ) -> str: """ 处理实体或关系的描述信息 @@ -33,17 +35,19 @@ async def _handle_kg_summary( KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language tokens = tokenizer_instance.encode_string(description) - if len(tokens) < max_summary_tokens: + if len(tokens) < max_summary_tokens: return description use_description = tokenizer_instance.decode_tokens(tokens[:max_summary_tokens]) prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format( entity_name=entity_or_relation_name, - description_list=use_description.split(''), - **KG_SUMMARIZATION_PROMPT["FORMAT"] + description_list=use_description.split(""), + **KG_SUMMARIZATION_PROMPT["FORMAT"], ) new_description = await llm_client.generate_answer(prompt) - logger.info("Entity or relation %s summary: %s", entity_or_relation_name, new_description) + logger.info( + "Entity or relation %s summary: %s", entity_or_relation_name, new_description + ) return new_description @@ -52,7 +56,7 @@ async def merge_nodes( kg_instance: BaseGraphStorage, llm_client: TopkTokenModel, tokenizer_instance: Tokenizer, - max_concurrent: int = 1000 + max_concurrent: int = 1000, ): """ Merge nodes @@ -77,39 +81,34 @@ async def process_single_node(entity_name: str, node_data: list[dict]): if node is not None: entity_types.append(node["entity_type"]) source_ids.extend( - split_string_by_multi_markers(node["source_id"], ['']) + split_string_by_multi_markers(node["source_id"], [""]) ) descriptions.append(node["description"]) # 统计当前节点数据和已有节点数据的entity_type出现次数,取出现次数最多的entity_type entity_type = sorted( - Counter( - [dp["entity_type"] for dp in node_data] + entity_types - ).items(), + Counter([dp["entity_type"] for dp in node_data] + entity_types).items(), key=lambda x: x[1], reverse=True, )[0][0] - description = ''.join( + description = "".join( sorted(set([dp["description"] for dp in node_data] + descriptions)) ) description = await _handle_kg_summary( entity_name, description, llm_client, tokenizer_instance ) - source_id = ''.join( + source_id = "".join( set([dp["source_id"] for dp in node_data] + source_ids) ) node_data = { "entity_type": entity_type, "description": description, - "source_id": source_id + "source_id": source_id, } - await kg_instance.upsert_node( - entity_name, - node_data=node_data - ) + await kg_instance.upsert_node(entity_name, node_data=node_data) node_data["entity_name"] = entity_name return node_data @@ -125,7 +124,7 @@ async def process_single_node(entity_name: str, node_data: list[dict]): ): try: entities_data.append(await result) - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except logger.error("Error occurred while inserting entities into storage: %s", e) @@ -134,7 +133,7 @@ async def merge_edges( kg_instance: BaseGraphStorage, llm_client: TopkTokenModel, tokenizer_instance: Tokenizer, - max_concurrent: int = 1000 + max_concurrent: int = 1000, ): """ Merge edges @@ -157,14 +156,14 @@ async def process_single_edge(src_id: str, tgt_id: str, edge_data: list[dict]): edge = await kg_instance.get_edge(src_id, tgt_id) if edge is not None: source_ids.extend( - split_string_by_multi_markers(edge["source_id"], ['']) + split_string_by_multi_markers(edge["source_id"], [""]) ) descriptions.append(edge["description"]) - description = ''.join( + description = "".join( sorted(set([dp["description"] for dp in edge_data] + descriptions)) ) - source_id = ''.join( + source_id = "".join( set([dp["source_id"] for dp in edge_data] + source_ids) ) @@ -175,8 +174,8 @@ async def process_single_edge(src_id: str, tgt_id: str, edge_data: list[dict]): node_data={ "source_id": source_id, "description": description, - "entity_type": "UNKNOWN" - } + "entity_type": "UNKNOWN", + }, ) description = await _handle_kg_summary( @@ -186,24 +185,20 @@ async def process_single_edge(src_id: str, tgt_id: str, edge_data: list[dict]): await kg_instance.upsert_edge( src_id, tgt_id, - edge_data={ - "source_id": source_id, - "description": description - } + edge_data={"source_id": source_id, "description": description}, ) - edge_data = { - "src_id": src_id, - "tgt_id": tgt_id, - "description": description - } + edge_data = {"src_id": src_id, "tgt_id": tgt_id, "description": description} return edge_data logger.info("Inserting relationships into storage...") relationships_data = [] for result in tqdm_async( asyncio.as_completed( - [process_single_edge(src_id, tgt_id, v) for (src_id, tgt_id), v in edges_data.items()] + [ + process_single_edge(src_id, tgt_id, v) + for (src_id, tgt_id), v in edges_data.items() + ] ), total=len(edges_data), desc="Inserting relationships into storage", @@ -211,5 +206,7 @@ async def process_single_edge(src_id: str, tgt_id: str, edge_data: list[dict]): ): try: relationships_data.append(await result) - except Exception as e: # pylint: disable=broad-except - logger.error("Error occurred while inserting relationships into storage: %s", e) + except Exception as e: # pylint: disable=broad-except + logger.error( + "Error occurred while inserting relationships into storage: %s", e + ) diff --git a/graphgen/operators/split_graph.py b/graphgen/operators/kg/split_kg.py similarity index 74% rename from graphgen/operators/split_graph.py rename to graphgen/operators/kg/split_kg.py index e2e2b5ca..a3307a86 100644 --- a/graphgen/operators/split_graph.py +++ b/graphgen/operators/kg/split_kg.py @@ -1,14 +1,16 @@ import random from collections import defaultdict + from tqdm.asyncio import tqdm as tqdm_async -from graphgen.utils import logger from graphgen.models import NetworkXStorage, TraverseStrategy +from graphgen.utils import logger + async def _get_node_info( node_id: str, graph_storage: NetworkXStorage, -)-> dict: +) -> dict: """ Get node info @@ -17,10 +19,7 @@ async def _get_node_info( :return: node info """ node_data = await graph_storage.get_node(node_id) - return { - "node_id": node_id, - **node_data - } + return {"node_id": node_id, **node_data} def _get_level_n_edges_by_max_width( @@ -33,7 +32,7 @@ def _get_level_n_edges_by_max_width( bidirectional: bool, max_extra_edges: int, edge_sampling: str, - loss_strategy: str = "only_edge" + loss_strategy: str = "only_edge", ) -> list: """ Get level n edges for an edge. @@ -71,10 +70,17 @@ def _get_level_n_edges_by_max_width( if len(candidate_edges) >= max_extra_edges: if loss_strategy == "both": - er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in candidate_edges] - candidate_edges = _sort_tuples(er_tuples, edge_sampling)[:max_extra_edges] + er_tuples = [ + ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) + for edge in candidate_edges + ] + candidate_edges = _sort_tuples(er_tuples, edge_sampling)[ + :max_extra_edges + ] elif loss_strategy == "only_edge": - candidate_edges = _sort_edges(candidate_edges, edge_sampling)[:max_extra_edges] + candidate_edges = _sort_edges(candidate_edges, edge_sampling)[ + :max_extra_edges + ] else: raise ValueError(f"Invalid loss strategy: {loss_strategy}") @@ -101,16 +107,16 @@ def _get_level_n_edges_by_max_width( def _get_level_n_edges_by_max_tokens( - edge_adj_list: dict, - node_dict: dict, - edges: list, - nodes: list, - src_edge: tuple, - max_depth: int, - bidirectional: bool, - max_tokens: int, - edge_sampling: str, - loss_strategy: str = "only_edge" + edge_adj_list: dict, + node_dict: dict, + edges: list, + nodes: list, + src_edge: tuple, + max_depth: int, + bidirectional: bool, + max_tokens: int, + edge_sampling: str, + loss_strategy: str = "only_edge", ) -> list: """ Get level n edges for an edge. @@ -129,8 +135,11 @@ def _get_level_n_edges_by_max_tokens( """ src_id, tgt_id, src_edge_data = src_edge - max_tokens -= (src_edge_data["length"] + nodes[node_dict[src_id]][1]["length"] - + nodes[node_dict[tgt_id]][1]["length"]) + max_tokens -= ( + src_edge_data["length"] + + nodes[node_dict[src_id]][1]["length"] + + nodes[node_dict[tgt_id]][1]["length"] + ) level_n_edges = [] @@ -151,7 +160,10 @@ def _get_level_n_edges_by_max_tokens( break if loss_strategy == "both": - er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in candidate_edges] + er_tuples = [ + ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) + for edge in candidate_edges + ] candidate_edges = _sort_tuples(er_tuples, edge_sampling) elif loss_strategy == "only_edge": candidate_edges = _sort_edges(candidate_edges, edge_sampling) @@ -196,15 +208,22 @@ def _sort_tuples(er_tuples: list, edge_sampling: str) -> list: if edge_sampling == "random": er_tuples = random.sample(er_tuples, len(er_tuples)) elif edge_sampling == "min_loss": - er_tuples = sorted(er_tuples, key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"]) + er_tuples = sorted( + er_tuples, + key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"], + ) elif edge_sampling == "max_loss": - er_tuples = sorted(er_tuples, key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"], - reverse=True) + er_tuples = sorted( + er_tuples, + key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"], + reverse=True, + ) else: raise ValueError(f"Invalid edge sampling: {edge_sampling}") edges = [edge for _, edge in er_tuples] return edges + def _sort_edges(edges: list, edge_sampling: str) -> list: """ Sort edges with edge sampling strategy @@ -223,11 +242,12 @@ def _sort_edges(edges: list, edge_sampling: str) -> list: raise ValueError(f"Invalid edge sampling: {edge_sampling}") return edges -async def get_batches_with_strategy( # pylint: disable=too-many-branches + +async def get_batches_with_strategy( # pylint: disable=too-many-branches nodes: list, edges: list, graph_storage: NetworkXStorage, - traverse_strategy: TraverseStrategy + traverse_strategy: TraverseStrategy, ): expand_method = traverse_strategy.expand_method if expand_method == "max_width": @@ -256,7 +276,10 @@ async def get_cached_node_info(node_id: str) -> dict: node_dict[node_name] = i if traverse_strategy.loss_strategy == "both": - er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in edges] + er_tuples = [ + ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) + for edge in edges + ] edges = _sort_tuples(er_tuples, edge_sampling) elif traverse_strategy.loss_strategy == "only_edge": edges = _sort_edges(edges, edge_sampling) @@ -279,21 +302,36 @@ async def get_cached_node_info(node_id: str) -> dict: src_id = edge[0] tgt_id = edge[1] - _process_nodes.extend([await get_cached_node_info(src_id), - await get_cached_node_info(tgt_id)]) + _process_nodes.extend( + [await get_cached_node_info(src_id), await get_cached_node_info(tgt_id)] + ) _process_edges.append(edge) if expand_method == "max_width": level_n_edges = _get_level_n_edges_by_max_width( - edge_adj_list, node_dict, edges, nodes, edge, max_depth, - traverse_strategy.bidirectional, traverse_strategy.max_extra_edges, - edge_sampling, traverse_strategy.loss_strategy + edge_adj_list, + node_dict, + edges, + nodes, + edge, + max_depth, + traverse_strategy.bidirectional, + traverse_strategy.max_extra_edges, + edge_sampling, + traverse_strategy.loss_strategy, ) else: level_n_edges = _get_level_n_edges_by_max_tokens( - edge_adj_list, node_dict, edges, nodes, edge, max_depth, - traverse_strategy.bidirectional, traverse_strategy.max_tokens, - edge_sampling, traverse_strategy.loss_strategy + edge_adj_list, + node_dict, + edges, + nodes, + edge, + max_depth, + traverse_strategy.bidirectional, + traverse_strategy.max_tokens, + edge_sampling, + traverse_strategy.loss_strategy, ) for _edge in level_n_edges: @@ -302,8 +340,12 @@ async def get_cached_node_info(node_id: str) -> dict: _process_edges.append(_edge) # 去重 - _process_nodes = list({node['node_id']: node for node in _process_nodes}.values()) - _process_edges = list({(edge[0], edge[1]): edge for edge in _process_edges}.values()) + _process_nodes = list( + {node["node_id"]: node for node in _process_nodes}.values() + ) + _process_edges = list( + {(edge[0], edge[1]): edge for edge in _process_edges}.values() + ) processing_batches.append((_process_nodes, _process_edges)) @@ -312,15 +354,21 @@ async def get_cached_node_info(node_id: str) -> dict: # isolate nodes isolated_node_strategy = traverse_strategy.isolated_node_strategy if isolated_node_strategy == "add": - processing_batches = await _add_isolated_nodes(nodes, processing_batches, graph_storage) - logger.info("Processing batches after adding isolated nodes: %d", len(processing_batches)) + processing_batches = await _add_isolated_nodes( + nodes, processing_batches, graph_storage + ) + logger.info( + "Processing batches after adding isolated nodes: %d", + len(processing_batches), + ) return processing_batches + async def _add_isolated_nodes( - nodes: list, - processing_batches: list, - graph_storage: NetworkXStorage, + nodes: list, + processing_batches: list, + graph_storage: NetworkXStorage, ) -> list: visited_nodes = set() for _process_nodes, _process_edges in processing_batches: diff --git a/graphgen/operators/preprocess/__init__.py b/graphgen/operators/preprocess/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/graphgen/operators/resolute_coreference.py b/graphgen/operators/preprocess/resolute_coreference.py similarity index 100% rename from graphgen/operators/resolute_coreference.py rename to graphgen/operators/preprocess/resolute_coreference.py diff --git a/graphgen/operators/traverse_graph.py b/graphgen/operators/traverse_graph.py index 947033ed..ad1cda0d 100644 --- a/graphgen/operators/traverse_graph.py +++ b/graphgen/operators/traverse_graph.py @@ -1,49 +1,67 @@ import asyncio -import gradio as gr +import gradio as gr from tqdm.asyncio import tqdm as tqdm_async -from graphgen.models import OpenAIModel, NetworkXStorage, TraverseStrategy, Tokenizer, JsonKVStorage -from graphgen.templates import ANSWER_REPHRASING_PROMPT, QUESTION_GENERATION_PROMPT, MULTI_HOP_GENERATION_PROMPT -from graphgen.utils import detect_main_language, compute_content_hash, logger -from graphgen.operators.split_graph import get_batches_with_strategy - - -async def _pre_tokenize(graph_storage: NetworkXStorage, - tokenizer: Tokenizer, - edges: list, - nodes: list) -> tuple: +from graphgen.models import ( + JsonKVStorage, + NetworkXStorage, + OpenAIModel, + Tokenizer, + TraverseStrategy, +) +from graphgen.operators.kg.split_kg import get_batches_with_strategy +from graphgen.templates import ( + ANSWER_REPHRASING_PROMPT, + MULTI_HOP_GENERATION_PROMPT, + QUESTION_GENERATION_PROMPT, +) +from graphgen.utils import compute_content_hash, detect_main_language, logger + + +async def _pre_tokenize( + graph_storage: NetworkXStorage, tokenizer: Tokenizer, edges: list, nodes: list +) -> tuple: sem = asyncio.Semaphore(1000) + async def handle_edge(edge: tuple) -> tuple: async with sem: - if 'length' not in edge[2]: - edge[2]['length'] = len( - await asyncio.get_event_loop().run_in_executor(None, - tokenizer.encode_string, - edge[2]['description'])) + if "length" not in edge[2]: + edge[2]["length"] = len( + await asyncio.get_event_loop().run_in_executor( + None, tokenizer.encode_string, edge[2]["description"] + ) + ) return edge async def handle_node(node: dict) -> dict: async with sem: - if 'length' not in node[1]: - node[1]['length'] = len( - await asyncio.get_event_loop().run_in_executor(None, - tokenizer.encode_string, - node[1]['description'])) + if "length" not in node[1]: + node[1]["length"] = len( + await asyncio.get_event_loop().run_in_executor( + None, tokenizer.encode_string, node[1]["description"] + ) + ) return node new_edges = [] new_nodes = [] - for result in tqdm_async(asyncio.as_completed([handle_edge(edge) for edge in edges]), - total=len(edges), desc="Pre-tokenizing edges"): + for result in tqdm_async( + asyncio.as_completed([handle_edge(edge) for edge in edges]), + total=len(edges), + desc="Pre-tokenizing edges", + ): new_edge = await result await graph_storage.update_edge(new_edge[0], new_edge[1], new_edge[2]) new_edges.append(new_edge) - for result in tqdm_async(asyncio.as_completed([handle_node(node) for node in nodes]), - total=len(nodes), desc="Pre-tokenizing nodes"): + for result in tqdm_async( + asyncio.as_completed([handle_node(node) for node in nodes]), + total=len(nodes), + desc="Pre-tokenizing nodes", + ): new_node = await result await graph_storage.update_node(new_node[0], new_node[1]) new_nodes.append(new_node) @@ -51,46 +69,62 @@ async def handle_node(node: dict) -> dict: await graph_storage.index_done_callback() return new_edges, new_nodes -async def _construct_rephrasing_prompt(_process_nodes: list, - _process_edges: list, - text_chunks_storage: JsonKVStorage, - add_context: bool = False - ) -> str: + +async def _construct_rephrasing_prompt( + _process_nodes: list, + _process_edges: list, + text_chunks_storage: JsonKVStorage, + add_context: bool = False, +) -> str: entities = [ - f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes + f"{_process_node['node_id']}: {_process_node['description']}" + for _process_node in _process_nodes ] relations = [ f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}" for _process_edge in _process_edges ] - entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)]) - relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)]) - language = "Chinese" if detect_main_language(entities_str + relations_str) == "zh" else "English" + entities_str = "\n".join( + [f"{index + 1}. {entity}" for index, entity in enumerate(entities)] + ) + relations_str = "\n".join( + [f"{index + 1}. {relation}" for index, relation in enumerate(relations)] + ) + language = ( + "Chinese" + if detect_main_language(entities_str + relations_str) == "zh" + else "English" + ) if add_context: - original_ids = ([node['source_id'].split('')[0] for node in _process_nodes] + - [edge[2]['source_id'].split('')[0] for edge in _process_edges]) + original_ids = [ + node["source_id"].split("")[0] for node in _process_nodes + ] + [edge[2]["source_id"].split("")[0] for edge in _process_edges] original_ids = list(set(original_ids)) original_text = await text_chunks_storage.get_by_ids(original_ids) - original_text = "\n".join([f"{index + 1}. {text['content']}" for index, text in enumerate(original_text)]) + original_text = "\n".join( + [ + f"{index + 1}. {text['content']}" + for index, text in enumerate(original_text) + ] + ) - prompt = ANSWER_REPHRASING_PROMPT[language]['CONTEXT_TEMPLATE'].format( + prompt = ANSWER_REPHRASING_PROMPT[language]["CONTEXT_TEMPLATE"].format( language=language, original_text=original_text, entities=entities_str, - relationships=relations_str + relationships=relations_str, ) return prompt - prompt = ANSWER_REPHRASING_PROMPT[language]['TEMPLATE'].format( - language=language, - entities=entities_str, - relationships=relations_str + prompt = ANSWER_REPHRASING_PROMPT[language]["TEMPLATE"].format( + language=language, entities=entities_str, relationships=relations_str ) return prompt + def get_loss_tercile(losses: list) -> (float, float): losses = sorted(losses) q1_index = int(len(losses) * (1 / 3)) @@ -98,14 +132,17 @@ def get_loss_tercile(losses: list) -> (float, float): return losses[q1_index], losses[q2_index] + def get_average_loss(batch: tuple, loss_strategy: str) -> float: if loss_strategy == "only_edge": - return sum(edge[2]['loss'] for edge in batch[1]) / len(batch[1]) + return sum(edge[2]["loss"] for edge in batch[1]) / len(batch[1]) if loss_strategy == "both": - return sum(edge[2]['loss'] for edge in batch[1]) + sum(node['loss'] for node in batch[0]) / \ - (len(batch[0]) + len(batch[1])) + return sum(edge[2]["loss"] for edge in batch[1]) + sum( + node["loss"] for node in batch[0] + ) / (len(batch[0]) + len(batch[1])) raise ValueError("Invalid loss strategy") + def _post_process_synthetic_data(data): block = data.split("\n\n") qas = [] @@ -113,26 +150,18 @@ def _post_process_synthetic_data(data): if "Question:" in line and "Answer:" in line: question = line.split("Question:")[1].split("Answer:")[0].strip() answer = line.split("Answer:")[1].strip() - qas.append({ - "question": question, - "answer": answer - }) + qas.append({"question": question, "answer": answer}) elif "问题:" in line and "答案:" in line: question = line.split("问题:")[1].split("答案:")[0].strip() answer = line.split("答案:")[1].strip() - qas.append({ - "question": question, - "answer": answer - }) + qas.append({"question": question, "answer": answer}) elif "问题:" in line and "回答:" in line: question = line.split("问题:")[1].split("回答:")[0].strip() answer = line.split("回答:")[1].strip() - qas.append({ - "question": question, - "answer": answer - }) + qas.append({"question": question, "answer": answer}) return qas + async def traverse_graph_by_edge( llm_client: OpenAIModel, tokenizer: Tokenizer, @@ -140,7 +169,7 @@ async def traverse_graph_by_edge( traverse_strategy: TraverseStrategy, text_chunks_storage: JsonKVStorage, progress_bar: gr.Progress = None, - max_concurrent: int = 1000 + max_concurrent: int = 1000, ) -> dict: """ Traverse the graph @@ -158,28 +187,24 @@ async def traverse_graph_by_edge( semaphore = asyncio.Semaphore(max_concurrent) async def _process_nodes_and_edges( - _process_nodes: list, - _process_edges: list, + _process_nodes: list, + _process_edges: list, ) -> str: prompt = await _construct_rephrasing_prompt( - _process_nodes, - _process_edges, - text_chunks_storage, - add_context = False + _process_nodes, _process_edges, text_chunks_storage, add_context=False ) context = await llm_client.generate_answer(prompt) # post-process the context if context.startswith("Rephrased Text:"): - context = context[len("Rephrased Text:"):].strip() + context = context[len("Rephrased Text:") :].strip() elif context.startswith("重述文本:"): - context = context[len("重述文本:"):].strip() + context = context[len("重述文本:") :].strip() return context async def _process_single_batch( - _process_batch: tuple, - question_type: str = "single" + _process_batch: tuple, question_type: str = "single" ) -> dict: async with semaphore: context = await _process_nodes_and_edges( @@ -188,21 +213,26 @@ async def _process_single_batch( ) language = "Chinese" if detect_main_language(context) == "zh" else "English" - pre_length = sum(node['length'] for node in _process_batch[0]) \ - + sum(edge[2]['length'] for edge in _process_batch[1]) + pre_length = sum(node["length"] for node in _process_batch[0]) + sum( + edge[2]["length"] for edge in _process_batch[1] + ) if question_type == "single": question = await llm_client.generate_answer( - QUESTION_GENERATION_PROMPT[language]['SINGLE_TEMPLATE'].format( + QUESTION_GENERATION_PROMPT[language]["SINGLE_TEMPLATE"].format( answer=context ) ) if question.startswith("Question:"): - question = question[len("Question:"):].strip() + question = question[len("Question:") :].strip() elif question.startswith("问题:"): - question = question[len("问题:"):].strip() + question = question[len("问题:") :].strip() - logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1])) + logger.info( + "%d nodes and %d edges processed", + len(_process_batch[0]), + len(_process_batch[1]), + ) logger.info("Pre-length: %s", pre_length) logger.info("Question: %s", question) logger.info("Answer: %s", context) @@ -211,12 +241,14 @@ async def _process_single_batch( compute_content_hash(context): { "question": question, "answer": context, - "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy) + "loss": get_average_loss( + _process_batch, traverse_strategy.loss_strategy + ), } } content = await llm_client.generate_answer( - QUESTION_GENERATION_PROMPT[language]['MULTI_TEMPLATE'].format( + QUESTION_GENERATION_PROMPT[language]["MULTI_TEMPLATE"].format( doc=context ) ) @@ -224,19 +256,27 @@ async def _process_single_batch( if len(qas) == 0: print(content) - logger.error("Error occurred while processing batch, question or answer is None") + logger.error( + "Error occurred while processing batch, question or answer is None" + ) return {} final_results = {} - logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1])) + logger.info( + "%d nodes and %d edges processed", + len(_process_batch[0]), + len(_process_batch[1]), + ) logger.info("Pre-length: %s", pre_length) for qa in qas: - logger.info("Question: %s", qa['question']) - logger.info("Answer: %s", qa['answer']) - final_results[compute_content_hash(qa['question'])] = { - "question": qa['question'], - "answer": qa['answer'], - "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy) + logger.info("Question: %s", qa["question"]) + logger.info("Answer: %s", qa["answer"]) + final_results[compute_content_hash(qa["question"])] = { + "question": qa["question"], + "answer": qa["answer"], + "loss": get_average_loss( + _process_batch, traverse_strategy.loss_strategy + ), } return final_results @@ -247,22 +287,25 @@ async def _process_single_batch( edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes) processing_batches = await get_batches_with_strategy( - nodes, - edges, - graph_storage, - traverse_strategy + nodes, edges, graph_storage, traverse_strategy ) - for result in tqdm_async(asyncio.as_completed( - [_process_single_batch(batch) for batch in processing_batches] - ), total=len(processing_batches), desc="[4/4]Generating QAs"): + for result in tqdm_async( + asyncio.as_completed( + [_process_single_batch(batch) for batch in processing_batches] + ), + total=len(processing_batches), + desc="[4/4]Generating QAs", + ): try: if progress_bar is not None: - progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs") + progress_bar( + len(results) / len(processing_batches), desc="[4/4]Generating QAs" + ) results.update(await result) if progress_bar is not None and len(results) == len(processing_batches): progress_bar(1, desc="[4/4]Generating QAs") - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except logger.error("Error occurred while generating QA: %s", e) return results @@ -275,7 +318,7 @@ async def traverse_graph_atomically( traverse_strategy: TraverseStrategy, text_chunks_storage: JsonKVStorage, progress_bar: gr.Progress = None, - max_concurrent: int = 1000 + max_concurrent: int = 1000, ) -> dict: """ Traverse the graph atomicly @@ -292,22 +335,21 @@ async def traverse_graph_atomically( assert traverse_strategy.qa_form == "atomic" semaphore = asyncio.Semaphore(max_concurrent) - async def _generate_question( - node_or_edge: tuple - ): + + async def _generate_question(node_or_edge: tuple): if len(node_or_edge) == 2: - des = node_or_edge[0] + ": " + node_or_edge[1]['description'] - loss = node_or_edge[1]['loss'] + des = node_or_edge[0] + ": " + node_or_edge[1]["description"] + loss = node_or_edge[1]["loss"] else: - des = node_or_edge[2]['description'] - loss = node_or_edge[2]['loss'] + des = node_or_edge[2]["description"] + loss = node_or_edge[2]["loss"] async with semaphore: try: language = "Chinese" if detect_main_language(des) == "zh" else "English" qa = await llm_client.generate_answer( - QUESTION_GENERATION_PROMPT[language]['SINGLE_QA_TEMPLATE'].format( + QUESTION_GENERATION_PROMPT[language]["SINGLE_QA_TEMPLATE"].format( doc=des ) ) @@ -321,8 +363,8 @@ async def _generate_question( else: return {} - question = question.strip("\"") - answer = answer.strip("\"") + question = question.strip('"') + answer = answer.strip('"') logger.info("Question: %s", question) logger.info("Answer: %s", answer) @@ -330,10 +372,10 @@ async def _generate_question( compute_content_hash(question): { "question": question, "answer": answer, - "loss": loss + "loss": loss, } } - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except logger.error("Error occurred while generating question: %s", e) return {} @@ -345,24 +387,26 @@ async def _generate_question( tasks = [] for node in nodes: - if "" in node[1]['description']: - description_list = node[1]['description'].split("") + if "" in node[1]["description"]: + description_list = node[1]["description"].split("") for item in description_list: - tasks.append((node[0], {"description": item, 'loss': node[1]['loss']})) + tasks.append((node[0], {"description": item, "loss": node[1]["loss"]})) else: tasks.append((node[0], node[1])) for edge in edges: - if "" in edge[2]['description']: - description_list = edge[2]['description'].split("") + if "" in edge[2]["description"]: + description_list = edge[2]["description"].split("") for item in description_list: - tasks.append((edge[0], edge[1], {"description": item, 'loss': edge[2]['loss']})) + tasks.append( + (edge[0], edge[1], {"description": item, "loss": edge[2]["loss"]}) + ) else: tasks.append((edge[0], edge[1], edge[2])) for result in tqdm_async( asyncio.as_completed([_generate_question(task) for task in tasks]), total=len(tasks), - desc="[4/4]Generating QAs" + desc="[4/4]Generating QAs", ): try: if progress_bar is not None: @@ -370,10 +414,11 @@ async def _generate_question( results.update(await result) if progress_bar is not None and len(results) == len(tasks): progress_bar(1, desc="[4/4]Generating QAs") - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except logger.error("Error occurred while generating QA: %s", e) return results + async def traverse_graph_for_multi_hop( llm_client: OpenAIModel, tokenizer: Tokenizer, @@ -381,7 +426,7 @@ async def traverse_graph_for_multi_hop( traverse_strategy: TraverseStrategy, text_chunks_storage: JsonKVStorage, progress_bar: gr.Progress = None, - max_concurrent: int = 1000 + max_concurrent: int = 1000, ) -> dict: """ Traverse the graph for multi-hop @@ -406,24 +451,24 @@ async def traverse_graph_for_multi_hop( edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes) processing_batches = await get_batches_with_strategy( - nodes, - edges, - graph_storage, - traverse_strategy + nodes, edges, graph_storage, traverse_strategy ) - async def _process_single_batch( - _process_batch: tuple - ) -> dict: + async def _process_single_batch(_process_batch: tuple) -> dict: async with semaphore: try: - language = "Chinese" if detect_main_language(_process_batch[0][0]['description']) == "zh" else "English" + language = ( + "Chinese" + if detect_main_language(_process_batch[0][0]["description"]) == "zh" + else "English" + ) _process_nodes = _process_batch[0] _process_edges = _process_batch[1] entities = [ - f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes + f"{_process_node['node_id']}: {_process_node['description']}" + for _process_node in _process_nodes ] relations = [ @@ -431,12 +476,18 @@ async def _process_single_batch( for _process_edge in _process_edges ] - entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)]) - relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)]) + entities_str = "\n".join( + [f"{index + 1}. {entity}" for index, entity in enumerate(entities)] + ) + relations_str = "\n".join( + [ + f"{index + 1}. {relation}" + for index, relation in enumerate(relations) + ] + ) prompt = MULTI_HOP_GENERATION_PROMPT[language].format( - entities=entities_str, - relationships=relations_str + entities=entities_str, relationships=relations_str ) context = await llm_client.generate_answer(prompt) @@ -451,8 +502,8 @@ async def _process_single_batch( else: return {} - question = question.strip("\"") - answer = answer.strip("\"") + question = question.strip('"') + answer = answer.strip('"') logger.info("Question: %s", question) logger.info("Answer: %s", answer) @@ -461,25 +512,31 @@ async def _process_single_batch( compute_content_hash(question): { "question": question, "answer": answer, - "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy), + "loss": get_average_loss( + _process_batch, traverse_strategy.loss_strategy + ), } } - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except logger.error("Error occurred while processing batch: %s", e) return {} async for result in tqdm_async( - asyncio.as_completed([_process_single_batch(batch) for batch in processing_batches]), + asyncio.as_completed( + [_process_single_batch(batch) for batch in processing_batches] + ), total=len(processing_batches), - desc="[4/4]Generating QAs" + desc="[4/4]Generating QAs", ): try: if progress_bar is not None: - progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs") + progress_bar( + len(results) / len(processing_batches), desc="[4/4]Generating QAs" + ) results.update(await result) if progress_bar is not None and len(results) == len(processing_batches): progress_bar(1, desc="[4/4]Generating QAs") - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except logger.error("Error occurred while generating QA: %s", e) return results diff --git a/graphgen/templates/search_judgement.py b/graphgen/templates/search_judgement.py index ca9e7e12..e85b0097 100644 --- a/graphgen/templates/search_judgement.py +++ b/graphgen/templates/search_judgement.py @@ -17,7 +17,7 @@ ################ -Examples- ################ -{examples} +{input_examples} ################ -Real Data- diff --git a/resources/examples/chunked_demo.json b/resources/input_examples/chunked_demo.json similarity index 100% rename from resources/examples/chunked_demo.json rename to resources/input_examples/chunked_demo.json diff --git a/resources/examples/keywords_demo.txt b/resources/input_examples/keywords_demo.txt similarity index 100% rename from resources/examples/keywords_demo.txt rename to resources/input_examples/keywords_demo.txt diff --git a/resources/examples/raw_demo.jsonl b/resources/input_examples/raw_demo.jsonl similarity index 100% rename from resources/examples/raw_demo.jsonl rename to resources/input_examples/raw_demo.jsonl diff --git a/resources/examples/txt_demo.txt b/resources/input_examples/txt_demo.txt similarity index 100% rename from resources/examples/txt_demo.txt rename to resources/input_examples/txt_demo.txt diff --git a/resources/output_examples/aggregated.json b/resources/output_examples/aggregated.json new file mode 100644 index 00000000..1e6ba648 --- /dev/null +++ b/resources/output_examples/aggregated.json @@ -0,0 +1,47 @@ +{ + "b917f19103fb6184aae1b37f903ce112": { + "question": "云粳26号在西南稻区的推广情况如何?", + "answer": "在2012年,云粳26号被农业部正式列为西南稻区的农业推广主导品种。这一决定标志着云粳26号在该区域的重要性,进一步推动了西南稻区的农业发展。", + "loss": 0.7650802039228166 + }, + "62cc58c678ae70ed9a03684efede4d8b": { + "question": "What is the classification and phenotype of the Bg1-D mutant in terms of its grain size compared to normal varieties?", + "answer": "The Bg1-D mutant is classified as a dominant mutant, which means it expresses its traits more prominently than recessive variants. This particular mutant is characterized by an extra-large grain phenotype, showcasing grains that are significantly larger than those of normal varieties.", + "loss": 0.31326545774936676 + }, + "7afdff9aa72d418f706fff5353fd69e7": { + "question": "What is the role of the Tiller Angle Control 4 (TAC4) gene in rice plants, and how does its loss of function affect the plant's architecture and grain yield?", + "answer": "The loss of function of the Tiller Angle Control 4 (TAC4) gene leads to a significant increase in the tiller angle of rice plants, which in turn affects their overall architecture. The tiller angle is a crucial aspect of plant structure that has a direct impact on the grain yield of rice. TAC4 has been identified as a key regulator of this trait, as it influences the tiller angle in rice plants. Furthermore, TAC4 encodes a highly conserved nuclear protein, which is essential for the regulation of the tiller angle and, consequently, the plant's yield potential.", + "loss": 0.3675571912313429 + }, + "50fb8fdad05d10217da553bade12ca15": { + "question": "How does auxin influence grain size and crop yield in plants, particularly in rice?", + "answer": "Grain size is a critical factor that significantly impacts crop yield, especially in rice. This aspect of plant development is heavily influenced by auxin, a class of plant hormones essential for regulating growth and development. Specifically, auxin affects grain size through its role in various developmental signals that guide plant growth.\n\nOne important type of auxin is indole acetic acid, which not only contributes to overall plant growth and development but also plays a vital role in regulating gravitropism—the plant's growth response to gravity. This gravitropic response is modulated by auxin, which determines how plants orient themselves in relation to gravitational forces.\n\nHowever, the transport of auxin can be inhibited by substances such as N-1-naphthylphthalamic acid. This auxin transport inhibitor can affect the sensitivity of plants to auxin, thereby influencing their growth and development. Overall, the interplay between grain size, auxin, and various developmental signals is crucial for understanding how plants adapt and thrive in their environments.", + "loss": 1.9266283028019138e-07 + }, + "5ae7a76b5ecb56fb5c0be5b7f0c6cc4b": { + "question": "云粳26号是什么品种的稻米,它的培育背景和市场潜力如何?", + "answer": "稻米是水稻的种子,作为一种主要的粮食作物,其重要性不言而喻。在众多稻米品种中,云粳26号以其良好的食用价值和市场潜力而受到关注。云粳26号是由云南省农业科学院粮食作物研究所于2005年培育出的早熟稻品种,具有优良的外观和食味品质。\n\n为了确保云粳26号的高产和优质,种植过程中需要采用相应的种植技术。这些技术旨在提高作物的产量和质量,从而满足市场的需求。市场需求反映了消费者对稻米等农产品的偏好,因此,云粳26号的推广与市场需求密切相关。通过有效的种植技术和对市场需求的把握,云粳26号有望在稻米市场中占据一席之地。", + "loss": 4.89174685946466e-07 + }, + "a3b291ba0fe28ca48e9447944bb9581c": { + "question": "云粳26号是什么品种,它的主要特征和推广情况如何?", + "answer": "云粳26号是一种早熟稻品种,由云南省农业科学院粮食作物研究所于2005年育成。该品种具有良好的外观和食味品质,其中米粒大、颖尖无色、无芒以及谷壳黄色等特征,使其在市场上更具吸引力。此外,云粳26号还具备高抗稻瘟病的特性,增强了其在种植过程中的可行性。\n\n在2012年,农业部将云粳26号列为西南稻区的农业推广主导品种,标志着其在该地区的重要性。云粳26号特别适合在中海拔稻区种植,尤其是在海拔1500至1800米的云南中海拔稻区。其落粒性适中和有香味的特性进一步提升了其食用价值,满足了消费者的需求。\n\n通过科学的品种育成方法,云粳26号的推广旨在提高农业生产效率,促进农业生产的发展。因此,云粳26号不仅是一个优质的稻米品种,也是推动当地农业发展的重要力量。", + "loss": 0.8043609434100357 + }, + "eb671e24d1721884126bd70a6b0a654d": { + "question": "What is the role of the Big Grain1 (BG1) gene in rice, and how does it affect grain size and auxin transport?", + "answer": "The gene known as Big Grain1 (BG1) plays a significant role in regulating grain size and is preferentially expressed in vascular tissue, which is essential for nutrient transport. This gene is also involved in the transport mechanisms of auxin, a class of plant hormones crucial for growth and development. Specifically, BG1 is induced by auxin treatment, highlighting its importance in auxin response and transport.\n\nIn the context of rice, the Bg1-D mutant, which is associated with the BG1 gene, exhibits an extra-large grain phenotype. This mutant demonstrates increased auxin basipetal transport, suggesting that BG1 is integral to the movement of auxin within the plant. Furthermore, the expression of BG1 is observed in young panicles, indicating its role in the reproductive development of rice.\n\nManipulating BG1 through techniques such as knockdown results in reduced sensitivity to auxin and smaller grain sizes, thereby affecting overall seed weight and plant biomass. The protein encoded by BG1 is membrane-localized, which implies its involvement in cellular transport mechanisms. Additionally, the expression of BG1 in culms further supports its contribution to the growth and development of the rice plant.\n\nResearch involving both rice and Arabidopsis has provided insights into the regulatory functions of BG1, particularly in enhancing crop yield. The Bg1-D mutant's altered gravitropism also suggests a connection between grain size and the plant's growth responses to gravity. Overall, the manipulation of BG1 is crucial for improving traits such as biomass and yield in crop plants, emphasizing its significance in agricultural advancements.", + "loss": 0.4372811872636896 + }, + "c88ce9480857feee561cbb1facb64dea": { + "question": "隆两优1212是什么水稻品种,它的引入和推广过程是怎样的?", + "answer": "隆两优1212是一种水稻品种,于2017年首次引入福建省龙岩市长汀县进行试种。这一品种表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高和适应性好的优良特性,显示出其在该地区的适应性。隆两优1212的种植过程始于2017年6月15日,当天进行了机播,随后在7月5日进行了机插,标志着其生长周期的推进。\n\n在经过一段时间的生长后,隆两优1212于2017年10月21日成熟,标志着其生长周期的结束。由于其优良的表现,隆两优1212在2018年得以进一步扩大示范种植,长汀润丰优质稻专业合作社成为该品种在这一年示范种植的合作社之一。\n\n随着推广的成功,隆两优1212在2019年继续在长汀县进行示范种植,长汀县绿丰优质稻专业合作社也参与了这一过程。此外,隆两优1212还被用于作烟后稻的种植方式,进一步展示了其适应性。值得一提的是,圣丰家庭农场也是隆两优1212试种的具体地点之一,表明该农场积极参与了该品种的推广。", + "loss": 0.8527936783199727 + }, + "8c098daa808c8eed3304ebe951b89ba8": { + "question": "What role does the TAC4 gene play in the domestication and improvement of Indica rice cultivars, particularly in relation to tiller angle and grain yield?", + "answer": "Indica cultivars, which are varieties of rice that have undergone domestication and improvement, exhibit a fixed bottleneck in the function of the TAC4 gene. This bottleneck has significantly impacted the regulation of tiller angle in these cultivars. TAC4 plays a crucial role in influencing the gravitropic response of rice shoots by regulating the levels of indole acetic acid, a plant hormone essential for growth and development. \n\nThe domestication process of rice, scientifically known as Oryza sativa, has shaped the regulatory role of TAC4, which is classified as a plant-specific protein. This protein is vital for determining the overall architecture of rice plants, particularly through its effect on tiller angle. Consequently, TAC4's regulation of tiller angle is directly linked to the grain yield of rice, as the angle significantly affects the amount of rice produced per unit area.\n\nFurthermore, the improvement of rice varieties involves a comprehensive understanding of TAC4's role in plant architecture. The distribution of auxin, which is crucial for proper plant growth, is also influenced by TAC4. This relationship highlights the importance of TAC4 in regulating gravitropism, which is the plant's growth response to gravity, and is affected by the spatial distribution of auxin.\n\nIn severe lines of rice, genetic mutations can lead to altered gravitropism, demonstrating the connection between genetic factors and growth responses. Additionally, grain size is a critical determinant of overall grain yield, further emphasizing the intricate relationships between TAC4, tiller angle, and the various factors that influence rice yield.", + "loss": 0.22054819647759094 + } +} diff --git a/resources/output_examples/atomic.json b/resources/output_examples/atomic.json new file mode 100644 index 00000000..e2c20df4 --- /dev/null +++ b/resources/output_examples/atomic.json @@ -0,0 +1,882 @@ +{ + "8fc4b703d23bc7c3ef8f20044ac1f5a1": { + "question": "What does TAC4 regulate in rice plants?", + "answer": "TAC4 regulates rice shoot gravitropism, which is the plant's growth response to gravity.", + "loss": 0.00020721811449675394 + }, + "558806cbd9b43cf1755282f61bfd4ae4": { + "question": "隆两优1212何时在福建省龙岩市长汀县进行试种?", + "answer": "隆两优1212在2017年引入福建省龙岩市长汀县进行试种。", + "loss": 0.1943468487790341 + }, + "38fb101a957dff2d347f3795e9d498de": { + "question": "What type of protein is TAC4 and what does it regulate in rice plants?", + "answer": "TAC4 is classified as a plant-specific protein that regulates key traits in rice plants.", + "loss": 0.04473321555027118 + }, + "dca7567b4ff830415e4d7939ccbed49b": { + "question": "What role does auxin play in the gravitropic response of plants?", + "answer": "Auxin is involved in the gravitropic response of plants, affecting how they grow in relation to gravity.", + "loss": 1.2660724378656726e-07 + }, + "59d256fd67fbe38726c9f4c16e17e713": { + "question": "云粳26号的落粒性如何?", + "answer": "云粳26号的落粒性适中,影响其在收割时的表现。", + "loss": 0.1650506654987112 + }, + "401e01ab2f43f0fdd1ef6128af1e3086": { + "question": "How does TAC4 affect the gravitropic response of rice shoots?", + "answer": "TAC4 influences the gravitropic response of rice shoots by regulating the levels of indole acetic acid.", + "loss": 0.00020721811449675394 + }, + "0fbd0aae3a0c8507e447530e8e735f09": { + "question": "How does N-1-naphthylphthalamic acid affect plant growth and development?", + "answer": "N-1-naphthylphthalamic acid inhibits auxin transport, which affects plant growth and development.", + "loss": 6.453754981617878e-08 + }, + "ec67d3230420a5ff9da6bfbe53f40ddf": { + "question": "What is ARABIDOPSIS commonly used for in plant biology?", + "answer": "ARABIDOPSIS is used as a model organism for studying genetic functions and plant development.", + "loss": 6.034219097644969e-07 + }, + "41b0254a9dec4f30baf425e742935a2d": { + "question": "云粳26号在西南稻区的地位是什么?", + "answer": "云粳26号被列为西南稻区的农业推广主导品种。", + "loss": 0.912369919935317 + }, + "a8071628666cfc7cd059987a04dd5e1d": { + "question": "隆两优1212在长汀县的试种结果如何?", + "answer": "隆两优1212在长汀县进行试种,显示出该地区对该品种的适应性。", + "loss": 0.07428048545261845 + }, + "9a1319119fa5344cad2ffae84a4d94f5": { + "question": "隆两优1212在2019年有什么表现?", + "answer": "隆两优1212在2019年继续示范种植,表明其在长汀县的持续推广。", + "loss": 0.036480231750829255 + }, + "ed974c970e399ecbe818e6eba46e369e": { + "question": "隆两优1212在什么地方进行试种?", + "answer": "隆两优1212在圣丰家庭农场进行试种。", + "loss": 0.006842017582584937 + }, + "c5879d941d81109d33dcc25733cc97fd": { + "question": "What is the effect of losing TAC4 function on the tiller angle of rice plants?", + "answer": "Loss of TAC4 function leads to a significant increase in the tiller angle of rice plants.", + "loss": 8.998404945691856 + }, + "5786fba44124297bb4056b905ab13909": { + "question": "What is the function of the TILLER ANGLE CONTROL 4 (TAC4) gene in rice?", + "answer": "TAC4 is a novel regulator that controls tiller angle in rice and encodes a highly conserved nuclear protein.", + "loss": 3.363024097783987e-05 + }, + "40e8e0fcad7ec55ec0fa14b91ea536f0": { + "question": "What are \"severe lines\" in the context of genetic research?", + "answer": "Severe lines\" refer to genetic lines that exhibit extreme phenotypes, such as those with significantly altered gravitropism due to BG1 manipulation.", + "loss": 0.0677631951402873 + }, + "3ec6a0860b0cc52278a0972a9849ed96": { + "question": "农业推广的目的是什么?", + "answer": "农业推广的目的是将优质农作物品种推广到农民中,以提高农业生产效率和质量。", + "loss": 0.0005425948230547652 + }, + "919fcd2678b5419e889fd4582820b86c": { + "question": "云粳26号适宜在哪个海拔范围的稻区种植?", + "answer": "云粳26号适宜在云南中海拔 1 500∼1 800 m 稻区种植。", + "loss": 0.17507959033052126 + }, + "8900f870f0c30d148c7640466d00ef5d": { + "question": "What does the Bg1-D mutant indicate about the relationship between grain size and growth responses?", + "answer": "The Bg1-D mutant exhibits altered gravitropism, indicating a link between grain size and growth responses.", + "loss": 0.003970473083124177 + }, + "b8aaca034f9d35e09ae508c09de641f4": { + "question": "How does TAC4 influence plant growth and development?", + "answer": "TAC4 affects the distribution of auxin, which is crucial for the regulation of plant growth and development.", + "loss": 3.2774787920447374e-07 + }, + "2860bb26879cf4faa38e1b7b8051b69c": { + "question": "What is a dominant mutant in genetics?", + "answer": "A dominant mutant is a genetic variant that expresses a trait more strongly than its recessive counterparts, such as the Bg1-D mutant.", + "loss": 0.7038410826333497 + }, + "ef1dea66ec062e4ebb67b7deb4c5e686": { + "question": "What effect does manipulating BG1 have on plant biomass?", + "answer": "Manipulating BG1 can enhance plant biomass, indicating its importance in growth.", + "loss": 3.3525097293767964e-06 + }, + "6de16594543122315aeab7d988705ef6": { + "question": "什么是作烟后稻?", + "answer": "作烟后稻是指在烟草收获后种植的水稻,隆两优1212被用于这种种植方式。", + "loss": 0.16025123000144953 + }, + "df74a578fffcdbb405891d5e66e4aae0": { + "question": "What do the \"sensitivities\" of plants refer to in the context of hormones and inhibitors?", + "answer": "The \"sensitivities\" of plants refer to their responsiveness to various hormones and inhibitors, such as auxin and N-1-naphthylphthalamic acid.", + "loss": 0.00029019961948506084 + }, + "d53ecf39048949e747c8efd49c821e89": { + "question": "隆两优1212在长汀县的机插日期是什么时候?", + "answer": "隆两优1212在长汀县进行机插的日期是7月5日。", + "loss": 5.835247260401488 + }, + "20347b4952b32b0119a10a1f9c4379bf": { + "question": "云粳26号在云南省的情况如何?", + "answer": "云粳26号在云南省进行种植和推广。", + "loss": 5.512236427143802e-07 + }, + "4db933005bd2cd0836930b4fc0b39710": { + "question": "What is a characteristic feature of indica cultivars of rice regarding TAC4 function?", + "answer": "Indica cultivars show a fixed bottleneck in TAC4 function.", + "loss": 4.560647790112646 + }, + "6a9519091e81ba54e2bdbb7eefc7e2f4": { + "question": "云粳26号稻米品种有什么特点?", + "answer": "云粳26号作为稻米品种具有良好的食用价值。", + "loss": 2.4827877114415276e-07 + }, + "03ff1e419a72174ad92373bed5b90833": { + "question": "隆两优1212的生长周期何时结束?", + "answer": "隆两优1212在10月21日成熟,标志着其生长周期的结束。", + "loss": 5.004641309711946 + }, + "4716b2e661dd13d3c6a8d6acc68babd4": { + "question": "隆两优1212的种植过程何时开始?", + "answer": "隆两优1212在6月15日进行机播,标志着其种植过程的开始。", + "loss": 2.008934394148294 + }, + "5d648964f09ec4ee58a5a14af16e0148": { + "question": "长汀县在哪个市?", + "answer": "长汀县是龙岩市的一个县。", + "loss": 0.5276962733190051 + }, + "8c92d2e36b86a17076bf6f24540f89dd": { + "question": "What is the role of the TILLER ANGLE CONTROL 4 gene in rice plants?", + "answer": "The TILLER ANGLE CONTROL 4 gene is a novel regulator of rice tiller angle, influencing plant architecture and grain yield.", + "loss": 0.0001787313643338491 + }, + "8c631862cb197af687f26b795bb7619e": { + "question": "What does the term \"phenotype\" refer to in the context of the Bg1-D mutant?", + "answer": "The term \"phenotype\" refers to the observable characteristics or traits of an organism, such as grain size in the Bg1-D mutant.", + "loss": 1.2606540320739281 + }, + "c9895a29281ab6a3b3ffc2c38949613e": { + "question": "云粳26号的食味品质中有哪些特点?", + "answer": "有香味是云粳26号的食味品质之一,增加了其市场吸引力。", + "loss": 0.19347324164300522 + }, + "dbcb3bef6cafdbd272af09186db55b62": { + "question": "What does the sensitivity of the Bg1-D mutant to N-1-naphthylphthalamic acid indicate?", + "answer": "It indicates the involvement of the Bg1-D mutant in auxin transport mechanisms.", + "loss": 1.4213754904512588 + }, + "99654153cbc972f6310f8caee0250000": { + "question": "隆两优1212在2019年示范种植的平均产量是多少?", + "answer": "隆两优1212在2019年示范种植的平均产量是8.74 t/hm^2。", + "loss": 2.503329696521127 + }, + "1d7b28291f633da994851af128aabb11": { + "question": "农业部在中国政府中负责什么?", + "answer": "农业部负责农业政策和品种推广的管理。", + "loss": 0.002764938996165263 + }, + "fc5a9717d96f500ff63d26a60c794cec": { + "question": "What is the function of the nuclear protein encoded by TAC4?", + "answer": "The nuclear protein encoded by TAC4 is involved in regulating plant traits.", + "loss": 1.9638977341952133 + }, + "a9f5a373014289148b47ecb9472f37a1": { + "question": "云粳26号在2012年被农业部列为哪个地区的农业推广主导品种?", + "answer": "云粳26号在2012年被农业部列为西南稻区农业推广主导品种。", + "loss": 1.148128436397871 + }, + "096051c952bc98c64a1a75493399c09c": { + "question": "What is the primary purpose of cultivating crop plants like rice?", + "answer": "The primary purpose of cultivating crop plants like rice is for agricultural purposes, specifically to study and improve yield.", + "loss": 7.457303081540263e-05 + }, + "576c5f30a90f6fa61b74d0f5595e85bf": { + "question": "云粳26号的特性之一是什么?", + "answer": "落粒性适中是云粳26号的特性之一。", + "loss": 0.12270220625214279 + }, + "09ead68a9248b556db67abc58113dfcb": { + "question": "隆两优1212在种植方式上有什么应用?", + "answer": "隆两优1212被用于作烟后稻的种植方式,显示其适应性。", + "loss": 0.012229025208701708 + }, + "c02ce02d80931b7ca6215fb4f0e0f83a": { + "question": "云粳26号在什么区域被列为主导品种?", + "answer": "云粳26号被列为西南稻区的主导品种。", + "loss": 2.1393262670705567e-05 + }, + "6a40541aeac4ff01ff27ee9e1220f998": { + "question": "云粳26号是由哪个机构育成的?", + "answer": "云粳26号是由云南省农业科学院粮食作物研究所育成的。", + "loss": 0.12513624576483076 + }, + "8ff2922303daa9a9a72f052c5a9c1af5": { + "question": "云粳26号的一个重要特性是什么?", + "answer": "食味品质好是云粳26号的一个重要特性。", + "loss": 0.00032564003083735554 + }, + "a88e828680780c2373470e547ffde183": { + "question": "米粒大的特征对云粳26号有什么影响?", + "answer": "米粒大是云粳26号的一个重要特征,影响其食用价值。", + "loss": 0.9131141534554827 + }, + "16a1c3442b09a541ea1183ff2f9f4a88": { + "question": "市场需求在云粳26号的推广中扮演什么角色?", + "answer": "市场需求与云粳26号的推广密切相关。", + "loss": 3.352136559578442e-07 + }, + "c2b1f263f244832716f1f7b8868f6afa": { + "question": "What does the term \"CULMS\" refer to in the context of rice plants?", + "answer": "CULMS\" refers to the stem of the rice plant, where BG1 expression is observed.", + "loss": 1.6983356725699963 + }, + "99f926a61c70c5b5a57923bd2701a6ef": { + "question": "云粳26号是什么时候育成的?", + "answer": "云粳26号是在2005年育成的早熟稻品种。", + "loss": 3.296899015603895 + }, + "1ac36a6756745d61a2e6635617d87b3e": { + "question": "How does domestication affect the function of TAC4?", + "answer": "Domestication impacts the function of TAC4 by cultivating and improving wild species for agricultural purposes.", + "loss": 4.745760155323239 + }, + "16578072c3065d3bc299260c0eebdcc1": { + "question": "How does the expression of BG1 affect seed weight and yield?", + "answer": "The expression of BG1 influences seed weight, contributing to overall yield.", + "loss": 2.49821685354467e-06 + }, + "2293cd15c685b5fa198ba10962fd9da3": { + "question": "What role do developmental signals play in plants regarding grain size?", + "answer": "Developmental signals play a crucial role in regulating grain size in plants.", + "loss": 2.7561182491159133e-07 + }, + "d5a42ae23a09de3c130d2fc071b88634": { + "question": "Why is grain size important in agriculture?", + "answer": "Grain size is a critical determinant of overall grain yield in crops.", + "loss": 1.0876401574818144e-06 + }, + "e5962fad358edcd184373ef55d710e64": { + "question": "What do severe lines indicate about the relationship between genetic mutations and growth responses?", + "answer": "Severe lines exhibit altered gravitropism, indicating a relationship between genetic mutations and growth responses.", + "loss": 0.0002387586577014825 + }, + "1f01190302db474acb8abbc763f0750f": { + "question": "What factors can influence seed weight in plants?", + "answer": "Seed weight can be influenced by genetic factors such as BG1.", + "loss": 0.0021608307427776244 + }, + "06472540d42042a800dec83c298fccb5": { + "question": "What is observed in the Bg1-D mutant regarding auxin transport?", + "answer": "The Bg1-D mutant shows enhanced movement of auxin towards the base of the plant.", + "loss": 0.61003755970295 + }, + "de114086a8bd588f1eede9b30bd8475b": { + "question": "What methods are used for the improvement of rice varieties mentioned in the text?", + "answer": "The improvement of rice varieties is achieved through selective breeding and genetic modification.", + "loss": 0.004592334122062172 + }, + "1ece7403e3d929e2d533240a3f438ce2": { + "question": "How has the TAC4 gene bottleneck impacted indica cultivars?", + "answer": "The bottleneck in the TAC4 gene has affected the tiller angle regulation in indica cultivars.", + "loss": 0.042615559037509854 + }, + "f253c7d4fb2bd4d569134b2fa6316ce1": { + "question": "云粳26号的外观特点是什么?", + "answer": "云粳26号的外观特点之一是颖尖无色、无芒。", + "loss": 1.448043743927099 + }, + "91438e810834eebb3214fc9b956705ab": { + "question": "圣丰家庭农场位于哪个县?", + "answer": "圣丰家庭农场位于长汀县。", + "loss": 3.338190455054042 + }, + "6507645acf35a3f01c4fbd717a4a9345": { + "question": "What factors influence the total amount of grain produced from a crop?", + "answer": "The total amount of grain produced from a crop is influenced by various factors, including grain size.", + "loss": 1.5854104661895019 + }, + "8a312b9cd3dc5a58a3f0ee2a84aca766": { + "question": "云粳26号的外观特征之一是什么?", + "answer": "谷壳黄色是云粳26号的外观特征之一。", + "loss": 1.4384042079781831 + }, + "7eb9dfccb3b36c53823fcc1e330aaf26": { + "question": "云粳26号是什么类型的品种?", + "answer": "云粳26号是一种早熟品种,生长周期较短。", + "loss": 0.05963051443298655 + }, + "18555f3352ef57d00c2f30a15d098f35": { + "question": "什么是早熟品种?", + "answer": "早熟品种是指生长周期较短的作物品种,云粳26号属于此类。", + "loss": 0.024306114763021497 + }, + "c13cef8ac76334ebdfa80b218fb6f919": { + "question": "2019年在长汀县发生了什么事件?", + "answer": "2019年是隆两优1212在长汀县继续示范种植的年份。", + "loss": 1.571420667330729 + }, + "9361662357bfe22246e14dcb06abd40c": { + "question": "How can plant biomass be enhanced according to the text?", + "answer": "Plant biomass can be enhanced by manipulating genes like BG1.", + "loss": 0.2813629664603256 + }, + "3e0264f61f77973584e020c9701d250a": { + "question": "What is the distinct phenotype of the Bg1-D mutant?", + "answer": "The Bg1-D mutant displays a distinct phenotype characterized by extra-large grains.", + "loss": 0.3556477129091149 + }, + "de65c9dd060fa58887a3e6bc7b27bb4c": { + "question": "How does altered auxin distribution impact plant growth and development?", + "answer": "Altered auxin distribution affects growth and development by changing the spatial distribution of auxin within the plant.", + "loss": 7.896309739486327e-07 + }, + "3e955c5841ac3a90f3da4eccd8a426c9": { + "question": "How has the domestication of rice impacted the TAC4 gene?", + "answer": "The domestication of rice has caused a bottleneck in the TAC4 gene, affecting its function in indica cultivars.", + "loss": 0.7150320114210279 + }, + "f2a50f83b1cd7c921693daf2b66610cd": { + "question": "What does BG1 expression in culms indicate about its role in rice plants?", + "answer": "BG1 expression in culms suggests its involvement in the growth and development of the rice plant.", + "loss": 1.5640816284886636e-07 + }, + "336dfcf835de87bc229f046e6964827f": { + "question": "云粳26号在西南稻区的地位如何?", + "answer": "云粳26号被列为西南稻区的主导品种,表明其在该地区的重要性。", + "loss": 0.0008515005374647119 + }, + "ceb4da988a0541b944acf36431fd76cb": { + "question": "What is the role of TAC4 in rice plant architecture?", + "answer": "TAC4 is crucial for determining the plant architecture of rice by regulating the tiller angle.", + "loss": 7.177022780518039e-06 + }, + "8ba052e897dccbbf68f44205a2b13d6c": { + "question": "How does tiller angle affect rice grain yield?", + "answer": "The tiller angle significantly influences the amount of rice produced per unit area, thereby affecting grain yield.", + "loss": 1.5854104661895019 + }, + "1e7847bb6784c626f939beb5b6f5952b": { + "question": "What is the role of AUXIN in plants?", + "answer": "AUXIN is a class of plant hormones that play a crucial role in regulating plant growth and development.", + "loss": 2.0090522023619024 + }, + "08e14e5b8e5fab928dac963eead1ddab": { + "question": "种植技术的定义是什么?", + "answer": "种植技术是指为提高作物产量和质量而采用的农业技术。", + "loss": 2.458109094033553e-07 + }, + "244cb50818224e7fd2b29d207c69e03c": { + "question": "What is the role of auxin in plants?", + "answer": "Auxin is a plant hormone that plays a crucial role in regulating growth and development, including grain size.", + "loss": 2.0090522023619024 + }, + "043d7930d2325dcaedf4878293041a59": { + "question": "How does TAC4 influence the architecture of rice plants?", + "answer": "TAC4 influences the overall architecture of rice plants by affecting the tiller angle.", + "loss": 7.177022780518039e-06 + }, + "b75f7fe791e0be7db8c39d033ca2663a": { + "question": "河田镇南塘村在哪个县内进行隆两优1212的试种?", + "answer": "河田镇南塘村是长汀县内隆两优1212试种的具体村庄。", + "loss": 3.3123129141253984 + }, + "44101c560c53e95f3a21933a3cd4dcd4": { + "question": "What does \"plant architecture\" refer to in the context of botany?", + "answer": "Plant architecture\" refers to the structural design and arrangement of various parts of a plant, including its tiller angle.", + "loss": 2.333420687403566 + }, + "dc4ab28cc3040ea4bc13421055172e5d": { + "question": "What is gravitropism and what factors influence it in plants?", + "answer": "Gravitropism is the growth response of plants to gravity, which is regulated by hormones like indole acetic acid and influenced by TAC4.", + "loss": 0.8259349847750537 + }, + "4c468c49040e86bc49be604e8ac43ae6": { + "question": "云粳26号的推广有什么目的?", + "answer": "云粳26号的推广旨在提高农业生产效率,满足市场需求。", + "loss": 1.169639290293123e-06 + }, + "ab66b52ddff55b31c798473eaf9b8329": { + "question": "What does the sensitivity of the Bg1-D mutant to auxin suggest about its function?", + "answer": "The sensitivity of the Bg1-D mutant to auxin indicates its role in growth regulation.", + "loss": 2.238042513150685e-05 + }, + "883ac11bb60cd8f0fe2f764eee03e140": { + "question": "What is the effect of losing TAC4 function on tiller angle in plants?", + "answer": "The loss of TAC4 function results in an increased tiller angle, affecting plant architecture.", + "loss": 1.0634637919853458 + }, + "890f0a5481300b38d26e79b557a0399f": { + "question": "What are proteins and what is their relationship with genes like BG1?", + "answer": "Proteins are molecules that perform various functions in living organisms, and they are encoded by genes such as BG1.", + "loss": 1.4121341062231416 + }, + "604d203823749d2e336dd4beb3995e86": { + "question": "What role does BG1 play in crop plants, specifically in rice?", + "answer": "BG1 is identified as a positive regulator for improving yield in crop plants, particularly in rice.", + "loss": 0.021197290586618064 + }, + "b358c59097b3640abd20ac8c1a578314": { + "question": "What does the localization of the protein encoded by BG1 suggest about its function?", + "answer": "The membrane localization of the protein encoded by BG1 suggests its role in cellular transport mechanisms.", + "loss": 0.4468337439779575 + }, + "0ef97d9a8479608769f793f62b9e4566": { + "question": "隆两优1212在2018年的推广情况如何?", + "answer": "隆两优1212在2018年进一步扩大示范种植,显示其推广的成功。", + "loss": 0.2437448643440381 + }, + "da1a042a73763ff90604980747c8ca89": { + "question": "What is a bottleneck in the context of genetic diversity in populations?", + "answer": "A bottleneck is a genetic phenomenon where a population experiences a significant reduction in genetic diversity, which can affect traits like TAC4 in indica cultivars.", + "loss": 0.03912297745894949 + }, + "90704274b79cecd12d2d5ea7f51ad004": { + "question": "How do genetic factors like BG1 affect rice grain yield?", + "answer": "Genetic factors like BG1 significantly influence the grain yield of rice.", + "loss": 0.3467021710806151 + }, + "31d5f888b6d6e7ec333dff2ac6a54b06": { + "question": "濯田镇永巫村与长汀润丰优质稻专业合作社有什么关系?", + "answer": "濯田镇永巫村是长汀润丰优质稻专业合作社的具体位置。", + "loss": 5.690190202714729 + }, + "42203838d5c245e37c0ad45caaec78c1": { + "question": "云粳26号的主要目标是什么?", + "answer": "云粳26号被列为主导品种,旨在通过农业推广提高其种植面积和产量。", + "loss": 0.0012081633904017803 + }, + "eea823c502b6dc35793e0a94a4a7cd17": { + "question": "What defines a membrane-localized protein?", + "answer": "A membrane-localized protein is defined as a protein that is located within or associated with cellular membranes.", + "loss": 0.09087045925116397 + }, + "21856b095ca30838d0b893a866bdec63": { + "question": "What is a plant-specific protein and can you give an example?", + "answer": "A plant-specific protein is a type of protein that is unique to plants, such as the TAC4 protein, which plays a role in regulating plant traits.", + "loss": 2.5723295018461854 + }, + "1cdc8e988342502fcbeb4736a11d50a1": { + "question": "How does the manipulation of BG1 influence plant development?", + "answer": "Manipulation of BG1 affects auxin distribution, which is crucial for proper plant development.", + "loss": 2.8801332519304187e-07 + }, + "811f16be3cf374455c54e528b24a55f8": { + "question": "What does the study of BG1 in rice and Arabidopsis reveal about its function?", + "answer": "The study of BG1 in both rice and Arabidopsis provides insights into its role in regulating grain size and plant productivity.", + "loss": 4.186935257127697e-06 + }, + "dab00a112752a927b7ab25aac7a36769": { + "question": "龙岩市在哪个省份?", + "answer": "龙岩市是福建省的一个城市。", + "loss": 3.2501133071586494 + }, + "d60e24d8188fdcf7430734f6cc7fc32d": { + "question": "隆两优1212水稻品种试种在哪个省份进行?", + "answer": "隆两优1212水稻品种试种在福建省进行。", + "loss": 3.1677237672614025 + }, + "d33ff6ff06752745ed97904f94e96d87": { + "question": "云粳26号是什么的结果?", + "answer": "云粳26号是通过科学方法培育出的新的作物品种的结果。", + "loss": 5.140191774269691e-07 + }, + "aea7c92c1bf912a3c54c2969589c6a5a": { + "question": "What is the purpose of the rice T-DNA insertion population?", + "answer": "The rice T-DNA insertion population is used for genetic studies, particularly for identifying mutants like Bg1-D.", + "loss": 0.40683872975935675 + }, + "925298aabd4ce05f193d8f4499c4ad12": { + "question": "How does TAC4 influence rice grain yield?", + "answer": "TAC4's regulation of tiller angle ultimately impacts the grain yield of rice.", + "loss": 0.5317985935423521 + }, + "e1a8111e472dd766a08b76a715998633": { + "question": "云粳26号的一个重要特征是什么?", + "answer": "米粒大是云粳26号的一个重要特征。", + "loss": 2.2840864064429938 + }, + "d1c90ce25ecf7efdcd3a3354f055a1ba": { + "question": "云粳26号是什么时候育成的,属于什么类型的稻品种?", + "answer": "云粳26号是于2005年育成的早熟稻品种。", + "loss": 0.2890754962572828 + }, + "3d922cfc2368752a79c884b971d1bfb3": { + "question": "How does TAC4 influence gravitropism in plants?", + "answer": "TAC4 regulates the distribution of auxin, which influences gravitropism in plants.", + "loss": 0.00011275943037008666 + }, + "c7a04dba6c03a4b086f903d22ba7927b": { + "question": "What does the BG1 gene encode and what is its role in plants?", + "answer": "The BG1 gene encodes a protein that is involved in regulating auxin transport and grain size.", + "loss": 0.0025066489567203954 + }, + "9e502f734ed87a35e39d40cdc1489afa": { + "question": "What does the Bg1-D mutant indicate about the role of the BG1 gene in auxin transport?", + "answer": "The Bg1-D mutant shows increased auxin basipetal transport, suggesting that BG1 plays a role in auxin movement.", + "loss": 0.7033061152256165 + }, + "13e374cccee0c3f9f3e5066d9941f5c1": { + "question": "What is the scientific name of rice and what factors influence its growth and yield?", + "answer": "The scientific name of rice is Oryza sativa, and its growth and yield are influenced by various genetic factors including TAC4.", + "loss": 4.617459285327665e-05 + }, + "866d573f2f297061002de628a2d41a9e": { + "question": "What does the expression of BG1 in young panicles indicate about its function in rice?", + "answer": "The expression of BG1 in young panicles indicates its role in the reproductive development of rice.", + "loss": 0.3803147644199621 + }, + "baafb35d76a2e4e72754ee2e02837e97": { + "question": "What is the purpose of the knockdown technique in genetics?", + "answer": "The knockdown technique is used to reduce the expression of a specific gene, such as BG1, resulting in smaller grains.", + "loss": 1.1819866932528378 + }, + "c9892bf81db62710652aa1a4df6d4b28": { + "question": "中海拔稻区的海拔范围是多少米?", + "answer": "中海拔稻区是指海拔在1500至1800米之间的稻米种植区域。", + "loss": 8.109651826946951 + }, + "b3d116c8baf1e3c1c24faae3045b7eeb": { + "question": "How has the function of TAC4 changed during the domestication of indica cultivars?", + "answer": "The function of TAC4 has become fixed in indica cultivars during the domestication process.", + "loss": 1.7112298922019014 + }, + "411393e23fb7be3b93440c83cf623b70": { + "question": "What does the induction of BG1 by auxin treatment indicate about its function?", + "answer": "The induction of BG1 by auxin treatment indicates its role in auxin response and transport.", + "loss": 1.3951637372738468 + }, + "c662270fb0855a401f06b78ec7614805": { + "question": "What is the primary function of vascular tissue in plants?", + "answer": "The primary function of vascular tissue in plants is to transport nutrients and water.", + "loss": 1.4059810507194868 + }, + "65391b2acbd48738bd74607c18df8b67": { + "question": "How does TAC4 influence rice shoot gravitropism?", + "answer": "TAC4 influences rice shoot gravitropism by increasing the content of indole acetic acid.", + "loss": 0.5071944590623009 + }, + "6c8afa1bbed42737a90a6327d055708a": { + "question": "云粳26号适宜在哪个海拔范围内种植?", + "answer": "云粳26号适宜在1500至1800米的海拔范围内种植。", + "loss": 0.46691513806581497 + }, + "debb6bbeb1012c754fcb00a572302d44": { + "question": "What is the effect of knocking down the BG1 gene on grain size and auxin sensitivity?", + "answer": "Knockdown of BG1 results in decreased sensitivities to auxin and smaller grain size.", + "loss": 1.175097647990172 + }, + "8596971cc73665376243a606dec81932": { + "question": "2018年在隆两优1212的种植中有什么重要事件?", + "answer": "2018年是隆两优1212在长汀县进一步扩大示范种植的年份。", + "loss": 1.1295489961280851 + }, + "79240f6ab5887ba097cc6c3659838b41": { + "question": "云粳26号在何时被列为西南稻区的农业推广主导品种?", + "answer": "云粳26号在2012年被列为西南稻区农业推广主导品种。", + "loss": 0.7650802039228166 + }, + "12d55d7631d787a33b4716cfca81b3cd": { + "question": "云南省农业科学院的主要职责是什么?", + "answer": "云南省农业科学院负责农业相关的研究和品种育成。", + "loss": 1.5640816284886636e-07 + }, + "d91eae23ea5e22d671024ae897104681": { + "question": "Where is BG1 preferentially expressed, and what does this indicate about its function?", + "answer": "BG1 is preferentially expressed in the vascular tissue, indicating its role in nutrient transport.", + "loss": 2.6113451392768074 + }, + "01e2d60dad5ac03b1fdade163ae70e1f": { + "question": "Why is the tiller angle important in rice cultivation?", + "answer": "The tiller angle is a critical factor that affects the grain yield of rice.", + "loss": 5.736209800751819e-07 + }, + "9745c183eac605943269ad4245c6aa86": { + "question": "云粳26号的推广旨在实现什么目标?", + "answer": "云粳26号的推广旨在提高农业生产效率。", + "loss": 3.6748243321680464e-07 + }, + "a6d71ebd98510437b234b78d7182134f": { + "question": "云粳26号在何时被农业部列为西南稻区农业推广主导品种?", + "answer": "云粳26号在2012年被农业部列为西南稻区农业推广主导品种。", + "loss": 0.5534002898677657 + }, + "ada64ad160c44a4433de00a08c02594d": { + "question": "河田镇中街村与哪个合作社有关联?", + "answer": "河田镇中街村是长汀县绿丰优质稻专业合作社的具体位置。", + "loss": 3.517918152359073 + }, + "cc4880aa16a927bf6ca9d166557e19fb": { + "question": "云粳26号的抗病特性是什么?", + "answer": "云粳26号具有高抗稻瘟病的特性,增强了其种植的可行性。", + "loss": 0.437758534580301 + }, + "415f6fedeeded46ae3c79164ad5d0f48": { + "question": "Why is rice considered a significant crop in agricultural research?", + "answer": "Rice is considered a significant crop in agricultural research because it is a staple crop and a model organism, particularly in studies of grain size and yield.", + "loss": 4.617459285327665e-05 + }, + "2fac96ce4fd81203d8d499e11ed89f39": { + "question": "What is the role of TAC4 in rice plants?", + "answer": "TAC4 is identified as a regulator that influences the tiller angle in rice plants.", + "loss": 0.024331670298124643 + }, + "0ea45e4ae02367a0b6d1c753f7462548": { + "question": "What does TAC4 encode and what is its role in rice plants?", + "answer": "TAC4 encodes a nuclear protein that plays a role in the regulation of rice tiller angle.", + "loss": 0.014876111410558224 + }, + "8aaa2e4913b155bc148b2e617387c1c0": { + "question": "How does auxin influence grain size in plants?", + "answer": "Auxin regulates grain size by affecting growth and development.", + "loss": 3.674824332542619e-07 + }, + "5b28c9eb45130f4b2a8e82596729f221": { + "question": "What is the function of the TAC4 gene in rice plants?", + "answer": "The TAC4 gene regulates the architecture of rice by specifically affecting its tiller angle.", + "loss": 0.13206698219679916 + }, + "21ad04b6f115dd1a70ebacc1882cca7d": { + "question": "What is the relationship between the Bg1-D mutant and the BG1 gene in rice?", + "answer": "The Bg1-D mutant is associated with the BG1 gene, which regulates grain size in rice.", + "loss": 0.005603051259337632 + }, + "0ce4a17582ae236c253cd7cbf54209db": { + "question": "What is the function of N-1-Naphthylphthalamic acid in plants?", + "answer": "N-1-Naphthylphthalamic acid is an auxin transport inhibitor that affects the sensitivity of plants to auxin.", + "loss": 0.012305542981872948 + }, + "92c16b7125ea8b5d7dd51ee8383774b9": { + "question": "What is the function of the BG1 gene in relation to grain size and auxin transport?", + "answer": "The BG1 gene, when overexpressed, leads to increased grain size and is involved in the regulation of auxin transport.", + "loss": 0.38225379123502873 + }, + "88c7f31116c93a825901faf664843abc": { + "question": "How has the domestication of rice affected TAC4's role in tiller angle regulation?", + "answer": "The domestication of rice has influenced the regulatory role of TAC4 in controlling tiller angle.", + "loss": 0.10460714693084584 + }, + "ae137b7e8295e038c4fbdfa984a51517": { + "question": "How does TAC4 influence auxin distribution in plants?", + "answer": "TAC4 affects the spatial distribution of auxin hormones within a plant, which is crucial for plant growth.", + "loss": 4.1819995810189575e-06 + }, + "7850dde8a721228f1565056215b208d6": { + "question": "What was identified from the rice T-DNA insertion population in relation to its genetic basis?", + "answer": "Bg1-D was identified from the rice T-DNA insertion population, highlighting its genetic basis.", + "loss": 0.22322877934147434 + }, + "a8c6f560f4329fef58b134e60eb2f043": { + "question": "What is indole acetic acid and what role does it play in plants?", + "answer": "Indole acetic acid is a type of auxin that plays a role in plant growth and development.", + "loss": 1.2907509963235755e-07 + }, + "ae8be48e67d8c2add117d17464d91eb1": { + "question": "云粳26号是什么类型的稻品种,在哪里育成的?", + "answer": "云粳26号是早熟稻品种,在云南省育成。", + "loss": 2.9819501947351417 + }, + "79b7e7fba7722ab8029be1bbd07a3539": { + "question": "How is TAC4 related to the grain yield of rice plants?", + "answer": "TAC4's regulation of tiller angle is directly linked to the grain yield of rice plants.", + "loss": 0.5317985935423521 + }, + "aa08e87d4b534d3910a84eec103d6c42": { + "question": "What is the specific trait exhibited by the Bg1-D mutant in terms of grain size?", + "answer": "The Bg1-D mutant exhibits an \"extra-large grain phenotype,\" characterized by significantly larger grains compared to normal.", + "loss": 4.747928125914951 + }, + "d33b082fea393b2f82878506633200e0": { + "question": "隆两优1212有哪些特征?", + "answer": "隆两优1212的特征包括分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好。", + "loss": 0.0379364202963188 + }, + "5f9003d9cdb4d6cb4e5d4f401b4842b6": { + "question": "隆两优1212在长汀县的机播日期是什么时候?", + "answer": "隆两优1212在长汀县进行机播的日期是6月15日。", + "loss": 3.9376944857096556 + }, + "96d45879686436690a445eba766e1775": { + "question": "What phenotype is exhibited by the dominant mutant BIG GRAIN1 (BG1-D) in rice?", + "answer": "The dominant mutant BIG GRAIN1 (BG1-D) exhibits an extra-large grain phenotype, affecting grain size regulation.", + "loss": 0.08021527799428438 + }, + "6067737f1d47489e15279e37c39c5bef": { + "question": "隆两优1212在2019年示范种植的面积是多少?", + "answer": "隆两优1212在2019年示范种植的面积是13.50 hm^2。", + "loss": 3.0636376298296395 + }, + "9b762fe480055ddef0917289a512c4d5": { + "question": "Where is BG1 preferentially expressed in the rice plant?", + "answer": "BG1 is preferentially expressed in the young panicles, which are the flowering part of the rice plant.", + "loss": 0.9476254495473939 + }, + "a1669112bb9f021a37bdf1a5dcf21db3": { + "question": "长汀润丰优质稻专业合作社是什么时候示范种植隆两优1212的?", + "answer": "长汀润丰优质稻专业合作社在2018年示范种植隆两优1212。", + "loss": 1.7738511032028708 + }, + "98cab38ea668ee6861f3315a04046f00": { + "question": "云粳26号对稻瘟病的抗性如何?", + "answer": "云粳26号具有高抗性。", + "loss": 0.19220860377117788 + }, + "c4666a40186f02854aeeb47e3d6e50b0": { + "question": "What is the significance of TAC4 in the improvement of rice varieties?", + "answer": "TAC4 plays a significant role in understanding plant architecture, which is important for the improvement of rice varieties.", + "loss": 0.05359335437443253 + }, + "12f867eff5c8a1d8cef09816d9da6abd": { + "question": "What phenotype is exhibited by the Bg1-D dominant mutant?", + "answer": "The Bg1-D dominant mutant exhibits an extra-large grain phenotype.", + "loss": 0.31326545774936676 + }, + "5aa3f6cf208530ccaaff6c9d78714873": { + "question": "隆两优1212在长汀县的产量是多少?", + "answer": "隆两优1212在长汀县的产量是8.78 t/hm^2。", + "loss": 2.8553758704349064 + }, + "8afa4992b308aeefbbd20b190f261b3b": { + "question": "隆两优1212在哪里进行示范种植?", + "answer": "隆两优1212在长汀润丰优质稻专业合作社进行示范种植。", + "loss": 0.04478021708549916 + }, + "89be36de1561ebf2d4d9be888c8724f9": { + "question": "Why is grain size considered important in crop production, especially in rice?", + "answer": "Grain size is a key factor that influences grain yield in crops, particularly in rice.", + "loss": 0.06938895229298701 + }, + "4d320e65556f22d27f9258c6c4dc69de": { + "question": "What is the role of indole acetic acid in plants?", + "answer": "Indole acetic acid is a plant hormone that influences growth and development, including the regulation of gravitropism in rice.", + "loss": 0.36837029734096155 + }, + "5ccfac83d3a9e7cd8b9a756d9f7a0eeb": { + "question": "隆两优1212在长汀县的成熟日期是什么时候?", + "answer": "隆两优1212在长汀县的成熟日期是10月21日。", + "loss": 3.889435500154262 + }, + "3a393c1d40121007f634742d9c3b3c04": { + "question": "How does tiller angle influence rice production?", + "answer": "Tiller angle significantly affects the grain yield of rice, making it an important aspect of plant architecture.", + "loss": 0.00038276082659649714 + }, + "1f3bb8f8f683d4dcb4b379d9e7e6788b": { + "question": "云粳26号有哪些特性?", + "answer": "云粳26号具有高抗稻瘟病的特性,适合种植。", + "loss": 0.0004952482704538482 + }, + "bafcaf510c615078854418a9809806a2": { + "question": "云粳26号的食味品质之一是什么?", + "answer": "有香味是云粳26号的食味品质之一。", + "loss": 0.07535676845873245 + }, + "7b58d98a18f753fe6c12d45fe3108703": { + "question": "隆两优1212在2018年示范种植的平均产量是多少?", + "answer": "隆两优1212在2018年示范种植的平均产量是8.72 t/hm^2。", + "loss": 5.291381644470648 + }, + "fa48f84fd40533b2ca7a214722ad1e0f": { + "question": "What is gravitropism in plants?", + "answer": "Gravitropism is the growth response of plants to gravity, which is affected in the severe lines of the Bg1-D mutant.", + "loss": 0.8259349847750537 + }, + "d1b2c30ec8be68fde163061cef0d259b": { + "question": "云粳26号是什么时候被育成的?", + "answer": "云粳26号是在2005年被育成的。", + "loss": 1.2757854678825942 + }, + "ca0bc071dad7cc7cdfe1e4a9c329c70d": { + "question": "What is the role of the TAC4 gene in rice cultivation?", + "answer": "The TAC4 gene plays a significant role in the growth and yield of rice by regulating the tiller angle.", + "loss": 0.00041262686485408725 + }, + "892efc63c9cfe98da3b9494122143143": { + "question": "隆两优1212适合在哪个地区推广种植?", + "answer": "隆两优1212适合在长汀县推广种植。", + "loss": 0.11312617113192873 + }, + "5a8a85bed2bc5f4f7212180271267b06": { + "question": "隆两优1212在2018年示范种植的面积是多少?", + "answer": "隆两优1212在2018年示范种植的面积是4.00 hm^2。", + "loss": 4.626307918465909 + }, + "de1fa70aa2b47b46c5f7dac395fbe33a": { + "question": "云粳26号适宜种植在哪个地理区域?", + "answer": "云粳26号适宜种植在云南中海拔 1 500∼1 800 m 稻区。", + "loss": 0.10301619147260987 + }, + "8f679a45d1a431d078c6912887c1e5e4": { + "question": "What are developmental signals in the context of plant growth?", + "answer": "Developmental signals are factors that influence the growth and development of plants, including grain size regulation.", + "loss": 0.6492638748206391 + }, + "95b7a1b205e4858e0dfd7ec90c695a60": { + "question": "长汀县绿丰优质稻专业合作社在何时示范种植隆两优1212?", + "answer": "长汀县绿丰优质稻专业合作社在2019年示范种植隆两优1212。", + "loss": 2.9061057158497205 + }, + "dc156ef4ade6d6c5a92142d808440dd3": { + "question": "隆两优1212的机插时间是什么时候?", + "answer": "隆两优1212在7月5日进行机插。", + "loss": 2.4750936400954147 + }, + "a32f3e889baec282ae58a993663d12c5": { + "question": "隆两优1212是什么品种,何时引入福建省进行试种?", + "answer": "隆两优1212是一种水稻品种,于2017年引入福建省龙岩市长汀县进行试种。", + "loss": 0.3354722399381748 + }, + "5ca116ec91a2c12ed734654c8c06cb0f": { + "question": "种植云粳26号需要注意什么?", + "answer": "种植云粳26号需要采用适当的种植技术,以确保其高产和优质。", + "loss": 4.320199830704407e-07 + }, + "91105a125573ed42f2bae39b8e8c302f": { + "question": "云粳26号在何时被农业部列为主导品种?", + "answer": "云粳26号在2012年被农业部列为主导品种。", + "loss": 2.114203438647301 + }, + "f6e7dfd65f03ad8a3c36d2eb2953597d": { + "question": "云粳26号的推广与什么因素密切相关?", + "answer": "云粳26号的推广与市场需求密切相关,反映了消费者对优质稻米的偏好。", + "loss": 1.0427210856591091e-07 + }, + "656d5230809aeae83b025aff88994177": { + "question": "云南省农业科学院粮食作物研究所负责育成哪种早熟品种?", + "answer": "云南省农业科学院粮食作物研究所负责育成早熟品种云粳26号。", + "loss": 0.4100542664527893 + }, + "9d97b79cd0a097abe894c1fe0b2c0b23": { + "question": "What is the scientific name for rice and what aspect of its architecture is influenced by tiller angle?", + "answer": "The scientific name for rice is \"Oryza sativa L.\" and its architecture is influenced by tiller angle.", + "loss": 1.063461342675712 + }, + "fcd7886b3252416cf81d00db42564c3e": { + "question": "How does the manipulation of BG1 in rice affect crop traits?", + "answer": "Manipulation of BG1 in rice can enhance traits such as biomass and yield, demonstrating its significance in crop improvement.", + "loss": 2.8801332519304187e-07 + }, + "5b5f79dd7fb573c24343440090d22cf9": { + "question": "隆两优1212在何时引入试种于福建省龙岩市长汀县?", + "answer": "隆两优1212在2017年引入试种于福建省龙岩市长汀县。", + "loss": 1.5090347017819814 + }, + "bec5b1527c24113026bbf872d0073417": { + "question": "谷壳黄色对云粳26号的市场接受度有什么影响?", + "answer": "谷壳黄色是云粳26号的外观特征之一,影响其市场接受度。", + "loss": 0.16414476682742438 + }, + "f8572a27d49bad7beba1c28a2fc8731f": { + "question": "云粳26号是什么类型的品种,谁育成的?", + "answer": "云粳26号是由云南省农业科学院育成的早熟品种。", + "loss": 0.7699170204937471 + }, + "0b9613c1c35a277045d3540ea95ab0b3": { + "question": "云粳26号是什么类型的稻品种?", + "answer": "云粳26号是通过品种育成过程而获得的早熟稻品种。", + "loss": 0.0595000738285307 + } +} diff --git a/resources/output_examples/cot.json b/resources/output_examples/cot.json new file mode 100644 index 00000000..01a4925d --- /dev/null +++ b/resources/output_examples/cot.json @@ -0,0 +1,47 @@ +{ + "f8faeacb05a885ec5dfc090ffc97e5af": { + "question": "隆两优1212在长汀县的示范种植是在哪些合作社进行的?", + "reasoning_path": "1. 查找隆两优1212的示范种植关系。\n2. 确定示范种植的合作社名称。", + "answer": "隆两优1212在长汀县的示范种植关系显示出其在多个合作社进行推广。首先,长汀县绿丰优质稻专业合作社是隆两优1212进行示范种植的一个合作社。其次,长汀润丰优质稻专业合作社也是隆两优1212示范种植的另一个合作社。因此,隆两优1212在长汀县的示范种植主要是在长汀县绿丰优质稻专业合作社和长汀润丰优质稻专业合作社进行的。" + }, + "71a4fb6d6b35c3a46c376b02c11db686": { + "question": "云粳26号的推广如何影响农业生产效率?", + "reasoning_path": "1. 确定云粳26号的推广目的。\n2. 识别云粳26号与农业生产之间的关系。\n3. 验证云粳26号的推广是否满足市场需求。", + "answer": "云粳26号的推广旨在提高农业生产效率。通过推广这一品种,农民能够获得更高的产量和更好的品质,从而提升整体的农业生产水平。云粳26号作为一种优质稻米品种,具有良好的食用价值和市场潜力,这使得它在农业生产中扮演了重要角色。此外,云粳26号的推广与市场需求密切相关,反映了消费者对优质稻米的偏好。因此,云粳26号的推广不仅提高了农业生产效率,还满足了市场对优质稻米的需求。" + }, + "1303aff788fa38c5bfe049121ac8a2c5": { + "question": "2012年云粳26号被列为哪个区域的农业推广主导品种?", + "reasoning_path": "1. 确定“2012年”对应的关系。\n2. 从“2012年”出发,找到与之相关的目标实体。\n3. 验证目标实体的描述信息,确认其对应的区域。", + "answer": "2012年云粳26号被列为农业推广主导品种。这个信息表明了“2012年”与“西南稻区”之间的关系。接下来,从“2012年”出发,可以找到与之相关的目标实体,即“西南稻区”。根据描述,西南稻区是云粳26号被列为农业推广主导品种的区域。因此,2012年云粳26号被列为西南稻区的农业推广主导品种。" + }, + "47807be5144177709792642a482b992c": { + "question": "What is the relationship between the Bg1-D dominant mutant and the extra-large grain phenotype it exhibits?", + "reasoning_path": "1. Recognize entities: Identify \"EXTRA-LARGE GRAIN PHENOTYPE\" and \"DOMINANT MUTANT\" with their descriptions.\n2. Recognize relationships: Identify the relationship between \"EXTRA-LARGE GRAIN PHENOTYPE\" and \"DOMINANT MUTANT\" as described.\n3. Associate the source entity \"EXTRA-LARGE GRAIN PHENOTYPE\" with its description and the target entity \"DOMINANT MUTANT\" with its description.\n4. Formulate the question based on the identified relationship and entities.\n5. Ensure the question allows for verification through the relationships and entities in the graph.", + "answer": "The Bg1-D dominant mutant is associated with the extra-large grain phenotype, which is a specific trait characterized by significantly larger grains compared to normal. The relationship indicates that the Bg1-D is a dominant mutant that expresses this extra-large grain phenotype more strongly than its recessive counterparts. This connection highlights how the genetic variant of the Bg1-D mutant directly leads to the observable trait of larger grains. Thus, the relationship between the Bg1-D dominant mutant and the extra-large grain phenotype it exhibits is that the mutant expresses this specific trait prominently." + }, + "549e295cb71cb1609654806177130875": { + "question": "How does the expression of the BG1 gene influence seed weight and overall yield in crop plants?", + "reasoning_path": "1. Identify the entity \"BG1\" and its description.\n2. Identify the entity \"SEED WEIGHT\" and its description.\n3. Recognize the relationship between \"BG1\" and \"SEED WEIGHT\" that states BG1 influences seed weight.\n4. Identify the entity \"CROP PLANT\" and its description.\n5. Recognize the relationship that indicates BG1 is a positive regulator for improving yield in crop plants.\n6. Connect the influence of BG1 on seed weight to its role in overall yield in crop plants.", + "answer": "The entity \"BG1\" is a gene that, when overexpressed, leads to increased grain size and is involved in auxin transport regulation. The entity \"SEED WEIGHT\" refers to the mass of seeds produced by a plant, which can be influenced by genetic factors such as BG1. There is a relationship that states BG1 influences seed weight, contributing to overall yield. The entity \"CROP PLANT\" refers to plants cultivated for agricultural purposes, such as rice, which are studied for yield improvement. Additionally, BG1 is identified as a positive regulator in crop plants, particularly in rice, for improving yield. Therefore, the influence of BG1 on seed weight directly connects to its role in enhancing overall yield in crop plants." + }, + "b3fa8fa35bddade955b059d656120620": { + "question": "How does auxin influence grain size in plants, and what role do developmental signals play in this process?", + "reasoning_path": "1. Identify the role of AUXIN in regulating GRAIN SIZE.\n2. Explore the relationship between GRAIN SIZE and DEVELOPMENTAL SIGNALS.\n3. Investigate how INDole ACETIC ACID relates to AUXIN and its impact on plant growth.\n4. Examine the effect of N-1-NAPHTHYLPHTHALAMIC ACID on AUXIN transport and its implications for plant development.\n5. Analyze the SENSITIVITIES of the Bg1-D mutant to AUXIN and its relevance to growth regulation.", + "answer": "Auxin plays a significant role in regulating grain size through its effects on growth and development. This indicates that auxin is crucial for determining how large grains can become in plants. Developmental signals also play a crucial role in regulating grain size, suggesting that various factors influence this process alongside auxin. Indole acetic acid, a type of auxin, is involved in plant growth and development, further emphasizing the importance of auxin in the overall growth mechanisms. N-1-naphthylphthalamic acid inhibits auxin transport, which can negatively affect plant growth and development by disrupting the normal auxin signaling pathways. The sensitivities of the Bg1-D mutant to auxin indicate its role in growth regulation, highlighting how variations in auxin response can impact the growth and development of plants, including grain size." + }, + "9913a8e476a0232a35f577dbf6b7678f": { + "question": "What is the relationship between the loss of TAC4 function and the tiller angle in rice plants?", + "reasoning_path": "1. Identify the entities related to \"TILLER ANGLE\" and \"LOSS OF TAC4 FUNCTION\".\n2. Examine the relationship between \"LOSS OF TAC4 FUNCTION\" and \"TILLER ANGLE\".\n3. Analyze how \"TILLER ANGLE CONTROL 4 (TAC4)\" influences \"TILLER ANGLE\".\n4. Investigate the role of \"NUCLEAR PROTEIN\" in the context of \"TILLER ANGLE CONTROL 4 (TAC4)\".\n5. Synthesize the information to explain the impact of TAC4 on tiller angle and plant architecture.", + "answer": "The entities involved in the question are \"TILLER ANGLE\" and \"LOSS OF TAC4 FUNCTION\". \n\nThe relationship between \"LOSS OF TAC4 FUNCTION\" and \"TILLER ANGLE\" indicates that the loss of TAC4 function results in an increased tiller angle, which significantly affects the plant architecture of rice.\n\nNext, \"TILLER ANGLE CONTROL 4 (TAC4)\" is identified as a regulator that influences the tiller angle in rice plants. This suggests that TAC4 plays a crucial role in maintaining the appropriate tiller angle for optimal plant architecture.\n\nFurthermore, \"TAC4\" encodes a nuclear protein that is involved in the regulation of the tiller angle. This nuclear protein is essential for the proper functioning of TAC4 in controlling the tiller angle.\n\nBy synthesizing this information, it can be concluded that the loss of TAC4 function leads to an increased tiller angle, which negatively impacts the overall architecture of rice plants. Thus, TAC4 is vital for regulating the tiller angle and ensuring the structural integrity of the plant." + }, + "4949e9970b7b2af69fdbf3d5b084cc84": { + "question": "What is the relationship between the BIG GRAIN1 (BG1-D) mutant and its effects on gravitropism and phenotype in rice?", + "reasoning_path": "1. Identify the entity \"BIG GRAIN1 (BG1-D)\" and its description.\n2. Identify the relationship between \"BIG GRAIN1 (BG1-D)\" and \"PHENOTYPE\" to understand its observable characteristics.\n3. Identify the relationship between \"BIG GRAIN1 (BG1-D)\" and \"GRAVITROPISM\" to explore its impact on growth responses.\n4. Identify the relationship between \"GRAVITROPISM\" and \"SEVERE LINES\" to assess how genetic mutations affect gravitropism.\n5. Identify the relationship between \"GRAVITROPISM\" and \"AUXIN DISTRIBUTION\" to understand the hormonal influence on growth responses.\n6. Compile the information from the identified relationships to summarize the connections between the entities regarding the effects of BIG GRAIN1 (BG1-D) on phenotype and gravitropism.", + "answer": "The entity \"BIG GRAIN1 (BG1-D)\" is a dominant mutant in rice known for its extra-large grain phenotype. This mutant exhibits a distinct phenotype characterized by its large grains, which is directly linked to its observable characteristics. The relationship between \"BIG GRAIN1 (BG1-D)\" and \"GRAVITROPISM\" reveals that this mutant also shows altered gravitropism, indicating a connection between grain size and the plant's growth responses to gravity. Furthermore, examining the relationship between \"GRAVITROPISM\" and \"SEVERE LINES\" indicates that severe lines, which are genetic lines with extreme phenotypes, exhibit altered gravitropism, suggesting that genetic mutations can significantly affect how plants respond to gravity. Additionally, the relationship between \"GRAVITROPISM\" and \"AUXIN DISTRIBUTION\" highlights that gravitropism is influenced by the distribution of auxin hormones, which are regulated by proteins like TAC4, further linking hormonal regulation to growth responses. By compiling this information, it becomes clear that the BIG GRAIN1 (BG1-D) mutant not only affects the phenotype of rice by producing larger grains but also alters gravitropism, demonstrating the interconnectedness of genetic mutations, observable traits, and growth responses in plants." + }, + "dcb8419b97651f5d21e2b579ce926efe": { + "question": "How does the TAC4 gene influence the grain yield of rice through its regulation of tiller angle?", + "reasoning_path": "1. Identify the role of TAC4 in regulating tiller angle.\n2. Determine the relationship between TAC4's regulation of tiller angle and grain yield.\n3. Explore how the domestication process has affected TAC4's function in rice.\n4. Investigate the impact of indica cultivars on the function of TAC4 and its regulation of tiller angle.\n5. Analyze the overall influence of TAC4 on rice plant architecture and its implications for grain yield.", + "answer": "The TAC4 gene plays a crucial role in regulating the tiller angle of rice plants. This regulation is significant because the tiller angle directly affects how the plant grows and develops. The relationship between TAC4's regulation of tiller angle and grain yield is established, as changes in tiller angle can lead to variations in the amount of grain produced. \n\nThe domestication process of rice has influenced the function of TAC4, as it has undergone changes that affect its regulatory capabilities. Specifically, during domestication, the function of TAC4 has become fixed in indica cultivars, which may limit its variability and adaptability in regulating tiller angle.\n\nIndica cultivars have experienced a genetic bottleneck that impacts the function of TAC4, leading to a more uniform regulation of tiller angle across these varieties. This uniformity can affect the overall plant architecture, which is essential for optimizing grain yield.\n\nOverall, TAC4 influences the architecture of rice plants by regulating the tiller angle, which in turn has significant implications for grain yield. The interplay between TAC4, tiller angle, and grain yield highlights the importance of this gene in rice cultivation and agricultural productivity." + } +} diff --git a/resources/output_examples/multi-hop.json b/resources/output_examples/multi-hop.json new file mode 100644 index 00000000..49842731 --- /dev/null +++ b/resources/output_examples/multi-hop.json @@ -0,0 +1,167 @@ +{ + "6a4aaafd4628c0d0081366b1230922fe": { + "question": "How does the expression of the BG1 gene in young panicles contribute to the yield improvement in rice crop plants?", + "answer": "The expression of the BG1 gene in young panicles indicates its role in reproductive development, and since BG1 is identified as a positive regulator for improving yield in crop plants like rice, it contributes to increased grain size and overall yield enhancement.", + "loss": 0.1357050354219726 + }, + "08bab7d44dc742dd5327e422a8c8fab2": { + "question": "How does the knockdown of the BG1 gene affect grain size and auxin sensitivity in plants?", + "answer": "The knockdown of BG1 results in smaller grain size and decreased sensitivities to auxin.", + "loss": 0.7750791690645821 + }, + "2323c4afc0aa3677c8e102b6677548ef": { + "question": "How does the genetic bottleneck affect the tiller angle regulation in indica cultivars of rice?", + "answer": "It affects the tiller angle regulation by causing a reduction in genetic diversity in the TAC4 gene.", + "loss": 0.042615559037509854 + }, + "802343be53da9fbf8a67ec72a94638ef": { + "question": "How does the manipulation of the BG1 gene in Arabidopsis affect the overall growth and productivity of plants?", + "answer": "It enhances plant biomass, indicating its importance in growth and productivity.", + "loss": 0.0008380628005689666 + }, + "21a261da0befb9f5e7616a29c223e954": { + "question": "How does the alteration in gravitropism in the Bg1-D mutant relate to its observable characteristics, specifically regarding grain size?", + "answer": "The alteration in gravitropism in the Bg1-D mutant indicates a link between the growth response to gravity and the phenotype of extra-large grains, suggesting that the genetic changes affecting gravitropism also influence grain size regulation.", + "loss": 0.19428232177790447 + }, + "fa714644151cfff0d77970a1ba255aae": { + "question": "在什么年份,云粳26号被列为西南稻区的农业推广主导品种?", + "answer": "2012年", + "loss": 0.7650802039228166 + }, + "c6aab621d809b4e7871427d8f2b929ee": { + "question": "What is the role of the protein encoded by the TAC4 gene in rice, and how does it affect the plant's characteristics?", + "answer": "The protein encoded by the TAC4 gene regulates the tiller angle in rice, which influences the overall architecture and yield of the plant.", + "loss": 0.014876111410558224 + }, + "f997a69a39ba43e7ce81f14199cc7dd9": { + "question": "云粳26号是由哪个机构育成的,并且在什么年份被列为西南稻区农业推广主导品种?", + "answer": "云南省农业科学院,2012年。", + "loss": 0.7469301817580426 + }, + "09054e433f0298ab2ed54518caab744f": { + "question": "How does the gene TAC4 affect the grain yield of rice through its influence on grain size?", + "answer": "TAC4 regulates the tiller angle in rice, which in turn affects the growth and arrangement of rice plants, ultimately influencing grain size; since grain size is a critical determinant of overall grain yield, TAC4 indirectly impacts the grain yield of rice.", + "loss": 0.11570529519520889 + }, + "a693075a3684ba38ac44980686d376ba": { + "question": "隆两优1212在长汀县绿丰优质稻专业合作社和圣丰家庭农场的种植方式有什么共同点?", + "answer": "隆两优1212被用于作烟后稻的种植方式。", + "loss": 0.012698659113356001 + }, + "1da6d08dcb77c4c121f0f72f31237484": { + "question": "How does the bottleneck phenomenon affect the TAC4 gene in indica cultivars, and what is the ultimate impact of TAC4 on rice grain yield?", + "answer": "The bottleneck phenomenon has led to a fixed function of the TAC4 gene in indica cultivars, which affects its role in regulating the gravitropic response of rice shoots. This regulation of TAC4 influences the tiller angle, which ultimately impacts the grain yield of rice plants.", + "loss": 0.9860201657217603 + }, + "1c5b5edcea0a3478f75b604f49286a80": { + "question": "How does N-1-naphthylphthalamic acid affect the role of indole acetic acid in plant growth and development?", + "answer": "N-1-naphthylphthalamic acid inhibits auxin transport, which affects the sensitivity of plants to indole acetic acid, thereby influencing plant growth and development.", + "loss": 9.680632472426816e-08 + }, + "65f0768e366029849a44df721d9efcee": { + "question": "云粳26号是如何通过品种育成和农业推广在西南稻区成为主导品种的?", + "answer": "云粳26号是通过品种育成过程获得的早熟稻品种,随后被列为主导品种,通过农业推广提高其种植面积和产量,从而在西南稻区成为主导品种。", + "loss": 0.02051991258546573 + }, + "169b79c15459f437327606555d0a1543": { + "question": "云粳26号在2012年被哪个政府部门列为主导品种,并且它的外观特点有哪些?", + "answer": "农业部;颖尖无色、无芒和谷壳黄色。", + "loss": 1.6668837968508612 + }, + "2ab613f72194988d46224cb78778f364": { + "question": "What is the role of the gene BG1 in the growth of rice plants, and where is its expression observed?", + "answer": "BG1 is involved in the growth and development of the rice plant, and its expression is observed in the culms.", + "loss": 1.5640816284886636e-07 + }, + "51d2351f43d26fa20a0bb97e020828ab": { + "question": "How does the manipulation of the Bg1-D mutant in severe lines affect the growth response of plants to gravity, and what role does auxin play in this process?", + "answer": "The manipulation of the Bg1-D mutant in severe lines leads to altered gravitropism, indicating that genetic mutations can significantly impact how plants respond to gravity. Auxin, a crucial plant hormone, is involved in this gravitropic response by influencing the distribution of auxin within the plant, which is regulated by proteins like TAC4.", + "loss": 0.00011721489843845191 + }, + "4fee5777f433a06093a5efa897a08bb7": { + "question": "How does the TAC4 protein contribute to the improvement of rice varieties in relation to their growth response to gravity?", + "answer": "TAC4 influences the gravitropic response of rice shoots by regulating the levels of indole acetic acid, which is essential for understanding and enhancing rice varieties through selective breeding and genetic modification.", + "loss": 0.0327960422161646 + }, + "5581ad988938db9f72113259f75a9139": { + "question": "云粳26号的哪个特征影响了它的食用价值,并且这个品种是在哪一年和哪个省份被育成的?", + "answer": "米粒大;2005年;云南省", + "loss": 2.8543118722606766 + }, + "700ee905416b63b66ffbb566b77b2f06": { + "question": "云粳26号的哪些特性使其在市场上更具吸引力,并且这些特性与其生长周期有什么关系?", + "answer": "云粳26号的特性包括落粒性适中和有香味,这些特性增加了其市场吸引力。同时,云粳26号是一种早熟品种,生长周期较短。", + "loss": 0.08589649638128727 + }, + "1aec0d81190e97295ffe2848533d4997": { + "question": "What type of genetic variant is responsible for the extra-large grain phenotype observed in the Bg1-D mutant?", + "answer": "Dominant mutant", + "loss": 0.31326545774936676 + }, + "6f6506a49aeeda15889361e838b8aa70": { + "question": "隆两优1212的生长周期是从什么时候开始,到什么时候结束?", + "answer": "生长周期从6月15日开始,到10月21日结束。", + "loss": 3.1628897813185515 + }, + "e03c364a79bfe35a03957fcdd7b75c3c": { + "question": "隆两优1212在长汀县的示范种植是在哪一年开始的,并且在2019年继续进行的合作社是什么?", + "answer": "长汀润丰优质稻专业合作社。", + "loss": 0.05184697809631562 + }, + "e179cf5967359215381832f4c60c38ef": { + "question": "How does the loss of TAC4 function in rice plants affect their grain yield?", + "answer": "The loss of TAC4 function leads to an increased tiller angle, which negatively impacts the plant architecture and ultimately reduces the grain yield of rice.", + "loss": 0.3625986786348168 + }, + "14201c917ac08b622a5cd57766abece9": { + "question": "云粳26号作为一种稻米品种,其推广与什么因素密切相关,这个因素又与消费者对什么的需求有关?", + "answer": "市场需求,消费者对优质稻米的需求。", + "loss": 4.89174685946466e-07 + }, + "72d959b82c302eb4fa644bc1986e8544": { + "question": "云粳26号的哪些特性使其在农业生产中受到推广,以应对稻瘟病和满足市场需求?", + "answer": "云粳26号具有高抗稻瘟病的特性和食味品质好的特性。", + "loss": 0.0002740193135271656 + }, + "afbf76dbc78c093f318ab088b777d88e": { + "question": "隆两优1212在2018年推广成功的原因是什么?", + "answer": "隆两优1212表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等优良特性。", + "loss": 0.18373929475166698 + }, + "27ea3a56a1ecf68c1565dc4ed88b9dcb": { + "question": "How does the expression of the BG1 gene in vascular tissue relate to the plant's sensitivity to auxin transport inhibitors like N-1-naphthylphthalamic acid?", + "answer": "The expression of the BG1 gene in vascular tissue indicates its role in nutrient transport, and the sensitivity of the Bg1-D mutant to N-1-naphthylphthalamic acid suggests that BG1 is involved in auxin transport mechanisms, which are affected by this inhibitor.", + "loss": 1.8092947890006377 + }, + "99a9f111b7f1f93666126f0e3f61c42f": { + "question": "How does the manipulation of the BG1 gene in a rice T-DNA insertion population affect seed weight and plant development?", + "answer": "The manipulation of the BG1 gene influences seed weight by contributing to overall yield, and it also alters auxin distribution, which is crucial for proper plant development.", + "loss": 1.0247478346435845e-06 + }, + "bf1a5fdce87674c5386f89ceaf37205b": { + "question": "云粳26号是由哪个机构育成的,并且适合在哪个海拔范围的稻区种植?", + "answer": "云粳26号是由云南省农业科学院粮食作物研究所育成的,适合在1500至1800米的中海拔稻区种植。", + "loss": 0.25571032472038896 + }, + "caf8a993ece46b37372c5982da3ec8b6": { + "question": "How does TAC4 influence the growth and structural design of rice plants through its effect on auxin distribution and tiller angle?", + "answer": "TAC4 regulates the distribution of auxin, which is essential for proper plant growth, and it also influences the tiller angle, thereby determining the overall architecture of rice plants.", + "loss": 7.328495219848076e-06 + }, + "eb2085106243976a07daa33a713d8f37": { + "question": "How does TAC4 influence grain yield in rice through its effect on auxin distribution and plant growth regulation?", + "answer": "TAC4 influences grain yield in rice by regulating the distribution of auxin, which plays a significant role in the growth and development of the plant, ultimately affecting grain size.", + "loss": 7.691885147988529e-06 + }, + "d55437c4a02401b6b8fc5b050e0e0056": { + "question": "How does the domestication of rice affect the regulation of indole acetic acid levels in relation to the growth response of rice shoots?", + "answer": "The domestication of rice influences the regulatory role of TAC4, which in turn regulates the levels of indole acetic acid, affecting the gravitropic response of rice shoots.", + "loss": 0.2479561960633153 + }, + "6307049c2892597a87535553f0bce4e7": { + "question": "What factor, influenced by developmental signals, is crucial for determining the yield of rice crops?", + "answer": "Grain size", + "loss": 2.7561182491159133e-07 + } +} \ No newline at end of file diff --git a/scripts/generate.sh b/scripts/generate.sh deleted file mode 100644 index be0bee9b..00000000 --- a/scripts/generate.sh +++ /dev/null @@ -1 +0,0 @@ -python3 -m graphgen.generate --config_file graphgen/configs/graphgen_config.yaml --output_dir cache/ diff --git a/scripts/generate/generate_aggregated.sh b/scripts/generate/generate_aggregated.sh new file mode 100644 index 00000000..6da9cd7f --- /dev/null +++ b/scripts/generate/generate_aggregated.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.generate \ +--config_file graphgen/configs/aggregated_config.yaml \ +--output_dir cache/ diff --git a/scripts/generate/generate_atomic.sh b/scripts/generate/generate_atomic.sh new file mode 100644 index 00000000..22cd4198 --- /dev/null +++ b/scripts/generate/generate_atomic.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.generate \ +--config_file graphgen/configs/atomic_config.yaml \ +--output_dir cache/ diff --git a/scripts/generate/generate_cot.sh b/scripts/generate/generate_cot.sh new file mode 100644 index 00000000..451d8f82 --- /dev/null +++ b/scripts/generate/generate_cot.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.generate \ +--config_file graphgen/configs/cot_config.yaml \ +--output_dir cache/ diff --git a/scripts/generate/generate_multi_hop.sh b/scripts/generate/generate_multi_hop.sh new file mode 100644 index 00000000..a3e2b5c7 --- /dev/null +++ b/scripts/generate/generate_multi_hop.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.generate \ +--config_file graphgen/configs/multi_hop_config.yaml \ +--output_dir cache/ diff --git a/webui/app.py b/webui/app.py index 153f159a..f28a109e 100644 --- a/webui/app.py +++ b/webui/app.py @@ -439,7 +439,7 @@ def sum_tokens(client): file_types=[".txt", ".json", ".jsonl"], interactive=True, ) - examples_dir = os.path.join(root_dir, "webui", "examples") + examples_dir = os.path.join(root_dir, "webui", "input_examples") gr.Examples( examples=[ [os.path.join(examples_dir, "txt_demo.txt")], From 2a75c3eab2314a7387f12a955b8731dd96967095 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Thu, 14 Aug 2025 16:58:05 +0800 Subject: [PATCH 09/10] refactor: rename scripts --- scripts/{ => evaluate}/evaluate.sh | 0 scripts/judge.sh | 2 -- 2 files changed, 2 deletions(-) rename scripts/{ => evaluate}/evaluate.sh (100%) delete mode 100644 scripts/judge.sh diff --git a/scripts/evaluate.sh b/scripts/evaluate/evaluate.sh similarity index 100% rename from scripts/evaluate.sh rename to scripts/evaluate/evaluate.sh diff --git a/scripts/judge.sh b/scripts/judge.sh deleted file mode 100644 index f6fc134e..00000000 --- a/scripts/judge.sh +++ /dev/null @@ -1,2 +0,0 @@ -python3 -m graphgen.judge --input cache \ - --output cache/output/new_graph.graphml \ From 288d67164f1454d838739e6fdd0a00d688aef306 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Thu, 14 Aug 2025 17:17:18 +0800 Subject: [PATCH 10/10] docs: update README --- README.md | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 41ef05c5..a6ef8cd1 100644 --- a/README.md +++ b/README.md @@ -136,18 +136,31 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe TRAINEE_BASE_URL=your_base_url_for_trainee_model TRAINEE_API_KEY=your_api_key_for_trainee_model ``` -2. (Optional) If you want to modify the default generated configuration, you can edit the content of the configs/graphgen_config.yaml file. +2. (Optional) Customize generation parameters in `graphgen/configs/` folder. + + Edit the corresponding YAML file, e.g.: + ```yaml - # configs/aggregated_config.yaml - # Example configuration - input_data_type: "raw" - input_file: "resources/input_examples/raw_demo.jsonl" - # more configurations... + # configs/cot_config.yaml + input_data_type: raw + input_file: resources/input_examples/raw_demo.jsonl + output_data_type: cot + tokenizer: cl100k_base + # additional settings... ``` -3. Run the generation script - ```bash - bash scripts/generate/generate_aggregated.sh - ``` + +3. Generate data + + Pick the desired format and run the matching script: + + | Format | Script to run | Notes | + | ------------ | ---------------------------------------------- |-------------------------------------------------------------------| + | `cot` | `bash scripts/generate/generate_cot.sh` | Chain-of-Thought Q\&A pairs | + | `atomic` | `bash scripts/generate/generate_atomic.sh` | Atomic Q\&A pairs covering basic knowledge | + | `aggregated` | `bash scripts/generate/generate_aggregated.sh` | Aggregated Q\&A pairs incorporating complex, integrated knowledge | + | `multi-hop` | `bash scripts/generate/generate_multihop.sh` | Multi-hop reasoning Q\&A pairs | + + 4. Get the generated data ```bash ls cache/data/graphgen