open-sciencelab · ChenZiHong-Gavin · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/README.md b/README.md
@@ -100,7 +100,7 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
 ### Run Gradio Demo
 
    ```bash
-   python -m webui.app.py
+   python -m webui.app
    ```
 
 ![ui](https://github.com/user-attachments/assets/3024e9bc-5d45-45f8-a4e6-b57bd2350d84)
@@ -148,7 +148,7 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
     ```yaml
       # configs/cot_config.yaml
       input_data_type: raw
-      input_file: resources/input_examples/raw_demo.jsonl
+      input_file: resources/input_examples/jsonl_demo.jsonl
       output_data_type: cot
       tokenizer: cl100k_base
       # additional settings...

diff --git a/README_ZH.md b/README_ZH.md
@@ -99,7 +99,7 @@ GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期
 ### 运行 Gradio 演示
 
    ```bash
-   python -m webui.app.py
+   python -m webui.app
    ```
 
 ![ui](https://github.com/user-attachments/assets/3024e9bc-5d45-45f8-a4e6-b57bd2350d84)
@@ -147,7 +147,7 @@ GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期
     ```yaml
       # configs/cot_config.yaml
       input_data_type: raw
-      input_file: resources/input_examples/raw_demo.jsonl
+      input_file: resources/input_examples/jsonl_demo.jsonl
       output_data_type: cot
       tokenizer: cl100k_base
       # 其他设置...

diff --git a/baselines/EntiGraph/entigraph.py b/baselines/EntiGraph/entigraph.py
@@ -232,7 +232,7 @@ async def generate_qa_sft(content):
     parser.add_argument(
         "--input_file",
         help="Raw context jsonl path.",
-        default="resources/input_examples/chunked_demo.json",
+        default="resources/input_examples/json_demo.json",
         type=str,
     )
     parser.add_argument(

diff --git a/baselines/Genie/genie.py b/baselines/Genie/genie.py
@@ -100,7 +100,7 @@ async def process_chunk(content: str):
     parser.add_argument(
         "--input_file",
         help="Raw context jsonl path.",
-        default="resources/input_examples/chunked_demo.json",
+        default="resources/input_examples/json_demo.json",
         type=str,
     )
     parser.add_argument(

diff --git a/baselines/LongForm/longform.py b/baselines/LongForm/longform.py
@@ -67,7 +67,7 @@ async def process_chunk(content: str):
     parser.add_argument(
         "--input_file",
         help="Raw context jsonl path.",
-        default="resources/input_examples/chunked_demo.json",
+        default="resources/input_examples/json_demo.json",
         type=str,
     )
     parser.add_argument(

diff --git a/baselines/SELF-QA/self-qa.py b/baselines/SELF-QA/self-qa.py
@@ -134,7 +134,7 @@ async def process_chunk(content: str):
     parser.add_argument(
         "--input_file",
         help="Raw context jsonl path.",
-        default="resources/input_examples/chunked_demo.json",
+        default="resources/input_examples/json_demo.json",
         type=str,
     )
     parser.add_argument(

diff --git a/baselines/Wrap/wrap.py b/baselines/Wrap/wrap.py
@@ -87,7 +87,7 @@ async def process_chunk(content: str):
     parser.add_argument(
         "--input_file",
         help="Raw context jsonl path.",
-        default="resources/input_examples/chunked_demo.json",
+        default="resources/input_examples/json_demo.json",
         type=str,
     )
     parser.add_argument(

diff --git a/graphgen/version.py → graphgen/_version.py b/graphgen/version.py → graphgen/_version.py
@@ -1,7 +1,6 @@
-
 from typing import Tuple
 
-__version__ = '20250416'
+__version__ = "20250416"
 short_version = __version__
 
 
@@ -15,13 +14,13 @@ def parse_version_info(version_str: str) -> Tuple:
         tuple: A sequence of integer and string represents version.
     """
     _version_info = []
-    for x in version_str.split('.'):
+    for x in version_str.split("."):
         if x.isdigit():
             _version_info.append(int(x))
-        elif x.find('rc') != -1:
-            patch_version = x.split('rc')
+        elif x.find("rc") != -1:
+            patch_version = x.split("rc")
             _version_info.append(int(patch_version[0]))
-            _version_info.append(f'rc{patch_version[1]}')
+            _version_info.append(f"rc{patch_version[1]}")
     return tuple(_version_info)
 
 

diff --git a/graphgen/models/embed/__init__.py → graphgen/bases/__init__.py b/graphgen/models/embed/__init__.py → graphgen/bases/__init__.py
diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py
@@ -0,0 +1,20 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+
+
+class BaseReader(ABC):
+    """
+    Abstract base class for reading and processing data.
+    """
+
+    def __init__(self, text_column: str = "content"):
+        self.text_column = text_column
+
+    @abstractmethod
+    def read(self, file_path: str) -> List[Dict[str, Any]]:
+        """
+        Read data from the specified file path.
+
+        :param file_path: Path to the input file.
+        :return: List of dictionaries containing the data.
+        """
diff --git a/graphgen/models/storage/base_storage.py → graphgen/bases/base_storage.py b/graphgen/models/storage/base_storage.py → graphgen/bases/base_storage.py
@@ -1,8 +1,6 @@
 from dataclasses import dataclass
 from typing import Generic, TypeVar, Union
 
-from graphgen.models.embed.embedding import EmbeddingFunc
-
 T = TypeVar("T")
 
 
@@ -62,8 +60,6 @@ async def drop(self):
 
 @dataclass
 class BaseGraphStorage(StorageNameSpace):
-    embedding_func: EmbeddingFunc = None
-
     async def has_node(self, node_id: str) -> bool:
         raise NotImplementedError
 

diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -1,5 +1,4 @@
-input_data_type: raw # raw, chunked
-input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
 output_data_type: aggregated # atomic, aggregated, multi_hop, cot
 output_data_format: ChatML # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

diff --git a/graphgen/configs/atomic_config.yaml b/graphgen/configs/atomic_config.yaml
@@ -1,5 +1,4 @@
-input_data_type: raw # raw, chunked
-input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples
 output_data_type: atomic # atomic, aggregated, multi_hop, cot
 output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml
@@ -1,5 +1,4 @@
-input_data_type: raw # raw, chunked
-input_file: resources/input_examples/raw_demo.jsonl  # input file path, support json, jsonl, txt. See resources/input_examples for examples
+input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt. See resources/input_examples for examples
 output_data_type: cot # atomic, aggregated, multi_hop, cot
 output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

diff --git a/graphgen/configs/multi_hop_config.yaml b/graphgen/configs/multi_hop_config.yaml
@@ -1,5 +1,4 @@
-input_data_type: raw # raw, chunked
-input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples
 output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
 output_data_format: ChatML # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -7,16 +7,18 @@
 import gradio as gr
 from tqdm.asyncio import tqdm as tqdm_async
 
-from .models import (
+from graphgen.bases.base_storage import StorageNameSpace
+from graphgen.models import (
     Chunk,
     JsonKVStorage,
     JsonListStorage,
     NetworkXStorage,
     OpenAIModel,
     Tokenizer,
     TraverseStrategy,
+    read_file,
 )
-from .models.storage.base_storage import StorageNameSpace
+
 from .operators import (
     extract_kg,
     generate_cot,
@@ -32,7 +34,6 @@
     create_event_loop,
     format_generation_results,
     logger,
-    read_file,
 )
 
 sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -108,94 +109,54 @@ def __post_init__(self):
             namespace=f"qa-{self.unique_id}",
         )
 
-    async def async_split_chunks(
-        self, data: List[Union[List, Dict]], data_type: str
-    ) -> dict:
+    async def async_split_chunks(self, data: List[Union[List, Dict]]) -> dict:
         # TODO: configurable whether to use coreference resolution
         if len(data) == 0:
             return {}
 
         inserting_chunks = {}
-        if data_type == "raw":
-            assert isinstance(data, list) and isinstance(data[0], dict)
-            # compute hash for each document
-            new_docs = {
-                compute_content_hash(doc["content"], prefix="doc-"): {
-                    "content": doc["content"]
-                }
-                for doc in data
-            }
-            _add_doc_keys = await self.full_docs_storage.filter_keys(
-                list(new_docs.keys())
-            )
-            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
-            if len(new_docs) == 0:
-                logger.warning("All docs are already in the storage")
-                return {}
-            logger.info("[New Docs] inserting %d docs", len(new_docs))
-
-            cur_index = 1
-            doc_number = len(new_docs)
-            async for doc_key, doc in tqdm_async(
-                new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
-            ):
-                chunks = {
-                    compute_content_hash(dp["content"], prefix="chunk-"): {
-                        **dp,
-                        "full_doc_id": doc_key,
-                    }
-                    for dp in self.tokenizer_instance.chunk_by_token_size(
-                        doc["content"], self.chunk_overlap_size, self.chunk_size
-                    )
-                }
-                inserting_chunks.update(chunks)
-
-                if self.progress_bar is not None:
-                    self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
-                    cur_index += 1
+        assert isinstance(data, list) and isinstance(data[0], dict)
 
-            _add_chunk_keys = await self.text_chunks_storage.filter_keys(
-                list(inserting_chunks.keys())
-            )
-            inserting_chunks = {
-                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+        # compute hash for each document
+        new_docs = {
+            compute_content_hash(doc["content"], prefix="doc-"): {
+                "content": doc["content"]
             }
-        elif data_type == "chunked":
-            assert isinstance(data, list) and isinstance(data[0], list)
-            new_docs = {
-                compute_content_hash("".join(chunk["content"]), prefix="doc-"): {
-                    "content": "".join(chunk["content"])
+            for doc in data
+        }
+        _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
+        new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+        if len(new_docs) == 0:
+            logger.warning("All docs are already in the storage")
+            return {}
+        logger.info("[New Docs] inserting %d docs", len(new_docs))
+
+        cur_index = 1
+        doc_number = len(new_docs)
+        async for doc_key, doc in tqdm_async(
+            new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
+        ):
+            chunks = {
+                compute_content_hash(dp["content"], prefix="chunk-"): {
+                    **dp,
+                    "full_doc_id": doc_key,
                 }
-                for doc in data
-                for chunk in doc
-            }
-            _add_doc_keys = await self.full_docs_storage.filter_keys(
-                list(new_docs.keys())
-            )
-            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
-            if len(new_docs) == 0:
-                logger.warning("All docs are already in the storage")
-                return {}
-            logger.info("[New Docs] inserting %d docs", len(new_docs))
-            async for doc in tqdm_async(
-                data, desc="[1/4]Chunking documents", unit="doc"
-            ):
-                doc_str = "".join([chunk["content"] for chunk in doc])
-                for chunk in doc:
-                    chunk_key = compute_content_hash(chunk["content"], prefix="chunk-")
-                    inserting_chunks[chunk_key] = {
-                        **chunk,
-                        "full_doc_id": compute_content_hash(doc_str, prefix="doc-"),
-                    }
-            _add_chunk_keys = await self.text_chunks_storage.filter_keys(
-                list(inserting_chunks.keys())
-            )
-            inserting_chunks = {
-                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+                for dp in self.tokenizer_instance.chunk_by_token_size(
+                    doc["content"], self.chunk_overlap_size, self.chunk_size
+                )
             }
-        else:
-            raise ValueError(f"Unknown data type: {data_type}")
+            inserting_chunks.update(chunks)
+
+            if self.progress_bar is not None:
+                self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
+                cur_index += 1
 
+        _add_chunk_keys = await self.text_chunks_storage.filter_keys(
+            list(inserting_chunks.keys())
+        )
+        inserting_chunks = {
+            k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+        }
         await self.full_docs_storage.upsert(new_docs)
         await self.text_chunks_storage.upsert(inserting_chunks)
 
@@ -211,10 +172,8 @@ async def async_insert(self):
         """
 
         input_file = self.config["input_file"]
-        data_type = self.config["input_data_type"]
         data = read_file(input_file)
-
-        inserting_chunks = await self.async_split_chunks(data, data_type)
+        inserting_chunks = await self.async_split_chunks(data)
 
         if len(inserting_chunks) == 0:
             logger.warning("All chunks are already in the storage")