Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
### Run Gradio Demo

```bash
python -m webui.app.py
python -m webui.app
```

![ui](https://github.com/user-attachments/assets/3024e9bc-5d45-45f8-a4e6-b57bd2350d84)
Expand Down Expand Up @@ -148,7 +148,7 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
```yaml
# configs/cot_config.yaml
input_data_type: raw
input_file: resources/input_examples/raw_demo.jsonl
input_file: resources/input_examples/jsonl_demo.jsonl
output_data_type: cot
tokenizer: cl100k_base
# additional settings...
Expand Down
4 changes: 2 additions & 2 deletions README_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ GraphGen 首先根据源文本构建细粒度的知识图谱,然后利用期
### 运行 Gradio 演示

```bash
python -m webui.app.py
python -m webui.app
```

![ui](https://github.com/user-attachments/assets/3024e9bc-5d45-45f8-a4e6-b57bd2350d84)
Expand Down Expand Up @@ -147,7 +147,7 @@ GraphGen 首先根据源文本构建细粒度的知识图谱,然后利用期
```yaml
# configs/cot_config.yaml
input_data_type: raw
input_file: resources/input_examples/raw_demo.jsonl
input_file: resources/input_examples/jsonl_demo.jsonl
output_data_type: cot
tokenizer: cl100k_base
# 其他设置...
Expand Down
2 changes: 1 addition & 1 deletion baselines/EntiGraph/entigraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ async def generate_qa_sft(content):
parser.add_argument(
"--input_file",
help="Raw context jsonl path.",
default="resources/input_examples/chunked_demo.json",
default="resources/input_examples/json_demo.json",
type=str,
)
parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion baselines/Genie/genie.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ async def process_chunk(content: str):
parser.add_argument(
"--input_file",
help="Raw context jsonl path.",
default="resources/input_examples/chunked_demo.json",
default="resources/input_examples/json_demo.json",
type=str,
)
parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion baselines/LongForm/longform.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ async def process_chunk(content: str):
parser.add_argument(
"--input_file",
help="Raw context jsonl path.",
default="resources/input_examples/chunked_demo.json",
default="resources/input_examples/json_demo.json",
type=str,
)
parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion baselines/SELF-QA/self-qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ async def process_chunk(content: str):
parser.add_argument(
"--input_file",
help="Raw context jsonl path.",
default="resources/input_examples/chunked_demo.json",
default="resources/input_examples/json_demo.json",
type=str,
)
parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion baselines/Wrap/wrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ async def process_chunk(content: str):
parser.add_argument(
"--input_file",
help="Raw context jsonl path.",
default="resources/input_examples/chunked_demo.json",
default="resources/input_examples/json_demo.json",
type=str,
)
parser.add_argument(
Expand Down
11 changes: 5 additions & 6 deletions graphgen/version.py → graphgen/_version.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@

from typing import Tuple

__version__ = '20250416'
__version__ = "20250416"
short_version = __version__


Expand All @@ -15,13 +14,13 @@ def parse_version_info(version_str: str) -> Tuple:
tuple: A sequence of integer and string represents version.
"""
_version_info = []
for x in version_str.split('.'):
for x in version_str.split("."):
if x.isdigit():
_version_info.append(int(x))
elif x.find('rc') != -1:
patch_version = x.split('rc')
elif x.find("rc") != -1:
patch_version = x.split("rc")
_version_info.append(int(patch_version[0]))
_version_info.append(f'rc{patch_version[1]}')
_version_info.append(f"rc{patch_version[1]}")
return tuple(_version_info)


Expand Down
File renamed without changes.
20 changes: 20 additions & 0 deletions graphgen/bases/base_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from abc import ABC, abstractmethod
from typing import Any, Dict, List


class BaseReader(ABC):
"""
Abstract base class for reading and processing data.
"""

def __init__(self, text_column: str = "content"):
self.text_column = text_column

@abstractmethod
def read(self, file_path: str) -> List[Dict[str, Any]]:
"""
Read data from the specified file path.

:param file_path: Path to the input file.
:return: List of dictionaries containing the data.
"""
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from dataclasses import dataclass
from typing import Generic, TypeVar, Union

from graphgen.models.embed.embedding import EmbeddingFunc

T = TypeVar("T")


Expand Down Expand Up @@ -62,8 +60,6 @@ async def drop(self):

@dataclass
class BaseGraphStorage(StorageNameSpace):
embedding_func: EmbeddingFunc = None

async def has_node(self, node_id: str) -> bool:
raise NotImplementedError

Expand Down
3 changes: 1 addition & 2 deletions graphgen/configs/aggregated_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
input_data_type: raw # raw, chunked
input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
output_data_type: aggregated # atomic, aggregated, multi_hop, cot
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
Expand Down
3 changes: 1 addition & 2 deletions graphgen/configs/atomic_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
input_data_type: raw # raw, chunked
input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples
output_data_type: atomic # atomic, aggregated, multi_hop, cot
output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
Expand Down
3 changes: 1 addition & 2 deletions graphgen/configs/cot_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
input_data_type: raw # raw, chunked
input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt. See resources/input_examples for examples
output_data_type: cot # atomic, aggregated, multi_hop, cot
output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
Expand Down
3 changes: 1 addition & 2 deletions graphgen/configs/multi_hop_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
input_data_type: raw # raw, chunked
input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples
output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
Expand Down
127 changes: 43 additions & 84 deletions graphgen/graphgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,18 @@
import gradio as gr
from tqdm.asyncio import tqdm as tqdm_async

from .models import (
from graphgen.bases.base_storage import StorageNameSpace
from graphgen.models import (
Chunk,
JsonKVStorage,
JsonListStorage,
NetworkXStorage,
OpenAIModel,
Tokenizer,
TraverseStrategy,
read_file,
)
from .models.storage.base_storage import StorageNameSpace

from .operators import (
extract_kg,
generate_cot,
Expand All @@ -32,7 +34,6 @@
create_event_loop,
format_generation_results,
logger,
read_file,
)

sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
Expand Down Expand Up @@ -108,94 +109,54 @@ def __post_init__(self):
namespace=f"qa-{self.unique_id}",
)

async def async_split_chunks(
self, data: List[Union[List, Dict]], data_type: str
) -> dict:
async def async_split_chunks(self, data: List[Union[List, Dict]]) -> dict:
# TODO: configurable whether to use coreference resolution
if len(data) == 0:
return {}

inserting_chunks = {}
if data_type == "raw":
assert isinstance(data, list) and isinstance(data[0], dict)
# compute hash for each document
new_docs = {
compute_content_hash(doc["content"], prefix="doc-"): {
"content": doc["content"]
}
for doc in data
}
_add_doc_keys = await self.full_docs_storage.filter_keys(
list(new_docs.keys())
)
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
if len(new_docs) == 0:
logger.warning("All docs are already in the storage")
return {}
logger.info("[New Docs] inserting %d docs", len(new_docs))

cur_index = 1
doc_number = len(new_docs)
async for doc_key, doc in tqdm_async(
new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
):
chunks = {
compute_content_hash(dp["content"], prefix="chunk-"): {
**dp,
"full_doc_id": doc_key,
}
for dp in self.tokenizer_instance.chunk_by_token_size(
doc["content"], self.chunk_overlap_size, self.chunk_size
)
}
inserting_chunks.update(chunks)

if self.progress_bar is not None:
self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
cur_index += 1
assert isinstance(data, list) and isinstance(data[0], dict)

_add_chunk_keys = await self.text_chunks_storage.filter_keys(
list(inserting_chunks.keys())
)
inserting_chunks = {
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
# compute hash for each document
new_docs = {
compute_content_hash(doc["content"], prefix="doc-"): {
"content": doc["content"]
}
elif data_type == "chunked":
assert isinstance(data, list) and isinstance(data[0], list)
new_docs = {
compute_content_hash("".join(chunk["content"]), prefix="doc-"): {
"content": "".join(chunk["content"])
for doc in data
}
_add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
if len(new_docs) == 0:
logger.warning("All docs are already in the storage")
return {}
logger.info("[New Docs] inserting %d docs", len(new_docs))

cur_index = 1
doc_number = len(new_docs)
async for doc_key, doc in tqdm_async(
new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
):
chunks = {
compute_content_hash(dp["content"], prefix="chunk-"): {
**dp,
"full_doc_id": doc_key,
}
for doc in data
for chunk in doc
}
_add_doc_keys = await self.full_docs_storage.filter_keys(
list(new_docs.keys())
)
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
if len(new_docs) == 0:
logger.warning("All docs are already in the storage")
return {}
logger.info("[New Docs] inserting %d docs", len(new_docs))
async for doc in tqdm_async(
data, desc="[1/4]Chunking documents", unit="doc"
):
doc_str = "".join([chunk["content"] for chunk in doc])
for chunk in doc:
chunk_key = compute_content_hash(chunk["content"], prefix="chunk-")
inserting_chunks[chunk_key] = {
**chunk,
"full_doc_id": compute_content_hash(doc_str, prefix="doc-"),
}
_add_chunk_keys = await self.text_chunks_storage.filter_keys(
list(inserting_chunks.keys())
)
inserting_chunks = {
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
for dp in self.tokenizer_instance.chunk_by_token_size(
doc["content"], self.chunk_overlap_size, self.chunk_size
)
}
else:
raise ValueError(f"Unknown data type: {data_type}")
inserting_chunks.update(chunks)

if self.progress_bar is not None:
self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
cur_index += 1

_add_chunk_keys = await self.text_chunks_storage.filter_keys(
list(inserting_chunks.keys())
)
inserting_chunks = {
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
}
await self.full_docs_storage.upsert(new_docs)
await self.text_chunks_storage.upsert(inserting_chunks)

Expand All @@ -211,10 +172,8 @@ async def async_insert(self):
"""

input_file = self.config["input_file"]
data_type = self.config["input_data_type"]
data = read_file(input_file)

inserting_chunks = await self.async_split_chunks(data, data_type)
inserting_chunks = await self.async_split_chunks(data)

if len(inserting_chunks) == 0:
logger.warning("All chunks are already in the storage")
Expand Down
Loading