Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions graphgen/configs/aggregated_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ read:
split:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
output_data_type: aggregated # atomic, aggregated, multi_hop, cot
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
output_data_type: aggregated # atomic, aggregated, multi_hop, cot
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
enabled: true
quiz_samples: 2 # number of quiz samples to generate
Expand Down
6 changes: 3 additions & 3 deletions graphgen/configs/atomic_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ read:
split:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
output_data_type: atomic # atomic, aggregated, multi_hop, cot
output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
output_data_type: atomic # atomic, aggregated, multi_hop, cot
output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
enabled: true
quiz_samples: 2 # number of quiz samples to generate
Expand Down
6 changes: 3 additions & 3 deletions graphgen/configs/cot_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ read:
split:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
output_data_type: cot # atomic, aggregated, multi_hop, cot
output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
output_data_type: cot # atomic, aggregated, multi_hop, cot
output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
method_params:
method: leiden
max_size: 20 # Maximum size of communities
Expand Down
6 changes: 3 additions & 3 deletions graphgen/configs/multi_hop_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ read:
split:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
enabled: false
quiz_samples: 2 # number of quiz samples to generate
Expand Down
109 changes: 63 additions & 46 deletions webui/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@
from graphgen.models.llm.limitter import RPM, TPM
from graphgen.utils import set_logger
from webui.base import WebuiParams
from webui.cache_utils import cleanup_workspace, setup_workspace
from webui.count_tokens import count_tokens
from webui.i18n import Translate
from webui.i18n import gettext as _
from webui.test_api import test_api_connection
from webui.utils import cleanup_workspace, count_tokens, preview_file, setup_workspace

root_dir = files("webui").parent
sys.path.append(root_dir)
Expand Down Expand Up @@ -391,6 +390,58 @@ def sum_tokens(client):
with gr.Column(scale=1):
test_connection_btn = gr.Button(_("Test Connection"))

with gr.Row(equal_height=True):
with gr.Column(scale=1):
with gr.Blocks():
with gr.Row(equal_height=True):
with gr.Column(scale=1):
upload_file = gr.File(
label=_("Upload File"),
file_count="single",
file_types=[".txt", ".json", ".jsonl", ".csv"],
interactive=True,
)
examples_dir = os.path.join(root_dir, "webui", "examples")
gr.Examples(
examples=[
[os.path.join(examples_dir, "txt_demo.txt")],
[os.path.join(examples_dir, "jsonl_demo.jsonl")],
[os.path.join(examples_dir, "json_demo.json")],
[os.path.join(examples_dir, "csv_demo.csv")],
],
inputs=upload_file,
label=_("Example Files"),
examples_per_page=4,
)
with gr.Column(scale=1):
with gr.Blocks():
preview_code = gr.Code(
label=_("File Preview"),
interactive=False,
visible=True,
elem_id="preview_code",
)
preview_df = gr.DataFrame(
label=_("File Preview"),
interactive=False,
visible=False,
elem_id="preview_df",
)

with gr.Blocks():
token_counter = gr.DataFrame(
label="Token Stats",
headers=[
"Source Text Token Count",
"Estimated Token Usage",
"Token Used",
],
datatype="str",
interactive=False,
visible=False,
wrap=True,
)

with gr.Blocks():
with gr.Row(equal_height=True):
with gr.Column():
Expand All @@ -415,46 +466,12 @@ def sum_tokens(client):
)

with gr.Blocks():
with gr.Row(equal_height=True):
with gr.Column(scale=1):
upload_file = gr.File(
label=_("Upload File"),
file_count="single",
file_types=[".txt", ".json", ".jsonl", ".csv"],
interactive=True,
)
examples_dir = os.path.join(root_dir, "webui", "examples")
gr.Examples(
examples=[
[os.path.join(examples_dir, "txt_demo.txt")],
[os.path.join(examples_dir, "jsonl_demo.jsonl")],
[os.path.join(examples_dir, "json_demo.json")],
[os.path.join(examples_dir, "csv_demo.csv")],
],
inputs=upload_file,
label=_("Example Files"),
examples_per_page=4,
)
with gr.Column(scale=1):
output = gr.File(
label="Output(See Github FAQ)",
file_count="single",
interactive=False,
)

with gr.Blocks():
token_counter = gr.DataFrame(
label="Token Stats",
headers=[
"Source Text Token Count",
"Estimated Token Usage",
"Token Used",
],
datatype="str",
interactive=False,
visible=False,
wrap=True,
)
with gr.Column(scale=1):
output = gr.File(
label=_("Output File"),
file_count="single",
interactive=False,
)

submit_btn = gr.Button(_("Run GraphGen"))

Expand Down Expand Up @@ -494,13 +511,13 @@ def sum_tokens(client):
)

upload_file.change(
lambda x: (gr.update(visible=True)),
inputs=[upload_file],
outputs=[token_counter],
preview_file, inputs=upload_file, outputs=[preview_code, preview_df]
).then(
lambda x: gr.update(visible=True), inputs=upload_file, outputs=token_counter
Comment on lines +514 to +516
Copy link

Copilot AI Sep 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lambda function should be extracted to a named function to improve code readability and maintainability.

Copilot uses AI. Check for mistakes.
).then(
count_tokens,
inputs=[upload_file, tokenizer, token_counter],
outputs=[token_counter],
outputs=token_counter,
)

# run GraphGen
Expand Down
16 changes: 12 additions & 4 deletions webui/translation.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
{
"en": {
"Title": "✨Easy-to-use LLM Training Data Generation Framework✨",
"\n\n": "\n\n",
"### [GraphGen](https://github.com/open-sciencelab/GraphGen) ": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) ",
"Intro": "is a framework for synthetic data generation guided by knowledge graphs, designed to tackle challenges for knowledge-intensive QA generation. \n\nBy uploading your text chunks (such as knowledge in agriculture, healthcare, or marine science) and filling in the LLM API key, you can generate the training data required by **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)** and **[xtuner](https://github.com/InternLM/xtuner)** online. We will automatically delete user information after completion.",
"# ": "# ",
"Use Trainee Model": "Use Trainee Model to identify knowledge blind spots, please keep disable for SiliconCloud",
"Synthesizer URL Info": "Base URL for the Synthesizer Model API, use SiliconFlow as default",
"Synthesizer Model Info": "Model for constructing KGs and generating QAs",
Expand All @@ -11,16 +13,20 @@
"SiliconFlow Token for Trainee Model": "SiliconFlow API Key for Trainee Model",
"Model Config": "Model Configuration",
"Generation Config": "Generation Config",
"API Config": "API Config",
"### ": "### ",
"SiliconFlow Token": "SiliconFlow API Key",
"Test Connection": "Test Connection",
"Upload File": "Upload File",
"Example Files": "Example Files",
"Run GraphGen": "Run GraphGen"
"Output File": "Output File",
"File Preview": "File Preview"
},
"zh": {
"Title": "✨开箱即用的LLM训练数据生成框架✨",
"\n\n": "\n\n",
"### [GraphGen](https://github.com/open-sciencelab/GraphGen) ": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) ",
"Intro": "是一个基于知识图谱的数据合成框架,旨在知识密集型任务中生成问答。\n\n 上传你的文本块(如农业、医疗、海洋知识),填写 LLM api key,即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。",
"# ": "# ",
"Use Trainee Model": "使用Trainee Model来识别知识盲区,使用硅基流动时请保持禁用",
"Synthesizer URL Info": "调用合成模型API的URL,默认使用硅基流动",
"Synthesizer Model Info": "用于构建知识图谱和生成问答的模型",
Expand All @@ -29,10 +35,12 @@
"SiliconFlow Token for Trainee Model": "SiliconFlow Token for Trainee Model",
"Model Config": "模型配置",
"Generation Config": "生成配置",
"API Config": "API Config",
"### ": "### ",
"SiliconFlow Token": "SiliconFlow Token",
"Test Connection": "测试接口",
"Upload File": "上传文件",
"Example Files": "示例文件",
"Run GraphGen": "运行GraphGen"
"Output File": "输出文件",
"File Preview": "文件预览"
}
}
3 changes: 3 additions & 0 deletions webui/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .cache import cleanup_workspace, setup_workspace
from .count_tokens import count_tokens
from .preview_file import preview_file
3 changes: 2 additions & 1 deletion webui/cache_utils.py → webui/utils/cache.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import uuid
import shutil
import uuid


def setup_workspace(folder):
request_id = str(uuid.uuid4())
Expand Down
File renamed without changes.
29 changes: 29 additions & 0 deletions webui/utils/preview_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import codecs
import os

import gradio as gr
import pandas as pd


def preview_file(file):
if file is None:
return gr.update(visible=False), gr.update(visible=False)

path = file.name
ext = os.path.splitext(path)[1].lower()

try:
if ext == ".csv":
df = pd.read_csv(path, nrows=10)
Copy link

Copilot AI Sep 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The magic number 10 for CSV row limit should be defined as a named constant at the module level to improve maintainability.

Copilot uses AI. Check for mistakes.
return gr.update(visible=False), gr.update(value=df, visible=True)
with codecs.open(path, "r", encoding="utf-8") as f:
text = f.read(5000)
if len(text) == 5000:
text += "\n\n... (truncated at 5000 chars)"
Comment on lines +19 to +22
Copy link

Copilot AI Sep 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The magic number 5000 should be defined as a named constant at the module level to improve maintainability and make the truncation limit configurable.

Copilot uses AI. Check for mistakes.
return gr.update(
value=text, visible=True, language="json" if ext != ".txt" else None
), gr.update(visible=False)
except Exception as e: # pylint: disable=broad-except
return gr.update(
value=f"Preview failed: {e}", visible=True, language=None
), gr.update(visible=False)