合并数据

### sharegpt的openAI风格

In [None]:
import os
import json
import re
import random


def load_processed_obj_file(obj_file_path):
    with open(obj_file_path, 'r') as f:
        return f.read()


def extract_object_id_from_content(content):
    match = re.search(r"Object ID (\w+)", content)
    return match.group(1) if match else None


def migrate_legacy_format(entry):
    # 从旧格式中提取元数据
    uid = entry.get("uid", "")
    call = entry.get("call", "")
    instruction = entry.get("instruction", "")

    # 尝试从指令中提取 Object ID
    object_id_match = re.search(r"Object ID (\w+)", instruction)
    object_id = object_id_match.group(1) if object_id_match else uid

    return {
        "uid": uid,
        "Object_ID": object_id,
        "Name": call,
        "messages": [
            {"role": "system", "content": "You are a helpful 3D mesh modeling AI assistant."},
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": entry.get("output", "")}
        ]
    }


def update_json_file(obj_folder, reference_json_path, output_json_path):
    print("开始更新 JSON 文件")

    # 构建微调模板
    templates = [
        ("(user) Create a 3D model of {description} (assistant) {obj_content}",
         "Create a 3D model of {description}", "{obj_content}"),
        ("(user) Generate a 3D obj file for {description} (assistant) {obj_content}",
         "Generate a 3D obj file for {description}", "{obj_content}"),
        ("(user) I need a 3D model of {description} (assistant) {obj_content}",
         "I need a 3D model of {description}", "{obj_content}"),
        ("(user) Make a 3D representation of {description} (assistant) {obj_content}",
         "Make a 3D representation of {description}", "{obj_content}"),
        ("(user) Design a 3D object based on {description} (assistant) {obj_content}",
         "Design a 3D object based on {description}", "{obj_content}"),
        ("(user) Create a 3D scene with {description} (assistant) {obj_content}",
         "Create a 3D scene with {description}", "{obj_content}"),
        ("(user) Build a 3D model according to {description} (assistant) {obj_content}",
         "Build a 3D model according to {description}", "{obj_content}"),
        ("(user) Produce a 3D model for {description} (assistant) {obj_content}",
         "Produce a 3D model for {description}", "{obj_content}"),
        ("(user) Craft a 3D structure of {description} (assistant) {obj_content}",
         "Craft a 3D structure of {description}", "{obj_content}"),
        ("(user) Develop a 3D mesh for {description} (assistant) {obj_content}",
         "Develop a 3D mesh for {description}", "{obj_content}")
    ]


    # 读取或初始化现有数据
    existing_data = []
    if os.path.exists(output_json_path):
        print(f"读取现有文件: {output_json_path}")
        with open(output_json_path, 'r') as f:
            try:
                raw_data = json.load(f)
                for entry in raw_data:
                    if all(key in entry for key in ["uid", "Object_ID", "Name"]):
                        existing_data.append(entry)
                    else:  # 迁移旧格式
                        migrated = migrate_legacy_format(entry)
                        existing_data.append(migrated)
                print(f"成功加载 {len(raw_data)} 条数据")
            except json.JSONDecodeError:
                print("警告：JSON 解析失败，初始化空数据集")

    # 加载参考数据
    print(f"加载参考文件: {reference_json_path}")
    with open(reference_json_path, 'r') as f:
        reference_data = json.load(f)
    ref_map = {item["Object ID"]: item for item in reference_data if "Object ID" in item}

    # 处理 OBJ 文件
    new_count = 0
    for root, _, files in os.walk(obj_folder):
        for file in files:
            if not file.endswith('.obj'):
                continue

            # 提取基础信息
            object_id = os.path.splitext(file)[0]
            obj_path = os.path.join(root, file)
            # print(f"处理文件: {obj_path}")

            # 获取参考信息
            ref_info = ref_map.get(object_id, {})
            name = ref_info.get("Name", "")
            description = ref_info.get("text", "")

            # 生成内容
            obj_content = load_processed_obj_file(obj_path)
            # 随机选择一个模板
            template, user_template, assistant_template = random.choice(templates)
            user_content = user_template.format(description=description)
            assistant_content = assistant_template.format(obj_content=obj_content)

            # 构建完整条目
            new_entry = {
                "uid": f"{object_id}",  # 生成唯一UID
                "Object_ID": object_id,
                "Name": name,
                "messages": [
                    # {"role": "system", "content": "You are a helpful 3D mesh modeling AI assistant."},
                    {"role": "system", "content": ""},
                    {"role": "user", "content": user_content},
                    {"role": "assistant", "content": assistant_content}
                ]
            }

            # 通过Object_ID匹配现有条目
            found = False
            for idx, entry in enumerate(existing_data):
                if entry.get("Object_ID") == object_id:
                    existing_data[idx] = new_entry
                    found = True
                    # print(f"更新条目: {object_id}")
                    break

            if not found:
                existing_data.append(new_entry)
                new_count += 1
                # print(f"新增条目 [{new_count}]: {object_id}")

    # 保存结果
    print(f"写入文件: {output_json_path}")
    with open(output_json_path, 'w') as f:
        json.dump(existing_data, f, indent=2, ensure_ascii=False)
    print(f"完成！新增 {new_count} 条，总计 {len(existing_data)} 条数据")


if __name__ == "__main__":
    obj_folder = ""
    reference_json_path = ""
    output_json_path = ""
    update_json_file(obj_folder, reference_json_path, output_json_path)

修改json文件，去掉多余的字段

In [None]:
import os
import json


def remove_fields(json_path):
    step_count = 1
    print(f"[{step_count}] 开始检查 JSON 文件是否存在...")
    if os.path.exists(json_path):
        step_count += 1
        print(f"[{step_count}] 文件 {json_path} 存在，开始读取文件内容...")
        with open(json_path, 'r') as f:
            data = json.load(f)
        step_count += 1
        print(f"[{step_count}] 文件内容读取成功，开始处理数据中的字段...")
        if isinstance(data, list):
            for item in data:
                if 'uid' in item:
                    del item['uid']
                    step_count += 1
                    # print(f"[{step_count}] 已从列表项中移除 'uid' 字段。")
                if 'Name' in item:
                    del item['Name']
                    step_count += 1
                    # print(f"[{step_count}] 已从列表项中移除 'Name' 字段。")
                if 'Object_ID' in item:
                    del item['Object_ID']
                    step_count += 1
                    # print(f"[{step_count}] 已从列表项中移除 'Object_ID' 字段。")
        elif isinstance(data, dict):
            if 'uid' in data:
                del data['uid']
                step_count += 1
                # print(f"[{step_count}] 已从字典中移除 'uid' 字段。")
            if 'Name' in data:
                del data['Name']
                step_count += 1
                # print(f"[{step_count}] 已从字典中移除 'Name' 字段。")
            if 'Object_ID' in data:
                del data['Object_ID']
                step_count += 1
                # print(f"[{step_count}] 已从字典中移除 'Object_ID' 字段。")
        step_count += 1
        print(f"[{step_count}] 字段处理完成，开始将处理后的数据写回文件...")
        with open(json_path, 'w') as f:
            json.dump(data, f, indent=4)
        step_count += 1
        print(f"[{step_count}] 数据已成功写回文件 {json_path}。")
    else:
        step_count += 1
        print(f"[{step_count}] 文件 {json_path} 不存在。")


if __name__ == "__main__":
    json_path = "./data_700face/dataxyz.json"
    remove_fields(json_path)