From 03a6f498245bfa0d6bac71832c5e15585dd44cb2 Mon Sep 17 00:00:00 2001 From: John-Saxon Date: Mon, 6 Nov 2023 13:37:38 +0800 Subject: [PATCH] #122: support multi-turn datasets(chase\sparc\cosql), and merge all data together --- dbgpt_hub/configs/config.py | 28 ++++++++ dbgpt_hub/data_process/sql_data_process.py | 83 ++++++++++++++-------- 2 files changed, 82 insertions(+), 29 deletions(-) diff --git a/dbgpt_hub/configs/config.py b/dbgpt_hub/configs/config.py index b3f4853..cf267c8 100644 --- a/dbgpt_hub/configs/config.py +++ b/dbgpt_hub/configs/config.py @@ -44,6 +44,7 @@ EXT2TYPE = {"csv": "csv", "json": "json", "jsonl": "json", "txt": "text"} # text2sql dataset information for processing sql data +# TODO: BIRD \ WiKiSQL \ ... SQL_DATA_INFO = [ { "data_source": "spider", @@ -53,6 +54,33 @@ "db_id_name": "db_id", "is_multiple_turn": False, } + , + { + "data_source": "chase", + "train_file": ["Chase/chase_train.json"], + "dev_file": ["Chase/chase_dev.json"], + "tables_file": "Chase/chase_tables.json", + "db_id_name": "database_id", + "is_multiple_turn": True, + } + , + { + "data_source": "cosql_dataset", + "train_file": ["sql_state_tracking/cosql_train.json"], + "dev_file": ["sql_state_tracking/cosql_dev.json"], + "tables_file": "tables.json", + "db_id_name": "database_id", + "is_multiple_turn": True, + } + , + { + "data_source": "sparc", + "train_file": ["train.json"], + "dev_file": ["dev.json"], + "tables_file": "tables.json", + "db_id_name": "database_id", + "is_multiple_turn": True, + } ] INSTRUCTION_PROMPT = """\ I want you to act as a SQL terminal in front of an example database, \ diff --git a/dbgpt_hub/data_process/sql_data_process.py b/dbgpt_hub/data_process/sql_data_process.py index 8b0286e..fd67c93 100644 --- a/dbgpt_hub/data_process/sql_data_process.py +++ b/dbgpt_hub/data_process/sql_data_process.py @@ -17,15 +17,15 @@ class ProcessSqlData: - def __init__(self) -> None: - pass + def __init__(self, train_file=None, dev_file=None) -> None: + self.train_file = train_file + self.dev_file = dev_file - def decode_json_file(self, data_file_list, table_file, out_file): + def decode_json_file(self, data_file_list, table_file, db_id_name, is_multiple_turn=False): """ TO DO: 1.将相关prompt放入config中 2.将不同数据来源的字段信息放入config中 - 3.支持多轮对话数据集 """ if table_file.endswith(".jsonl"): @@ -87,46 +87,71 @@ def decode_json_file(self, data_file_list, table_file, out_file): # 单论对话 res = [] for data in tqdm(datas): - if data["db_id"] in db_dict.keys(): - input = { - "db_id": data["db_id"], - "instruction": INSTRUCTION_PROMPT.format(db_dict[data["db_id"]]), - "input": INPUT_PROMPT.format(data["question"]), - "output": data["query"], - "history": [], - } - res.append(input) - - with open(out_file, "w", encoding="utf-8") as s: - json.dump(res, s, indent=4, ensure_ascii=False) + if data[db_id_name] in db_dict.keys(): + if is_multiple_turn: + history = [] + for interaction in data["interaction"]: + input = { + "db_id": data[db_id_name], + "instruction": INSTRUCTION_PROMPT.format(db_dict[data[db_id_name]]), + "input": INPUT_PROMPT.format(interaction["utterance"]), + "output": interaction["query"], + "history": history, + } + res.append(input) + history.append((INPUT_PROMPT.format(interaction["utterance"]), interaction["query"])) + else: + input = { + "db_id": data[db_id_name], + "instruction": INSTRUCTION_PROMPT.format(db_dict[data[db_id_name]]), + "input": INPUT_PROMPT.format(data["question"]), + "output": data["query"], + "history": [], + } + res.append(input) + return res def create_sft_raw_data(self): + train_data = [] + dev_data = [] for data_info in SQL_DATA_INFO: train_data_file_list = [ os.path.join(DATA_PATH, data_info["data_source"], file) for file in data_info["train_file"] ] - self.decode_json_file( - data_file_list=train_data_file_list, - table_file=os.path.join( - DATA_PATH, data_info["data_source"], data_info["tables_file"] - ), - out_file=os.path.join(DATA_PATH, "example_text2sql_train.json"), + train_data.extend( + self.decode_json_file( + data_file_list=train_data_file_list, + table_file=os.path.join( + DATA_PATH, data_info["data_source"], data_info["tables_file"] + ), + db_id_name=data_info["db_id_name"], + is_multiple_turn=data_info['is_multiple_turn'] + ) ) dev_data_file_list = [ os.path.join(DATA_PATH, data_info["data_source"], file) for file in data_info["dev_file"] ] - self.decode_json_file( - data_file_list=dev_data_file_list, - table_file=os.path.join( - DATA_PATH, data_info["data_source"], data_info["tables_file"] - ), - out_file=os.path.join(DATA_PATH, "example_text2sql_dev.json"), + dev_data.extend( + self.decode_json_file( + data_file_list=dev_data_file_list, + table_file=os.path.join( + DATA_PATH, data_info["data_source"], data_info["tables_file"] + ), + db_id_name=data_info["db_id_name"], + is_multiple_turn=data_info['is_multiple_turn'] + ) ) + with open(self.train_file, "w", encoding="utf-8") as s: + json.dump(train_data, s, indent=4, ensure_ascii=False) + with open(self.dev_file, "w", encoding="utf-8") as s: + json.dump(dev_data, s, indent=4, ensure_ascii=False) if __name__ == "__main__": - precess = ProcessSqlData() + all_in_one_train_file = os.path.join(DATA_PATH, "example_text2sql_train.json") + all_in_one_dev_file = os.path.join(DATA_PATH, "example_text2sql_dev.json") + precess = ProcessSqlData(train_file=all_in_one_train_file, dev_file=all_in_one_dev_file) precess.create_sft_raw_data()