From 0bda78ec35ae8f878646e5ab36a8ae8b4c95cdaf Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Wed, 7 May 2025 08:49:03 +0000 Subject: [PATCH] adding data preparation script and usage --- src/train/README.md | 44 ++++++++++++++++++++++++++++++++-- src/train/data_prep.py | 54 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 src/train/data_prep.py diff --git a/src/train/README.md b/src/train/README.md index 795dc6a..64b81b0 100644 --- a/src/train/README.md +++ b/src/train/README.md @@ -9,6 +9,17 @@ - `sft.py` contains the code for training model with Full Supervised Finetuning ## Usgae +### Preparing the dataset +- Download the both Instruction and Conversational variant dataset from huggingface + - [microsoft/NextCoderDataset](https://huggingface.co/datasets/microsoft/NextCoderDataset) + - [microsoft/NextCoderDataset-Conversational](https://huggingface.co/datasets/microsoft/NextCoderDataset-Conversational) + +- Run the `data_prep.py` script with the corresponding data path for instruction-variant, this will fetch the commitpackft subset and create a complete dataset for instruction-tuning + ```bash + python data_prep.py --commitpackft_mapping ../data/commitpackft_subset.csv --save_dir . + ``` + This will save the final instruction dataset as `instruction_dataset` which will be used in trainig for stage-1 + ### Training with SFT - modify or replace the `general_acc.yaml` file as per the desired system configuration - set the `zero_optimization-stage` to `3` and `overlap_comm` to `false` in `ds_config` for better memory optimizations @@ -36,7 +47,21 @@ --debug True \ ``` Update the above command as per the model -- To train on conversation data by only applying loss on the response, uncomment the lines 175, 176 and 185 and run the same command with proper dataset path +- To train on conversation data by only applying loss on the response, uncomment the lines 175, 176 and 185 and run the same command with proper conversational dataset path + ```python + response_template = "#RESPONSE\n" + collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) + + # Initialize trainer + trainer = SFTTrainer( + model=model, + processing_class=tokenizer, + train_dataset=dataset, + args=training_config, + callbacks=[Callback(flush_steps=1)], + data_collator=collator, # pass the collator in the trainer + ) + ``` ### Training with LoRA - modify or replace the `general_acc.yaml` file as per the desired system configuration @@ -101,4 +126,19 @@ --alpha "Enter value for desired alpha parameter for SeleKT" \ ``` Update the above command as per the model -- To train on conversation data by only applying loss on the response, uncomment the lines 291, 292 and 301 and run the same command with proper dataset path \ No newline at end of file +- To train on conversation data by only applying loss on the response, uncomment the lines 291, 292 and 301 and run the same command with proper conversational dataset path + ```python + ```python + response_template = "#RESPONSE\n" + collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) + + # Initialize trainer + trainer = SFTTrainer( + model=model, + processing_class=tokenizer, + train_dataset=dataset, + args=training_config, + callbacks=[Callback(flush_steps=1)], + data_collator=collator, # pass the collator in the trainer + ) + ``` \ No newline at end of file diff --git a/src/train/data_prep.py b/src/train/data_prep.py new file mode 100644 index 0000000..0f0f619 --- /dev/null +++ b/src/train/data_prep.py @@ -0,0 +1,54 @@ +from datasets import load_dataset, concatenate_datasets, Dataset +import pandas as pd +import argparse + + +def process(example): + lang = example['lang'].lower() + message = example['message'] + instruction = f''' + Rewrite the given {lang} program as per the following instruction.\n + {message} + Write the entire code and no other text in the response. + ```{lang}\n{example['old_contents']} + ``` + ''' + + completion = f''' + ```{lang} + {example['new_contents']} + ```''' + + return { + 'prompt': instruction, + 'completion': completion, + } + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--commitpackft_mapping", type=str, default="../data/commitpackft_subset.csv") + parser.add_argument("--save_dir", type=str, default="") + args = parser.parse_args() + + langs = ["python", "javascript", "java", "go", "c", "c++", "kotlin", "rust"] + commitpck = [] + for lang in langs: + commitpck.append(load_dataset("bigcode/commitpackft", lang, trust_remote_code=True)) + + commitpack_final = concatenate_datasets([_["train"] for _ in commitpck]) + + commitpack_subset_map = pd.read_csv(args.commitpackft_mapping) + commitpack_subset_map.columns = ["repos", "commit"] + commitpack_final_df = commitpack_final.to_pandas() + matched_df = commitpack_final_df.merge(commitpack_subset_map, on=['commit', 'repos']) + filtered_dataset = Dataset.from_pandas(matched_df) + + processed_dataset = filtered_dataset.map(process, remove_columns=filtered_dataset.column_names) + + synthetic_ins = load_dataset("microsoft/NextCoderDataset") + + final_instruction_ds = concatenate_datasets([synthetic_ins['train'], processed_dataset]) + final_instruction_ds.save_to_disk(f"{args.save_path}/instruction_dataset") + + print(f"Dataset {args.save_path}/instruction_dataset saved to disk.") \ No newline at end of file