From 0bda78ec35ae8f878646e5ab36a8ae8b4c95cdaf Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 7 May 2025 08:49:03 +0000
Subject: [PATCH] adding data preparation script and usage

---
 src/train/README.md    | 44 ++++++++++++++++++++++++++++++++--
 src/train/data_prep.py | 54 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 2 deletions(-)
 create mode 100644 src/train/data_prep.py

diff --git a/src/train/README.md b/src/train/README.md
index 795dc6a..64b81b0 100644
--- a/src/train/README.md
+++ b/src/train/README.md
@@ -9,6 +9,17 @@
 - `sft.py` contains the code for training model with Full Supervised Finetuning
   
 ## Usgae
+### Preparing the dataset
+- Download the both Instruction and Conversational variant dataset from huggingface
+  - [microsoft/NextCoderDataset](https://huggingface.co/datasets/microsoft/NextCoderDataset)
+  - [microsoft/NextCoderDataset-Conversational](https://huggingface.co/datasets/microsoft/NextCoderDataset-Conversational)
+
+- Run the `data_prep.py` script with the corresponding data path for instruction-variant, this will fetch the commitpackft subset and create a complete dataset for instruction-tuning
+  ```bash
+  python data_prep.py --commitpackft_mapping ../data/commitpackft_subset.csv --save_dir .
+  ```
+  This will save the final instruction dataset as `instruction_dataset` which will be used in trainig for stage-1
+
 ### Training with SFT
 - modify or replace the `general_acc.yaml` file as per the desired system configuration
 - set the `zero_optimization-stage` to `3` and `overlap_comm` to `false` in `ds_config` for better memory optimizations
@@ -36,7 +47,21 @@
       --debug True \
   ```
   Update the above command as per the model
-- To train on conversation data by only applying loss on the response, uncomment the lines 175, 176 and 185 and run the same command with proper dataset path
+- To train on conversation data by only applying loss on the response, uncomment the lines 175, 176 and 185 and run the same command with proper conversational dataset path
+  ```python
+    response_template = "#RESPONSE\n"
+    collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
+
+    # Initialize trainer
+    trainer = SFTTrainer(
+        model=model,
+        processing_class=tokenizer,
+        train_dataset=dataset,
+        args=training_config,
+        callbacks=[Callback(flush_steps=1)],
+        data_collator=collator, # pass the collator in the trainer
+    )
+  ```
 
 ### Training with LoRA
 - modify or replace the `general_acc.yaml` file as per the desired system configuration
@@ -101,4 +126,19 @@
       --alpha "Enter value for desired alpha parameter for SeleKT" \
   ```
   Update the above command as per the model
-- To train on conversation data by only applying loss on the response, uncomment the lines 291, 292 and 301 and run the same command with proper dataset path
\ No newline at end of file
+- To train on conversation data by only applying loss on the response, uncomment the lines 291, 292 and 301 and run the same command with proper conversational dataset path
+  ```python
+    ```python
+    response_template = "#RESPONSE\n"
+    collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
+
+    # Initialize trainer
+    trainer = SFTTrainer(
+        model=model,
+        processing_class=tokenizer,
+        train_dataset=dataset,
+        args=training_config,
+        callbacks=[Callback(flush_steps=1)],
+        data_collator=collator, # pass the collator in the trainer
+    )
+    ```
\ No newline at end of file
diff --git a/src/train/data_prep.py b/src/train/data_prep.py
new file mode 100644
index 0000000..0f0f619
--- /dev/null
+++ b/src/train/data_prep.py
@@ -0,0 +1,54 @@
+from datasets import load_dataset, concatenate_datasets, Dataset
+import pandas as pd
+import argparse
+
+
+def process(example):
+  lang = example['lang'].lower()
+  message = example['message']
+  instruction = f'''
+  Rewrite the given {lang} program as per the following instruction.\n
+  {message}
+  Write the entire code and no other text in the response.
+  ```{lang}\n{example['old_contents']}
+  ```
+  '''
+
+  completion = f'''
+  ```{lang}
+  {example['new_contents']}
+  ```'''
+  
+  return {
+    'prompt': instruction,
+    'completion': completion,
+  }
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--commitpackft_mapping", type=str, default="../data/commitpackft_subset.csv")
+  parser.add_argument("--save_dir", type=str, default="")
+  args = parser.parse_args()
+
+  langs = ["python", "javascript", "java", "go", "c", "c++", "kotlin", "rust"]
+  commitpck = []
+  for lang in langs:
+      commitpck.append(load_dataset("bigcode/commitpackft", lang, trust_remote_code=True))
+
+  commitpack_final = concatenate_datasets([_["train"] for _ in commitpck])
+
+  commitpack_subset_map = pd.read_csv(args.commitpackft_mapping)
+  commitpack_subset_map.columns = ["repos", "commit"]
+  commitpack_final_df = commitpack_final.to_pandas()
+  matched_df = commitpack_final_df.merge(commitpack_subset_map, on=['commit', 'repos'])
+  filtered_dataset = Dataset.from_pandas(matched_df)
+
+  processed_dataset = filtered_dataset.map(process, remove_columns=filtered_dataset.column_names)
+
+  synthetic_ins = load_dataset("microsoft/NextCoderDataset")
+
+  final_instruction_ds = concatenate_datasets([synthetic_ins['train'], processed_dataset])
+  final_instruction_ds.save_to_disk(f"{args.save_path}/instruction_dataset")
+
+  print(f"Dataset {args.save_path}/instruction_dataset saved to disk.")
\ No newline at end of file