# Code Github

## Training

In [1]:
import torch
torch.cuda.empty_cache()


In [2]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [3]:
current_device = torch.cuda.current_device()  # e.g., returns 0
current_device

0

In [4]:
import torch

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

Torch version: 2.6.0+cu118
CUDA available: True
GPU name: Tesla P40


In [5]:
import torch

# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla P40. Max memory = 22.413 GB.
0.0 GB of memory reserved.


In [6]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_from_disk, Dataset
from peft import LoraConfig, get_peft_model

def explode_dataset(dataset):
    """
    Explodes the "text" list in each example so that each element becomes its own example.
    """
    new_examples = []
    for example in dataset:
        # example["text"] is expected to be a list
        for t in example["text"]:
            new_examples.append({"text": t})
    return Dataset.from_list(new_examples)

def main():
    model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        device_map="auto",
        torch_dtype=torch.float16
    )
    model.config.pretraining_tp = 1
    model.config.window = 256

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1,
        task_type="CAUSAL_LM", 
        bias="none"
    )

    model = get_peft_model(model, lora_config)

    raw_datasets = load_from_disk('E:\\fine_tuning\\dataset_code')

    system_prompt = (
        "\n**Code Generation Request**\n\n"
        " * Read the provided **Method Descriptions & Summaries**\n"
        " * **Complete the Body of Each Block code** based on the corresponding summaries\n"
        " * **Format Requirement:** Wrap all generated **Code Blocks** in triple backticks (```) for enhanced readability\n"
        " * **Delivery Note:** Provide only the completed **Code Blocks** without explanatory comments or text\n"
    )

    def chunk_text(text, tokenizer, max_length=1024, stride=512):
        """
        Tokenize the full text and then split it into overlapping chunks.
        Each chunk is decoded back to a string.
        """
        tokens = tokenizer.encode(text, add_special_tokens=False)
        chunks = []
        for i in range(0, len(tokens), stride):
            chunk_tokens = tokens[i: i + max_length]
            chunk_str = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
            chunks.append(chunk_str)
        return chunks

    def preprocess_example(batch):
        """
        Process a batch of examples by creating sliding window chunks.
        Each example's text is transformed into a list of chunk strings.
        """
        all_chunks = []
        for prompt, response in zip(batch["prompt"], batch["response"]):
            if isinstance(prompt, list):
                prompt = " ".join(prompt)
            if isinstance(response, list):
                response = " ".join(response)
            
            full_text = (
            "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"
            "<|im_start|>user\n" + prompt.strip() + "<|im_end|>\n"
            "<|im_start|>assistant\n" + response.strip() + "<|im_end|>\n"
        )
            
            chunks = chunk_text(full_text, tokenizer, max_length=1024, stride=512)
            all_chunks.append(chunks)
        
        return {"text": all_chunks}

    # Process examples in batches.
    processed_datasets = raw_datasets.map(preprocess_example, batched=True, num_proc=4)

    # Explode the "text" list into individual examples for each split.
    for split in processed_datasets.keys():
        processed_datasets[split] = explode_dataset(processed_datasets[split])

    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, max_length=1024, padding="max_length")

    # Remove unnecessary columns.
    remove_columns = processed_datasets["train"].column_names
    tokenized_datasets = processed_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=remove_columns,
        num_proc=4
    )

    print("Train dataset size:", len(tokenized_datasets["train"]))
    print("Validation dataset size:", len(tokenized_datasets["validation"]))
    if len(tokenized_datasets["validation"]) > 0:
        print("A sample validation example (tokenized):")
        print(tokenized_datasets["validation"][0])
    else:
        print("Validation dataset is empty!")

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    save_dir = "./Qwen_finetuned_v4"
    print("Saving output to:", save_dir)
    
    training_args = TrainingArguments(
        output_dir=save_dir,
        evaluation_strategy="steps",
        logging_first_step=True,
        eval_steps=1000,
        logging_steps=100,
        num_train_epochs=3,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,  # Lower the eval batch size
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        fp16=True,
        save_steps=100,
        save_total_limit=2,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
    )

    if os.path.isdir(save_dir) and os.listdir(save_dir):
    #     main_path = save_dir+"/checkpoint-900"
    #     trainer.train(resume_from_checkpoint=main_path)
    # else:
        trainer.train()
        # raise
            
    eval_metrics = trainer.evaluate()
    print("Evaluation metrics:", eval_metrics)

    model.save_pretrained(save_dir + "_lora")

if __name__ == "__main__":
    main()




The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map (num_proc=4):   0%|          | 0/7841 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2028 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2380 [00:00<?, ? examples/s]

Train dataset size: 7841
Validation dataset size: 2028
A sample validation example (tokenized):
{'input_ids': [8948, 271, 334, 2078, 23470, 6145, 56177, 353, 4457, 279, 3897, 3070, 3523, 3874, 24685, 609, 8116, 89333, 1019, 353, 3070, 12548, 279, 13958, 315, 8886, 8362, 2038, 334, 3118, 389, 279, 12159, 68922, 198, 353, 3070, 4061, 75938, 66963, 42187, 678, 7907, 3070, 2078, 45678, 334, 304, 23725, 1182, 35078, 320, 13874, 32881, 369, 23922, 91494, 198, 353, 3070, 38121, 7036, 66963, 39565, 1172, 279, 8145, 3070, 2078, 45678, 334, 2041, 92466, 6042, 476, 1467, 271, 872, 198, 2575, 366, 2231, 397, 2575, 5100, 92967, 1963, 54097, 481, 1096, 4473, 374, 8480, 369, 8241, 19721, 311, 10091, 24083, 304, 279, 92450, 3766, 12626, 624, 2575, 690, 2231, 397, 2575, 366, 4684, 397, 2575, 92450, 39949, 6482, 31282, 481, 1096, 536, 5707, 8894, 5413, 369, 7842, 323, 18150, 1651, 24083, 304, 279, 92450, 3766, 624, 2575, 690, 4684, 1339, 2231, 5100, 92967, 1963, 54097, 1476, 262, 1099, 7130, 536, 92450,



Step,Training Loss,Validation Loss
1000,0.5814,
2000,0.5555,
3000,0.553,
4000,0.4939,
5000,0.5114,


Evaluation metrics: {'eval_loss': nan, 'eval_runtime': 6539.9675, 'eval_samples_per_second': 0.31, 'eval_steps_per_second': 0.31, 'epoch': 2.9988521872210177}


In [7]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model, PeftModel  # Added PeftModel import

def main():
    # [Previous code remains identical until after model.save_pretrained()...]

    save_dir = "./Qwen_finetuned_v4"

    # New section: Merge LoRA with base model
    # --------------------------------------------------
    print("\nMerging LoRA adapters with base model...")
    model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    # Reload base model in FP16 (without 4-bit quantization)
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    # Load LoRA adapter weights
    lora_model = PeftModel.from_pretrained(
        base_model,
        save_dir+"_lora",
        torch_dtype=torch.float16
    )
    
    # Merge and save
    merged_model = lora_model.merge_and_unload()
    merged_model.save_pretrained(save_dir+"-merged")
    tokenizer.save_pretrained(save_dir+"-merged")
    
    print("Merged model saved to"+ save_dir +"-merged")
    # --------------------------------------------------

if __name__ == "__main__":
    main()


Merging LoRA adapters with base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Some parameters are on the meta device because they were offloaded to the cpu.


Saving checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Merged model saved to./Qwen_finetuned_v4-merged


## Test

### Code Bleu

In [6]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_from_disk
from codebleu import calc_codebleu

def main():
    # Specify the model path from Hugging Face
    model_name = "E:\\fine_tuning\\Qwen2.5-Coder-7B-Instruct\\Qwen_finetuned_v4-merged"

    # Load the tokenizer and set the padding token (using eos_token if not already set)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    # Adjust the tokenizer padding side to "right"
    tokenizer.padding_side = "right"

    # Load the base model in 4-bit mode using bitsandbytes for memory efficiency
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,      # requires bitsandbytes; lowers memory usage
        device_map="auto",
        torch_dtype=torch.float16
    )
    # Additional model configurations
    model.config.pretraining_tp = 1
    model.config.window = 256

    # Load test dataset and select only 10 examples
    full_test_dataset = load_from_disk("E:\\fine_tuning\\dataset_code")["test"]
    # test_dataset = full_test_dataset.select(range(10))
    test_dataset = full_test_dataset
    # Define system prompt
    system_prompt = '''
        **Code Generation Request** 

        * Read the provided **Method Descriptions & Summaries**
        * **Complete the Body of Each Block code** based on the corresponding summaries
        * **Format Requirement:** Wrap all generated **Code Blocks** in triple backticks (```) for enhanced readability
        * **Delivery Note:** Provide only the completed **Code Blocks** without explanatory comments or text
    '''

    # Preprocess function to combine the system prompt and user prompt into a single input string
    def preprocess_example(example):
        example["text"] = (
            "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"
            "<|im_start|>user\n" + example["prompt"].strip() + "<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        return example

    # Preprocess test dataset
    processed_test_dataset = test_dataset.map(preprocess_example)

    # Tokenization function (using truncation to 2048 tokens for compatibility with fine-tuning)
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, max_length=2048, padding="max_length")

    # Tokenize dataset
    tokenized_test_dataset = processed_test_dataset.map(
        tokenize_function, 
        batched=True, 
        remove_columns=test_dataset.column_names
    )

    # Generate predictions using max_new_tokens instead of max_length
    def generate_code(example):
        input_ids = tokenizer(example["text"], return_tensors="pt").input_ids.to("cuda")
        output = model.generate(
            input_ids,
            max_new_tokens=1024,  # Generate 512 new tokens after the prompt
            pad_token_id=tokenizer.pad_token_id
        )
        example["generated_code"] = tokenizer.decode(output[0], skip_special_tokens=True)
        return example

    generated_results = tokenized_test_dataset.map(generate_code)

    # Evaluate with CodeBLEU on these 10 examples
    references = [ex["response"] for ex in test_dataset]
    hypotheses = [ex["generated_code"] for ex in generated_results]
    # Compute CodeBLEU score
    codebleu_score = calc_codebleu(
        references,        # list of reference code (or list of lists if there are multiple references)
        hypotheses,        # list of candidate code
        lang="c_sharp",           # specify the programming language
        weights=(0.25, 0.25, 0.25, 0.25),  # weights for n-gram, weighted n-gram, syntax, and data-flow matches
        tokenizer=None           # if None, the default string split is used
    )

    print("CodeBLEU Score for all examples:", codebleu_score)

if __name__ == "__main__":
    main()




The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/632 [00:00<?, ? examples/s]

Map:   0%|          | 0/632 [00:00<?, ? examples/s]

Map:   0%|          | 0/632 [00:00<?, ? examples/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


CodeBLEU Score for all examples: {'codebleu': 0.31630421135449427, 'ngram_match_score': 0.11803688139341223, 'weighted_ngram_match_score': 0.356302488844282, 'syntax_match_score': 0.36877137044611563, 'dataflow_match_score': 0.4221061047341671}


### perplaxity

# Code Bank

## Training

In [6]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_from_disk, Dataset
from peft import LoraConfig, get_peft_model

def explode_dataset(dataset):
    """
    Explodes the "text" list in each example so that each element becomes its own example.
    """
    new_examples = []
    for example in dataset:
        # example["text"] is expected to be a list
        for t in example["text"]:
            new_examples.append({"text": t})
    return Dataset.from_list(new_examples)
def main():
    model_name = "E:\\fine_tuning\\Qwen2.5-Coder-7B-Instruct\\Qwen_finetuned_v4-merged"

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        device_map="auto",
        torch_dtype=torch.float16
    )
    model.config.pretraining_tp = 1
    model.config.window = 256

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1,
        task_type="CAUSAL_LM", 
        bias="none"
    )

    model = get_peft_model(model, lora_config)

    raw_datasets = load_from_disk('E:\\fine_tuning\\dataset_code_bank')

    system_prompt = (
        "\n**Code Generation Request**\n\n"
        " * Read the provided **Method Descriptions & Summaries**\n"
        " * **Complete the Body of Each Block code** based on the corresponding summaries\n"
        " * **Format Requirement:** Wrap all generated **Code Blocks** in triple backticks (```) for enhanced readability\n"
        " * **Delivery Note:** Provide only the completed **Code Blocks** without explanatory comments or text\n"
    )

    def chunk_text(text, tokenizer, max_length=1024, stride=512):
        """
        Tokenize the full text and then split it into overlapping chunks.
        Each chunk is decoded back to a string.
        """
        tokens = tokenizer.encode(text, add_special_tokens=False)
        chunks = []
        for i in range(0, len(tokens), stride):
            chunk_tokens = tokens[i: i + max_length]
            chunk_str = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
            chunks.append(chunk_str)
        return chunks

    def preprocess_example(batch):
        """
        Process a batch of examples by creating sliding window chunks.
        Each example's text is transformed into a list of chunk strings.
        """
        all_chunks = []
        for prompt, response in zip(batch["prompt"], batch["response"]):
            if isinstance(prompt, list):
                prompt = " ".join(prompt)
            if isinstance(response, list):
                response = " ".join(response)
            
            full_text = (
            "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"
            "<|im_start|>user\n" + prompt.strip() + "<|im_end|>\n"
            "<|im_start|>assistant\n" + response.strip() + "<|im_end|>\n"
        )
            
            chunks = chunk_text(full_text, tokenizer, max_length=1024, stride=512)
            all_chunks.append(chunks)
        
        return {"text": all_chunks}

    # Process examples in batches.
    processed_datasets = raw_datasets.map(preprocess_example, batched=True, num_proc=4)

    # Explode the "text" list into individual examples for each split.
    for split in processed_datasets.keys():
        processed_datasets[split] = explode_dataset(processed_datasets[split])

    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, max_length=1024, padding="max_length")

    # Remove unnecessary columns.
    remove_columns = processed_datasets["train"].column_names
    tokenized_datasets = processed_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=remove_columns,
        num_proc=4
    )

    print("Train dataset size:", len(tokenized_datasets["train"]))
    print("Validation dataset size:", len(tokenized_datasets["validation"]))
    if len(tokenized_datasets["validation"]) > 0:
        print("A sample validation example (tokenized):")
        print(tokenized_datasets["validation"][0])
    else:
        print("Validation dataset is empty!")

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


    save_dir = "./Qwen_finetuned_v5"
    
    print(save_dir)
    training_args = TrainingArguments(
        output_dir=save_dir,
        evaluation_strategy="steps",
        logging_first_step=True,
        eval_steps=1000,
        logging_steps=100,
        num_train_epochs=2,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,  # Lower the eval batch size
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        fp16=True,
        save_steps=100,
        save_total_limit=2,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
    )

    trainer.train()

    eval_metrics = trainer.evaluate()
    print("Evaluation metrics:", eval_metrics)

    model.save_pretrained(save_dir+"_lora")

if __name__ == "__main__":
    main()




The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map (num_proc=4):   0%|          | 0/1188 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/132 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/147 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/7692 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/647 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/746 [00:00<?, ? examples/s]

Train dataset size: 7692
Validation dataset size: 647
A sample validation example (tokenized):
{'input_ids': [8948, 271, 334, 2078, 23470, 6145, 56177, 353, 4457, 279, 3897, 3070, 3523, 3874, 24685, 609, 8116, 89333, 1019, 353, 3070, 12548, 279, 13958, 315, 8886, 8362, 2038, 334, 3118, 389, 279, 12159, 68922, 198, 353, 3070, 4061, 75938, 66963, 42187, 678, 7907, 3070, 2078, 45678, 334, 304, 23725, 1182, 35078, 320, 13874, 32881, 369, 23922, 91494, 198, 353, 3070, 38121, 7036, 66963, 39565, 1172, 279, 8145, 3070, 2078, 45678, 334, 2041, 92466, 6042, 476, 1467, 271, 872, 198, 2575, 366, 2231, 397, 2575, 4895, 6563, 1321, 11603, 481, 1096, 4473, 374, 8480, 369, 11589, 29679, 6813, 304, 279, 3482, 5333, 11, 11689, 369, 1196, 16653, 323, 23715, 624, 2575, 690, 2231, 397, 2575, 366, 4684, 397, 2575, 547, 2668, 24684, 481, 1096, 536, 28872, 279, 16653, 1882, 369, 19393, 10130, 7388, 11, 22573, 429, 3847, 525, 18630, 1573, 31788, 2617, 4963, 624, 2575, 690, 4684, 1339, 2231, 4895, 6563, 1321, 



Step,Training Loss,Validation Loss
1000,0.2216,0.353531
2000,0.2167,0.324561
3000,0.1744,0.311933


Evaluation metrics: {'eval_loss': 0.3071354627609253, 'eval_runtime': 2082.6593, 'eval_samples_per_second': 0.311, 'eval_steps_per_second': 0.311, 'epoch': 2.0}


In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import  PeftModel  # Added PeftModel import

def main():
    # [Previous code remains identical until after model.save_pretrained()...]

    save_dir =  "./Qwen_finetuned_v5"

    # New section: Merge LoRA with base model
    # --------------------------------------------------
    print("\nMerging LoRA adapters with base model...")
    model_name = "E:\\fine_tuning\\Qwen2.5-Coder-7B-Instruct\\Qwen_finetuned_v4-merged"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    # Reload base model in FP16 (without 4-bit quantization)
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    # Load LoRA adapter weights
    lora_model = PeftModel.from_pretrained(
        base_model,
        save_dir+"_lora",
        torch_dtype=torch.float16
    )
    
    # Merge and save
    merged_model = lora_model.merge_and_unload()
    merged_model.save_pretrained(save_dir+"-merged")
    tokenizer.save_pretrained(save_dir+"-merged")
    
    print("Merged model saved to"+ save_dir +"-merged")
    # --------------------------------------------------

if __name__ == "__main__":
    main()


Merging LoRA adapters with base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Saving checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Merged model saved to./Qwen_finetuned_v5-merged


In [14]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_from_disk
from codebleu import calc_codebleu

def main():
    # Specify the model path from Hugging Face
    model_name = "E:\\fine_tuning\\Qwen2.5-Coder-7B-Instruct\\Qwen_finetuned_v5-merged"

    # Load the tokenizer and set the padding token (using eos_token if not already set)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    # Adjust the tokenizer padding side to "right"
    tokenizer.padding_side = "right"

    # Load the base model in 4-bit mode using bitsandbytes for memory efficiency
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,      # requires bitsandbytes; lowers memory usage
        device_map="auto",
        torch_dtype=torch.float16
    )
    # Additional model configurations
    model.config.pretraining_tp = 1
    model.config.window = 256

    # Load test dataset and select only 10 examples
    full_test_dataset = load_from_disk("E:\\fine_tuning\\dataset_code_bank")["test"]
    # test_dataset = full_test_dataset.select(range(10))
    test_dataset = full_test_dataset
    # Define system prompt
    system_prompt = '''
        **Code Generation Request** 

        * Read the provided **Method Descriptions & Summaries**
        * **Complete the Body of Each Block code** based on the corresponding summaries
        * **Format Requirement:** Wrap all generated **Code Blocks** in triple backticks (```) for enhanced readability
        * **Delivery Note:** Provide only the completed **Code Blocks** without explanatory comments or text
    '''

    # Preprocess function to combine the system prompt and user prompt into a single input string
    def preprocess_example(example):
        example["text"] = (
            "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"
            "<|im_start|>user\n" + example["prompt"].strip() + "<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        return example

    # Preprocess test dataset
    processed_test_dataset = test_dataset.map(preprocess_example)

    # Tokenization function (using truncation to 2048 tokens for compatibility with fine-tuning)
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, max_length=2048, padding="max_length")

    # Tokenize dataset
    tokenized_test_dataset = processed_test_dataset.map(
        tokenize_function, 
        batched=True, 
        remove_columns=test_dataset.column_names
    )

    # Generate predictions using max_new_tokens instead of max_length
    def generate_code(example):
        input_ids = tokenizer(example["text"], return_tensors="pt").input_ids.to("cuda")
        output = model.generate(
            input_ids,
            max_new_tokens=1024,  # Generate 512 new tokens after the prompt
            pad_token_id=tokenizer.pad_token_id
        )
        example["generated_code"] = tokenizer.decode(output[0], skip_special_tokens=True)
        return example

    generated_results = tokenized_test_dataset.map(generate_code)

    # Evaluate with CodeBLEU on these 10 examples
    references = [ex["response"] for ex in test_dataset]
    hypotheses = [ex["generated_code"] for ex in generated_results]
    # Compute CodeBLEU score
    codebleu_score = calc_codebleu(
        references,        # list of reference code (or list of lists if there are multiple references)
        hypotheses,        # list of candidate code
        lang="c_sharp",           # specify the programming language
        weights=(0.25, 0.25, 0.25, 0.25),  # weights for n-gram, weighted n-gram, syntax, and data-flow matches
        tokenizer=None           # if None, the default string split is used
    )

    print("CodeBLEU Score for all examples:", codebleu_score)

if __name__ == "__main__":
    main()


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


CodeBLEU Score for all examples: {'codebleu': 0.08708562792715882, 'ngram_match_score': 0.050650360881428415, 'weighted_ngram_match_score': 0.0889081122479649, 'syntax_match_score': 0.16321260622060885, 'dataflow_match_score': 0.045571432358633125}


# Reapeted train on bank code

In [7]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_from_disk, Dataset
from peft import LoraConfig, get_peft_model

def explode_dataset(dataset):
    """
    Explodes the "text" list in each example so that each element becomes its own example.
    """
    new_examples = []
    for example in dataset:
        # example["text"] is expected to be a list
        for t in example["text"]:
            new_examples.append({"text": t})
    return Dataset.from_list(new_examples)
def main():
    model_name = "E:\\fine_tuning\\Qwen2.5-Coder-7B-Instruct\\Qwen_finetuned_v4-merged"

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        device_map="auto",
        torch_dtype=torch.float16
    )
    model.config.pretraining_tp = 1
    model.config.window = 256

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1,
        task_type="CAUSAL_LM", 
        bias="none"
    )

    model = get_peft_model(model, lora_config)

    raw_datasets = load_from_disk('E:\\fine_tuning\\dataset_code_bank')

    system_prompt = (
        "\n**Code Generation Request**\n\n"
        " * Read the provided **Method Descriptions & Summaries**\n"
        " * **Complete the Body of Each Block code** based on the corresponding summaries\n"
        " * **Format Requirement:** Wrap all generated **Code Blocks** in triple backticks (```) for enhanced readability\n"
        " * **Delivery Note:** Provide only the completed **Code Blocks** without explanatory comments or text\n"
    )

    def chunk_text(text, tokenizer, max_length=1024, stride=512):
        """
        Tokenize the full text and then split it into overlapping chunks.
        Each chunk is decoded back to a string.
        """
        tokens = tokenizer.encode(text, add_special_tokens=False)
        chunks = []
        for i in range(0, len(tokens), stride):
            chunk_tokens = tokens[i: i + max_length]
            chunk_str = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
            chunks.append(chunk_str)
        return chunks

    def preprocess_example(batch):
        """
        Process a batch of examples by creating sliding window chunks.
        Each example's text is transformed into a list of chunk strings.
        """
        all_chunks = []
        for prompt, response in zip(batch["prompt"], batch["response"]):
            if isinstance(prompt, list):
                prompt = " ".join(prompt)
            if isinstance(response, list):
                response = " ".join(response)
            
            full_text = (
            "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"
            "<|im_start|>user\n" + prompt.strip() + "<|im_end|>\n"
            "<|im_start|>assistant\n" + response.strip() + "<|im_end|>\n"
        )
            
            chunks = chunk_text(full_text, tokenizer, max_length=1024, stride=512)
            all_chunks.append(chunks)
        
        return {"text": all_chunks}

    # Process examples in batches.
    processed_datasets = raw_datasets.map(preprocess_example, batched=True, num_proc=4)

    # Explode the "text" list into individual examples for each split.
    for split in processed_datasets.keys():
        processed_datasets[split] = explode_dataset(processed_datasets[split])

    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, max_length=1024, padding="max_length")

    # Remove unnecessary columns.
    remove_columns = processed_datasets["train"].column_names
    tokenized_datasets = processed_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=remove_columns,
        num_proc=4
    )

    print("Train dataset size:", len(tokenized_datasets["train"]))
    print("Validation dataset size:", len(tokenized_datasets["validation"]))
    if len(tokenized_datasets["validation"]) > 0:
        print("A sample validation example (tokenized):")
        print(tokenized_datasets["validation"][0])
    else:
        print("Validation dataset is empty!")

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


    save_dir = "./Qwen_finetuned_v5"
    
    print(save_dir)
    training_args = TrainingArguments(
        output_dir=save_dir,
        evaluation_strategy="steps",
        logging_first_step=True,
        eval_steps=1000,
        logging_steps=100,
        num_train_epochs=2,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,  # Lower the eval batch size
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        fp16=True,
        save_steps=100,
        save_total_limit=2,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
    )

    trainer.train()

    eval_metrics = trainer.evaluate()
    print("Evaluation metrics:", eval_metrics)

    model.save_pretrained(save_dir+"_lora")

if __name__ == "__main__":
    main()




The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map (num_proc=4):   0%|          | 0/7692 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/647 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/746 [00:00<?, ? examples/s]

Train dataset size: 7692
Validation dataset size: 647
A sample validation example (tokenized):
{'input_ids': [8948, 271, 334, 2078, 23470, 6145, 56177, 353, 4457, 279, 3897, 3070, 3523, 3874, 24685, 609, 8116, 89333, 1019, 353, 3070, 12548, 279, 13958, 315, 8886, 8362, 2038, 334, 3118, 389, 279, 12159, 68922, 198, 353, 3070, 4061, 75938, 66963, 42187, 678, 7907, 3070, 2078, 45678, 334, 304, 23725, 1182, 35078, 320, 13874, 32881, 369, 23922, 91494, 198, 353, 3070, 38121, 7036, 66963, 39565, 1172, 279, 8145, 3070, 2078, 45678, 334, 2041, 92466, 6042, 476, 1467, 271, 872, 198, 2575, 366, 2231, 397, 2575, 4895, 6563, 1321, 11603, 481, 1096, 4473, 374, 8480, 369, 11589, 29679, 6813, 304, 279, 3482, 5333, 11, 11689, 369, 1196, 16653, 323, 23715, 624, 2575, 690, 2231, 397, 2575, 366, 4684, 397, 2575, 547, 2668, 24684, 481, 1096, 536, 28872, 279, 16653, 1882, 369, 19393, 10130, 7388, 11, 22573, 429, 3847, 525, 18630, 1573, 31788, 2617, 4963, 624, 2575, 690, 4684, 1339, 2231, 4895, 6563, 1321, 



Step,Training Loss,Validation Loss
1000,0.2224,0.357036
2000,0.2173,0.32647
3000,0.1751,0.313923


Evaluation metrics: {'eval_loss': 0.3091481029987335, 'eval_runtime': 2082.384, 'eval_samples_per_second': 0.311, 'eval_steps_per_second': 0.311, 'epoch': 2.0}


In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import  PeftModel  # Added PeftModel import

def main():
    # [Previous code remains identical until after model.save_pretrained()...]

    save_dir =  "./Qwen_finetuned_v5"

    # New section: Merge LoRA with base model
    # --------------------------------------------------
    print("\nMerging LoRA adapters with base model...")
    model_name = "E:\\fine_tuning\\Qwen2.5-Coder-7B-Instruct\\Qwen_finetuned_v4-merged"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    # Reload base model in FP16 (without 4-bit quantization)
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    # Load LoRA adapter weights
    lora_model = PeftModel.from_pretrained(
        base_model,
        save_dir+"_lora",
        torch_dtype=torch.float16
    )
    
    # Merge and save
    merged_model = lora_model.merge_and_unload()
    merged_model.save_pretrained(save_dir+"-merged")
    tokenizer.save_pretrained(save_dir+"-merged")
    
    print("Merged model saved to"+ save_dir +"-merged")
    # --------------------------------------------------

if __name__ == "__main__":
    main()


Merging LoRA adapters with base model...




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Merged model saved to./Qwen_finetuned_v5-merged


In [6]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_from_disk
from codebleu import calc_codebleu

def main():
    # Specify the model path from Hugging Face
    model_name = "E:\\fine_tuning\\Qwen2.5-Coder-7B-Instruct\\Qwen_finetuned_v5-merged"

    # Load the tokenizer and set the padding token (using eos_token if not already set)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    # Adjust the tokenizer padding side to "right"
    tokenizer.padding_side = "right"

    # Load the base model in 4-bit mode using bitsandbytes for memory efficiency
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,      # requires bitsandbytes; lowers memory usage
        device_map="auto",
        torch_dtype=torch.float16
    )
    # Additional model configurations
    model.config.pretraining_tp = 1
    model.config.window = 256

    # Load test dataset and select only 10 examples
    full_test_dataset = load_from_disk("E:\\fine_tuning\\dataset_code_bank")["test"]
    # test_dataset = full_test_dataset.select(range(10))
    test_dataset = full_test_dataset
    # Define system prompt
    system_prompt = '''
        **Code Generation Request** 

        * Read the provided **Method Descriptions & Summaries**
        * **Complete the Body of Each Block code** based on the corresponding summaries
        * **Format Requirement:** Wrap all generated **Code Blocks** in triple backticks (```) for enhanced readability
        * **Delivery Note:** Provide only the completed **Code Blocks** without explanatory comments or text
    '''

    # Preprocess function to combine the system prompt and user prompt into a single input string
    def preprocess_example(example):
        example["text"] = (
            "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"
            "<|im_start|>user\n" + example["prompt"].strip() + "<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        return example

    # Preprocess test dataset
    processed_test_dataset = test_dataset.map(preprocess_example)

    # Tokenization function (using truncation to 2048 tokens for compatibility with fine-tuning)
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, max_length=2048, padding="max_length")

    # Tokenize dataset
    tokenized_test_dataset = processed_test_dataset.map(
        tokenize_function, 
        batched=True, 
        remove_columns=test_dataset.column_names
    )

    # Generate predictions using max_new_tokens instead of max_length
    def generate_code(example):
        input_ids = tokenizer(example["text"], return_tensors="pt").input_ids.to("cuda")
        output = model.generate(
            input_ids,
            max_new_tokens=1024,  # Generate 512 new tokens after the prompt
            pad_token_id=tokenizer.pad_token_id
        )
        example["generated_code"] = tokenizer.decode(output[0], skip_special_tokens=True)
        return example

    generated_results = tokenized_test_dataset.map(generate_code)

    # Evaluate with CodeBLEU on these 10 examples
    references = [ex["response"] for ex in test_dataset]
    hypotheses = [ex["generated_code"] for ex in generated_results]
    # Compute CodeBLEU score
    codebleu_score = calc_codebleu(
        references,        # list of reference code (or list of lists if there are multiple references)
        hypotheses,        # list of candidate code
        lang="c_sharp",           # specify the programming language
        weights=(0.25, 0.25, 0.25, 0.25),  # weights for n-gram, weighted n-gram, syntax, and data-flow matches
        tokenizer=None           # if None, the default string split is used
    )

    print("CodeBLEU Score for all examples:", codebleu_score)

if __name__ == "__main__":
    main()




The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


CodeBLEU Score for all examples: {'codebleu': 0.28420149107786136, 'ngram_match_score': 0.09453043295781977, 'weighted_ngram_match_score': 0.21753594188659264, 'syntax_match_score': 0.6813711739955449, 'dataflow_match_score': 0.14336841547148804}
