In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_from_disk
from codebleu import calc_codebleu

def main():
    # Specify the model path from Hugging Face
    model_name = "mistralai/Mistral-7B-Instruct-v0.3"

    # Load the tokenizer and set the padding token (using eos_token if not already set)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    # Adjust the tokenizer padding side to "right"
    tokenizer.padding_side = "right"

    # Load the base model in 4-bit mode using bitsandbytes for memory efficiency
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,      # requires bitsandbytes; lowers memory usage
        device_map="auto",
        torch_dtype=torch.float16
    )
    # Additional model configurations
    model.config.pretraining_tp = 1
    model.config.window = 256

    # Load test dataset and select only 10 examples
    full_test_dataset = load_from_disk("E:\\fine_tuning\\dataset_code")["test"]
    # test_dataset = full_test_dataset.select(range(10))
    test_dataset = full_test_dataset
    # Define system prompt
    system_prompt = '''
        **Code Generation Request** 

        * Read the provided **Method Descriptions & Summaries**
        * **Complete the Body of Each Block code** based on the corresponding summaries
        * **Format Requirement:** Wrap all generated **Code Blocks** in triple backticks (```) for enhanced readability
        * **Delivery Note:** Provide only the completed **Code Blocks** without explanatory comments or text
    '''

    # Preprocess function to combine the system prompt and user prompt into a single input string
    def preprocess_example(example):
        example["text"] = (
            "<s>[INST] "
            "System: " + system_prompt + "\n"
            "User: " + example["prompt"].strip() + " [/INST] \n"
            "Assistant: "
        )
        return example

    # Preprocess test dataset
    processed_test_dataset = test_dataset.map(preprocess_example)

    # Tokenization function (using truncation to 2048 tokens for compatibility with fine-tuning)
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, max_length=2048, padding="max_length")

    # Tokenize dataset
    tokenized_test_dataset = processed_test_dataset.map(
        tokenize_function, 
        batched=True, 
        remove_columns=test_dataset.column_names
    )

    # Generate predictions using max_new_tokens instead of max_length
    def generate_code(example):
        input_ids = tokenizer(example["text"], return_tensors="pt").input_ids.to("cuda")
        output = model.generate(
            input_ids,
            max_new_tokens=1024,  # Generate 512 new tokens after the prompt
            pad_token_id=tokenizer.pad_token_id
        )
        example["generated_code"] = tokenizer.decode(output[0], skip_special_tokens=True)
        return example

    generated_results = tokenized_test_dataset.map(generate_code)

    # Evaluate with CodeBLEU on these 10 examples
    references = [ex["response"] for ex in test_dataset]
    hypotheses = [ex["generated_code"] for ex in generated_results]
    # Compute CodeBLEU score
    codebleu_score = calc_codebleu(
        references,        # list of reference code (or list of lists if there are multiple references)
        hypotheses,        # list of candidate code
        lang="c_sharp",           # specify the programming language
        weights=(0.25, 0.25, 0.25, 0.25),  # weights for n-gram, weighted n-gram, syntax, and data-flow matches
        tokenizer=None           # if None, the default string split is used
    )

    print("CodeBLEU Score for all examples:", codebleu_score)

if __name__ == "__main__":
    main()




The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/632 [00:00<?, ? examples/s]

Map:   0%|          | 0/632 [00:00<?, ? examples/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


CodeBLEU Score for all examples: {'codebleu': 0.2558254453861192, 'ngram_match_score': 0.10074367646481623, 'weighted_ngram_match_score': 0.2673363819340307, 'syntax_match_score': 0.2880239617385143, 'dataflow_match_score': 0.3671977614071155}


In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_from_disk
from codebleu import calc_codebleu

def main():
    # Specify the model path from Hugging Face
    model_name = "mistralai/Mistral-7B-Instruct-v0.3"

    # Load the tokenizer and set the padding token (using eos_token if not already set)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    # Adjust the tokenizer padding side to "right"
    tokenizer.padding_side = "right"

    # Load the base model in 4-bit mode using bitsandbytes for memory efficiency
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,      # requires bitsandbytes; lowers memory usage
        device_map="auto",
        torch_dtype=torch.float16
    )
    # Additional model configurations
    model.config.pretraining_tp = 1
    model.config.window = 256

    # Load test dataset and select only 10 examples
    full_test_dataset = load_from_disk("E:\\fine_tuning\\dataset_code_bank")["test"]
    # test_dataset = full_test_dataset.select(range(10))
    test_dataset = full_test_dataset
    # Define system prompt
    system_prompt = '''
        **Code Generation Request** 

        * Read the provided **Method Descriptions & Summaries**
        * **Complete the Body of Each Block code** based on the corresponding summaries
        * **Format Requirement:** Wrap all generated **Code Blocks** in triple backticks (```) for enhanced readability
        * **Delivery Note:** Provide only the completed **Code Blocks** without explanatory comments or text
    '''

    # Preprocess function to combine the system prompt and user prompt into a single input string
    def preprocess_example(example):
        example["text"] = (
            "<s>[INST] "
            "System: " + system_prompt + "\n"
            "User: " + example["prompt"].strip() + " [/INST] \n"
            "Assistant: "
        )
        return example

    # Preprocess test dataset
    processed_test_dataset = test_dataset.map(preprocess_example)

    # Tokenization function (using truncation to 2048 tokens for compatibility with fine-tuning)
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, max_length=2048, padding="max_length")

    # Tokenize dataset
    tokenized_test_dataset = processed_test_dataset.map(
        tokenize_function, 
        batched=True, 
        remove_columns=test_dataset.column_names
    )

    # Generate predictions using max_new_tokens instead of max_length
    def generate_code(example):
        input_ids = tokenizer(example["text"], return_tensors="pt").input_ids.to("cuda")
        output = model.generate(
            input_ids,
            max_new_tokens=1024,  # Generate 512 new tokens after the prompt
            pad_token_id=tokenizer.pad_token_id
        )
        example["generated_code"] = tokenizer.decode(output[0], skip_special_tokens=True)
        return example

    generated_results = tokenized_test_dataset.map(generate_code)

    # Evaluate with CodeBLEU on these 10 examples
    references = [ex["response"] for ex in test_dataset]
    hypotheses = [ex["generated_code"] for ex in generated_results]
    # Compute CodeBLEU score
    codebleu_score = calc_codebleu(
        references,        # list of reference code (or list of lists if there are multiple references)
        hypotheses,        # list of candidate code
        lang="c_sharp",           # specify the programming language
        weights=(0.25, 0.25, 0.25, 0.25),  # weights for n-gram, weighted n-gram, syntax, and data-flow matches
        tokenizer=None           # if None, the default string split is used
    )

    print("CodeBLEU Score for all examples:", codebleu_score)

if __name__ == "__main__":
    main()




The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


CodeBLEU Score for all examples: {'codebleu': 0.18295523459326507, 'ngram_match_score': 0.0872878987416189, 'weighted_ngram_match_score': 0.16032361693947714, 'syntax_match_score': 0.375967329428265, 'dataflow_match_score': 0.10824209326369927}
