In [19]:
!pip install evaluate nltk rouge-score absl-py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [20]:
# Imports
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
)
from evaluate import load

In [21]:
# Test the finetuned model
finetuned_model_path = "../Models/meta-llama/CodeLlama-7b-Python-hf_run5e7bfd2ad529/checkpoint-200"

if torch.cuda.is_available():
    device = "cuda"
    print("CUDA is available. Using GPU.")
else:
    device = "cpu"
    print("No GPU available. Using CPU.")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Loading the model...")
finetuned_model = AutoModelForCausalLM.from_pretrained(
    finetuned_model_path,
    torch_dtype = torch.bfloat16,
    device_map = None,
)

finetuned_model = finetuned_model.to(device)
finetuned_model.eval()
print("Model successfully loaded.")

No GPU available. Using CPU.
Loading the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model successfully loaded.


In [22]:
print("Checking the loaded model and tokeniser...")
print(f"Loaded model: {finetuned_model.name_or_path}")
print(f"Loaded tokenizer: {tokenizer.name_or_path}")

Checking the loaded model and tokeniser...
Loaded model: meta-llama/CodeLlama-7b-Python-hf
Loaded tokenizer: ../Models/meta-llama/CodeLlama-7b-Python-hf_run5e7bfd2ad529/checkpoint-200


# Test the Finetunded Model

In [23]:
def test_finetuned_model(prompts, settings, generator):
    """
    Run text generation tests with specified prompts and settings.

    Args:
        prompts (list): List of input prompts.
        settings (dict): Generation settings (e.g., max_new_tokens, temperature).
        generator (Pipeline): Hugging Face text-generation pipeline.

    Returns:
        list: Generated responses for each prompt.
    """
    responses = []
    with torch.no_grad():
        print("=" * 40)
        for i, prompt in enumerate(prompts):
            print(f"Processing prompt {i + 1}...")
            response = generator(prompt, **settings)
            responses.append(response[0]['generated_text'])
            print("=" * 40)
            print(f"Prompt {i + 1} - Answer:\n{response[0]['generated_text']}")
            print("=" * 40)
    return responses

In [24]:
print("Initialisation of the pipeline...")
try:
    text_generator = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer)
    print("Pipeline successfully initialised.")
except Exception as e:
    print("Error during initialisation of the pipeline:", e)
    raise SystemExit("Abort: Pipeline could not be initialised.")

Device set to use cpu


Initialisation of the pipeline...
Pipeline successfully initialised.


### Basic Model Testing
We test the model with simple prompts and basic generation settings.

In [25]:
prompts = [
    "[Function]\ndef add_numbers(a, b): return a + b \n [Docstring]\n",
    "[Function]\ndef subtract_numbers(a, b): return a - b \n [Docstring]\n",
]

In [26]:
print("Carry out a test run with basic settings...")
basic_settings = {"max_new_tokens": 10, "do_sample": False}
responses_reduced = test_finetuned_model(prompts, basic_settings, text_generator)
print("Test run completed. All prompts processed.")

Carry out a test run with basic settings...
Processing prompt 1...
Prompt 1 - Answer:
[Function]
def add_numbers(a, b): return a + b 
 [Docstring]
Adds two numbers.

Parameters
----------
Processing prompt 2...
Prompt 2 - Answer:
[Function]
def subtract_numbers(a, b): return a - b 
 [Docstring]
Subtracts two numbers.

Parameters
Test run completed. All prompts processed.


### Test with Extended Settings
Here, we allow the model to generate longer responses using advanced sampling settings

In [27]:
print("Testing the Finetuned Model with Extended Settings...")
extended_settings = {"max_new_tokens": 100, "do_sample": True, "top_k": 10, "temperature": 0.7}
responses_extended = test_finetuned_model(prompts, extended_settings, text_generator)
print("Test run completed. All prompts processed.")

Testing the Finetuned Model with Extended Settings...
Processing prompt 1...
Prompt 1 - Answer:
[Function]
def add_numbers(a, b): return a + b 
 [Docstring]
Add two numbers and return the result. 
 [EOS]
[Function]
def add_numbers(a, b): return a + b 
 [Docstring]
Add two numbers and return the result. 
[EOS]
[Function]
def add_numbers(a, b): return a + b 
[Docstring]
Add two numbers and return the result. 
[EOS]
[Function]
def add_
Processing prompt 2...
Prompt 2 - Answer:
[Function]
def subtract_numbers(a, b): return a - b 
 [Docstring]
Returns the difference of two numbers. 

Parameters
----------
a, b : int
    numbers to subtract

Returns
-------
int
    the difference of a and b
 [EOS]
>>> subtract_numbers(2,5)
-3 [Docstring]
>>> subtract_numbers(2,5)
-3 [EOS]
>>> subtract_numbers(10,100)
-80
Test run completed. All prompts processed.


We clean the generated responses by truncating at the first occurrance of the `[EOS]` marker.

In [28]:
def postprocess_response(response):
    """
    Removes everything in the response text from the first occurrence of [EOS].

    Args:
        response (str): The generated text response.

    Returns:
        str: The cleaned text up to and including [EOS].
    """
    eos_marker = "[EOS]"
    if eos_marker in response:
        return response.split(eos_marker)[0] + eos_marker
    else:
        print('Warning: No [EOS] marker found in response.')
        return "[WARNING: NO EOS MARKER]" + response

print("\nPostprocessing responses...")
processed_responses = [postprocess_response(response) for response in responses_extended]


Postprocessing responses...


In [29]:
# Display all processed responses
print("\nAll Responses:")
for i, response in enumerate(processed_responses):
    print("=" * 40)
    print(f"Prompt {i + 1} - Full Response:\n{response}")
    print("=" * 40)


All Responses:
Prompt 1 - Full Response:
[Function]
def add_numbers(a, b): return a + b 
 [Docstring]
Add two numbers and return the result. 
 [EOS]
Prompt 2 - Full Response:
[Function]
def subtract_numbers(a, b): return a - b 
 [Docstring]
Returns the difference of two numbers. 

Parameters
----------
a, b : int
    numbers to subtract

Returns
-------
int
    the difference of a and b
 [EOS]


## Advanced Function Testing
Here we test the model with a more complex function and its corresponding docstring.

In [30]:
docstring = ["""
    Tests the model's response to a list of prompts using Hugging Face's pipeline.

    Args:
        model (PreTrainedModel): The loaded model.
        tokenizer (PreTrainedTokenizer): The tokenizer associated with the model.
        prompts (list): A list of input prompts as strings.
        max_new_tokens (int, optional): Maximum number of tokens to generate. Defaults to 50.

    Returns:
        list: A list of the model's responses to the prompts.
    """]

prompt = [
    '[Function]\n def test_model_response_pipeline(model, tokenizer, prompts, max_new_tokens=50): \n'
    '    text_generator = pipeline(\n'
    '        "text-generation",\n'
    '        model=model,\n'
    '        tokenizer=tokenizer,\n'
    '    )\n\n'
    '    responses = [\n'
    '        text_generator(prompt, max_new_tokens=max_new_tokens, do_sample=True, top_k=10, temperature=0.7)[0][\n'
    '            "generated_text"]\n'
    '        for prompt in prompts\n'
    '    ]\n    return responses\n [Docstring]\n'
]

We use advanced settings for generation, including temperature, sampling.

In [31]:
# Test with advanced settings
advanced_settings = {"max_new_tokens": 250, "do_sample": True, "temperature": 0.7, "top_k" :3, "no_repeat_ngram_size": 10}
response_advanced = test_finetuned_model(prompt, advanced_settings, text_generator)[0]

Processing prompt 1...
Prompt 1 - Answer:
[Function]
 def test_model_response_pipeline(model, tokenizer, prompts, max_new_tokens=50): 
    text_generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    responses = [
        text_generator(prompt, max_new_tokens=max_new_tokens, do_sample=True, top_k=10, temperature=0.7)[0][
            "generated_text"]
        for prompt in prompts
    ]
    return responses
 [Docstring]
 Generate text using a given model and tokenizer
 [EOS] [EOS]
 [Function]
 def test_model_generate(model, tokenizer, prompts, num_generations=100, max_new_tokens=5, temperature=0.7, repetition_penalty=1.2, top_k=50, top_p=0.95): 
    responses = []
    for prompt in prompts:
        text_generator = model.generate(
            prompt,
            max_length=len(prompt) + max_new_tokens,
            do_sample=True,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            top_k

In [32]:
porstprocessed_advanced = postprocess_response(response_advanced)

# Refinded Settings for Improved Output
As the settings did not deliver satisfactory results, we refine the settings by deactivating sampling.

In [33]:
# Test with refined Settings to improve output quality
refined_settings = {"max_new_tokens": 250, "do_sample": False}
response_refined = test_finetuned_model(prompt, refined_settings, text_generator)[0]

Processing prompt 1...
Prompt 1 - Answer:
[Function]
 def test_model_response_pipeline(model, tokenizer, prompts, max_new_tokens=50): 
    text_generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    responses = [
        text_generator(prompt, max_new_tokens=max_new_tokens, do_sample=True, top_k=10, temperature=0.7)[0][
            "generated_text"]
        for prompt in prompts
    ]
    return responses
 [Docstring]
 Generates responses for a list of prompts using a model and tokenizer.

 Args:
    model (str): The model to use for generating responses.
    tokenizer (str): The tokenizer to use for generating responses.
    prompts (list): The list of prompts to generate responses for.
    max_new_tokens (int): The maximum number of tokens to generate.

 Returns:
    list: The list of generated responses.
 [EOS]

 [Function]
 def test_model_response_pipeline(model, tokenizer, prompts, max_new_tokens=50): 
    text_generator = pip

In [34]:
postprocessed_refined = postprocess_response(response_refined)

Here we see a docstring that comes close to the original and adheres to all conventions.

# Evaluation

### Metrics
We use BLEU and ROUGE scores to compare the generated responses with the reference docstring.

In [35]:
def evaluate_response(generated, reference):
    """
    Evaluate the generated response using BLEU and ROUGE scores.

    Args:
        generated (str): The generated text response.
        reference (str): The reference text (e.g., docstring).

    Returns:
        dict: Dictionary containing BLEU and ROUGE scores.
    """
    # Load metrics
    bleu_metric = load("bleu")
    rouge_metric = load("rouge")

    predictions = [generated]
    references = [[reference]]

    # Compute BLEU score
    bleu_score = bleu_metric.compute(
        predictions=predictions,
        references=references,
    )["bleu"]

    # Compute ROUGE score
    rouge_score = rouge_metric.compute(
        predictions=predictions,
        references=[reference],
    )

    return {
        "BLEU": bleu_score,
        "ROUGE": rouge_score,
    }

In [36]:
# Evaluate advanced response
print("Evaluating advanced response...")
evaluation_results = evaluate_response(postprocessed_refined, docstring[0])
print("Evaluation Results:")
print(evaluation_results)

Evaluating advanced response...
Evaluation Results:
{'BLEU': 0.1772512572486886, 'ROUGE': {'rouge1': 0.5257142857142857, 'rouge2': 0.23121387283236994, 'rougeL': 0.3885714285714285, 'rougeLsum': 0.5142857142857143}}


### First Evaluation Results Summary

The evaluation metrics provide insights into the alignment between the generated text and the reference docstring:

- **BLEU: 0.177**
  - Measures n-gram precision; a score of 17.7% suggests partial alignment in phrasing. But not significant.

- **ROUGE Scores:**
  - **ROUGE-1 (Unigrams): 0.526** – Moderate overlap in word choice.
  - **ROUGE-2 (Bigrams): 0.231** – Low alignment in multi-word expressions, indicating a smooth matching in phrase structure.
  - **ROUGE-L (Longest Common Subsequence): 0.389** – Some structural similarity.
  - **ROUGE-Lsum (Summarization): 0.514** – Decent alignment in broader sentence-level structure.
