# Testing Generation for Each Model

In [1]:
import pandas as pd

### Load Data

In [4]:
df = pd.read_csv("/media/mujtaba/DATA/nick/UnitTestExamples/data/prompts/humanevalx/zero_shot_first_prompts.csv")

In [5]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,prompt
0,0,0,//Source method\nimport java.util.*;\nimport j...
1,1,1,//Source method\nimport java.util.*;\nimport j...
2,2,2,//Source method\nimport java.util.*;\nimport j...
3,3,3,//Source method\nimport java.util.*;\nimport j...
4,4,4,//Source method\nimport java.util.*;\nimport j...


In [10]:
from datasets import load_dataset, Dataset
ds = Dataset.from_csv("../../../data/prompts/humanevalx/first_formatted_prompts.csv")

In [12]:
ds[0]

{'Unnamed: 0': 0,
 'original_index': 0,
 'prompt': '\nimport java.util.*;\nimport java.lang.*;\n\nclass Solution {\n    public boolean hasCloseElements(List<Double> numbers, double threshold) {\n        for (int i = 0; i < numbers.size(); i++) {\n            for (int j = i + 1; j < numbers.size(); j++) {\n                double distance = Math.abs(numbers.get(i) - numbers.get(j));\n                if (distance < threshold) return true;\n            }\n        }\n        return false;\n    }\n}\n\n//Create Java unit tests for the Java method given above\n        \npublic class Main {\n    public static void main(String[] args) {\n        '}

In [13]:
from datasets import load_dataset, Dataset
ds = load_dataset("THUDM/humaneval-x", "java")

Each Given Attribute:
* task_id: original location in dataset
* prompt: imports and class declaration + instructions/information for desired function
* declaration: imports, class, and method decalaration
* canonical_solution: desired code to finish method with no class or method declaration before (continuation after declaration)
* test: hidden valid test cases that verify the solution works properly
* example_test: public (in prompt) test cases written as tests

#### Getting the 10 Shortest Full Solutions from HumanEval

In [None]:
def k_smallest_strings(k, list):
    smallest = []
    code = ''

    for i, example in enumerate(list):
        code = example['declaration'] + example['canonical_solution']
        if len(smallest) < k:
            add_code((i,code), smallest)
        elif len(code) < len(smallest[-1][1]):
            smallest.pop()
            add_code((i,code), smallest)
    return smallest

def add_code(code, list):
    for i in range(len(list)):
        if len(code[1]) < len(list[i][1]):
            list.insert(i, code)
            #print(f"Inserted this: {code}")
            return
    list.append(code)
    #print(f"Appended this: {code}")

ten_smallest = k_smallest_strings(10, ds['test'])

for ex in ten_smallest:
    print(ex)

Best/Simplest Methods for Reference:
* 53: add (two int)
* 23: strlen (one string)
* 45: traingleArea (two double)
* 2: truncateNumber (one double)

## Models for Loading

In [None]:
#from accelerate import PartialState
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import torch

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

checkpoint = "Salesforce/codet5p-770m"
device = "cuda:1" # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint).to(device)

# inputs = tokenizer.encode(ds[2]['prompt'], return_tensors="pt").to(device)
# outputs = model.generate(inputs, num_beams = 4, do_sample = True, max_length=1000)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

device = torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
model = AutoModelForCausalLM.from_pretrained(
    "codellama/CodeLlama-7b-Instruct-hf",
    #quantization_config=bnb_config, #Model quantization
    trust_remote_code=True,
    low_cpu_mem_usage=True, #Automatic with quantized models
).to(device)


In [None]:
model.to('cuda:0')

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder2-3b")
model = AutoModelForCausalLM.from_pretrained(
    "bigcode/starcoder2-3b",
    #quantization_config=bnb_config,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)
model.to('cuda:0')

In [6]:
import json
#Save model generation values
gen_config = model.generation_config
config_dict = gen_config.to_dict()
with open("generation_config.json", "w") as f:
    json.dump(config_dict, f, indent=4)

## Different Prompts

In [None]:
prompt = r'''Sum the elements of an array and return the sum with unit tests following the ZOMBIES testing methodology.
Here is a breakdown of the ZOMBIES testing methodology:
Z - Zero
O - One
M - Many
B - Boundaries
I - Interface
E - Exceptions
S - Simplicity

import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;

class Solution {
    public double sumArray(List<Double> numbers) {'''

In [None]:
prompt = r'''//Generate Unit Tests.

import java.util.*;
import java.lang.*;
import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;


class Solution {
    public boolean hasCloseElements(List<Double> numbers, double threshold) {
        for (int i = 0; i < numbers.size(); i++) {
            for (int j = i + 1; j < numbers.size(); j++) {
                double distance = Math.abs(numbers.get(i) - numbers.get(j));
                if (distance < threshold) return true;
            }
        }
        return false;
    }
}

/* Example Test       
    @Test
    void shouldReturnTrueIfHasClosestElementExample1() {
        // Arrange
        final var s = new Solution();
        final var arg1 = List.of(11.0, 2.0, 3.9, 4.0, 5.0, 2.2);
        final var arg2 = 0.3;
        final var expected = true;
        // Act
        final var actual = s.hasCloseElements(arg1, arg2);
        
        // Assert
        assertEquals(expected, actual);
    }
*/

//Include tests for empty and singleton Lists.
class SolutionTest {'''

In [None]:
from jinja2 import Environment, FileSystemLoader
import os

# Set up Jinja environment
env = Environment(loader=FileSystemLoader("templates"))
template = env.get_template("java_unit_test_template.j2")

# Example Java class to test
java_class = """
import java.util.*;
import java.lang.*;
import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;


class Solution {
    public boolean hasCloseElements(List<Double> numbers, double threshold) {
        for (int i = 0; i < numbers.size(); i++) {
            for (int j = i + 1; j < numbers.size(); j++) {
                double distance = Math.abs(numbers.get(i) - numbers.get(j));
                if (distance < threshold) return true;
            }
        }
        return false;
    }
}
"""
# Render the template with context
formatted_prompt = template.render(
    source_code=java_class,
    class_name="Solution",
    #package_name="com.example.calculator",
    dependencies=["JUnit 5", "Mockito"],
    specific_requirements="""
    - Generate Unit Tests.
    """,
    examples=[
        """
        @Test
        void shouldReturnTrueIfHasClosestElementExample1() {
            // Arrange
            final var s = new Solution();
            final var arg1 = List.of(11.0, 2.0, 3.9, 4.0, 5.0, 2.2);
            final var arg2 = 0.3;
            final var expected = true;

            // Act
            final var actual = s.hasCloseElements(arg1, arg2);
            
            // Assert
            assertEquals(expected, actual);
        }
        """
    ]
)

print(formatted_prompt)

In [None]:
prompt = r'''from jinja2 import Environment, FileSystemLoader
import os

# Set up Jinja environment
env = Environment(loader=FileSystemLoader("templates"))
template = env.get_template("java_unit_test_template.j2")

# Example Java class to test
java_class = """
package com.example.calculator;

public class Calculator {
    public int add(int a, int b) {
        return a + b;
    }
    
    public int divide(int numerator, int denominator) {
        if (denominator == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return numerator / denominator;
    }
}
"""
# Render the template with context
formatted_prompt = template.render(
    source_code=java_class,
    class_name="Calculator",
    package_name="com.example.calculator",
    dependencies=["JUnit 5", "Mockito"],
    specific_requirements="""
    - Test both the add() and divide() methods
    - For divide(), include tests for the divide-by-zero exception
    - Test with both positive and negative numbers
    - Test with boundary values
    """,
    examples=[
        """
        @Test
        void shouldReturnSumWhenAddingTwoPositiveNumbers() {
            // Arrange
            Calculator calculator = new Calculator();
            
            // Act
            int result = calculator.add(3, 5);
            
            // Assert
            assertEquals(8, result, "3 + 5 should equal 8");
        }
        """
    ]
)

print(formatted_prompt)'''

#### Strlen Prompt

In [None]:
from jinja2 import Environment, FileSystemLoader
import os

# Set up Jinja environment
env = Environment(loader=FileSystemLoader("templates"))
template = env.get_template("java_unit_test_template.j2")

# Example Java class to test
java_class = """
import java.util.*;
import java.lang.*;

class Solution {
    public int strlen(String string) {
        return string.length();
    }
}
"""
# Render the template with context
formatted_prompt = template.render(
    source_code=java_class,
    class_name="Solution",
    #package_name="com.example.calculator",
    dependencies=[],
    specific_requirements="""
    - Generate Unit Tests.
    - Test for Empty Cases
    - Test for Failing Cases
    - Test for Error Causing Cases
    """,
    examples=[],
    test_framework = "Any"
)

print(formatted_prompt)

In [None]:
ds['test'][23]['declaration']

In [None]:
ds['test'][23]['canonical_solution']

In [None]:
ds['test'][23]['test']

In [None]:
print(ds['test'][23]['prompt'] + ds['test'][23]['canonical_solution'])

In [15]:
prompt = prompt + "\n\n\n//Source method\n" + remove_block_comments(ds['test'][23]['prompt']) + ds['test'][23]['canonical_solution'] + "\n\n//Unit Tests with Three Cases\n" + "public class Main {\n    public static void main(String[] args) {"

In [14]:
prompt = "//Source method\n" + remove_block_comments(ds['test'][45]['prompt']) + ds['test'][45]['canonical_solution'] + "\n\n//Unit Tests with Three Cases\n" + ds['test'][45]['test']

In [None]:
prompt = "//Source method\nClass Solution{" + ds['test'][45]['canonical_solution'] + "\n\n//Unit Tests with Three Cases\n" + ds['test'][45]['test']

In [None]:
import re

def remove_block_comments(java_code):
    """
    Removes all block comments (/* ... */) from the given Java code.
    
    Args:
        java_code (str): The Java code as a string.
    
    Returns:
        str: The Java code with block comments removed.
    """
    # Regex pattern to match block comments
    pattern = re.compile(r'/\*.*?\*/', re.DOTALL)  # re.DOTALL makes .* match newlines
    
    # Substitute block comments with empty string
    cleaned_code = re.sub(pattern, '', java_code)
    
    return cleaned_code

In [8]:
import re

def remove_block_comments(java_code):
    pattern = re.compile(r'/\*.*?\*/\s*\n?', re.DOTALL)  # Finds block and newline character after
    
    # Substitute block comments (and following newline) with empty string
    cleaned_code = re.sub(pattern, '', java_code)
    
    return cleaned_code


In [None]:
remove_block_comments(ds['test'][23]['prompt'])

In [None]:
ds['test'][23]['test']

In [None]:
print(prompt)

## Generating outputs

In [17]:
inputs = tokenizer.encode(prompt, return_tensors="pt").to('cuda:0')
outputs = model.generate(inputs, max_new_tokens=300)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


//Source method
import java.util.*;
import java.lang.*;

class Solution {
    public double triangleArea(double a, double h) {
        return a * h / 2;
    }
}

//Unit Tests with Three Cases
public class Main {
    public static void main(String[] args) {
        Solution s = new Solution();
        List<Boolean> correct = Arrays.asList(
                s.triangleArea(5, 3) == 7.5,
                s.triangleArea(2, 2) == 2.0,
                s.triangleArea(10, 8) == 40.0
        );
        if (correct.contains(false)) {
            throw new AssertionError();
        }
    }
}


//Source method
import java.util.*;
import java.lang.*;

class Solution {
    public int strlen(String string) {
        return string.length();
    }
}

//Unit Tests with Three Cases
public class Main {
    public static void main(String[] args) {
        Solution s = new Solution();
        List<Boolean> correct = Arrays.asList(
                s.strlen("Hello") == 5,
                s.strlen("World") == 5,


In [None]:
inputs = tokenizer.encode(prompt, return_tensors="pt").to('cuda:0')
outputs = model.generate(inputs, 
                         
                         ####OUTPUT LENGTH CONTROL####
                         #max_length = 1200, #optional, default = 20, max length of tokens in input prompt + newly generated tokens
                         max_new_tokens = 300, #optional, max number of model generated tokens (not including prompt)
                         min_length = 0, #optional, default = 0, min length of tokens in input prompt + newly generated tokens
                         #min_new_tokens, #optional, min number of model generated tokens (not including prompt)
                         early_stopping = False, #optional, default = False, True: stops with num_beams completion candidates; False: Stops when better candidates are unlikely; "never": only stops when there cannot be any better candidates
                         max_time = 120, #optional (in seconds)
                         #stop_strings, #optional (string or list of strings that terminate generation)
                         
                         ####MANIPULATION OF MODEL OUTPUT LOGITS####
                         temperature = 1.5, #optional, default = 1.0, >1.0: more random; <1.0: more controlled/repetitive; 1.0: balance between both (check generation.config file for default values)
                         top_k = 50, #optional, default = 50, considers the k most probable tokens for generation
                         top_p = 1, #optional, default = 1.0, cumulative probability of all top_k tokens needed; if set <1.0, only the number of tokens that achieve that probability are considered (check generation.config for alterations)
                         #min_p = 0.01, #optional, minimum token probability for consideration, scaled based on the most likely token
                         typical_p = 1.0, #optional, default = 1.0, ?
                         epsilon_cutoff = 0.0, #optional, default = 0.0, tokens must have a conditional probability greater than the value (3e-4 - 9e-4)
                         eta_cutoff = 0.0, #optional, default = 0.0, hybrid between epsilon and typical p sampling (3e-4 - 2e-3)
                         diversity_penalty = 2.0, #optional, default = 0.0, value subtracted from beam score if another beam generates the same token; group beam search must be enabled
                         repetition_penalty = 10.0, #optional, default = 1.0, 1.0: no penalty, ****need to experiment with values****
                         #encoder_repitition_penalty = 1.0, #optional, default = 1.0, exponential penalty on sequences not in the original input
                         length_penalty = 0.0, #optional, default = 1.0, exponential penalty to length; <0.0 means shorter sequences; >0.0 means longer sequences
                         no_repeat_ngram_size = 0, #optional, default = 0, if set, ngrams of that size can never be repeated once used
                         #bad_words_ids, #optional, tokens that are prevented from being generated
                         #force_word_ids, #optional, list of token ids that must be generated
                         renormalize_logits = True, #optional, default = False, whether logits are renormalized,recommended to be set to True as logit processors can break normalization
                         #constraints, #optional, list of constraints added to generation to promote usage of certain tokens
                         forced_bos_token_id = model.config.forced_bos_token_id, #optional, defaults to model.config.forced_bos_token_id, this is the token forced to be the first generation
                         forced_eos_token_id = model.config.forced_eos_token_id, #optional, default = model.config.forced_eos_token_id, this is the token forced to be the last generation
                         remove_invalid_values = model.config.remove_invalid_values, #optional, defaults to model.config.remove_invalid_values, if true, removes nan and inf outputs to prevent crash, but slows down generation
                         #exponential_decay_length_penalty, #optional, given tuple that adds increasing penalty after the given number of tokens are generated tuple(int(num tokens), float(penalty))
                         #suppress_tokens, #optional, list of tokens suppressed at generation; log probs set to inf
                         #begin_suppress_tokens, #optional, list of tokens suppressed at beginning of generation; log probs set to inf
                         #forced_decoder_ids, #optional, list of generation indicies to token indicies that will be forced before sampling begins
                         #sequence_bias, #optional, dictionary mapping sequences to bias terms; positive bias = increased odds, negative bias = decreased odds
                         token_healing = False, #optional, default = False
                         #guidance_scale, #optional
                         #low_memory, #optional
                         #watermarking_config, #optional

                         ####GENERATION STRATEGY####
                         do_sample = False, #optional, default = False, whether sampling is used; greedy decoding if False
                         num_beams = 8, #optional, default = 1, number of beams used for beam search; 1 means no beam search
                         num_beam_groups = 4, #optional, default = 1, number of beam groups that beams divide into to ensure diversity among beam groups
                         #penalty_alpha, #optional, balances model confidence and degeneration penalty in contrastive search decoding
                         #dola_layers, #optional, number of layers used for DoLa decoding (LOOK INTO MORE THROUGH DOCS AND PAPER)

                         ####SPECIAL TOKENS AT GENERATION TIME####
                         pad_token_id = tokenizer.eos_token_id, #optional, id of padding token
                         #bos_token_id, #optional, id of beginning-of-sequence token
                         #eos_token_id, #optional, id of end-of-sequence token

                         ####CACHE CONTROL####
                         use_cache = True, #optional, default = True, whether model uses past key/values attentions to speed up decoding
                         cache_implementation = None, #optional, default = None, type of cache implementation used
                         cache_config = None, #optional, default = None, 
                         return_legacy_cache = True, #optional, default = True                   
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
del(model)
del(tokenizer)

# Models

## SantaCoder

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "bigcode/santacoder"
device = "cuda:2" # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True).to(device)


## StarCoder2-3b

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "bigcode/starcoder2-3b"
device = "cuda:2"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True).to(device)

inputs = tokenizer.encode(ds[0]['prompt'], return_tensors="pt").to(device)
outputs = model.generate(inputs, 
                         max_new_tokens=400,
                         top_k = 0.7)
print(tokenizer.decode(outputs[0]))

In [None]:
inputs = tokenizer.encode(ds[0]['prompt'], return_tensors="pt").to(device)
outputs = model.generate(inputs, 
                         max_new_tokens=400,
                         top_k = 0.7)
print(tokenizer.decode(outputs[0]))

## CodeGen2

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "Salesforce/codegen2-1B_P"
device = "cuda:2" # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True).to(device)

inputs = tokenizer.encode("//Create a function that sums two numbers together and returns the result\npublic static int sum(int a, int b){", return_tensors="pt").to(device)
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))