In [40]:
import torch
import re
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import numpy as np
import random
from torch.utils.data import DataLoader
from tokenizer import *
from generation_processing import *

class MySantaCoder(nn.Module):
    def __init__(self, list_of_bad_words = ['#'], max_tokens = 128, num_sol = 1):
        super(MySantaCoder, self).__init__()
        self.checkpoint = "bigcode/santacoder"
        self.model = AutoModelForCausalLM.from_pretrained(self.checkpoint, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.max_new_tokens = max_tokens
        # define the list of bad word that the model should not generate
        self.bad_words = self.get_input_ids_as_list(list_of_bad_words)

        self.generation_config = GenerationConfig(
                bad_words_ids = self.bad_words,
                num_beams = num_sol,
                num_return_sequences = num_sol,
                max_new_tokens = self.max_new_tokens,
                eos_token_id=self.model.generation_config.eos_token_id,
                bos_token_id=self.model.generation_config.bos_token_id
                )

    def get_input_ids_as_list(self, list_of_bad_words):
        token_list = []
        for element in list_of_bad_words:
            token_list.append(self.tokenizer.encode(element))
        return token_list
    
    def forward(self, input_ids):
        # input_ids = input_ids.unsqueeze(0)
        outputs = self.model.generate(input_ids, self.generation_config)
        return outputs

    def decode_output(self, encoded_output):
        output = self.tokenizer.decode(encoded_output)
        return output

    def post_generation_processing(self,code):
        # split it into list of blocks
        list_blocks = re.split('def |class |assert |print ', code)
        if 'init' in list_blocks[1]:
            fill_word = '\nclass '
        else:
            fill_word = '\ndef '
        # keep only the first block
        result = list_blocks[0] + fill_word + list_blocks[1]
        return result


In [23]:
from get_data import * 

converted_mtbp = read_json_line_format('data/MTBP/converted_mtpb.jsonl')
mtbp = read_json_line_format('data/MTBP/mtpb.jsonl')

In [25]:
converted_mtbp.iloc[0]

text                    Append a string in the middle of another string.
prompt                  Append a string in the middle of another string.
code                                     def sandwich_string(A):\r\npass
task_id                                                                1
test_setup_code                                                         
test_list              [assert sandwich_string('abcde') == ['a', 'b',...
challenge_test_list                                                   []
entry_point                                              sandwich_string
test                   def check(candidate):\n\tassert sandwich_strin...
signature                                        def sandwich_string(A):
Name: 0, dtype: object

In [24]:
mtbp.iloc[0]

prompts           [Assign the string "{A}" to a variable named "...
inputs            [{'A': 'abcde'}, {'A': 'abcdecadeCADE'}, {'A':...
outputs           [[a, b, c, d, e], [a, b, c, d, e], [a], [ , e,...
max_gen_length                                                128.0
category                                                     string
name                                                Sandwich string
description        Append a string in the middle of another string.
id                                                                1
Name: 0, dtype: object

In [41]:
import re
import random
import string

def generate_random_name(signature):
    # Find the function name using regular expression
    match = re.match(r'def ([\w\-_%.]+)\((.*)\):', signature)
    if match:
        original_name, parameters = match.groups()
        
        # Generate a random name with a reasonable length (e.g., length of original name)
        random_name = ''.join(random.choice(string.ascii_lowercase) for _ in range(len(original_name)))

        # Replace the original name with the random name
        new_signature = f'def {random_name}({parameters}):'
        return new_signature
    else:
        print(signature)
        # raise ValueError("Invalid function signature")
        return signature

def custom_dataset_context_investigation(mtbp_converted, mtbp):

    # select only features that are interesting
    features_name_converted = ['text', 'signature','test_list']
    mtbp_converted = mtbp_converted[features_name_converted]
    features_name = ['prompts']
    mtbp = mtbp[features_name]
    
    data = pd.concat([mtbp, mtbp_converted], axis=1)

    random_names = []

    for i in range(len(data)):
        signature = data.iloc[i]['signature']
        random_name = generate_random_name(signature)
        random_names.append(random_name)

    data['random_signatures'] = random_names

    return data
    


In [63]:
STOP_WORDS = ['def', 'if', 'for', 'while']

def context_and_contexless_generation(data, model, early_stopping = None):
    """ Generate two types of problems:
            1. generate with the appropriate function signature and the context (keep the structural generation cut off)
            2. generate with a random function name and without context (keep the structural generation cut off)
            3. Keep both generation at each step with a very large cut off function (when finding a new 'def') 
    """
    codes_with_context = []
    codes_without_context = []

    raw_generations_context = []
    raw_generations_no_context = []

    for j in range(len(data)):
        if early_stopping is not None and j > early_stopping:
            break

        code_with_context = []
        code_without_context = []

        no_cut_off_no_context = []
        no_cut_off_context = []

        # start with the signature for the incoming problem
        code = data.iloc[j]['signature']
        # start with a random name for the incoming problem
        code_random = data.iloc[j]['random_signatures']
        # initiate the list of prompt to generate
        prompts = data.iloc[j]['prompts']
        # Iterate over each prompt
        for i, prompt in enumerate(prompts):
            
            # Add the prompt to the previously generated code
            input_text_context = code + '\n\t' + '#' + prompt
            input_text_no_context = code_random +'\n\t' + '#' + prompt

            # Encode the input text
            input_ids_context = model.tokenizer.encode(input_text_context, return_tensors='pt')
            input_ids_no_context = model.tokenizer.encode(input_text_no_context, return_tensors='pt')

            # Generate the output
            output_ids_context = model.forward(input_ids_context)
            output_ids_no_context = model.forward(input_ids_no_context)

            # Decode the output
            output_text_context = model.decode_output(output_ids_context[0])
            output_text_no_context = model.decode_output(output_ids_no_context[0])



            # Cut off the generated code
            code = generation_cut_off(gen_code = output_text_context, stop_words=STOP_WORDS, index_prompt=i)
            code_random = generation_cut_off(gen_code = output_text_no_context, stop_words=STOP_WORDS, index_prompt=0)
            code_random = remove_context(code_random)

            # Keep the generation with a large cut off (new def found)
            output_text_context = model.post_generation_processing(output_text_context)
            output_text_no_context = model.post_generation_processing(output_text_no_context)

            code_with_context.append(code)
            code_without_context.append(code_random)

            no_cut_off_no_context.append(output_text_no_context)
            no_cut_off_context.append(output_text_context)

        codes_with_context.append(code_with_context)
        codes_without_context.append(code_without_context)

        raw_generations_context.append(no_cut_off_context)
        raw_generations_no_context.append(no_cut_off_no_context)

    return codes_with_context, raw_generations_context, codes_without_context, raw_generations_no_context

In [64]:
from get_data import *
mtbp_converted = read_json_line_format('data/MTBP/converted_mtpb.jsonl')
mtbp = read_json_line_format('data/MTBP/mtpb.jsonl')

dataset = custom_dataset_context_investigation(mtbp_converted, mtbp)

In [48]:
dataset.iloc[0]

prompts              [Assign the string "{A}" to a variable named "...
text                  Append a string in the middle of another string.
signature                                      def sandwich_string(A):
test_list            [assert sandwich_string('abcde') == ['a', 'b',...
random_signatures                              def uxbwfshjwzjglbh(A):
Name: 0, dtype: object

In [65]:

model = MySantaCoder()
codes_with_context, codes_without_context, codes_without_context, raw_generations_no_context = context_and_contexless_generation(data=dataset, model=model, early_stopping=2)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Set

ValueError: too many values to unpack (expected 2)

In [58]:
codes_with_context[1]

['def normalize_integer_list(numbers):\n\t#Define a list of integers named "numbers" with the values {numbers}.\n\tnumbers = [int(x) for x in numbers]',
 'def normalize_integer_list(numbers):\n\t#Calculate the sum of the elements in variable "numbers" and store the result to variable "total".\n\ttotal = sum(numbers)',
 'def normalize_integer_list(numbers):\n\t#Divide each element of the list by the total and multiply by 100, store the result to variable "normalized".\n\tnormalized = []',
 'def normalize_integer_list(numbers):\n\t#Convert each element in variable "normalized" into a formatted string with single decimal point and store the result into "formatted".\n\tformatted = []',
 'def normalize_integer_list(numbers):\n\t#Print the variable "formatted".\n\tformatted = []']

In [59]:
codes_without_context[1]

['def wicupvsftvndphmdqgwdyw(numbers):\n\tnumbers = [int(x) for x in numbers]',
 'def wicupvsftvndphmdqgwdyw(numbers):\n\ttotal = 0',
 'def wicupvsftvndphmdqgwdyw(numbers):\n\tnormalized = []',
 'def wicupvsftvndphmdqgwdyw(numbers):\n\tformatted = ""',
 'def wicupvsftvndphmdqgwdyw(numbers):\n\tformatted = ""']