In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch
import re
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MyTokenizer(Dataset):
    def __init__(self, data, path_to_hub):
        self.tokenizer = AutoTokenizer.from_pretrained(path_to_hub) # Load your tokenizer here with a pre-specified vocabulary
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.data = data # a dataframe
        self.content_name = 'instruction'
        self.target_name = 'code'
        self.list_test = 'test_list'

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        text = self.data.iloc[index][self.content_name]
        code = self.data.iloc[index][self.target_name]
        test_list = self.data.iloc[index][self.list_test]

        inputs = self.tokenizer(text, padding=True)
        label = self.tokenizer.encode(code, padding=True)
        tests = self.tokenizer.encode(test_list, padding=True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long),
            'tests' : torch.tensor(tests, dtype=torch.long)
        }

class MySantaCoder(nn.Module):
    def __init__(self, generation_method, num_sol = 1):
        super(MySantaCoder, self).__init__()
        self.checkpoint = "bigcode/santacoder"
        # self.checkpoint = model_path_to_hub
        self.model = AutoModelForCausalLM.from_pretrained(self.checkpoint, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.max_new_tokens = 128

        if generation_method == 'greedySearch':

            self.generation_config = GenerationConfig(
                num_beams = num_sol,
                num_return_sequences = num_sol,
                max_length = self.max_new_tokens,
                eos_token_id=self.model.generation_config.eos_token_id,
                bos_token_id=self.model.generation_config.bos_token_id
                )
        elif generation_method == 'samplingMethod' : 
     
            self.generation_config = GenerationConfig(   
                do_sample = True,  
                num_beams = num_sol,
                num_return_sequences = num_sol,
                top_p = 0.8,
                temperature = 0.95,
                max_length = self.max_new_tokens,
                eos_token_id=self.model.generation_config.eos_token_id,
                bos_token_id=self.model.generation_config.bos_token_id
                )

    def forward(self, input_ids):
        # input_ids = input_ids.unsqueeze(0)
        outputs = self.model.generate(input_ids, self.generation_config)
        return outputs

    def decode_output(self, encoded_output):
        output = self.tokenizer.decode(encoded_output)
        return output

    def post_generation_processing(self,code):
        # split it into list of blocks
        list_blocks = re.split('def |class |assert |print ', code)
        if 'init' in list_blocks[1]:
            fill_word = '\nclass '
        else:
            fill_word = '\ndef '
        # keep only the first block
        result = list_blocks[0] + fill_word + list_blocks[1]
        return result

In [3]:
def eval(test_loader, model, early_stoping = None):
    """
    This function evaluate the model on the test data
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()
    final_outputs=[]

    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
          ids = data['input_ids'].to(device, dtype = torch.long)
          targets = data['labels'].to(device, dtype = torch.long)
          tests = data['tests'].to(device, dtype = torch.long)
          # forward
          output = model.forward(ids).to(device)
          # postprocessing output
          decoded_output = [model.decode_output(t.cpu().numpy()) for t in output]
          code_generated = [model.post_generation_processing(dec) for dec in decoded_output]

          final_outputs.append(code_generated)

    return final_outputs

def save_generated_data(data, generated_code):
  selected_feature = ['code', 'test_list']
  data = data[selected_feature]
  # add the new feature
  data['generated_code'] = generated_code

  data.to_csv("data/200_greedy_solutions.csv", index=False)
  return data

def dataloading(data, path_to_hub, batch_size, num_workers, g, seed_worker):
    # create dataset out of dataframe
    dataset = MyTokenizer(data, path_to_hub)

    # create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        worker_init_fn=seed_worker,
        generator=g,
        )

    return dataloader

In [4]:
import pandas as pd
import numpy as np
import random

# instanciate the testing loader :
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


# setting the seed
g = torch.Generator()
g.manual_seed(0)


# Create dataloader
batch_size = 1
num_workers = 0
testing_data = pd.read_csv('data/mbpp_test.csv')
PATH_TO_HUB = "bigcode/santacoder"


testloader = dataloading(testing_data, PATH_TO_HUB, batch_size, num_workers, g, seed_worker)

In [9]:
model = MySantaCoder()

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [None]:
def extract_first_function(input_string):
    # Regular expression pattern to match Python function
    pattern = re.compile(r"(def .*?:.*?return .*?\n)(?=\n|\Z)", re.DOTALL)

    # Find all matches of the pattern in the input string
    matches = pattern.findall(input_string)

    # If no complete function is found, return the original string
    if not matches:
        return input_string

    # Return the first complete function
    return matches[0]

In [9]:
import pandas as pd

def find_max_token(df):
    max_token = 0
    avg = 0
    nm_to_be_generated = 0
    for i in range(len(df)):
        
        text = len(df.iloc[i]['text'])
        code = len(df.iloc[i]['code'])
        nm_to_be_generated += 10 + text
        avg += code + text
        nm_tokens = 1.1 * (text + code)
        if  nm_tokens > max_token:
            max_token = nm_tokens
    
    return max_token, avg/len(df), nm_to_be_generated/len(df)

data = pd.read_csv('data/mbpp_test.csv')
mx, avg, gen = find_max_token(data)


In [28]:
import pandas as pd
greedy_two_data = pd.read_csv('../data/200_greedy_solutions.csv')

# part of the processing
import ast

list_code = ast.literal_eval(greedy_two_data.iloc[0]['generated_code']) 

import re

def extract_first_function(input_string):
    # Regular expression pattern to match Python function
    pattern = re.compile(r"(def .*?:.*?return .*?\n)(?=\n|\Z)", re.DOTALL)

    # Find all matches of the pattern in the input string
    matches = pattern.findall(input_string)

    # If no complete function is found, return the original string
    if not matches:
        return input_string

    # Return the first complete function
    result = matches[0]
    
    return result

# still need to remove '<|endoftext|>' --> Is it a problem cannot remove it idk why


In [29]:
i = 22
new_function = extract_first_function(list_code[i])
print(new_function)
print(new_function.replace("<|endoftext|>",""))
print('============')
print(list_code[i])

def remove_Occ(s,ch):
	"""Write a python function to remove first and last occurrence of a given character from the string."""
	i = 0
	j = len(s) - 1
	while i < j:
		if s[i] == ch:
			i += 1
		elif s[j] == ch:
			j -= 1
		else:
			i += 1
			j -= 1
	return s[i+1:j+1]<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>
def remove_Occ(s,ch):
	"""Write a python function to remove first and last occurrence of a given character from the string."""
	i = 0
	j = len(s) - 1
	while i < j:
		if s[i] == ch:
			i += 1
		elif s[j] == ch:
			j -= 1
		else:
			i += 1
			j -= 1
	return s[i+1:j+1]
def remove_Occ(s,ch):
	"""Write a python function to remove first and last occurrence of a given character from the string."""
	i = 0
	j = len(s) - 1
	while i < j:
		if s[i] == ch:
			i += 1
		elif s[j] == ch:
			j -= 1
		else:
			i += 1
			j -= 1
	return s[i+1:j+1]<|endoftext|><|endoftext|><|end

In [20]:
new_function[268:].replace("<|endoftext|>","")

'endoftext|>'

In [3]:
import re

def extract_function_definitions(input_string):
    pattern = r"(def .*?:.*?)(?=^def |\Z)"
    matches = re.findall(pattern, input_string, re.DOTALL | re.MULTILINE)
    return "\n".join(match.strip() for match in matches)


def extract_first_function(input_string):
    # Regular expression pattern to match Python function
    pattern = re.compile(r"(def .*?:.*?return .*?\n)(?=\n|\Z)", re.DOTALL)

    # Find all matches of the pattern in the input string
    matches = pattern.findall(input_string)

    # If no complete function is found, return the original string
    if not matches:
        return input_string

    # Return the first complete function
    return matches[0]

In [8]:
new_function = extract_first_function(list_code[122])

In [9]:
print(new_function)

def remove_Occ(s,ch):
	"""Write a python function to remove first and last occurrence of a given character from the string."""
	i = 0
	j = len(s)-1
	while i < j:
		if s[i] == ch:
			i += 1
		elif s[j] == ch:
			j -= 1
		else:
			i += 1
			j -= 1
	return s[:i] + s[j+1:]

