<h1>Clean Try <h1>

In [2]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from torch.utils.data import Dataset, DataLoader
import re

# Custom dataset
class MyTokenizer(Dataset):
    def __init__(self, data, path_to_hub):
        self.tokenizer = AutoTokenizer.from_pretrained(path_to_hub) # Load your tokenizer here with a pre-specified vocabulary
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.data = data # a dataframe
        self.content_name = 'instruction'
        self.target_name = 'code'
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        text = self.data.iloc[index][self.content_name]
        code = self.data.iloc[index][self.target_name]

        inputs = self.tokenizer(text, padding=True)
        label = self.tokenizer.encode(code, padding=True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }
    
# Dataloader
def data_loader(data,batch_size):

    # Map style for Dataloader
    dataset = MyTokenizer(data)

    # dataloader
    dataloader_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)

    return dataloader_loader

# SantaCoder
class MySantaCoder(nn.Module):
    def __init__(self, model_path_to_hub):
        super(MySantaCoder, self).__init__()
        # self.checkpoint = "bigcode/santacoder"
        self.checkpoint = model_path_to_hub
        self.model = AutoModelForCausalLM.from_pretrained(self.checkpoint, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.max_new_tokens=512
        self.stop_words=["\nclass", "\ndef","\nassert", '\n"""', "\nprint", "\n<|"]

        self.generation_config = GenerationConfig(
            do_sample=True, 
            top_k=5,
            max_length = self.max_new_tokens,
            eos_token_id=self.model.generation_config.eos_token_id,
            bos_token_id=self.model.generation_config.bos_token_id
            )
    
    def forward(self, input):
        input_ids = input['input_ids'].unsqueeze(0)
        outputs = self.model.generate(input_ids, self.generation_config)
        return outputs
    
    def decode_output(self, encoded_output):
        output = self.tokenizer.decode(encoded_output[0])
        return output
    
    def post_generation_processing(self,encoded_output):
        code = self.tokenizer.decode(encoded_output[0])
        # split it into list of blocks
        list_blocks = re.split('def |class |assert |print ', code)
        if 'init' in list_blocks[1]:
            fill_word = '\nclass '
        else:
            fill_word = '\ndef '
        # keep only the first block
        result = list_blocks[0] + fill_word + list_blocks[1]
        return result

  from .autonotebook import tqdm as notebook_tqdm


<h3>Testing time<h3>

In [None]:
import pandas as pd

# eval
def eval(path_to_hub, path_to_data, path_to_save, early_stop = 3):

    print('Start to instanciate model and data...')
    # instantiate the model
    model = MySantaCoder(path_to_hub)
    # define the data
    data= pd.read_csv(path_to_data)

    mbpp_data = MyTokenizer(
        data=data,
        path_to_hub=path_to_hub
    )
    results = []
    model.eval()
    print('Start code generation...')
    for i in range(len(mbpp_data)):
        output = model(mbpp_data[i])
        result = model.decode_output(output)
        results.append(result)
        if i > early_stop:
            break
    

    data['Gen_code'] = results
    print('Save generated data ...')
    data.to_csv(path_to_save + "mbpp_generated.csv", index=False)

    return results


path_to_hub = 'bigcode/santacoder'
path_to_data = '../data/mbpp_test.csv'
path_to_save = '../data/'

results = eval(path_to_hub, path_to_data, path_to_save)


In [1]:
import pandas as pd

reports = pd.read_csv('../data/reports.csv')

In [13]:
import ast
extracted_list = ast.literal_eval(reports['Pass_one'][0])

In [11]:
extracted_list

[0.0, 0.0, 0.0]

In [15]:
import numpy as np
import ast 
def calculate_pass_percentage(df):

    # Get the total number of tests conducted
    total_test = 0
    results =  []
    for i in range(len(df)):
        pass_ones = ast.literal_eval(reports.iloc[i]['Pass_one'])
        total_test += len(pass_ones)
        results.append(sum(pass_ones))

    # Count the total number of passed tests
    total_passed = sum(results)

    # Calculate and return the percentage of passed tests
    return (total_passed / total_test) * 100

In [16]:
result = calculate_pass_percentage(reports)

In [17]:
result

11.51797603195739

<h1>Calculate Reports<h1>

In [None]:
import pandas as pd
import ast 

reports = pd.read_csv('../data/reports.csv')

def calculate_pass_percentage(df):

    # Get the total number of tests conducted
    total_test = 0
    results =  []
    for i in range(len(df)):
        pass_ones = ast.literal_eval(reports.iloc[i]['Pass_one'])
        total_test += len(pass_ones)
        results.append(sum(pass_ones))

    # Count the total number of passed tests
    total_passed = sum(results)

    # Calculate and return the percentage of passed tests
    return (total_passed / total_test) * 100

result = calculate_pass_percentage(reports)

<h1>SantaCoder OLD VERSION<h1>

In [19]:
import pandas as pd
import sys
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset, DataLoader

class SantaCoder(nn.Module):
    def __init__(self, model_path_to_hub):
        super(SantaCoder, self).__init__()
        # self.checkpoint = "bigcode/santacoder"
        self.checkpoint = model_path_to_hub
        self.santa = AutoModelForCausalLM.from_pretrained(self.checkpoint, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.max_new_tokens = 150
        self.max_time = 120
    
    def forward(self, input):
        inputs = self.tokenizer.encode(input, return_tensors="pt")
        outputs = self.santa.generate(
            inputs, 
            early_stopping = True, 
            bos_token_id = 49152, 
            eos_token_id = 49152, 
            max_length = self.max_new_tokens, 
            do_sample = True,
            max_time = self.max_time,
        )
        return self.tokenizer.decode(outputs[0])

    def generate(self, input):
        input_ids = self.tokenizer.encode(input, return_tensors="pt")
        output_ids = input_ids[0]
        while True:
            next_token_logits = self.santa(input_ids=output_ids.unsqueeze(0), return_dict=True).logits
            next_token = torch.argmax(next_token_logits[:, -1, :], dim=-1)
            output_ids = torch.cat((output_ids, next_token), dim=-1)
            output_code = self.tokenizer.decode(output_ids)
            if '\n\n' in output_code and not output_code.endswith((' ', '\n', '\t', 'd')):
                break
        return output_code.rstrip('def')

class MbppDataset(Dataset):
    def __init__(self, df, model_id, max_length=1024):
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.input_ids = []
        self.labels = []

        for i in range(len(df)):
            instruction = df.iloc[i]['instruction']
            code = df.iloc[i]['code']
            # tokenize
            encoded_instruction = self.tokenizer.encode(instruction, return_tensors="pt")
            encoded_label = self.tokenizer.encode(code, return_tensors="pt")
            # append to the list
            self.input_ids.append(torch.tensor(encoded_instruction))
            self.labels.append(encoded_label)
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.labels[index]

In [None]:
# model = AutoModelForCausalLM.from_pretrained("bigcode/santacoder", trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained("bigcode/santacoder")
# instruction = df.iloc[0]['instruction']
# encoded_instruction = tokenizer.encode(instruction, return_tensors="pt")
# outputs = model.generate(encoded_instruction, max_length=80)


<h1>Processing_post generation attempts</h1>

In [None]:
def extract_first_block(code_str):
    lines = code_str.split('\n')
    block = []
    first = False
    for i in range(len(lines) - 1):
        if (lines[i + 1].strip().startswith("def ") or lines[i + 1].strip().startswith("class ")):
            first = True
            break
        block.append(lines[i])
    # append last line
    block.append(lines[i])
    return '\n'.join(block)

def extract_first_block_1(text):
    lines = text.split("\n")
    block_started = False
    extracted_block = []
    indent_level = 0

    for line in lines:
        stripped_line = line.lstrip()  # remove leading spaces

        # check if line starts with class, def, or is a command
        if stripped_line.startswith(("class ", "def ", "assert ", "print ")) or block_started:
            current_indent_level = len(line) - len(stripped_line)

            # If we haven't started a block, this line starts it
            if not block_started:
                block_started = True
                indent_level = current_indent_level
                extracted_block.append(line)

            # If we're within the block
            elif block_started and current_indent_level >= indent_level:
                extracted_block.append(line)

            # If we've hit a line outside of the block
            else:
                break

    return "\n".join(extracted_block)

def extract_the_first_block_2(code):
    list_blocks = re.split('def |class |assert |print ', code)
    result = list_blocks[0] + '\ndef ' + list_blocks[1]
    return result

def extract_function(code_str, stop_words):
    min_stop_word_idx = len(code_str)  # assume stop word at the end
    for stop_word in stop_words:
        stop_word_idx = code_str.find(stop_word)
        # check if stop word exists in the string and its index is smaller
        if stop_word_idx != -1 and stop_word_idx < min_stop_word_idx:
            min_stop_word_idx = stop_word_idx

    # return the part of the string up to the stop word
    return code_str[:min_stop_word_idx]