<h1>Clean Try <h1>

In [31]:
import pandas as pd
import sys
import re
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer,GenerationConfig
from torch.utils.data import Dataset, DataLoader

class MySantaCoder(nn.Module):
    def __init__(self):
        super(MySantaCoder, self).__init__()
        self.checkpoint = "bigcode/santacoder"
        # self.checkpoint = model_path_to_hub
        self.model = AutoModelForCausalLM.from_pretrained(self.checkpoint, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.max_new_tokens = 128
        # self.num_solution_to_generate = 3
        self.stop_words=["\nclass", "\ndef","\nassert", '\n"""', "\nprint", "\n<|"]

        self.generation_config = GenerationConfig( 
            # num_beams = self.num_solution_to_generate,
            # num_return_sequences = self.num_solution_to_generate,
            max_length = self.max_new_tokens,
            eos_token_id=self.model.generation_config.eos_token_id,
            bos_token_id=self.model.generation_config.bos_token_id
            )
    
    def forward(self, input_ids):
        input_ids = input_ids.unsqueeze(0)
        outputs = self.model.generate(input_ids, self.generation_config)
        return outputs
    
    def decode_output(self, encoded_output):
        output = self.tokenizer.decode(encoded_output[0])
        return output
    
    def post_generation_processing(self,code):
        # split it into list of blocks
        list_blocks = re.split('def |class |assert |print ', code)
        if 'init' in list_blocks[1]:
            fill_word = '\nclass '
        else:
            fill_word = '\ndef '
        # keep only the first block
        result = list_blocks[0] + fill_word + list_blocks[1]
        return result
    
class MyTokenizer(Dataset):
    def __init__(self, data):
        self.checkpoint = 'bigcode/santacoder'
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint) # Load your tokenizer here with a pre-specified vocabulary
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.data = data # a dataframe
        self.content_name = 'instruction'
        self.target_name = 'code'
        self.list_test = 'test_list'
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        text = self.data.iloc[index][self.content_name]
        code = self.data.iloc[index][self.target_name]
        test_list = self.data.iloc[index][self.list_test]

        inputs = self.tokenizer(text, padding=True)
        label = self.tokenizer.encode(code, padding=True)
        tests = self.tokenizer.encode(test_list, padding=True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long),
            'tests' : torch.tensor(tests, dtype=torch.long)
        }

<h3>Testing time<h3>

In [32]:
import pandas as pd
data = pd.read_csv('../data/mbpp_prompt.csv')
model = MySantaCoder()
dataset = MyTokenizer(data)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [35]:
output = model.forward(dataset[6]['input_ids'])
tokenizer = AutoTokenizer.from_pretrained("bigcode/santacoder")

dec = tokenizer.decode(output[0])
print(dec)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


#Write a function to find all words which are at least 4 characters long in a string by using regex.
def find_char_long(text):
    return re.findall(r'\w{4,}', text)

#Write a function to find all words which are at least 4 characters long in a string by using regex.
def find_char_long_2(text):
    return re.findall(r'\w{4,}', text, re.IGNORECASE)

#Write a function to find all words which are at least 4 characters long in a string by using regex


In [37]:
import ast 

def extract_assertions(input_string):
    """ Transform the assertions to be as a list """
    # Using ast.literal_eval to safely parse the string representation of list into an actual list
    extracted_list = ast.literal_eval(input_string)
    
    # Now, ast.literal_eval gives us list of single string, we need to split this string 
    # into individual assertions. We will use newline '\n ' as the separator
    assertions = extracted_list[0].split('\n ')
    
    # Strip leading and trailing white spaces from each assertion
    # Also strip leading and trailing escape characters and quotation marks
    assertions = [assertion.strip().strip('\'') for assertion in assertions]
    
    # string of assertions
    str_assertions = assertions[0].split('assert')
    processed_assertions = []
    
    for i in range(len(str_assertions) - 1):
        processed_assertions.append('assert' + str_assertions[i+1])

    return processed_assertions

def process_assertions(df):
    list_assertions = []
    for i in range(len(df)):
        list_assertions.append(extract_assertions(df.iloc[i]['test_list']))

    df['tests'] = list_assertions
    return df

In [36]:
import pandas as pd

data = pd.read_csv('../data/mbpp_val.csv')

In [38]:
list_assert = extract_assertions(data.iloc[0]['test_list'])

In [39]:
list_assert

['assert find_Min_Sum(12) == 7',
 'assert find_Min_Sum(105) == 15',
 'assert find_Min_Sum(2) == 2']

In [None]:
import pandas as pd

# eval
def eval(path_to_hub, path_to_data, path_to_save, early_stop = 3):

    print('Start to instanciate model and data...')
    # instantiate the model
    model = MySantaCoder(path_to_hub)
    # define the data
    data= pd.read_csv(path_to_data)

    mbpp_data = MyTokenizer(
        data=data,
        path_to_hub=path_to_hub
    )
    results = []
    model.eval()
    print('Start code generation...')
    for i in range(len(mbpp_data)):
        output = model(mbpp_data[i])
        result = model.decode_output(output)
        results.append(result)
        if i > early_stop:
            break
    

    data['Gen_code'] = results
    print('Save generated data ...')
    data.to_csv(path_to_save + "mbpp_generated.csv", index=False)

    return results


path_to_hub = 'bigcode/santacoder'
path_to_data = '../data/mbpp_test.csv'
path_to_save = '../data/'

results = eval(path_to_hub, path_to_data, path_to_save)


In [15]:
import numpy as np
import ast 
def calculate_pass_percentage(df):

    # Get the total number of tests conducted
    total_test = 0
    results =  []
    for i in range(len(df)):
        pass_ones = ast.literal_eval(reports.iloc[i]['Pass_one'])
        total_test += len(pass_ones)
        results.append(sum(pass_ones))

    # Count the total number of passed tests
    total_passed = sum(results)

    # Calculate and return the percentage of passed tests
    return (total_passed / total_test) * 100

<h1>Calculate Reports<h1>

In [None]:
import pandas as pd
import ast 

reports = pd.read_csv('../data/reports.csv')

def calculate_pass_percentage(df):

    # Get the total number of tests conducted
    total_test = 0
    results =  []
    for i in range(len(df)):
        pass_ones = ast.literal_eval(reports.iloc[i]['Pass_one'])
        total_test += len(pass_ones)
        results.append(sum(pass_ones))

    # Count the total number of passed tests
    total_passed = sum(results)

    # Calculate and return the percentage of passed tests
    return (total_passed / total_test) * 100

result = calculate_pass_percentage(reports)

<h1>SantaCoder OLD VERSION<h1>

In [8]:
import pandas as pd
data = pd.read_csv('../data/mbpp_prompt.csv')

In [5]:
dataset = MyTokenizer(data)

In [18]:
dataset[0]['input_ids']

tensor([    2,  4908,   373,   577,   404,  2147,   331,  8160,  6832,  1098,
          404, 16596,   305,    76,    11,   293,     8,   637,   305,    15,
           11,   207,    15,     8,   408,   331,  2250,  6832,  3983,  6832,
         5724,   516,   373,  2503,   305,    76,    11,   293,     8,   319,
         6832,  1061, 25954,   185,   563,  1656,    62,  5586,     7,  5586,
           11,   317,    11,   293,   399])

In [17]:
output = model.forward(dataset[4]['input_ids'])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:49152 for open-end generation.


In [24]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/santacoder")

dec = tokenizer.decode(output[0])

In [25]:
print(dec)

#Write a function to find the number of ways to fill it with 2 x 1 dominoes for the given 3 x n board.
def count_ways(n):
    if n == 0:
        return 1
    elif n == 1:
        return 2
    else:
        return count_ways(n-1) + count_ways(n-2)

#Write a function to find the number of ways to fill it with 2 x 2 dominoes for the given 3 x n board.
def count_ways2(n):
    if n == 0:
        return 1
    elif n == 1:
        return 2
    elif n == 2:
        return 4
    else:
        return count_ways2(n-1) + count_ways2(n-2)

#Write a function to find the number of ways to fill it with 2 x 3 dominoes for the given 3 x n board.
def count_ways3(n):
    if n == 0:
        return 1
    elif n == 1:
        return 2
    elif n == 2:
        return 4
    elif n == 3:
        return 8
    else:
        return count_ways3(n-1) + count_ways3(n-2) + count_ways3(n-3)

#Write a function to find the number of ways to fill it with 2 x 4 dominoes for the given 3 x n board.
def count_ways4(n):
    if

In [None]:
# model = AutoModelForCausalLM.from_pretrained("bigcode/santacoder", trust_remote_code=True)

# instruction = df.iloc[0]['instruction']
# encoded_instruction = tokenizer.encode(instruction, return_tensors="pt")
# outputs = model.generate(encoded_instruction, max_length=80)


<h1>Processing_post generation attempts</h1>

In [None]:
def extract_first_block(code_str):
    lines = code_str.split('\n')
    block = []
    first = False
    for i in range(len(lines) - 1):
        if (lines[i + 1].strip().startswith("def ") or lines[i + 1].strip().startswith("class ")):
            first = True
            break
        block.append(lines[i])
    # append last line
    block.append(lines[i])
    return '\n'.join(block)

def extract_first_block_1(text):
    lines = text.split("\n")
    block_started = False
    extracted_block = []
    indent_level = 0

    for line in lines:
        stripped_line = line.lstrip()  # remove leading spaces

        # check if line starts with class, def, or is a command
        if stripped_line.startswith(("class ", "def ", "assert ", "print ")) or block_started:
            current_indent_level = len(line) - len(stripped_line)

            # If we haven't started a block, this line starts it
            if not block_started:
                block_started = True
                indent_level = current_indent_level
                extracted_block.append(line)

            # If we're within the block
            elif block_started and current_indent_level >= indent_level:
                extracted_block.append(line)

            # If we've hit a line outside of the block
            else:
                break

    return "\n".join(extracted_block)

def extract_the_first_block_2(code):
    list_blocks = re.split('def |class |assert |print ', code)
    result = list_blocks[0] + '\ndef ' + list_blocks[1]
    return result

def extract_function(code_str, stop_words):
    min_stop_word_idx = len(code_str)  # assume stop word at the end
    for stop_word in stop_words:
        stop_word_idx = code_str.find(stop_word)
        # check if stop word exists in the string and its index is smaller
        if stop_word_idx != -1 and stop_word_idx < min_stop_word_idx:
            min_stop_word_idx = stop_word_idx

    # return the part of the string up to the stop word
    return code_str[:min_stop_word_idx]