In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch
import re
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class MyTokenizer(Dataset):
    def __init__(self, data, path_to_hub):
        self.tokenizer = AutoTokenizer.from_pretrained(path_to_hub) # Load your tokenizer here with a pre-specified vocabulary
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.data = data # a dataframe
        self.content_name = 'instruction'
        self.target_name = 'code'
        self.list_test = 'test_list'

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        text = self.data.iloc[index][self.content_name]
        code = self.data.iloc[index][self.target_name]
        test_list = self.data.iloc[index][self.list_test]

        inputs = self.tokenizer(text, padding=True)
        label = self.tokenizer.encode(code, padding=True)
        tests = self.tokenizer.encode(test_list, padding=True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long),
            'tests' : torch.tensor(tests, dtype=torch.long)
        }

class MySantaCoder(nn.Module):
    def __init__(self):
        super(MySantaCoder, self).__init__()
        self.checkpoint = "bigcode/santacoder"
        # self.checkpoint = model_path_to_hub
        self.model = AutoModelForCausalLM.from_pretrained(self.checkpoint, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.max_new_tokens = 128
        self.num_solution_to_generate = 200

        self.generation_config = GenerationConfig(
            num_beams = self.num_solution_to_generate,
            num_return_sequences = self.num_solution_to_generate,
            max_length = self.max_new_tokens,
            eos_token_id=self.model.generation_config.eos_token_id,
            bos_token_id=self.model.generation_config.bos_token_id
            )

    def forward(self, input_ids):
        # input_ids = input_ids.unsqueeze(0)
        outputs = self.model.generate(input_ids, self.generation_config)
        return outputs

    def decode_output(self, encoded_output):
        output = self.tokenizer.decode(encoded_output)
        return output

    def post_generation_processing(self,code):
        # split it into list of blocks
        list_blocks = re.split('def |class |assert |print ', code)
        if 'init' in list_blocks[1]:
            fill_word = '\nclass '
        else:
            fill_word = '\ndef '
        # keep only the first block
        result = list_blocks[0] + fill_word + list_blocks[1]
        return result

In [4]:
def eval(test_loader, model, early_stoping = None):
    """
    This function evaluate the model on the test data
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()
    final_outputs=[]

    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
          ids = data['input_ids'].to(device, dtype = torch.long)
          targets = data['labels'].to(device, dtype = torch.long)
          tests = data['tests'].to(device, dtype = torch.long)
          # forward
          output = model.forward(ids).to(device)
          # postprocessing output
          decoded_output = [model.decode_output(t.cpu().numpy()) for t in output]
          code_generated = [model.post_generation_processing(dec) for dec in decoded_output]

          final_outputs.append(code_generated)

    return final_outputs

def save_generated_data(data, generated_code):
  selected_feature = ['code', 'test_list']
  data = data[selected_feature]
  # add the new feature
  data['generated_code'] = generated_code

  data.to_csv("data/200_solutions.csv", index=False)
  return data

def dataloading(data, path_to_hub, batch_size, num_workers, g, seed_worker):
    # create dataset out of dataframe
    dataset = MyTokenizer(data, path_to_hub)

    # create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        worker_init_fn=seed_worker,
        generator=g,
        )

    return dataloader

In [5]:
import pandas as pd
import numpy as np
import random

# instanciate the testing loader :
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


# setting the seed
g = torch.Generator()
g.manual_seed(0)


# Create dataloader
batch_size = 1
num_workers = 0
testing_data = pd.read_csv('data/mbpp_test.csv')
PATH_TO_HUB = "bigcode/santacoder"


testloader = dataloading(testing_data, PATH_TO_HUB, batch_size, num_workers, g, seed_worker)

In [6]:
model = MySantaCoder()
fin_outputs = eval(testloader, model)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 4.00 GiB total capacity; 3.41 GiB already allocated; 0 bytes free; 3.43 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF