<a href="https://colab.research.google.com/github/nicikess/hsg-nlp-course/blob/main/CodeBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References

In [None]:
https://github.com/microsoft/CodeBERT/tree/master/CodeBERT/codesearch

SyntaxError: ignored

Imports for project

In [None]:
import torch
import os
import json
import numpy as np
import logging
import torch.nn as nn

from torch.utils.data import (DataLoader, RandomSampler, TensorDataset, Dataset, SequentialSampler)
from tqdm import tqdm, trange
from torch.nn import CrossEntropyLoss

!pip install transformers
from transformers import (AdamW,
                          RobertaConfig,
                          RobertaModel,
                          RobertaForSequenceClassification,
                          RobertaTokenizer, 
                          get_linear_schedule_with_warmup)

!pip install utils

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
logger = logging.getLogger()
logging.basicConfig(level=logging.DEBUG)

Set directories and file names

In [None]:
data_dir = '/content/drive/MyDrive/NLP/data/'
train_file = 'train.jsonl'
eval_file = 'valid.jsonl'
model_path = 'python_model/'
model_config_path = 'python_model/config.json'
output_dir = '/content/drive/MyDrive/NLP/data/output/'

Set seed

In [None]:
# init deterministic seed
seed_value = 1234
np.random.seed(seed_value) # set numpy seed
torch.manual_seed(seed_value) # set pytorch seed CPU

<torch._C.Generator at 0x7fcda9412670>

Enable GPU

In [None]:
torch.cuda.is_available()

False

In [None]:
# set cpu or gpu enabled device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type

# init deterministic GPU seed
torch.cuda.manual_seed(seed_value)

# log type of device enabled
print('[LOG] notebook with {} computation enabled'.format(str(device)))

[LOG] notebook with cpu computation enabled


Set hyperparameters

In [None]:
batch_size = 32
learning_rate = 2e-5
num_training_epochs = 10
#warmup_steps = 0
#start_epoch = 0
#gradient_accumulation_steps = 1
#adam_epsilon = 1e-8

Load data

In [None]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path=None):
        file_path = os.path.join(data_dir,file_path)
        self.examples = []
        data=[]
        with open(file_path) as f:
            for line in f:
                line=line.strip()
                js=json.loads(line)
                data.append(js)

        #for js in data:
          #self.examples.append(convert_examples_to_features(js,tokenizer))

        #np.save('/content/drive/MyDrive/NLP/data/examples.npy', self.examples)


        logger.info("load examples")
        self.examples = np.load('/content/drive/MyDrive/NLP/data/examples.npy', allow_pickle=True)

        ####TAKE OUT####    
        if 'train' in file_path:
            for idx, example in enumerate(self.examples[:3]):
                logger.info("*** Example ***")
                logger.info("idx: {}".format(idx))
                logger.info("code_tokens: {}".format([x.replace('\u0120','_') for x in example.code_tokens]))
                logger.info("code_ids: {}".format(' '.join(map(str, example.code_ids))))
                logger.info("nl_tokens: {}".format([x.replace('\u0120','_') for x in example.nl_tokens]))
                logger.info("nl_ids: {}".format(' '.join(map(str, example.nl_ids))))                             
        
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):   
        return (torch.tensor(self.examples[i].code_ids),torch.tensor(self.examples[i].nl_ids))

Convert examples to features used in load data

In [None]:
#Set maximum length for code and natural language
code_length = 256
nl_length = 128

In [None]:
class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self,
                 code_tokens,
                 code_ids,
                 nl_tokens,
                 nl_ids,
                 url,

    ):
        self.code_tokens = code_tokens
        self.code_ids = code_ids
        self.nl_tokens = nl_tokens
        self.nl_ids = nl_ids
        self.url=url

In [None]:
def convert_examples_to_features(js,tokenizer):
    #code
    code=' '.join(js['code_tokens'])
    code_tokens=tokenizer.tokenize(code)[:code_length-2]
    code_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]
    code_ids =  tokenizer.convert_tokens_to_ids(code_tokens)
    padding_length = code_length - len(code_ids)
    code_ids+=[tokenizer.pad_token_id]*padding_length
    
    nl=' '.join(js['docstring_tokens'])
    nl_tokens=tokenizer.tokenize(nl)[:nl_length-2]
    nl_tokens =[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]
    nl_ids =  tokenizer.convert_tokens_to_ids(nl_tokens)
    padding_length = nl_length - len(nl_ids)
    nl_ids+=[tokenizer.pad_token_id]*padding_length

    return InputFeatures(code_tokens,code_ids,nl_tokens,nl_ids,js['url'])


Train the model

In [None]:
def train(model, tokenizer):

  #Get the training dataset
  train_dataset=TextDataset(tokenizer, train_file)
  train_sampler = RandomSampler(train_dataset)
  train_dataloader = DataLoader(train_dataset, sampler = train_sampler, batch_size = batch_size)

  optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(train_dataloader)*num_training_epochs)

  logger.info("Training start")

  model.zero_grad()
  model.train()

  tr_num,tr_loss,best_mrr=0,0,0

  for idx in range(num_training_epochs):
    for step, batch in enumerate(train_dataloader):

      #Get inputs
      code_inputs = batch[0].to(device)    
      nl_inputs = batch[1].to(device)

      #Get code and nl vectors
      code_vec = model(code_inputs=code_inputs)
      nl_vec = model(nl_inputs=nl_inputs)

      #Calculate scores and loss
      scores=torch.einsum("ab,cb->ac",nl_vec,code_vec)
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(scores, torch.arange(code_inputs.size(0), device=scores.device))

      #Report loss
      tr_loss += loss.item()
      tr_num+=1
      if (step+1)% 100==0:
          logger.info("epoch {} step {} loss {}".format(idx,step+1,round(tr_loss/tr_num,5)))
          tr_loss=0
          tr_num=0

      #Backward
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
      optimizer.step()
      optimizer.zero_grad()
      scheduler.step()

    #evaluate    
    results = evaluate(model, tokenizer, eval_file, eval_when_training=True)
    for key, value in results.items():
      logger.info("  %s = %s", key, round(value,4))

Evaluate

In [None]:
def evaluate(model, tokenizer,file_name,eval_when_training=False):
    query_dataset = TextDataset(tokenizer, file_name)
    query_sampler = SequentialSampler(query_dataset)
    query_dataloader = DataLoader(query_dataset, sampler=query_sampler, batch_size=batch_size,num_workers=4)
    
    code_dataset = TextDataset(tokenizer, args, args.codebase_file)
    code_sampler = SequentialSampler(code_dataset)
    code_dataloader = DataLoader(code_dataset, sampler=code_sampler, batch_size=batch_size,num_workers=4)    

    # Eval!

    #### TAKE OUT ####
    logger.info("***** Running evaluation *****")
    logger.info("  Num queries = %d", len(query_dataset))
    logger.info("  Num codes = %d", len(code_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    
    model.eval()
    code_vecs=[] 
    nl_vecs=[]
    for batch in query_dataloader:  
        nl_inputs = batch[1].to(args.device)
        with torch.no_grad():
            nl_vec = model(nl_inputs=nl_inputs) 
            nl_vecs.append(nl_vec.cpu().numpy()) 

    for batch in code_dataloader:
        code_inputs = batch[0].to(device)    
        with torch.no_grad():
            code_vec= model(code_inputs=code_inputs)
            code_vecs.append(code_vec.cpu().numpy())  
    model.train()    
    code_vecs=np.concatenate(code_vecs,0)
    nl_vecs=np.concatenate(nl_vecs,0)

    scores=np.matmul(nl_vecs,code_vecs.T)
    
    sort_ids=np.argsort(scores, axis=-1, kind='quicksort', order=None)[:,::-1]    
    
    nl_urls=[]
    code_urls=[]
    for example in query_dataset.examples:
        nl_urls.append(example.url)
        
    for example in code_dataset.examples:
        code_urls.append(example.url)
        
    ranks=[]
    for url, sort_id in zip(nl_urls,sort_ids):
        rank=0
        find=False
        for idx in sort_id[:1000]:
            if find is False:
                rank+=1
            if code_urls[idx]==url:
                find=True
        if find:
            ranks.append(1/rank)
        else:
            ranks.append(0)
    
    result = {
        "eval_mrr":float(np.mean(ranks))
    }

    return result

In [None]:
class Model(nn.Module):   
    def __init__(self, encoder):
        super(Model, self).__init__()
        self.encoder = encoder
      
    def forward(self, code_inputs=None, nl_inputs=None): 
        if code_inputs is not None:
            return self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[1]
        else:
            return self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[1]

Load model

In [None]:
config = RobertaConfig.from_pretrained(os.path.join(data_dir, model_config_path))
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained(os.path.join(data_dir, model_path))    
model = Model(model)
model.to(device)

Train model

In [None]:
train(model, tokenizer)

INFO:root:load examples
INFO:root:*** Example ***
INFO:root:idx: 0
INFO:root:code_tokens: ['<s>', 'def', '_split', '_', 'ph', 'yl', 'ogen', 'y', '_(', '_p', '_,', '_level', '_=', '_"', 's', '"', '_)', '_:', '_level', '_=', '_level', '_+', '_"', '__', '"', '_result', '_=', '_p', '_.', '_split', '_(', '_level', '_)', '_return', '_result', '_[', '_0', '_]', '_+', '_level', '_+', '_result', '_[', '_1', '_]', '_.', '_split', '_(', '_"', ';"', '_)', '_[', '_0', '_]', '</s>']
INFO:root:code_ids: 0 9232 3462 1215 3792 4360 11575 219 36 181 2156 672 5457 22 29 113 4839 4832 672 5457 672 2055 22 30529 113 898 5457 181 479 3462 36 672 4839 671 898 646 321 27779 2055 672 2055 898 646 112 27779 479 3462 36 22 42777 4839 646 321 27779 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1