<a href="https://colab.research.google.com/github/nicikess/hsg-nlp-course/blob/main/notebooks/CodeBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References

In [None]:
https://github.com/microsoft/CodeBERT/tree/master/CodeBERT/codesearch

Imports for project

In [None]:
import torch
import os
import json
import numpy as np
import logging
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset, Dataset)
from tqdm import tqdm, trange
from torch.nn import CrossEntropyLoss

!pip install transformers
from transformers import (AdamW,RobertaConfig,RobertaForSequenceClassification,RobertaTokenizer, get_linear_schedule_with_warmup)

!pip install utils

In [8]:
logger = logging.getLogger(__name__)

Set directories and file names

In [4]:
data_dir = '/content/drive/MyDrive/NLP/data/'
train_file = 'train.txt'
dev_file = 'valid.txt'
model_type = 'roberta'
model_name_or_path = 'mircrosoft/codebert-base'
task_name = 'codesearch'
output_dir = '/content/drive/MyDrive/NLP/data/output/'

Set seed

In [9]:
# init deterministic seed
seed_value = 1234
np.random.seed(seed_value) # set numpy seed
torch.manual_seed(seed_value) # set pytorch seed CPU

<torch._C.Generator at 0x7f64ee27a770>

Enable GPU

In [10]:
# set cpu or gpu enabled device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type

# init deterministic GPU seed
torch.cuda.manual_seed(seed_value)

# log type of device enabled
print('[LOG] notebook with {} computation enabled'.format(str(device)))

[LOG] notebook with cpu computation enabled


Set hyperparameters

In [14]:
batch_size = 32
learning_rate = 1e-5
num_training_epochs = 8
warmup_steps = 0
start_epoch = 0
gradient_accumulation_steps = 1
adam_epsilon = 1e-8

Load data

In [None]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path=None):
        self.examples = []
        data=[]
        with open(file_path) as f:
            for line in f:
                line=line.strip()
                js=json.loads(line)
                data.append(js)

        for js in data:
            self.examples.append(convert_examples_to_features(js,tokenizer))

        ####TAKE OUT####    
        if 'train' in file_path:
            for idx, example in enumerate(self.examples[:3]):
                logger.info("*** Example ***")
                logger.info("idx: {}".format(idx))
                logger.info("code_tokens: {}".format([x.replace('\u0120','_') for x in example.code_tokens]))
                logger.info("code_ids: {}".format(' '.join(map(str, example.code_ids))))
                logger.info("nl_tokens: {}".format([x.replace('\u0120','_') for x in example.nl_tokens]))
                logger.info("nl_ids: {}".format(' '.join(map(str, example.nl_ids))))                             
        
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):   
        return (torch.tensor(self.examples[i].code_ids),torch.tensor(self.examples[i].nl_ids))

Convert examples to features used in load data

In [None]:
def convert_examples_to_features(js,tokenizer,args):
    #code
    code=' '.join(js['code_tokens'])
    code_tokens=tokenizer.tokenize(code)[:args.code_length-2]
    code_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]
    code_ids =  tokenizer.convert_tokens_to_ids(code_tokens)
    padding_length = args.code_length - len(code_ids)
    code_ids+=[tokenizer.pad_token_id]*padding_length
    
    nl=' '.join(js['docstring_tokens'])
    nl_tokens=tokenizer.tokenize(nl)[:args.nl_length-2]
    nl_tokens =[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]
    nl_ids =  tokenizer.convert_tokens_to_ids(nl_tokens)
    padding_length = args.nl_length - len(nl_ids)
    nl_ids+=[tokenizer.pad_token_id]*padding_length 

Train the model

In [15]:
def train(model, optimizer):

  #Get the training dataset
  train_dataset=TextDataset(tokenizer, train_file)
  train_sampler = RandomSampler(train_dataset)
  train_dataloader = DataLoader(train_dataset, sampler = train_sampler, batch_size = batch_size)

  optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1-8)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(train_dataloader)*num_training_epochs)

  logger.info("Training start")

  model.zero_grad()
  model.train()

  tr_num,tr_loss,best_mrr=0,0,0

  for idx, _ in enumerate(num_training_epochs):
    for step, batch in enumerate(train_dataloader):

      #Get inputs
      code_inputs = batch[0].to(device)    
      nl_inputs = batch[1].to(device)

      #Get code and nl vectors
      code_vec = model(code_inputs=code_inputs)
      nl_vec = model(nl_inputs=nl_inputs)

      #Calculate scores and loss
      scores=torch.einsum("ab,cb->ac",nl_vec,code_vec)
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(scores, torch.arange(code_inputs.size(0), device=scores.device))

      #Report loss
      tr_loss += loss.item()
      tr_num+=1
      if (step+1)% 100==0:
          logger.info("epoch {} step {} loss {}".format(idx,step+1,round(tr_loss/tr_num,5)))
          tr_loss=0
          tr_num=0

      #Backward
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
      optimizer.step()
      optimizer.zero_grad()
      scheduler.step()

    #evaluate    
    results = evaluate(model, tokenizer, eval_data_file, eval_when_training=True)
    for key, value in results.items():
    logger.info("  %s = %s", key, round(value,4))

Evaluate

In [None]:
def evaluate(args, model, tokenizer,file_name,eval_when_training=False):
    query_dataset = TextDataset(tokenizer, args, file_name)
    query_sampler = SequentialSampler(query_dataset)
    query_dataloader = DataLoader(query_dataset, sampler=query_sampler, batch_size=args.eval_batch_size,num_workers=4)
    
    code_dataset = TextDataset(tokenizer, args, args.codebase_file)
    code_sampler = SequentialSampler(code_dataset)
    code_dataloader = DataLoader(code_dataset, sampler=code_sampler, batch_size=args.eval_batch_size,num_workers=4)    

    # multi-gpu evaluate
    if args.n_gpu > 1 and eval_when_training is False:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info("  Num queries = %d", len(query_dataset))
    logger.info("  Num codes = %d", len(code_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    
    model.eval()
    code_vecs=[] 
    nl_vecs=[]
    for batch in query_dataloader:  
        nl_inputs = batch[1].to(args.device)
        with torch.no_grad():
            nl_vec = model(nl_inputs=nl_inputs) 
            nl_vecs.append(nl_vec.cpu().numpy()) 

    for batch in code_dataloader:
        code_inputs = batch[0].to(args.device)    
        with torch.no_grad():
            code_vec= model(code_inputs=code_inputs)
            code_vecs.append(code_vec.cpu().numpy())  
    model.train()    
    code_vecs=np.concatenate(code_vecs,0)
    nl_vecs=np.concatenate(nl_vecs,0)

    scores=np.matmul(nl_vecs,code_vecs.T)
    
    sort_ids=np.argsort(scores, axis=-1, kind='quicksort', order=None)[:,::-1]    
    
    nl_urls=[]
    code_urls=[]
    for example in query_dataset.examples:
        nl_urls.append(example.url)
        
    for example in code_dataset.examples:
        code_urls.append(example.url)
        
    ranks=[]
    for url, sort_id in zip(nl_urls,sort_ids):
        rank=0
        find=False
        for idx in sort_id[:1000]:
            if find is False:
                rank+=1
            if code_urls[idx]==url:
                find=True
        if find:
            ranks.append(1/rank)
        else:
            ranks.append(0)
    
    result = {
        "eval_mrr":float(np.mean(ranks))
    }

    return result

Load model

In [None]:
#build model
config = RobertaConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
model = RobertaModel.from_pretrained(args.model_name_or_path)    
model=Model(model)
model.to(args.device)

Train model

In [None]:
train(dataset, model, optimizer)