<a href="https://colab.research.google.com/github/nicikess/hsg-nlp-course/blob/main/notebooks/CodeBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference to Microsoft CodeBERT: A Pre-Trained Model for Programming and Natural Languages 

In [1]:
#https://github.com/microsoft/CodeBERT/tree/master/CodeBERT/codesearch

# 1. Importing the required modules




In [None]:
!pip install transformers
!pip install utils
!pip install wandb

import torch
import os
import json
import numpy as np
import logging
import torch.nn as nn
import torch.utils.data as data_utils

from tqdm import tqdm, trange
from torch.nn import CrossEntropyLoss

from torch.utils.data import (DataLoader,
                              RandomSampler,
                              TensorDataset,
                              Dataset,
                              SequentialSampler)

from transformers import (AdamW,
                          RobertaConfig,
                          RobertaModel,
                          RobertaForSequenceClassification,
                          RobertaTokenizer, 
                          get_linear_schedule_with_warmup)

In [3]:
"""
Enable and login to weights & biases with key
"""

#!wandb login

'\nEnable and login to weights & biases with key\n'

In [4]:
"""
Initialize Logger
"""

logger = logging.getLogger()
logging.basicConfig(level=logging.DEBUG)

# 2. Set directories and file names, seed and hyperparameters

In [5]:
"""
Set directories and file names
"""

data_dir = '/content/drive/MyDrive/NLP/data/'
output_dir = '/content/drive/MyDrive/NLP/data/output/'

train_file = 'train.jsonl'
test_file = 'test.jsonl'
eval_file = 'valid.jsonl'
codebase_file = 'codebase.jsonl'
model_path = 'python_model/'
model_config_path = 'python_model/config.json'

In [6]:
"""
Init deterministic seed
"""

seed_value = 1234
np.random.seed(seed_value) # set numpy seed
torch.manual_seed(seed_value) # set pytorch seed CPU

<torch._C.Generator at 0x7f056b7726d0>

In [7]:
"""
Check if cuda is available and enable cuda
"""

# set cpu or gpu enabled device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type

# init deterministic GPU seed
torch.cuda.manual_seed(seed_value)

# log type of device enabled
print('[LOG] notebook with {} computation enabled'.format(str(device)))

[LOG] notebook with cuda computation enabled


In [8]:
"""
Set configurations for training and weights & biases
"""

config={
"batch_size": 32,
"learning_rate": 2e-5,
"num_training_epochs": 10,
"run_information": "not finetuned model with: microsoft/codebert-base as model"
}

In [9]:
"""
Initialize weights and bias run with defined configurations
"""

import wandb
run = wandb.init(project="nlp-codebert", entity="nicikess", config=config)

DEBUG:git.cmd:Popen(['git', 'version'], cwd=/content, universal_newlines=False, shell=None, istream=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/content, universal_newlines=False, shell=None, istream=None)
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 172.28.0.2:9000
DEBUG:urllib3.connectionpool:http://172.28.0.2:9000 "GET /api/sessions?token= HTTP/1.1" 200 410


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.wandb.ai:443
DEBUG:urllib3.connectionpool:https://api.wandb.ai:443 "POST /graphql HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.wandb.ai:443
DEBUG:urllib3.connectionpool:https://api.wandb.ai:443 "POST /graphql HTTP/1.1" 200 None


# 3. Define classes and functions used for training and evaluation

In [10]:
"""
Define features for training and testing
"""

class InputFeatures(object):

    def __init__(self,
                 code_tokens,
                 code_ids,
                 nl_tokens,
                 nl_ids,
                 url,

    ):
        self.code_tokens = code_tokens
        self.code_ids = code_ids
        self.nl_tokens = nl_tokens
        self.nl_ids = nl_ids
        self.url=url

In [11]:
"""
Convert json data to feature for training and testing
"""

def convert_examples_to_features(js,tokenizer):

    #Set maximum characters for natural language and code. Length were definied based on this notebook:
    #https://github.com/github/CodeSearchNet/blob/master/notebooks/ExploreData.ipynb

    code_length = 256
    nl_length = 128

    code=' '.join(js['code_tokens'])
    code_tokens=tokenizer.tokenize(code)[:code_length-2]
    code_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]
    code_ids =  tokenizer.convert_tokens_to_ids(code_tokens)
    padding_length = code_length - len(code_ids)
    code_ids+=[tokenizer.pad_token_id]*padding_length
    
    nl=' '.join(js['docstring_tokens'])
    nl_tokens=tokenizer.tokenize(nl)[:nl_length-2]
    nl_tokens =[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]
    nl_ids =  tokenizer.convert_tokens_to_ids(nl_tokens)
    padding_length = nl_length - len(nl_ids)
    nl_ids+=[tokenizer.pad_token_id]*padding_length

    return InputFeatures(code_tokens,code_ids,nl_tokens,nl_ids,js['url'])

In [12]:
"""
If first_run = true the jsonl training data is loaded from Google drive
If first_run = false the data is loaded as ndarray from Google drive to save time when training
When the script is executed for the first time set first_run = true
"""

first_run = False

In [13]:
"""
Dataset used for the training and evaluation process
"""

class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path=None):
        file_path = os.path.join(data_dir,file_path)
        self.examples = []
        data=[]
        with open(file_path) as f:
            for line in f:
                line=line.strip()
                js=json.loads(line)
                data.append(js)

        if 'train' not in file_path or first_run == True:
          for js in data:
            self.examples.append(convert_examples_to_features(js,tokenizer))
        else:
          self.examples = np.load('/content/drive/MyDrive/NLP/data/examples.npy', allow_pickle=True)

        #Print first three examples
        if 'train' in file_path:
            for idx, example in enumerate(self.examples[:3]):
                logger.info("*** Example ***")
                logger.info("idx: {}".format(idx))
                logger.info("code_tokens: {}".format([x.replace('\u0120','_') for x in example.code_tokens]))
                logger.info("code_ids: {}".format(' '.join(map(str, example.code_ids))))
                logger.info("nl_tokens: {}".format([x.replace('\u0120','_') for x in example.nl_tokens]))
                logger.info("nl_ids: {}".format(' '.join(map(str, example.nl_ids))))                             
        
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):   
        return (torch.tensor(self.examples[i].code_ids),torch.tensor(self.examples[i].nl_ids))

#3. Finetune pretrained CodeBERT model

In [14]:
def train(model, tokenizer):

  #Get the training dataset
  train_dataset=TextDataset(tokenizer, train_file)

  #Decrease training set for testing
  indices = np.arange(start = 0, stop = len(train_dataset), step = 1000)
  train_dataset = data_utils.Subset(train_dataset, indices)

  train_sampler = RandomSampler(train_dataset)
  train_dataloader = DataLoader(train_dataset, sampler = train_sampler, batch_size = config.get("batch_size"))

  optimizer = AdamW(model.parameters(), lr=config.get("learning_rate"), eps=1e-8)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(train_dataloader)*config.get("num_training_epochs"))

  logger.info("Training start")

  model.zero_grad()
  model.train()

  tr_num,tr_loss,best_mrr=0,0,0

  wandb.watch(model)

  for idx in range(config.get("num_training_epochs")):
    for step, batch in enumerate(train_dataloader):

      #Get inputs
      code_inputs = batch[0].to(device)    
      nl_inputs = batch[1].to(device)

      #Get code and nl vectors
      code_vec = model(code_inputs=code_inputs)
      nl_vec = model(nl_inputs=nl_inputs)

      #Calculate scores and loss
      scores=torch.einsum("ab,cb->ac",nl_vec,code_vec)
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(scores, torch.arange(code_inputs.size(0), device=scores.device))
      wandb.log({"loss": loss})

      #Report loss
      tr_loss += loss.item()
      tr_num+=1
      if (step+1)% 100==0:
          logger.info("epoch {} step {} loss {}".format(idx,step+1,round(tr_loss/tr_num,5)))
          tr_loss=0
          tr_num=0

      #Backward
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
      optimizer.step()
      optimizer.zero_grad()
      scheduler.step()

In [15]:
"""
Reset cuda GPU to save RAM
"""

torch.cuda.empty_cache()

#4. Evaluate finetuned model

In [26]:
def evaluate(model, tokenizer):
    query_dataset = TextDataset(tokenizer, test_file)

    query_sampler = SequentialSampler(query_dataset)
    query_dataloader = DataLoader(query_dataset, sampler=query_sampler, batch_size=config.get("batch_size"),num_workers=4)

    logger.info("  Num queries = %d", len(query_dataset))

    code_dataset = TextDataset(tokenizer, codebase_file)
    code_sampler = SequentialSampler(code_dataset)
    code_dataloader = DataLoader(code_dataset, sampler=code_sampler, batch_size=config.get("batch_size"),num_workers=4)    

    logger.info("***** Running evaluation *****")
    logger.info("  Num queries = %d", len(query_dataset))
    logger.info("  Num codes = %d", len(code_dataset))
    logger.info("  Batch size = %d", config.get("batch_size"))

    
    model.eval() 
    code_vecs=[] 
    nl_vecs=[]

    logger.info(" Go trough query")

    for batch in tqdm(query_dataloader):  
        nl_inputs = batch[1].to(device)
        with torch.no_grad():
            nl_vec = model(nl_inputs=nl_inputs) 
            nl_vecs.append(nl_vec.cpu().numpy()) 

    logger.info(" Go trough code")

    for batch in tqdm(code_dataloader):
        code_inputs = batch[0].to(device)    
        with torch.no_grad():
            code_vec= model(code_inputs=code_inputs)
            code_vecs.append(code_vec.cpu().numpy())

    model.train()    
    code_vecs=np.concatenate(code_vecs,0)
    nl_vecs=np.concatenate(nl_vecs,0)

    scores=np.matmul(nl_vecs,code_vecs.T)
    
    sort_ids=np.argsort(scores, axis=-1, kind='quicksort', order=None)[:,::-1]    
    
    nl_urls=[]
    code_urls=[]
    for example in query_dataset.examples:
        nl_urls.append(example.url)
        
    for example in code_dataset.examples:
        code_urls.append(example.url)
        
    ranks=[]
    for url, sort_id in zip(nl_urls,sort_ids):
        rank=0
        find=False
        for idx in sort_id[:1000]:
            if find is False:
                rank+=1
            if code_urls[idx]==url:
                find=True
        if find:
            ranks.append(1/rank)
        else:
            ranks.append(0)
    
    result = {
        "eval_mrr":float(np.mean(ranks))
    }

    return result

#5. Define model

In [17]:
class Model(nn.Module):   
    def __init__(self, encoder):
        super(Model, self).__init__()
        self.encoder = encoder
      
    def forward(self, code_inputs=None, nl_inputs=None): 
        if code_inputs is not None:
            return self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[1]
        else:
            return self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[1]

In [None]:
"""
Load pretrained model
"""

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

#Pretrained model
#model = RobertaModel.from_pretrained("microsoft/codebert-base")

#Finetuned model
model = RobertaModel.from_pretrained(os.path.join(data_dir, model_path)) 

model = Model(model)
model.to(device)

#6. Train and Evaluate

In [19]:
#train(model, tokenizer)

In [27]:
result = evaluate(model, tokenizer)
logger.info("***** Eval results *****")
for key in sorted(result.keys()):
    logger.info("  %s = %s", key, str(round(result[key],4)))

INFO:root:  Num queries = 14918
INFO:root:***** Running evaluation *****
INFO:root:  Num queries = 14918
INFO:root:  Num codes = 43827
INFO:root:  Batch size = 32
INFO:root: Go trough query
100%|██████████| 467/467 [00:55<00:00,  8.37it/s]
INFO:root: Go trough code
100%|██████████| 1370/1370 [05:27<00:00,  4.18it/s]
INFO:root:***** Eval results *****
INFO:root:  eval_mrr = 0.672
