<a href="https://colab.research.google.com/github/nicikess/hsg-nlp-course/blob/main/notebooks/CodeBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference to Microsoft CodeBERT: A Pre-Trained Model for Programming and Natural Languages 

In [1]:
#https://github.com/microsoft/CodeBERT/tree/master/CodeBERT/codesearch

# 1. Importing the required modules




In [2]:
!pip install transformers
!pip install utils
!pip install wandb
!pip install sentencepiece

import torch
import os
import json
import numpy as np
import logging
import torch.nn as nn
import torch.utils.data as data_utils

from tqdm import tqdm, trange
from torch.nn import CrossEntropyLoss

from torch.utils.data import (DataLoader,
                              RandomSampler,
                              TensorDataset,
                              Dataset,
                              SequentialSampler)

from transformers import (AdamW,
                          RobertaConfig,
                          RobertaModel,
                          RobertaForSequenceClassification,
                          RobertaTokenizer, 
                          get_linear_schedule_with_warmup)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 8.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 50.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYA

In [3]:
"""
Enable and login to weights & biases with key
"""

#!wandb login

'\nEnable and login to weights & biases with key\n'

In [4]:
"""
Initialize Logger
"""

logger = logging.getLogger()
logging.basicConfig(level=logging.DEBUG)

# 2. Set directories and file names, seed and hyperparameters

In [5]:
"""
Set directories and file names
"""

data_dir = '/content/drive/MyDrive/NLP/data/'
output_dir = '/content/drive/MyDrive/NLP/data/output/'

train_file = 'train.jsonl'
test_file = 'test.jsonl'
eval_file = 'valid.jsonl'
codesearchnet_file = 'codesearchnet.jsonl'
codebase_file = 'codebase.jsonl'
model_path = 'python_model/'
model_config_path = 'python_model/config.json'

In [6]:
"""
Init deterministic seed
"""

seed_value = 1234
np.random.seed(seed_value) # set numpy seed
torch.manual_seed(seed_value) # set pytorch seed CPU

<torch._C.Generator at 0x7fce535d3710>

In [7]:
"""
Check if cuda is available and enable cuda
"""

# set cpu or gpu enabled device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type

# init deterministic GPU seed
torch.cuda.manual_seed(seed_value)

# log type of device enabled
print('[LOG] notebook with {} computation enabled'.format(str(device)))

[LOG] notebook with cuda computation enabled


In [8]:
"""
Set configurations for training and weights & biases
"""

config={
"batch_size": 32,
"learning_rate": 2e-5,
"num_training_epochs": 10,
"run_information": "not finetuned model with: microsoft/codebert-base as model"
}

In [9]:
"""
Initialize weights and bias run with defined configurations
"""

#import wandb
#run = wandb.init(project="nlp-codebert", entity="nicikess", config=config)

'\nInitialize weights and bias run with defined configurations\n'

# 3. Define classes and functions used for training and evaluation

In [10]:
"""
Define features for training and testing
"""

class InputFeatures(object):

    def __init__(self,
                 code_tokens,
                 code_ids,
                 nl_tokens,
                 nl_ids,
                 url,

    ):
        self.code_tokens = code_tokens
        self.code_ids = code_ids
        self.nl_tokens = nl_tokens
        self.nl_ids = nl_ids
        self.url=url

In [11]:
"""
Convert json data to feature for training and testing
"""

def convert_examples_to_features(js,tokenizer):

    #Set maximum characters for natural language and code. Length were definied based on this notebook:
    #https://github.com/github/CodeSearchNet/blob/master/notebooks/ExploreData.ipynb

    code_length = 256
    nl_length = 128

    code=' '.join(js['code_tokens'])
    code_tokens=tokenizer.tokenize(code)[:code_length-2]
    code_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]
    code_ids =  tokenizer.convert_tokens_to_ids(code_tokens)
    padding_length = code_length - len(code_ids)
    code_ids+=[tokenizer.pad_token_id]*padding_length
    
    nl=' '.join(js['docstring_tokens'])
    nl_tokens=tokenizer.tokenize(nl)[:nl_length-2]
    nl_tokens =[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]
    nl_ids =  tokenizer.convert_tokens_to_ids(nl_tokens)
    padding_length = nl_length - len(nl_ids)
    nl_ids+=[tokenizer.pad_token_id]*padding_length

    return InputFeatures(code_tokens,code_ids,nl_tokens,nl_ids,js['url'])

In [12]:
"""
If first_run = true the jsonl training data is loaded from Google drive
If first_run = false the data is loaded as ndarray from Google drive to save time when training
When the script is executed for the first time set first_run = true
"""

first_run = False

In [13]:
"""
Dataset used for the training and evaluation process
"""

class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path=None):
        file_path = os.path.join(data_dir,file_path)
        self.examples = []
        self.data=[]
        with open(file_path) as f:
            for line in f:
                line=line.strip()
                js=json.loads(line)
                self.data.append(js)

        if 'train' not in file_path or first_run == True:
          for js in self.data:
            self.examples.append(convert_examples_to_features(js,tokenizer))
        else:
          self.examples = np.load('/content/drive/MyDrive/NLP/data/examples.npy', allow_pickle=True)

        #Print first three examples
        if 'train' in file_path:
            for idx, example in enumerate(self.examples[:3]):
                logger.info("*** Example ***")
                logger.info("idx: {}".format(idx))
                logger.info("code_tokens: {}".format([x.replace('\u0120','_') for x in example.code_tokens]))
                logger.info("code_ids: {}".format(' '.join(map(str, example.code_ids))))
                logger.info("nl_tokens: {}".format([x.replace('\u0120','_') for x in example.nl_tokens]))
                logger.info("nl_ids: {}".format(' '.join(map(str, example.nl_ids))))                             
        
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):   
        return (torch.tensor(self.examples[i].code_ids),torch.tensor(self.examples[i].nl_ids))

#3. Finetune pretrained CodeBERT model

In [14]:
def train(model, tokenizer):

  #Get the training dataset
  train_dataset=TextDataset(tokenizer, train_file)

  #Decrease training set for testing
  indices = np.arange(start = 0, stop = len(train_dataset), step = 1000)
  train_dataset = data_utils.Subset(train_dataset, indices)

  train_sampler = RandomSampler(train_dataset)
  train_dataloader = DataLoader(train_dataset, sampler = train_sampler, batch_size = config.get("batch_size"))

  optimizer = AdamW(model.parameters(), lr=config.get("learning_rate"), eps=1e-8)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(train_dataloader)*config.get("num_training_epochs"))

  logger.info("Training start")

  model.zero_grad()
  model.train()

  tr_num,tr_loss,best_mrr=0,0,0

  wandb.watch(model)

  for idx in range(config.get("num_training_epochs")):
    for step, batch in enumerate(train_dataloader):

      #Get inputs
      code_inputs = batch[0].to(device)    
      nl_inputs = batch[1].to(device)

      #Get code and nl vectors
      code_vec = model(code_inputs=code_inputs)
      nl_vec = model(nl_inputs=nl_inputs)

      #Calculate scores and loss
      scores=torch.einsum("ab,cb->ac",nl_vec,code_vec)
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(scores, torch.arange(code_inputs.size(0), device=scores.device))
      wandb.log({"loss": loss})

      #Report loss
      tr_loss += loss.item()
      tr_num+=1
      if (step+1)% 100==0:
          logger.info("epoch {} step {} loss {}".format(idx,step+1,round(tr_loss/tr_num,5)))
          tr_loss=0
          tr_num=0

      #Backward
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
      optimizer.step()
      optimizer.zero_grad()
      scheduler.step()

In [15]:
"""
Reset cuda GPU to save RAM
"""

torch.cuda.empty_cache()

#4. Evaluate finetuned model

In [16]:
def evaluate(model, tokenizer):
    query_dataset = TextDataset(tokenizer, test_file)

    query_sampler = SequentialSampler(query_dataset)
    query_dataloader = DataLoader(query_dataset, sampler=query_sampler, batch_size=config.get("batch_size"),num_workers=4)

    logger.info("  Num queries = %d", len(query_dataset))

    code_dataset = TextDataset(tokenizer, codebase_file)
    code_sampler = SequentialSampler(code_dataset)
    code_dataloader = DataLoader(code_dataset, sampler=code_sampler, batch_size=config.get("batch_size"),num_workers=4)    

    logger.info("***** Running evaluation *****")
    logger.info("  Num queries = %d", len(query_dataset))
    logger.info("  Num codes = %d", len(code_dataset))
    logger.info("  Batch size = %d", config.get("batch_size"))

    
    model.eval() 
    code_vecs=[] 
    nl_vecs=[]

    logger.info(" Go trough query")

    for batch in tqdm(query_dataloader):  
        nl_inputs = batch[1].to(device)
        with torch.no_grad():
            nl_vec = model(nl_inputs=nl_inputs) 
            nl_vecs.append(nl_vec.cpu().numpy()) 

    logger.info(" Go trough code")

    for batch in tqdm(code_dataloader):
        code_inputs = batch[0].to(device)    
        with torch.no_grad():
            code_vec= model(code_inputs=code_inputs)
            code_vecs.append(code_vec.cpu().numpy())

    model.train()    
    code_vecs=np.concatenate(code_vecs,0)
    nl_vecs=np.concatenate(nl_vecs,0)

    scores=np.matmul(nl_vecs,code_vecs.T)
    
    sort_ids=np.argsort(scores, axis=-1, kind='quicksort', order=None)[:,::-1]    
    
    nl_urls=[]
    code_urls=[]
    for example in query_dataset.examples:
        nl_urls.append(example.url)
        
    for example in code_dataset.examples:
        code_urls.append(example.url)
        
    ranks=[]
    for url, sort_id in zip(nl_urls,sort_ids):
        rank=0
        find=False
        for idx in sort_id[:1000]:
            if find is False:
                rank+=1
            if code_urls[idx]==url:
                find=True
        if find:
            ranks.append(1/rank)
        else:
            ranks.append(0)
    
    result = {
        "eval_mrr":float(np.mean(ranks))
    }

    return result

#5. Define model

In [17]:
class Model(nn.Module):   
    def __init__(self, encoder):
        super(Model, self).__init__()
        self.encoder = encoder
      
    def forward(self, code_inputs=None, nl_inputs=None): 
        if code_inputs is not None:
            return self.encoder(code_inputs,attention_mask=code_inputs.ne(1))[1]
        else:
            return self.encoder(nl_inputs,attention_mask=nl_inputs.ne(1))[1]

In [None]:
"""
Load pretrained model
"""

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

#Pretrained model
#model = RobertaModel.from_pretrained("microsoft/codebert-base")

#Finetuned model
model = RobertaModel.from_pretrained(os.path.join(data_dir, model_path)) 

model = Model(model)
model.to(device)

#6. Train and Evaluate

In [19]:
"""
Uncomment to train
"""

#train(model, tokenizer)

'\nUncomment to train\n'

In [20]:
"""
Uncomment to evaluate
"""

#result = evaluate(model, tokenizer)
#for key in sorted(result.keys()):
    #logger.info("  %s = %s", key, str(round(result[key],4)))

'\nUncomment to evaluate\n'

#7. Make Embeddings


In [197]:
"""
Create embeddings of python code from the data
Maybe change the data to the train file to have more embeddings
"""

query_dataset = TextDataset(tokenizer, codebase_file)
query_sampler = SequentialSampler(query_dataset)
query_dataloader = DataLoader(query_dataset, sampler=query_sampler, batch_size=config.get("batch_size"),num_workers=4)    

code_vecs=[] 

for batch in tqdm(query_dataloader):
    code_inputs = batch[0].to(device)    
    with torch.no_grad():
        code_vec= model(code_inputs=code_inputs)
        code_vecs.append(code_vec.cpu().numpy())

#Delete last embedding to have a valid shape
code_vecs = np.delete(code_vecs, -1)

100%|██████████| 1370/1370 [05:44<00:00,  3.98it/s]
  arr = asarray(arr)


In [200]:
"""
Stack embeddings together
"""

vecs = np.vstack(code_vecs)
vecs = torch.from_numpy(vecs)
np.shape(vecs)

torch.Size([43808, 768])

#8. Make a query


In [255]:
"""
Define a natural language query
"""

query = "Create stream for write data to `destination"
query_vec = model(tokenizer(query,return_tensors='pt')['input_ids'].to(device))

In [256]:
np.shape(query_vec)

torch.Size([1, 768])

In [257]:
"""
Calculate score
"""

query_vec = query_vec.to(device)
vecs = vecs.to(device)

scores=torch.einsum("ab,cb->ac",query_vec,vecs)
scores=torch.softmax(scores,-1)

In [259]:
"""
Convert scores
"""

scores = np.squeeze(scores)
np.shape(scores)
scores = scores.cpu().detach().numpy()
np.shape(scores)

torch.Size([43808])

In [262]:
"""
Save top 5 scores
"""

scores = scores.argsort()[-5:][::-1]

In [251]:
scores

array([19790, 28731, 33522, 20832, 39610])

In [263]:
"""
Find index with the highest score
"""

#index = torch.argmax(scores)

'\nFind index with the highest score\n'

In [266]:
"""
Return code of the result with the highest score
"""

query_dataset.data[scores[2]]

{'code': 'def upload_stream(self, destination, *, offset=0):\n        """\n        Create stream for write data to `destination` file.\n\n        :param destination: destination path of file on server side\n        :type destination: :py:class:`str` or :py:class:`pathlib.PurePosixPath`\n\n        :param offset: byte offset for stream start position\n        :type offset: :py:class:`int`\n\n        :rtype: :py:class:`aioftp.DataConnectionThrottleStreamIO`\n        """\n        return self.get_stream(\n            "STOR " + str(destination),\n            "1xx",\n            offset=offset,\n        )',
 'code_tokens': ['def',
  'upload_stream',
  '(',
  'self',
  ',',
  'destination',
  ',',
  '*',
  ',',
  'offset',
  '=',
  '0',
  ')',
  ':',
  'return',
  'self',
  '.',
  'get_stream',
  '(',
  '"STOR "',
  '+',
  'str',
  '(',
  'destination',
  ')',
  ',',
  '"1xx"',
  ',',
  'offset',
  '=',
  'offset',
  ',',
  ')'],
 'docstring': '',
 'docstring_tokens': [],
 'func_name': 'Client.

# 9. Paraphrase result

In [None]:
code_dataset.data[index]

In [None]:
docstring = 'Establish an ssh tunnel for each local host and port that can be used to communicate with the state host.'

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")  
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

sentence = "This is something which i cannot understand at all"

text =  "paraphrase: " + sentence + " </s>"

encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")


outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=256,
    do_sample=True,
    top_k=120,
    top_p=0.95,
    early_stopping=True,
    num_return_sequences=5
)

for output in outputs:
    line = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print(line)