In [None]:
# MODEL

import torch
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, batch_size, hidden_size):
        super(Model, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size;
        self.rotate_list = list(range(-1, batch_size-1))
        self.off_diag = torch.eye(batch_size).ge(1).logical_not()
        self.relu = nn.ReLU()
        self.qfirst_linear = nn.Linear(768, hidden_size)
        self.cfirst_linear = nn.Linear(768, hidden_size)
        self.cosine = nn.CosineSimilarity(dim=-1)
    
    def to(self, *args, **kwargs):
        self = super().to(*args, **kwargs) 
        return self
    
    def forward(self, query, code):
        query = self.qfirst_linear(query)
        code = self.cfirst_linear(code)
        
        if self.training:
            q = query.repeat(1, self.batch_size).reshape(self.batch_size,self.batch_size,self.hidden_size)
            c = code.repeat(self.batch_size, 1).reshape(self.batch_size,self.batch_size,self.hidden_size)
            w = 1
            loss = -self.cosine(q,c).diagonal().sum() * w /self.batch_size
            loss += torch.masked_select(self.cosine(q,c), self.off_diag.to('cuda')).sum() / (self.batch_size * (self.batch_size - 1))
        else:
            loss = 0
        return {'output_query': query, 'output_code': code,'loss': loss}

class Model_deep(nn.Module):
    def __init__(self, batch_size, hidden_size, depth):
        super(Model_deep, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size;
        self.rotate_list = list(range(-1, batch_size-1))
        self.off_diag = torch.eye(batch_size).ge(1).logical_not()
        self.relu = nn.ReLU()
        self.qfirst_linear = nn.Linear(768, hidden_size)
        self.cfirst_linear = nn.Linear(768, hidden_size)
        self.qlinear_layers = [nn.Linear(hidden_size, hidden_size) for i in range(depth)]
        self.clinear_layers = [nn.Linear(hidden_size, hidden_size) for i in range(depth)]
        self.cosine = nn.CosineSimilarity(dim=-1)
    
    def to(self, *args, **kwargs):
        self = super().to(*args, **kwargs) 
        for lin in self.qlinear_layers:
            lin.to(*args, **kwargs)
        for lin in self.clinear_layers:
            lin.to(*args, **kwargs)
        return self
    
    def forward(self, query, code):
        query = self.qfirst_linear(query)
        for lin in self.qlinear_layers:
            query = self.relu(query)
            query = lin(query)
        code = self.cfirst_linear(code)
        for lin in self.clinear_layers:
            code = self.relu(code)
            code = lin(code)
        
        if self.training:
            q = query.repeat(1, self.batch_size).reshape(self.batch_size,self.batch_size,self.hidden_size)
            c = code.repeat(self.batch_size, 1).reshape(self.batch_size,self.batch_size,self.hidden_size)
            w = 1
            loss = -self.cosine(q,c).diagonal().sum() * w /self.batch_size
            loss += torch.masked_select(self.cosine(q,c), self.off_diag.to('cuda')).sum() / (self.batch_size * (self.batch_size - 1))
        else:
            loss = 0
        return {'output_query': query, 'output_code': code,'loss': loss}

In [None]:
# TRAINING / EVALUATION
import math
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from transformers import DataCollatorWithPadding
from transformers import AdamW
from transformers import get_scheduler
import os

encodings_name = 'average_pooling_outputs_v2'

def train(batch_size=512, 
          num_epochs=3, 
          learning_rate = 5e-5, 
          warmup_percentage=1, 
          show_progress=True, 
          validate=True,
          shuffle=True,
          hidden=128,
          depth=0,
          name='model'):
    
    device = 'cuda'
    hidden_size = 768
    
    filenames = {'train': './data/train/{}'.format(encodings_name), 
                 'validation': './data/validation/{}'.format(encodings_name), 
                 'test': './data/test/{}'.format(encodings_name)}
    num_samples = {'train': 412178, 'validation': 23107, 'test': 22176}

    samples = torch.FloatTensor(torch.FloatStorage.from_file(filenames['train'], shared=False, size=num_samples['train'] * 2 * hidden_size)).reshape(num_samples['train'], 2, hidden_size)
    train_sampler = torch.utils.data.RandomSampler(samples)
    train_loader = DataLoader(samples, sampler=train_sampler, batch_size=batch_size, num_workers=0, drop_last=True)

    model = Model(batch_size, hidden) #If testing deeper model, use Model_deep

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    num_warmup_steps = round(num_epochs * len(train_loader) * warmup_percentage / (batch_size * 100)) * batch_size
    num_training_steps = num_epochs * len(train_loader)
    lr_scheduler = get_scheduler("linear", 
                                 optimizer = optimizer, 
                                 num_warmup_steps = num_warmup_steps, 
                                 num_training_steps = num_training_steps)
    losses = []

    if torch.cuda.is_available():
        model = model.to(device)
    best_mrr = 0
    best_epoch = 1
    for epoch in range(num_epochs):
        epoch_loss = 0
        model.train()
        for i, batch in enumerate(tqdm(train_loader, disable=not show_progress)):
            optimizer.zero_grad()

            if torch.cuda.is_available():
                query = batch[:,0,:].to(device)
                code = batch[:,1,:].to(device)

            output = model(query, code)

            loss = output['loss']
            losses += [loss.detach().cpu()]
            loss.backward()

            optimizer.step()

            if i % round(len(train_loader)/10) == 1:
                print(f'Epoch ({round(i/len(train_loader), 1)*100 - 10} - {round(i/len(train_loader), 1)*100} %): {epoch+1} \t Average loss: {sum(losses[-round(len(train_loader)/10):])/len(losses[-math.floor(len(train_loader)/10):])}')
        print(f'Epoch: {epoch+1} \t Average loss: {sum(losses[-round(len(train_loader)):])/len(losses[-math.floor(len(train_loader)):])}')

        mrr = test(model=model, hidden=hidden, split='validation', show_progress=False, name=name)
        if mrr > best_mrr:
            best_mrr = mrr
            best_epoch = epoch+1
            torch.save(model.state_dict(), './{}/MRR({})_epoch({})'.format(name, best_mrr, epoch+1))
        else:
            print(f"Don't save epoch {epoch+1}, it had a worse MRR")
            break
        
    plt.plot(losses)
    return best_mrr, best_epoch

In [None]:
# TRAINING / EVALUATION
import math
from torch.utils.data import TensorDataset, DataLoader
from tqdm.auto import tqdm

def test(model=None, split='test', show_progress=True, eval_size=512, hidden=128, depth=1, name='model'):
    if model is None:
        model = Model(eval_size, hidden)
        model.load_state_dict(torch.load('./{}'.format(name)))
    model.eval()
    
    MRRs = []
    hidden_size = 768
    device = 'cuda'

    filenames = {'train': './data/train/{}'.format(encodings_name), 
                 'validation': './data/validation/{}'.format(encodings_name), 
                 'test': './data/test/{}'.format(encodings_name)}
    num_samples = {'train': 412178, 'validation': 23107, 'test': 22176}

    samples = torch.FloatTensor(torch.FloatStorage.from_file(filenames[split], shared=False, size=num_samples[split] * 2 * hidden_size)).reshape(num_samples[split], 2, hidden_size)
    loader = DataLoader(samples, batch_size=eval_size, num_workers=0, drop_last=True)

    cosine = nn.CosineSimilarity(dim=-1)

    model.eval()
    with torch.no_grad():
        if torch.cuda.is_available():
            model = model.to(device)

        for batch in tqdm(loader, disable=not show_progress):
            cos = torch.tensor([]).to(device)
            max_size = 512
            for mini_batch in [batch[:512], batch[:512]]:
                
                if torch.cuda.is_available():
                    query = mini_batch[:,0,:].to(device)
                    code = mini_batch[:,1,:].to(device)

                output = model(query, code)

                q = output['output_query'].repeat(1, max_size).reshape(max_size,max_size,hidden)
                c = output['output_code'].repeat(max_size, 1).reshape(max_size,max_size,hidden)

                cos = torch.cat((cos, cosine(q, c)))
            MRRs += (1/(cos.argsort(descending=True).argsort(descending=False).diagonal() + 1)).tolist()
            
            mrr = round(sum(MRRs)/len(MRRs), 4)
        plt.hist([1/m for m in MRRs], bins=128)
        print('MRR: {}, Rank: {}'.format(mrr, 1/mrr))
    return mrr

torch.cuda.empty_cache()
batch_size=512
learning_rate = 5e-5
shuffle=True
warmup_percentage=0.0
num_epochs=20
depths=1
depth=0

dir_name = 'BERT_VS_CodeBERTa'
if not os.path.exists(f'./{dir_name}'): 
        os.mkdir(f'./{dir_name}')

for hidden in [512]:
    for learning_rate in [5e-6]:
        for batch_size in [512]:
            for depth in [1,2,3]:

                name = '{}/depth({})_hidden({})_shuffle({})_batchsize({})_lnrate({})_wrmup({})'.format(dir_name,
                                                                                                       depth, 
                                                                                                       hidden, 
                                                                                                       shuffle, 
                                                                                                       batch_size, 
                                                                                                       learning_rate, 
                                                                                                       warmup_percentage)
                if not os.path.exists(f'./{name}'): 
                    os.mkdir(f'./{name}')

                print('{}\nTRAINING: {}\n{}'.format('-'*100,name,'-'*100))
                best_mrr, epoch = train(batch_size=batch_size, 
                                        hidden=hidden, 
                                        depth = depth, 
                                        num_epochs=num_epochs, 
                                        learning_rate = learning_rate, 
                                        shuffle=shuffle, 
                                        warmup_percentage=warmup_percentage,
                                        name=name,
                                        show_progress=True)

                torch.cuda.empty_cache()
                name = './{}/MRR({})_epoch({})'.format(name, best_mrr, epoch)
                test(show_progress=True, eval_size=512, hidden=hidden, depth=depth, name=name)

In [None]:
torch.cuda.empty_cache()


dir_name = 'BERT_VS_CodeBERTa'

depth=0
hidden=512
batch_size=512
learning_rate = 5e-6
shuffle=True
warmup_percentage=0.0

mrr = 0.1121
epoch=7

model_config = '{}/depth({})_hidden({})_shuffle({})_batchsize({})_lnrate({})_wrmup({})'.format(dir_name,
                                                                                               depth, 
                                                                                               hidden, 
                                                                                               shuffle, 
                                                                                               batch_size, 
                                                                                               learning_rate, 
                                                                                               warmup_percentage)


name = '{}/MRR({})_epoch({})'.format(model_config, mrr, epoch)
mrr = test(show_progress=True, eval_size=1000, hidden=512, name=name)