# Load required things and setup environment

In [None]:
!pip install cloud-tpu-client==0.10 torch==1.11.0 https://storage.googleapis.com/tpu-pytorch/wheels/cuda/112/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl --force-reinstall 

In [None]:
!pip install transformers



In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
import psutil 
import matplotlib.pyplot as plt
import pdb
from sklearn.metrics import f1_score
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.debug.metrics as met
from google.colab import drive
import gc
drive.mount('/content/drive')

ModuleNotFoundError: ignored

In [None]:

CHUNK_SIZE = 100000#data points per file
MAX_TOKEN_DIM = 512 #controls padding and input to classifier
LOAD_DATA = True
PCA_DIM = 128
device = torch.device('cpu')
if torch.cuda.is_available():
  device = torch.device('cuda')
  torch.cuda.device(device)
try:
  print(xm.xla_device())
  device = xm.xla_device()
  print("USING TPU")
except:
  print("NOT USING TPU")
  pass
print(torch.cuda.is_available())
print(device)
if torch.cuda.is_available():
  torch.set_default_tensor_type('torch.cuda.FloatTensor')
  print("using cuda")
  

tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
codebert = AutoModel.from_pretrained("huggingface/CodeBERTa-small-v1")
torch.set_printoptions(precision=7)


xla:1
USING TPU
False
xla:1


Some weights of the model checkpoint at huggingface/CodeBERTa-small-v1 were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def load_data():
  f = open("/content/drive/MyDrive/CloneData/data.jsonl") #read sniipets and indices
  entries = f.readlines()
  objects = [json.loads(x) for x in entries] #load all functions
  idx_to_function = dict() #id num -> code snippiet
 
  for snippet in objects:#map to associate index to func
    
    idx_to_function[snippet["idx"]] = snippet["func"]

  return idx_to_function #map id num to code 0 -> "hello world"

In [None]:
def pairify_file(lines : list, idx_to_function : dict) -> tuple:
  #list of lines id1 id2 label
  # (code1, code2, label)
  examples = [] #list of lines of 
  
  for line in lines:
    line_entries = line.replace("\t", " ").split(" ") #given line x y label, divide to find if x is y according to label
    #print(line)
    x = line_entries[0]
    y = line_entries[1]
    label = line_entries[2]
    
    examples.append((idx_to_function[x], idx_to_function[y], float(label))) #convert label to float for pytorch
  return examples # [("hello world", "hi wurld", 1) , ("bye world", "EEEEEEE", 0)]


In [None]:
def split_and_label_data(idx_to_function : dict): #convert pairs to useful training examples
  return tuple(map(  lambda x : pairify_file(open(x).readlines(), idx_to_function)  , ["/content/drive/MyDrive/CloneData/train.txt","/content/drive/MyDrive/CloneData/test.txt", "/content/drive/MyDrive/CloneData/valid.txt"]))


# Pre-calculate embeddings

In [None]:
def embed(x : str) -> torch.TensorType:
  with torch.no_grad():
    code_tokens=tokenizer.tokenize(x)

    if len(code_tokens) >= 510: #confirm tokes arent too big for model
      return None
    tokens=[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]

    tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
    context_embeddings=codebert(torch.tensor(tokens_ids, device = device)[None,:])[0]
    
    flattened = torch.flatten(context_embeddings)
    
    
    
    return flattened #torch.clamp(flattened, min = -2, max = 2) #return flattened embedding vector

In [None]:
def embed_data(data : list) -> list: #takes prog1, prog2, label and replaces prog with their embedding for every item in the list and filters out too long items
  embedded_data = []
  i = 0
  
  for x,y, label in data:

    if i % 10 == 0:  
      #print("using {} MB for {} of {}, embedded {}".format(psutil.Process().memory_info().rss / (1024 * 1024),i, len(data), len(embedded_data)))
      pass
    emb_x = embed(x[0])
    emb_y = embed(y[0])
    #pdb.set_trace()
    if emb_x != None and emb_y != None: #check code isnt too long
      x_embed = emb_x #Standardize embeddings lengths since they depend on #of tokens
      y_embed = emb_y
     
      padding_length_x  = (MAX_TOKEN_DIM * 768 - x_embed.size()[0])
      padding_length_y  = (MAX_TOKEN_DIM * 768 - y_embed.size()[0])
      
      x_padded = torch.nn.functional.pad(x_embed, (int(padding_length_x/2), int(padding_length_x/2)))
      y_padded = torch.nn.functional.pad(y_embed, (int(padding_length_y/2), int(padding_length_y/2)))
      embedded_data.append((x_padded,y_padded, label))
    i += 1
  #print(f"unique entries {len(set([embedded_data[0][0],  embedded_data[1][0]]  ))}")
  return embedded_data 

In [None]:
class CloneDataset(Dataset): #dataset 

  def __init__(self,x : list ,y : list,labels : list):
    assert len(x) == len(y) and len(y) == len(labels) #make sure all the same size
    #standard boilerplate
    self.x = x.to(device)
    self.y = y.to(device)
    self.labels = labels.to(device)
    self.length = len(x)
    
  def __getitem__(self, idx):
    return self.x[idx], self.y[idx], self.labels[idx]
  
  def __len__(self):
    return self.length

In [None]:

#test_data = embed_data(test_data)
#validation_data = embed_data(validation_data)

In [None]:
def tokenize(code : str) -> list:
  with torch.no_grad():
    code_tokens=tokenizer.tokenize(code)
    

    if len(code_tokens) >= 510: #confirm tokes arent too big for model
      return None
    code_tokens += [tokenizer.pad_token] *  (510 - len(code_tokens)) #pad out to 510 which becomes 512
    tokens=[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]tokens

    tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
    return torch.tensor(tokens_ids, device = device)[None,:]

SyntaxError: ignored

In [None]:
def build_dataset(data : list):
  x_list = []
  y_list = []
  label_list = []
  i = 0
  for x,y,l in data:#convert list of tuples to 3 separate lists
    #x.to(device)
    #y.to(device)
    if i % 250 == 0:
        print(f"on data point {i}")
    x_tokens = tokenize(x)
    y_tokens = tokenize(y)
    if  not x_tokens is None and not y_tokens is None: #confirmm both seqs work
      x_list.append(x_tokens)
      y_list.append(y_tokens)
      label_list.append(l)
    i+=1

  return CloneDataset(x_list, y_list, label_list)

In [None]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
       
        
        #A note on architecture for those interested, we eat CodeBERT embeddings of size X  * 768 which have been flattened
        # Now those vectors are each fed into FF layer(s)
        #Then they're concatnated and fed thru more FF layer(s)
        # Then their dimensionality is shrunk down to 1, which is sigmoided
        input_size = 512
        layer2_size = 256
        layer3_size = 128
        layer4_size = 32
        #self.xlayer_1 = nn.Linear(MAX_TOKEN_DIM * 768, layer2_size)
        #self.ylayer_1 = nn.Linear(MAX_TOKEN_DIM * 768, layer2_size)
        self.reduction = nn.Linear(MAX_TOKEN_DIM * 768, input_size)
        self.ff1 = nn.Linear( 2 * input_size, layer2_size )
        
        self.ff2 = nn.Linear(layer2_size, 1)
        #self.ff3 = nn.Linear(layer3_size, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

        self.batchNorm1 = nn.BatchNorm1d(layer2_size, affine = False)
        #self.batchNorm2 = nn.BatchNorm1d(layer3_size, affine = False)

        #nn.init.xavier_normal_(self.xlayer_1.weight)
        #nn.init.xavier_normal_(self.ylayer_1.weight)
        nn.init.xavier_normal_(self.ff1.weight)
        nn.init.xavier_normal_(self.ff2.weight)
        nn.init.xavier_normal_(self.reduction.weight)
        #nn.init.xavier_normal_(self.ff3.weight)

        #self.to(device)


    def forward(self, x,y):
       x_reduced = self.relu(self.reduction(x))   
       y_reduced = self.relu(self.reduction(y))       
       combined = torch.cat((x_reduced, y_reduced),1)
      
       out = self.ff1(combined)
       #print(f"out is {out}")
       
       out = self.sigmoid(out)
       #xm.mark_step()
       out = self.batchNorm1(out)
       #xm.mark_step()
       out = self.ff2(out)
       #out = self.sigmoid(out)
       #out = self.batchNorm2(out)
       #xm.mark_step()
       #out = self.relu(out)
       #out = self.ff3(out)
       
       #out = self.sigmoid(out)
       return out


In [None]:
print(CHUNK_SIZE)

100000


In [None]:

torch.save((torch.tensor([1,2,3]) ), "test.pt")
torch.load("test.pt")

tensor([1, 2, 3])

In [None]:

hector = False
train_data = []
if not LOAD_DATA:
  #START
  idx_to_function = load_data()
  train_data, test_data,validation_data = split_and_label_data(idx_to_function)
  if hector:
    train_data = train_data[:100000]
  i = 0
  while i * CHUNK_SIZE < len(train_data):
    relevant_data = train_data[i * CHUNK_SIZE: min(len(train_data), (i+1) * CHUNK_SIZE ) ] #get 100000 items at a time
    print(i * CHUNK_SIZE, min(len(train_data), (i+1) * CHUNK_SIZE ), len(relevant_data))
    print(f"building dataset {i}")
    dataset = build_dataset(relevant_data) #get and save data
    
    print(f"saving dataset{i}")
    torch.save(dataset, f"train_data_{i}.pt")
    i+=1 #

  #END
elif LOAD_DATA:  
  data_loaders = []
  for i in range(10):#41 mins
    print(f"loading file {i}")
    start  = datetime.now()
    print(device)
    dl = torch.load(f"/content/drive/MyDrive/CloneData/id_data/train_data_{0}.pt", map_location= "cpu")
    
    data_loaders.append(dl)
    end = datetime.now()
    print(f"took {(end-start)} s")
  train_data = data_loaders
  train_data = torch.utils.data.ConcatDataset(data_loaders)


loading file 0
xla:1
took 0:00:06.518861 s
loading file 1
xla:1
took 0:00:05.642586 s
loading file 2
xla:1
took 0:00:05.775386 s
loading file 3
xla:1
took 0:00:05.841307 s
loading file 4
xla:1
took 0:00:05.502941 s
loading file 5
xla:1
took 0:00:06.163734 s
loading file 6
xla:1
took 0:00:05.856707 s
loading file 7
xla:1
took 0:00:05.740783 s
loading file 8
xla:1
took 0:00:05.774251 s
loading file 9
xla:1
took 0:00:05.779758 s


In [None]:
trainLoader = DataLoader(train_data, batch_size= 256, shuffle = False, drop_last = True)# BATCH SIZES MUST BE MULTIPLES OF 128
all_train = DataLoader(train_data, batch_size= len(train_data), shuffle = False, drop_last = True)


In [None]:
for x,y,l in all_train:
  print(type(x),type(y), type(l))

<class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>


In [None]:
#torch.save(train_data, "train_data_0.pt")

In [None]:
#del train_data
#train_data = torch.load("train_data.pt")
#print(train_data, "Fdsfasd")

In [None]:
class F1(nn.Module):
    def __init__(self):
        super(F1, self).__init__()
       


    def forward(self, label,pred):
      with torch.no_grad():
        x = torch.round(pred) - label
        x  = torch.flatten(x)
        tp = torch.where(x == 0, 1, 0)
        tp_count = float(torch.numel(torch.nonzero(tp)))

        fp = torch.where(x == 1, 1, 0)
        fp_count = float(torch.numel(torch.nonzero(fp)))

        fn = torch.where(x == -1, 1, 0)
        fn_count = float(torch.numel(torch.nonzero(fn)))
        denom = (tp_count + .5 * (fp_count + fn_count))
        #print(tp_count, fp_count, fn_count)
        #print(denom)
        if denom == 0:
          return 0
        return float(tp_count) / float(denom)

# Train model

In [None]:
def train():
  torch.cuda.empty_cache() 
  epochs  = 2 #standard boilerplate
  model = Classifier()
  print(model.ff1.weight.device)
  #criterion = nn.BCELoss()
  criterion = nn.BCEWithLogitsLoss()
  #optimizer = optim.Adam(model.parameters())
  optimizer = optim.SGD(model.parameters(), lr = 0.1)
  #scorer = F1()
  #scorer.to(device)

  loss_history = []
  f1_history = []
  max_grad_history = []
  min_grad_history = []
  print(device)
  model.to(device)
  codebert.to(device)
  U,S,V = None, None, None
  for epoch in range(epochs): #standard training procedure
    torch.cuda.empty_cache() 
    epoch_loss = 0
    
    tp_count = 0 #setup for f1 score
    fp_count = 0
    fn_count = 0
    f1 = 0
    
    j = 0
    for x,y,label in trainLoader:
      if j > 20:
        break
      torch.cuda.empty_cache()
      gc.collect()  
      


      with torch.no_grad():
        x = torch.reshape(x, (x.shape[0], x.shape[2])) # make Batch size X 1 X 512 into Batch size X 512
        y = torch.reshape(y, (y.shape[0], y.shape[2]))
        x = x.to(device)
        y = y.to(device)
        label = label.to(device)


        
        
        #model.to("cpu")
        #codebert.to(device)
        #torch.cuda.empty_cache()
        #print(f"before embedding codebert is on {codebert.device}")
        embed_start = datetime.now()
        embedded_x=codebert(x)[0]
       
        embedded_y=codebert(y)[0]
       
      
        #codebert.to("cpu")
        #model.to(device)
        #torch.cuda.empty_cache()
        #print(embedded_x.shape)
        #print(embedded_y.shape)
        #embedded_x.to('cpu')
        #embedded_y.to('cpu')
        
        embedded_x = torch.flatten(embedded_x, start_dim = 1)
        embedded_y = torch.flatten(embedded_y, start_dim = 1)
        embed_end = datetime.now()
        if j == 0 and epoch == 0:
          print("before PCA")
          _,_,V = torch.pca_lowrank(embedded_x, q=128, center=True, niter=2)
          print("got v")
          V = torch.transpose(V,0,1 )
          print("reshaped v")
        print(embedded_x.shape, V.shape)
        embedded_x = torch.reshape(embedded_x, embedded_x.shape)
        print("mult by v")
        embedded_x =  embedded_x *   V
        embedded_y = embedded_y * V
        #embedded_x.to(device)
        #embedded_y.to(device)
        #print(embedded_x.shape)
        #print(embedded_y.shape)
       
      
      model_start = datetime.now()
      optimizer.zero_grad()
      #print(embedded_x.dtype)
      #print("max x {}".format(torch.max(embedded_x)))
      #print(f"before prediction model is on device {model.ff1.weight.device}")
      pred = model(embedded_x,embedded_y)
      
     
      #print(f"pred is {pred.shape} {pred}")
     


      #print(label.shape)
      #print(pred.view(10).shape)
      loss_start = datetime.now()
      loss = criterion(torch.flatten(pred.unsqueeze(1)),torch.flatten(label.unsqueeze(1)))
      #loss = torch.nn.functional.binary_cross_entropy_with_logits(torch.flatten(pred.unsqueeze(1)),torch.flatten(label.unsqueeze(1)))
      loss.backward()
      #print(model.ff1.weight.grad)
      #nn.utils.clip_grad_norm_(model.parameters(), max_norm = 2.0, norm_type = 2.0)
      optimizer.step()
      min_grad_history.append(torch.min(model.ff1.weight.grad).detach())
      max_grad_history.append(torch.max(model.ff1.weight.grad).detach())
      xm.mark_step()
      
      loss_end = datetime.now()
      #print("pred is {}".format(pred))
      epoch_loss += 0 #loss.item()
      end = datetime.now()
      model_delta_t = end-model_start 
      embed_delta_t = embed_end-embed_start
      loss_delta_t = loss_start-loss_end
     
     
      with torch.no_grad():
        #f1_score = scorer(label,pred)
        #F1_score = f1_score(label.cpu(), torch.round(pred).cpu())
        #xm.mark_step()
        score_start = datetime.now()
        
        #f1_history.append(F1_score)
        if j % 1 == 0:
          l = loss.item()
          #print(torch.min(pred), torch.max(pred))
          print("time per model iteration {} s".format(model_delta_t.microseconds / 10**6))
          print("time per embed iteration {} s".format(embed_delta_t.microseconds / 10**6))
          print("time per loss iteration {} s".format(loss_delta_t.microseconds / 10**6))
          loss_history.append(l)
          print(f"iteation {j} of epoch {epoch+1}")
          print(f"loss is {l}")
          
        
          #calculate scores
          pred_cpu = pred.detach().cpu()
          pred_rounded = torch.round(pred_cpu)
          print(torch.min(pred_cpu), torch.max(pred_cpu))

          for i in range(label.shape[0]):
            #print(f"i is {i}")
            if pred_rounded[i] == 1 and label[i] == 1:
              tp_count += 1
            elif pred_rounded[i] == 1 and label[i] == 0:
              fp_count += 1
            elif pred_rounded[i] == 0 and label[i] == 1:
              fn_count += 1

          if (tp_count + .5 * (fp_count + fn_count)) != 0: #dont get 0 for denom of f1
            f1 = tp_count/(tp_count + .5 * (fp_count + fn_count))

            #loss = loss.item.to('cpu')
            
            f1_history.append(f1)
            score_end = datetime.now()
            print(f"{(score_end-score_start)} s for scoring")
        j+=1
  return (loss_history,f1_history, min_grad_history, max_grad_history)

In [None]:
loss_history, f1_history, min_grad_history,max_grad_history = train()

cpu
xla:1
before PCA
got v
reshaped v
torch.Size([256, 393216]) torch.Size([128, 393216])
mult by v


RuntimeError: ignored

In [None]:
print(torch_xla.core.xla_model.get_memory_info(device))

In [None]:
i = 0
for x,y,label in trainLoader:
  if i > 5:
    break
  with torch.no_grad():
        x = torch.reshape(x, (x.shape[0], x.shape[2])) # make Batch size X 1 X 512 into Batch size X 512
        y = torch.reshape(y, (y.shape[0], y.shape[2]))
        x = x.to(device)
        y = y.to(device)
        label = label.to(device)


        
        
        #model.to("cpu")
        #codebert.to(device)
        #torch.cuda.empty_cache()
        #print(f"before embedding codebert is on {codebert.device}")
        embed_start = datetime.now()
        embedded_x=codebert(x)[0]
       
        embedded_y=codebert(y)[0]
       
        embed_end = datetime.now()
        #codebert.to("cpu")
        #model.to(device)
        #torch.cuda.empty_cache()
        #print(embedded_x.shape)
        #print(embedded_y.shape)
        #embedded_x.to('cpu')
        #embedded_y.to('cpu')
        
        embedded_x = torch.flatten(embedded_x, start_dim = 1)
        embedded_y = torch.flatten(embedded_y, start_dim = 1)
        U,S,V = torch.pca_lowrank(embedded_x, q=None, center=True, niter=2)
        print(S)
        i += 1

# Analzye results

In [None]:
plt.scatter(list(range(len(f1_history))), f1_history)
plt.title("F1 score")
plt.xlabel("Iterations")
plt.ylabel("F1")
plt.show()
plt.scatter(list(range(len(loss_history))), loss_history)
plt.title("Loss")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.show()
plt.scatter(list(range(len(loss_history))), [x.detach().cpu() for x in  min_grad_history])
plt.title("Min Grad")
plt.xlabel("Iterations")
plt.ylabel("Min Grad")
plt.show()
plt.scatter(list(range(len(loss_history))), [x.detach().cpu() for x in  max_grad_history])
plt.title("Max Grad")
plt.xlabel("Iterations")
plt.ylabel("Max Grad")
plt.show()