ISSUES:
- what to set max sequence length
- batch size: out of memory issue

NEXT STEPS:
- migrate to pytorch lightning
  - wandb or tensorboard for logging metrics

In [None]:
!pip install --upgrade transformers
!pip install gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
import pandas as pd

import torch
import torch.nn as nn  
from torch.utils.data import Dataset, DataLoader 
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# triplet

## dataset

In [None]:
# dataset: triplet_data.csv
!gdown 1eww8pxYnbxbZwZh2dyJg7zoc2sli37iv

Downloading...
From: https://drive.google.com/uc?id=1eww8pxYnbxbZwZh2dyJg7zoc2sli37iv
To: /content/triplet_data.csv
100% 158M/158M [00:01<00:00, 104MB/s]


In [None]:
class TripletDataset(Dataset):
  def __init__(self, csv_file):
    # read csv_file into df  
    self.df = pd.read_csv(csv_file) 
    rename_columns = {
      'BodyText1': 'anchor-text',
      'BodyCode1': 'anchor-code',
      'BodyText2': 'positive-text',
      'BodyCode2': 'positive-code',
      'BodyText3': 'negative-text',
      'BodyCode3': 'negative-code',
    }
    self.df.rename(columns = rename_columns, inplace=True)

    # tokenizer
    self.tokenizer = AutoTokenizer.from_pretrained("UWB-AIR/MQDD-pretrained")
  
  def __len__(self):
    return len(self.df.index) 
  
  def __getitem__(self, idx):
    # read df 
    anchor_text = str(self.df.at[idx, "anchor-text"])
    anchor_code = str(self.df.at[idx, "anchor-code"])
    positive_text = str(self.df.at[idx, "positive-text"])
    positive_code = str(self.df.at[idx, "positive-code"])
    negative_text = str(self.df.at[idx, "negative-text"] )
    negative_code = str(self.df.at[idx, "negative-code"])

    # tokenize
    anchor = self.tokenizer(anchor_text, anchor_code, padding='max_length', truncation=True, max_length=500, return_tensors="pt")
    positive = self.tokenizer(positive_text, positive_code, padding='max_length', truncation=True, max_length=500, return_tensors="pt")
    negative = self.tokenizer(negative_text, negative_code, padding='max_length', truncation=True, max_length=500, return_tensors="pt")

    # squeeze dimensions (bug fix for batching)
    anchor['input_ids'] = torch.squeeze(anchor['input_ids'])
    anchor['attention_mask'] = torch.squeeze(anchor['attention_mask'])
    positive['input_ids'] = torch.squeeze(positive['input_ids'])
    positive['attention_mask'] = torch.squeeze(positive['attention_mask'])
    negative['input_ids'] = torch.squeeze(negative['input_ids'])
    negative['attention_mask'] = torch.squeeze(negative['attention_mask'])

    return anchor, positive, negative

In [None]:
csv_file = "triplet_data.csv"
dataset = TripletDataset(csv_file)

Downloading (…)okenizer_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/795 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/788k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# split dataset
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.7, 0.15, 0.15])

# dataloader 
batch_size = 1
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=True)

## model

In [None]:
class TripletNet(nn.Module):

  def __init__(self):
    super(TripletNet, self).__init__()

    self.model = AutoModel.from_pretrained("UWB-AIR/MQDD-pretrained")

  def forward(self, anchor, positive, negative):
    anchor = self.model(**anchor)[1]
    positive = self.model(**positive)[1]
    negative = self.model(**negative)[1]

    return anchor, positive, negative 
    
    

In [None]:
model = TripletNet().to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

## train

In [None]:
# correct if distance(anchor, positive) < distance(anchor, negative)
def num_correct(anchor, positive, negative): 
  positive_distances = F.pairwise_distance(anchor, positive)
  negative_distances = F.pairwise_distance(anchor, negative) 

  delta_distances = positive_distances - negative_distances 
  num_correct = (delta_distances < 0).sum() 

  return num_correct

def train(dataloader, model, triplet_loss, optimizer):
  size = len(dataloader.dataset)

  for batch, (anchor, positive, negative) in enumerate(dataloader):
    anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)

    anchor, positive, negative = model(anchor, positive, negative)
    loss = triplet_loss(anchor, positive, negative)

    optimizer.zero_grad() 
    loss.backward()
    optimizer.step() 

    correct = num_correct(anchor, positive, negative)

    if batch % 10 == 0:
      loss, current = loss.item(), (batch + 1) * 1
      print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def eval(dataloader, model, triplet_loss):
  size = len(dataloader.dataset)
  num_batches = len(dataloader) 

  eval_loss, eval_correct = 0, 0

  with torch.no_grad():
    for batch, (anchor, positive, negative) in enumerate(dataloader):
      anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)

      anchor, positive, negative = model(anchor, positive, negative)
      loss = triplet_loss(anchor, positive, negative) 
      eval_loss += loss.item() 

      correct = num_correct(anchor, positive, negative)
      eval_correct += correct 
    
    eval_loss /= num_batches
    eval_correct /= size 

    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {eval_loss:>8f} \n")

In [None]:
triplet_loss = nn.TripletMarginLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

num_epochs = 1

for i in range(num_epochs):
  train(train_dataloader, model, triplet_loss, optimizer)
  eval(val_dataloader, model, triplet_loss)

loss: 5.959546  [    1/35000]
loss: 1.048268  [   11/35000]
loss: 1.020672  [   21/35000]
loss: 0.665118  [   31/35000]
loss: 1.173087  [   41/35000]
loss: 0.414057  [   51/35000]
loss: 1.035926  [   61/35000]
loss: 1.193086  [   71/35000]
loss: 0.464754  [   81/35000]
loss: 0.830224  [   91/35000]
loss: 0.776767  [  101/35000]
loss: 0.581901  [  111/35000]
loss: 0.201887  [  121/35000]
loss: 0.188289  [  131/35000]
loss: 0.272629  [  141/35000]
loss: 0.000000  [  151/35000]
loss: 0.000000  [  161/35000]
loss: 0.000000  [  171/35000]
loss: 0.000000  [  181/35000]
loss: 0.000000  [  191/35000]
loss: 0.000000  [  201/35000]
loss: 0.000000  [  211/35000]
loss: 0.000000  [  221/35000]
loss: 0.000000  [  231/35000]
loss: 0.000000  [  241/35000]
loss: 0.000000  [  251/35000]
loss: 1.946865  [  261/35000]
loss: 0.000000  [  271/35000]
loss: 0.243235  [  281/35000]
loss: 0.000000  [  291/35000]
loss: 0.000000  [  301/35000]
loss: 0.000000  [  311/35000]
loss: 0.000000  [  321/35000]
loss: 0.00

KeyboardInterrupt: ignored

In [None]:
# model.save_pretrained("triplet_tuned")

# classifier model

## dataset

In [None]:
!gdown 1Z3TFJR-v4S-Kk9-OoeqZ-SKrVYKpq-NK

Downloading...
From: https://drive.google.com/uc?id=1Z3TFJR-v4S-Kk9-OoeqZ-SKrVYKpq-NK
To: /content/processed_dupls_and_non_dupls_100k.csv
100% 209M/209M [00:02<00:00, 96.0MB/s]


In [None]:
csv_file = "processed_dupls_and_non_dupls_100k.csv"
df = pd.read_csv(csv_file)

In [None]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Id1,Title1,Tags1,Id2,Title2,Tags2,Label,BodyText1,BodyCode1,BodyText2,BodyCode2
0,1,75780318.0,Reason about auxuliary constructors in scala,<java><scala>,28577,Globalization architecture,<c#><architecture><localization><globalization>,0,I have class methods to define auxiliary const...,"class Person:\n def __init__(self, name, ag...",I need to store products for an e-commerce sol...,"nvarchar(MAX)\n<cultures>\n <culture code=""..."
1,2,75780300.0,field not getting focus js/html,<javascript><html>,37721077,Multitrigger does not work,<c#><xaml><multitrigger>,0,\nim trying to get focus on field after i inp...,"<td><input class=""reference"" type=""tex...",I have a problem with my MultiTrigger and I do...,<MultiTrigger>\n <MultiTrigger.Conditions>\...


In [None]:
df.at[0, "BodyText1"]

In [None]:
class ClassifierDataset(Dataset):
  def __init__(self, csv_file):
    # read csv_file into df  
    self.df = pd.read_csv(csv_file) 

    # TODO: shuffle df? 

    # tokenizer
    self.tokenizer = AutoTokenizer.from_pretrained("UWB-AIR/MQDD-pretrained")
  
  def __len__(self):
    return len(self.df.index) 
  
  def __getitem__(self, idx):
    # read df 
    body_text_1 = str(self.df.at[idx, "BodyText1"])
    body_code_1 = str(self.df.at[idx, "BodyCode1"])
    body_text_2 = str(self.df.at[idx, "BodyText2"])
    body_code_2 = str(self.df.at[idx, "BodyCode2"])

    label = int(self.df.at[idx, "Label"])

    # tokenize
    input_1 = self.tokenizer(body_text_1, body_code_1, padding='max_length', truncation=True, max_length=500, return_tensors="pt")
    input_2 = self.tokenizer(body_text_2, body_code_2, padding='max_length', truncation=True, max_length=500, return_tensors="pt")

    # squeeze dimensions (bug fix for batching)
    input_1['input_ids'] = torch.squeeze(input_1['input_ids'])
    input_1['attention_mask'] = torch.squeeze(input_1['attention_mask'])
    input_2['input_ids'] = torch.squeeze(input_2['input_ids'])
    input_2['attention_mask'] = torch.squeeze(input_2['attention_mask'])

    return input_1, input_2, label

In [None]:
csv_file = "processed_dupls_and_non_dupls_100k.csv"
dataset = ClassifierDataset(csv_file)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# split dataset
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.7, 0.15, 0.15])

# dataloader 
batch_size = 1
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=True)

## model

In [None]:
class Classifier(nn.Module):

  def __init__(self):
    super(Classifier, self).__init__()

    self.model = AutoModel.from_pretrained("UWB-AIR/MQDD-pretrained")

    # can make more complicated
    self.classifier = torch.nn.Linear(2 * self.model.config.hidden_size, 2) 

  def forward(self, x1, x2):
    x1 = self.model(**x1)[1]
    x2 = self.model(**x2)[1]

    x = torch.concat((x1, x2), 1)

    logits = self.classifier(x)

    return logits

In [None]:
model = Classifier().to(device)

## train

In [None]:
def train(dataloader, model, criterion, optimizer):
  size = len(dataloader.dataset)

  for batch, (X1, X2, y) in enumerate(dataloader):
    X1, X2, y = X1.to(device), X2.to(device), y.to(device) 

    logits = model(X1, X2) 

    loss = criterion(logits, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step() 

    if batch % 10 == 0:
      loss, current = loss.item(), (batch + 1) * 1
      print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def eval(dataloader, model, criterion):
  size = len(dataloader.dataset)
  num_batches = len(dataloader) 

  eval_loss, eval_correct = 0, 0

  with torch.no_grad():
    for batch, (X1, X2, y) in enumerate(dataloader):
      X1, X2, y = X1.to(device), X2.to(device), y.to(device)

      logits = model(X1, X2) 

      loss = criterion(logits, y) 
      eval_loss += loss.item() 

      correct = (logits.argmax(1) == y).type(torch.float).sum().item() 
      eval_correct += correct 
    
    eval_loss /= num_batches
    eval_correct /= size 

    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {eval_loss:>8f} \n")

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

num_epochs = 1

for i in range(num_epochs):
  train(train_dataloader, model, criterion, optimizer)
  eval(val_dataloader, criterion, triplet_loss)

loss: 1.154698  [    1/70000]
loss: 0.882938  [   11/70000]
loss: 0.331018  [   21/70000]
loss: 0.060428  [   31/70000]
loss: 0.171646  [   41/70000]
loss: 0.373716  [   51/70000]
loss: 2.586619  [   61/70000]
loss: 0.586937  [   71/70000]
loss: 0.848163  [   81/70000]
loss: 1.667763  [   91/70000]
loss: 2.696003  [  101/70000]
loss: 0.139439  [  111/70000]
loss: 0.182101  [  121/70000]
loss: 1.568899  [  131/70000]
loss: 0.311636  [  141/70000]
loss: 0.124036  [  151/70000]
loss: 0.475539  [  161/70000]
loss: 0.423039  [  171/70000]
loss: 0.188192  [  181/70000]
loss: 0.058588  [  191/70000]
loss: 0.506734  [  201/70000]
loss: 0.019191  [  211/70000]
loss: 0.981314  [  221/70000]
loss: 1.589475  [  231/70000]
loss: 1.050990  [  241/70000]
loss: 0.762181  [  251/70000]
loss: 0.536045  [  261/70000]
loss: 1.706626  [  271/70000]
loss: 0.849604  [  281/70000]
loss: 2.052474  [  291/70000]
loss: 1.717387  [  301/70000]
loss: 0.117434  [  311/70000]
loss: 0.026247  [  321/70000]
loss: 0.60

KeyboardInterrupt: ignored