In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#install required libraries
!pip install transformers

In [None]:
# download data
#!wget https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz
#!tar -xf paws_wiki_labeled_final.tar.gz

--2021-10-28 19:45:25--  https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.197.128, 64.233.191.128, 209.85.145.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.197.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4687157 (4.5M) [application/gzip]
Saving to: ‘paws_wiki_labeled_final.tar.gz’


2021-10-28 19:45:25 (132 MB/s) - ‘paws_wiki_labeled_final.tar.gz’ saved [4687157/4687157]



In [None]:
#import dependencies
import os
import copy
import time
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

import torch
import transformers
import tokenizers

In [None]:
def get_msr_data(paraphrase_data_path, data_part):
  # This function reads the MRPC dataset
  with open(os.path.join(paraphrase_data_path, data_part), 'r') as f:
    data = f.read()

  sentence_1_data = [el.split('\t')[3:][0] for el in data.split('\n')[1:-1]]
  sentence_2_data = [el.split('\t')[3:][1] for el in data.split('\n')[1:-1]]
  targets = [int(el.split('\t')[0][0]) for el in data.split('\n')[1:-1]]
  return sentence_1_data, sentence_2_data, targets

In [None]:
def my_collate(batch):
  batch = filter(lambda x: x is not None, batch)
  return torch.utils.data.dataloader.default_collate(list(batch))

In [None]:
class BERTMSRDataset:
  # This class returns a processed data sample by index
  def __init__(self, sent_1, sent_2, targets):
    self.sent_1 = sent_1
    self.sent_2 = sent_2
    self.targets = targets
    self.sent_len = len(sent_1)

  def __len__(self):
    return self.sent_len

  def __getitem__(self, idx):

    target = torch.FloatTensor([self.targets[idx]])
    s1 = " ".join(self.sent_1[idx].split()).lower() 
    s2 = " ".join(self.sent_2[idx].split()).lower() # remove redundant spaces in a sentence

    inputs = TOKENIZER.encode_plus(s1, s2, add_special_tokens=True, max_length=MAX_LEN, padding='max_length') # encode sentences

    ids = torch.LongTensor(inputs['input_ids']) # a padded vector of encoded words
    token_type_ids = torch.LongTensor(inputs['token_type_ids']) # a mask that separates one sentence from the other
    mask = torch.LongTensor(inputs['attention_mask']) # a mask that highlights what part of the token ids the model needs to attend

    # data format: SENTENCE1 [SEP] SENTENCE2

    return {
        "ids": ids,
        "token_type_ids": token_type_ids,
        "mask": mask,
        "target": target,
        "sent": self.sent_1[idx] + '[SEP]' + self.sent_2[idx]
    }

In [None]:
class BERTModel(torch.nn.Module):
  def __init__(self, conf):
    super(BERTModel, self).__init__()
    self.bert = transformers.BertModel.from_pretrained('bert-base-uncased', config=conf) # load pretrained bert model
    self.drop = torch.nn.Dropout(0.2) # add regularization
    self.out = torch.nn.Linear(self.bert.config.hidden_size, 1) # add a classification layer

    self.criterion = torch.nn.BCELoss()

  def forward(self, ids, token_type_ids, mask, labels=None):
    out = self.bert(input_ids=ids, token_type_ids=token_type_ids, attention_mask=mask)['pooler_output']
    out = self.drop(out)
    out = self.out(out)
    out = torch.sigmoid(out) # pass the output of the model through the sigmoid function
    loss = 0
    if labels is not None:
      loss = self.criterion(out, labels) # calculate the loss if the model is provided with the labels
    return out, loss

In [None]:
def evaluate_model(bert):
  # model evaluation stage
  bert.eval()
  c = 0
  targ, pred = [], []
  for inputs in dl_test:
    targ += list(inputs['target'].detach().cpu().numpy().squeeze(1))
    with torch.set_grad_enabled(False):
      outputs, loss = bert(inputs['ids'].to(device), inputs['token_type_ids'].to(device), inputs['mask'].to(device), labels=inputs['target'].to(device))
    pred += list((outputs.detach().cpu().squeeze(1).numpy() > 0.5).astype(int))
    c+=1
    print(f'step: {c}, loss: {loss}')

  targ = np.array(list(map(int, targ)))
  pred = np.array(pred)
  acc = np.sum(targ == pred)/len(targ)
  conf_matrix = confusion_matrix(targ, pred)

  print('Accuracy:', acc)
  print('Confusion Matrix:')
  print(conf_matrix)

  return acc, conf_matrix

In [None]:
def train_model(model, optimizer, scheduler):
  # trains the model for one epoch
  model.train()

  running_loss = 0.0
  running_corrects = 0
  total_steps = dataset_sizes['train'] // BS + 1
  for step, inputs in enumerate(dataloaders['train']):
    ids = inputs['ids'].to(device)
    mask = inputs['mask'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    labels = inputs['target'].to(device)

    optimizer.zero_grad()

    with torch.set_grad_enabled(True):
      outputs, loss = model(ids, token_type_ids, mask, labels)
      preds = outputs.detach().cpu().numpy() > 0.5
      loss.backward()
      optimizer.step()
      scheduler.step()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      print('step: {}/{} Loss: {:.4f}'.format(step+1, total_steps, loss))

    # statistics
    running_loss += loss.item() * len(inputs)
    running_corrects += np.sum(preds == labels.detach().cpu().numpy())           
    scheduler.step()
    epoch_loss = running_loss / dataset_sizes['train']
    epoch_acc = running_corrects / dataset_sizes['train']

    print('Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
  return model

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
paraphrase_data_path = '/content/drive/MyDrive/msr_paraphrase'
pawsx_data_paths_train = ['/content/final/train.tsv']#, '/content/final/dev.tsv']
pawsx_data_paths_test = '/content/final/test.tsv'

EPOCHS = 4
BS = 64
WARMUP_STEPS = 0
MAX_LEN = 128 # maximal length of a sentence
LR = 2e-5
TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model_config = transformers.BertConfig.from_pretrained('bert-base-uncased')
model_config.output_hidden_states = True
bert = BERTModel(model_config).to(device)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_1, train_2, train_targets = get_msr_data(paraphrase_data_path, 'msr_paraphrase_train.txt')
test_1, test_2, test_targets = get_msr_data(paraphrase_data_path, 'msr_paraphrase_test.txt') # get sentences and targets
#df_train = pd.concat([pd.read_csv(path, sep='\t') for path in pawsx_data_paths_train])
#df_test = pd.read_csv(pawsx_data_paths_test, sep='\t')

In [None]:
ds_train = BERTMSRDataset(train_1, train_2, train_targets)
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=BS, shuffle=True, num_workers=2, worker_init_fn=np.random.seed(0), collate_fn=my_collate) # create torch dataloader

ds_test = BERTMSRDataset(test_1, test_2, test_targets)
dl_test = torch.utils.data.DataLoader(ds_test, batch_size=BS, shuffle=True, num_workers=2, worker_init_fn=np.random.seed(0), collate_fn=my_collate)

dataloaders = {'train': dl_train, 'val': dl_test}
dataset_sizes = {'train': len(ds_train), 'val': len(ds_test)}

In [None]:
# initialize adam optimizer and learning rate scheduler
optimizer = transformers.optimization.AdamW(bert.parameters(), lr=LR, weight_decay=1e-5) 

scheduler = transformers.optimization.get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=WARMUP_STEPS,
      num_training_steps=len(train_1) // BS * EPOCHS
    )

In [None]:
# train the model and save best weights
best_acc = 0
for epoch in range(EPOCHS):
  print('Epoch:', epoch+1)
  bert = train_model(bert, optimizer, scheduler)
  acc, conf_matrix = evaluate_model(bert)
  if acc > best_acc:
    best_acc = acc
    best_model = copy.deepcopy(bert.state_dict())
bert.load_state_dict(best_model)

Epoch: 1
step: 1/64 Loss: 0.6164
Loss: 0.0008 Acc: 0.0110
step: 2/64 Loss: 0.7041
Loss: 0.0016 Acc: 0.0199
step: 3/64 Loss: 0.6375
Loss: 0.0024 Acc: 0.0307
step: 4/64 Loss: 0.6866
Loss: 0.0032 Acc: 0.0402
step: 5/64 Loss: 0.6388
Loss: 0.0040 Acc: 0.0508
step: 6/64 Loss: 0.5983
Loss: 0.0048 Acc: 0.0621
step: 7/64 Loss: 0.5927
Loss: 0.0055 Acc: 0.0738
step: 8/64 Loss: 0.6044
Loss: 0.0062 Acc: 0.0849
step: 9/64 Loss: 0.6249
Loss: 0.0070 Acc: 0.0959
step: 10/64 Loss: 0.5884
Loss: 0.0077 Acc: 0.1072
step: 11/64 Loss: 0.7027
Loss: 0.0086 Acc: 0.1163
step: 12/64 Loss: 0.6596
Loss: 0.0094 Acc: 0.1266
step: 13/64 Loss: 0.6238
Loss: 0.0102 Acc: 0.1374
step: 14/64 Loss: 0.6864
Loss: 0.0110 Acc: 0.1465
step: 15/64 Loss: 0.6456
Loss: 0.0118 Acc: 0.1573
step: 16/64 Loss: 0.6573
Loss: 0.0126 Acc: 0.1678
step: 17/64 Loss: 0.5522
Loss: 0.0133 Acc: 0.1801
step: 18/64 Loss: 0.6475
Loss: 0.0141 Acc: 0.1901
step: 19/64 Loss: 0.7079
Loss: 0.0149 Acc: 0.1997
step: 20/64 Loss: 0.6108
Loss: 0.0157 Acc: 0.2103


<All keys matched successfully>

In [None]:
best_acc

0.7820289855072464

In [None]:
# save/load model to/from file

#with open('/content/drive/MyDrive/text_similarity/bert.mdl', 'wb') as f:
#  pickle.dump(bert.to('cpu'), f)
with open('/content/drive/MyDrive/text_similarity/bert.mdl', 'rb') as f:
  bert = pickle.load(f)
bert = bert.to(device)

In [None]:
# use your own sentences
s1 = 'Any trip to Italy should include a visit to Tuscany to sample the region\'s exquisite wines'
s2 = 'Be sure to make time for a Tuscan wine-tasting experience when visiting Italy.'

inputs = TOKENIZER.encode_plus(s1, s2, add_special_tokens=True, max_length=MAX_LEN, padding='max_length')

ids = torch.LongTensor(inputs['input_ids']).unsqueeze(0).to(device)
token_type_ids = torch.LongTensor(inputs['token_type_ids']).unsqueeze(0).to(device)
mask = torch.LongTensor(inputs['attention_mask']).unsqueeze(0).to(device)
outputs, loss = bert(ids, token_type_ids, mask)
print('Sentences are similar:', outputs.cpu().detach().squeeze(0).numpy()[0] > 0.5)
print('Confidence:', outputs.cpu().detach().squeeze(0).numpy()[0])

Sentences are similar: True
Confidence: 0.559029


In [None]:
s1 = 'Any trip to Italy should include a visit to Tuscany to sample the region\'s exquisite wines'
s2 = 'My name is Nikita.'

inputs = TOKENIZER.encode_plus(s1, s2, add_special_tokens=True, max_length=MAX_LEN, padding='max_length')

ids = torch.LongTensor(inputs['input_ids']).unsqueeze(0).to(device)
token_type_ids = torch.LongTensor(inputs['token_type_ids']).unsqueeze(0).to(device)
mask = torch.LongTensor(inputs['attention_mask']).unsqueeze(0).to(device)
outputs, loss = bert(ids, token_type_ids, mask)
print('Sentences are similar:', outputs.cpu().detach().squeeze(0).numpy()[0] > 0.5)
print('Confidence:', outputs.cpu().detach().squeeze(0).numpy()[0])

Sentences are similar: False
Confidence: 0.25596383


In [None]:
'''for inputs in dl_train:
  break
ids = inputs['ids']
mask = inputs['mask']
token_type_ids = inputs['token_type_ids']
labels = inputs['target']

#bert = transformers.BertModel.from_pretrained('bert-base-uncased', config=model_config)
out, loss = bert(ids, token_type_ids, mask, labels)'''

In [None]:
'''class BERTPAWSDataset:
  def __init__(self, df_train):
    self.sent_1 = list(df_train['sentence1'].values)
    self.sent_2 = list(df_train['sentence2'].values)
    self.labels = list(df_train['label'].values)
    self.sent_len = len(self.sent_1)

  def __len__(self):
    return self.sent_len

  def __getitem__(self, idx):

    rand_num = np.random.uniform()

    s1 = " ".join(self.sent_1[idx].split()).lower()
    s2 = " ".join(self.sent_2[idx].split()).lower()
    target = torch.FloatTensor([self.labels[idx]])

    inputs = TOKENIZER.encode_plus(s1, s2, add_special_tokens=True, max_length=MAX_LEN, padding='max_length')

    ids = torch.LongTensor(inputs['input_ids'])
    token_type_ids = torch.LongTensor(inputs['token_type_ids'])
    mask = torch.LongTensor(inputs['attention_mask'])
    if len(ids) <= MAX_LEN:
      return {
          "ids": ids,
          "token_type_ids": token_type_ids,
          "mask": mask,
          "target": target,
          "sent": self.sent_1[idx] + '[SEP]' + self.sent_2[idx]
      }'''