In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import transformers as trans
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
tokenizer = trans.BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 32
epochs = 2 # Change to 15
enc_lr = 0.00001
lr = 0.0005

device

device(type='cpu')

In [4]:
# Load datasets
train = pd.read_json('sciad_train.json')
dev = pd.read_json('sciad_dev.json')
test = pd.read_json('sciad_test.json')
train

Unnamed: 0,acronym,expansion,id,tokens
0,20,secrecy rate,TR-0,"[In, summary, ,, it, is, evident, that, their,..."
1,6,markov geographic model,TR-1,"[The, main, objective, of, DDE, -, MGM, is, co..."
2,10,graph convolution networks,TR-2,"[Especially, ,, there, is, the, smaller, desce..."
3,23,autonomic computing,TR-3,"[We, end, the, talk, inviting, the, community,..."
4,9,convolutional neural network,TR-4,"[We, start, with, "", AlexNet, "", as, our, base..."
...,...,...,...,...
36547,18,resource description framework,TR-50025,"[In, addition, ,, the, approach, in, chebotko2..."
36548,38,large deviation,TR-50026,"[In, this, section, ,, we, investigate, how, t..."
36549,9,neural image caption,TR-50027,"[In, this, paper, ,, we, have, presented, CNet..."
36550,24,hard decision,TR-50029,"[The, optimum, decoding, rule, is, given, byOb..."


In [5]:
# Create custom Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, txt, labels):
        self.labels = labels
        self.text = txt

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        label = self.labels[idx]
        text = self.text[idx]
        sample = {"Text": text, "Expansion": label}
        return sample

In [6]:
# Add <start> and <end> tokens to tokens column
for i in range(len(train['acronym'])):
  chars = [s[0] for s in train.iloc[i, 1].split()]
  acronym = ' '.join(chars)
  acronym = acronym.upper().replace(" ", "")
  sent = ' '.join(train.iloc[i, 3])
  if sent.find(acronym) != -1:
    index = sent.find(acronym)
    sent_w_tokens = sent[:index] + '<start>' + sent[index:index+len(acronym)] + '<end>' + sent[index+len(acronym):]
    train.iloc[i, 3] = sent_w_tokens
train

Unnamed: 0,acronym,expansion,id,tokens
0,20,secrecy rate,TR-0,"In summary , it is evident that their complexi..."
1,6,markov geographic model,TR-1,The main objective of DDE - <start>MGM<end> is...
2,10,graph convolution networks,TR-2,"Especially , there is the smaller descent of S..."
3,23,autonomic computing,TR-3,We end the talk inviting the community to join...
4,9,convolutional neural network,TR-4,"We start with "" AlexNet "" as our base <start>C..."
...,...,...,...,...
36547,18,resource description framework,TR-50025,"In addition , the approach in chebotko2009sema..."
36548,38,large deviation,TR-50026,"In this section , we investigate how the stoch..."
36549,9,neural image caption,TR-50027,"In this paper , we have presented CNet - <star..."
36550,24,hard decision,TR-50029,The optimum decoding rule is given byObserve t...


In [7]:
# Store expansions for use as labels
train_labels = []
test_labels = []
for i in range(len(train['acronym'])):
  train_labels.append(''.join(train.iloc[i,1]))

for i in range(len(dev['acronym'])):
  test_labels.append(''.join(dev.iloc[i,1]))

In [8]:
# Prepare input for BERT encoder
train_inputs = []
test_inputs = []
for i in range(len(train['acronym'])):
  train_text = '[CLS]' + ''.join(train.iloc[i, 1]) + '[SEP]' + ''.join(train.iloc[i, 3]) + '[SEP]'
  train_inputs.append(train_text)

for i in range(len(dev['acronym'])):
  test_text = '[CLS]' + ''.join(dev.iloc[i, 1]) + '[SEP]' + ''.join(dev.iloc[i, 3]) + '[SEP]'
  test_inputs.append(test_text)

In [9]:
print(train_labels[6])
print(train_inputs[6])

computed tomography
[CLS]computed tomography[SEP]The database is constructed from 915 clinical <start>CT<end> volumes consisting of head and neck images .[SEP]


In [10]:
# Instantiate Dataset and DataLoader objects
train_dataset = CustomTextDataset(train_inputs, train_labels)
test_dataset = CustomTextDataset(test_inputs, test_labels)

train_dl = DataLoader(train_dataset, batch_size=batch_size)
test_dl = DataLoader(test_dataset, batch_size=batch_size, num_workers=2)

In [11]:
train_labels[6]

'computed tomography'

In [12]:
# Create custom model class
class CustomModel(torch.nn.Module):
  def __init__(self):
    super(CustomModel, self).__init__()
    self.bert = trans.BertModel.from_pretrained('bert-base-uncased')
    embedding_dim = self.bert.config.to_dict()['hidden_size']
    self.dropout1 = torch.nn.Dropout(0.2)
    self.dense1 = torch.nn.Linear(embedding_dim, 128)
    self.relu = torch.nn.ReLU()
    self.dropout2 = torch.nn.Dropout(0.1)
    self.dense2 = torch.nn.Linear(128, 1)
    self.sigmoid = torch.nn.Sigmoid()

  def forward(self, encoding):
    sequence_output, pooled_output = self.bert(encoding.input_ids, attention_mask=encoding.attention_mask) #get [CLS] representation w/ outputs[0][:,0,:]
    dropout1_output = self.dropout1(sequence_output)
    dense1_output = self.dense1(dropout1_output)
    relu_output = self.relu(dense1_output)
    dropout2_output = self.dropout2(relu_output)
    dense2_output = self.dense2(dropout2_output)
    sigmoid_output = self.sigmoid(dense2_output)
    return sigmoid_output

In [13]:
# Instantiate model and optimizer
model = CustomModel().to(device)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# To calculate accuracy
def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim = 1, keepdim = True)
    correct = (max_preds.squeeze(1)==y).float()
    return correct.sum() / len(y)

In [None]:
# Train model
for epoch in range(epochs):
  epoch_loss = 0
  epoch_acc = 0
  for idx, batch in enumerate(train_dl):
    data = batch['Text']
    targets = batch['Expansion']
    optimizer.zero_grad()
    torch.cuda.empty_cache()   
    train_encoding = tokenizer.batch_encode_plus(train_inputs, padding=True, truncation=True, return_tensors='pt')
    outputs = model(train_encoding)
    loss = criterion(outputs, targets)
    acc = categorical_accuracy(outputs, targets)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  train_loss = epoch_loss / len(train_dl)
  train_acc = epoch_acc / len(train_dl)

In [None]:
# Test model
test_encoding = tokenizer.encode_plus(test_inputs, truncation=True, return_tensors='pt')
test_label_encoding = tokenizer.encode_plus(test_labels, return_tensors='pt')

In [None]:
# Display metrics (Precision, Recall, F1)