In [0]:
!pip install torch
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 2.8MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 12.9MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 27.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |██████████

In [0]:
from transformers import squad_convert_examples_to_features
from transformers.data.processors.squad import SquadResult, SquadV2Processor
from transformers import BertModel, BertConfig, BertTokenizer



feature_processor = SquadV2Processor()
examples = feature_processor.get_train_examples('/content/drive/My Drive/cis530project/cis530project/data')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

features, dataset = squad_convert_examples_to_features(
    examples=examples,
    tokenizer=tokenizer,
    max_seq_length=512,
    doc_stride=128,
    max_query_length=128,
    is_training=True,
    return_dataset="pt",
    threads=1
)

100%|██████████| 442/442 [00:51<00:00,  8.59it/s]
convert squad examples to features: 100%|██████████| 130319/130319 [19:02<00:00, 114.10it/s]
add example index and unique id: 100%|██████████| 130319/130319 [00:00<00:00, 825956.71it/s]


In [0]:
import torch

device = torch.device('cuda')

In [0]:
from torch.utils.data import DataLoader


train_loader = DataLoader(dataset=dataset, batch_size=6, shuffle=True)
dev_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)

In [0]:
import torch

from torch import nn
from transformers import BertModel, BertConfig, BertTokenizer

class BERT_SQUAD(nn.Module):
    def __init__(self):
        super(BERT_SQUAD, self).__init__()

        self.bert_model = BertModel.from_pretrained('bert-base-uncased')

        self.fc_layers = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 2)
        )

        self.criterion = nn.CrossEntropyLoss()

        #self.softmax



    def forward(self, c_q_pairs, attention_mask, token_type_ids, start_indices, end_indices):

        bert_encoded = self.bert_model(
            input_ids=c_q_pairs,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )[0]

        fc_output = self.fc_layers(bert_encoded)
        start_outputs, end_outputs = fc_output[:, :, 0].squeeze(-1), fc_output[:, :, 1].squeeze(-1)

        start_indices = (start_indices).clamp(0, start_outputs.shape[1]-1)
        end_indices = (end_indices).clamp(0, start_outputs.shape[1]-1)

        start_loss = self.criterion(start_outputs, start_indices)
        end_loss = self.criterion(end_outputs, end_indices)

        return 2*start_loss + end_loss


    def predict(self, c_q_pairs, attention_mask, token_type_ids):
        bert_encoded = self.bert_model(
            input_ids=c_q_pairs,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )[0]

        fc_output = self.fc_layers(bert_encoded)
        start_outputs, end_outputs = fc_output[:, :, 0].squeeze(-1), fc_output[:, :, 1].squeeze(-1)

        starts, s_ind = start_outputs.max(1)
        ends, e_ind = end_outputs.max(1)


        answers = []
        for i in range(start_outputs.shape[0]):
            start = s_ind[i].clamp(0, start_outputs.shape[1]-1).item()
            end = e_ind[i].clamp(0, start_outputs.shape[1]-1).item()
            answers.append([start, end])
        return answers


In [0]:
bs = BERT_SQUAD().to(device)

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




In [0]:
num_epochs = 2

optimizer = torch.optim.Adam(bs.parameters(), lr=.00003)

In [0]:
for epoch in range(num_epochs):
  bs.train()

  for i, batch in enumerate(train_loader):
    c_q_pairs = batch[0].to(device)
    attention_mask = batch[1].to(device)
    token_type_ids = batch[2].to(device)
    start_ind, end_ind = batch[3].to(device), batch[4].to(device)

    optimizer.zero_grad()
    loss = bs(c_q_pairs, attention_mask, token_type_ids, start_ind, end_ind)
    print('loss on batch {} : {}'.format(epoch * len(train_loader) + i, loss.item()))
    loss.backward()
    optimizer.step()

    if i % 50 == 0:
      print('-----------------------------------------------------')
      print('Results on two random questions from training set : ')
      for i in range(2):
        batch = next(iter(dev_loader))
        c_q_pairs = batch[0].to(device)
        attention_mask = batch[1].to(device)
        token_type_ids = batch[2].to(device)
        start_ind, end_ind = batch[3].to(device), batch[4].to(device)
        s_ind = start_ind.item()
        e_ind = end_ind.item()

        bs.eval()
        indices = bs.predict(c_q_pairs, attention_mask, token_type_ids)
        start, end = indices[0][0], indices[0][1]
        print('Context: {} \n'.format(
            tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(c_q_pairs.view(-1).tolist()))
            ))
        print('Answer: {} \n'.format(
            tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(c_q_pairs.view(-1).tolist())[s_ind:e_ind+1]) if s_ind <= e_ind else None
        ))
        print('Predicted answer: {} \n'.format(
            tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(c_q_pairs.view(-1).tolist())[start:end+1]) if start <= end else None
        ))


      print('-----------------------------------------------------')
      bs.train()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
loss on batch 39680 : 2.630323886871338
loss on batch 39681 : 2.5735714435577393
loss on batch 39682 : 0.7751349806785583
loss on batch 39683 : 4.280669689178467
loss on batch 39684 : 0.7787647247314453
loss on batch 39685 : 2.7492737770080566
loss on batch 39686 : 2.1514482498168945
loss on batch 39687 : 2.980196714401245
loss on batch 39688 : 4.2395524978637695
loss on batch 39689 : 2.4418370723724365
loss on batch 39690 : 0.5335222482681274
loss on batch 39691 : 1.8358917236328125
loss on batch 39692 : 2.9945356845855713
loss on batch 39693 : 1.1925079822540283
loss on batch 39694 : 3.342665910720825
loss on batch 39695 : 2.116452693939209
loss on batch 39696 : 1.4687660932540894
loss on batch 39697 : 1.361781358718872
loss on batch 39698 : 3.126960039138794
loss on batch 39699 : 1.2204577922821045
loss on batch 39700 : 0.6937004327774048
loss on batch 39701 : 2.3293395042419434
loss on batch 39702 : 0.9821524620056152

In [0]:
torch.save(bs.state_dict(), '/content/drive/My Drive/cis530project/bert-squad.pt')

In [0]:
bs1 = BERT_SQUAD()
bs1.load_state_dict(torch.load('/content/drive/My Drive/cis530project/bert-squad.pt'))

<All keys matched successfully>

In [0]:
from transformers import squad_convert_examples_to_features
from transformers.data.processors.squad import SquadResult, SquadV2Processor
from transformers import BertModel, BertConfig, BertTokenizer



feature_processor = SquadV2Processor()
examples = feature_processor.get_dev_examples('/content/drive/My Drive/cis530project/cis530project/data')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


100%|██████████| 35/35 [00:03<00:00,  9.34it/s]


HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
import json
bs1 = bs1.to(device)
bs1.eval()

outputs = dict()

for i in range(len(examples)):
  q_id = examples[i].qas_id
  context = examples[i].context_text
  question = examples[i].question_text
  tokenized = tokenizer.encode_plus(question,
                               context, 
                               max_length=512,
                               return_tensors='pt')
  c_q_pairs = tokenized['input_ids'].to(device)
  token_type_ids = tokenized['token_type_ids'].to(device)
  attention_mask = tokenized['attention_mask'].to(device)

  indices = bs1.predict(c_q_pairs, attention_mask, token_type_ids)
  start, end = indices[0][0], indices[0][1]
  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(c_q_pairs.view(-1).tolist())[start:end+1]) if start <= end else ""
  if '[CLS]' in answer:
    answer = ""
  outputs[q_id] = answer

  if i % 100 == 0:
    print('done with example : {}'.format(i))

with open('bert1-dev-preds.json', 'w') as f:
  json.dump(outputs, f)

done with example : 0
done with example : 100
done with example : 200
done with example : 300
done with example : 400
done with example : 500
done with example : 600
done with example : 700
done with example : 800
done with example : 900
done with example : 1000
done with example : 1100
done with example : 1200
done with example : 1300
done with example : 1400
done with example : 1500
done with example : 1600
done with example : 1700
done with example : 1800
done with example : 1900
done with example : 2000
done with example : 2100
done with example : 2200
done with example : 2300
done with example : 2400
done with example : 2500
done with example : 2600
done with example : 2700
done with example : 2800
done with example : 2900
done with example : 3000
done with example : 3100
done with example : 3200
done with example : 3300
done with example : 3400
done with example : 3500
done with example : 3600
done with example : 3700
done with example : 3800
done with example : 3900
done with ex