# Fine-Tuning Jean Michel Tine

> Indented block



In [1]:
import os
import requests
import json
from pathlib import Path

In [2]:
!mkdir squad
!wget https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_train-v1.1.json  -O squad/train-v2.0.json
!wget https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_test-v1.1_WER54.json -O squad/test-v2.0.json

--2023-03-26 17:18:25--  https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_train-v1.1.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21805612 (21M) [text/plain]
Saving to: ‘squad/train-v2.0.json’


2023-03-26 17:18:26 (180 MB/s) - ‘squad/train-v2.0.json’ saved [21805612/21805612]

--2023-03-26 17:18:26--  https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_test-v1.1_WER54.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3370853 (3.2M) [text/plain]
Saving to: ‘squad/dev-v2.0.j

---
# Get and Prepare Data

## Download SQuAD data

In [3]:

%%capture
!pip install transformers

## Read

In [4]:
def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa['answers']:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

In [5]:
train_contexts, train_questions, train_answers = read_squad('/content/squad/train-v2.0.json')
test_contexts, test_questions, test_answers = read_squad('/content/squad/test-v2.0.json')

## Prepare

In [6]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    # this means the answer is off by 'n' tokens
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

In [7]:
add_end_idx(train_answers, train_contexts)
add_end_idx(test_answers, test_contexts)

## Encode

In [8]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [9]:
tokenizer.decode(train_encodings['input_ids'][0])

'[CLS] architecturally the school has a catholic character. atop the main building school dome is the golden statue of the virgin mary. immediately in front of the main building in facing it is a copper statue of christ with arms appraised with the legend and the bad meow names. next to the main building is the basilica of the sacred heart. immediately behind the basilica is the grotto im mary in place of prayer and reflection. it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to st bernadette still burning eighteen fifty eight. at the end of the main drive and in a direct line that connects through three statues in the gold dome is as simple modern stone statue of mary. [SEP] what is in front of the notre dame main building? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
print(train_encodings)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [12]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    count = 0
    
    for i in range(len(answers)):
        if 'answer_start' not in answers[i] or 'answer_end' not in answers[i]:
            # skip entries without answer start or end positions
            continue
        
        start_pos = encodings.char_to_token(i, answers[i]['answer_start'])
        end_pos = encodings.char_to_token(i, answers[i]['answer_end'])
        
        if start_pos is None:
            # answer passage has been truncated
            start_pos = tokenizer.model_max_length
            
        if end_pos is None:
            # 'char_to_token' function points to space after correct token, so subtract 1
            end_pos = encodings.char_to_token(i, answers[i]['answer_end'] - 1)
            
            if end_pos is None:
                # answer passage has been truncated
                count += 1
                end_pos = tokenizer.model_max_length
        
        start_positions.append(start_pos)
        end_positions.append(end_pos)
        
    print(count)
    
    # update the data in the encodings dictionary
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(test_encodings, test_answers)

1
0


---

# PyTorch Fine-tuning

In [13]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        if idx >= len(self):
           raise IndexError("Index out of range")
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
test_dataset = SquadDataset(test_encodings)

In [None]:
#train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
#test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [14]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [15]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [16]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 2320/2320 [29:18<00:00,  1.32it/s, loss=1.47]
Epoch 1: 100%|██████████| 2320/2320 [29:16<00:00,  1.32it/s, loss=1.33]
Epoch 2: 100%|██████████| 2320/2320 [29:15<00:00,  1.32it/s, loss=0.742]


## Save Model

In [18]:
model_path = "/content/gdrive/My Drive/{models/distilbert-custom.h5}"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/content/gdrive/My Drive/{models/distilbert-custom.h5}/tokenizer_config.json',
 '/content/gdrive/My Drive/{models/distilbert-custom.h5}/special_tokens_map.json',
 '/content/gdrive/My Drive/{models/distilbert-custom.h5}/vocab.txt',
 '/content/gdrive/My Drive/{models/distilbert-custom.h5}/added_tokens.json',
 '/content/gdrive/My Drive/{models/distilbert-custom.h5}/tokenizer.json')

In [19]:
test_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [20]:
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
test_loader = DataLoader(test_dataset, batch_size=16)

acc = []

# initialize loop for progress bar
loop = tqdm(test_loader)
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions,
                        end_positions=end_positions)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)

  0%|          | 0/1116 [00:00<?, ?it/s]


ValueError: ignored

In [None]:
print("T/F\tstart\tend\n")
for i in range(len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

T/F	start	end



NameError: ignored