In [1]:
#%%Forcolabonly
from google.colab import drive
drive.mount('/content/gdrive')
#%%

Mounted at /content/gdrive


In [1]:
# !pip install datasets
# !pip install transformers
# !pip install torchcontrib
import torch
from torch import nn
import torch.nn.functional as F
from torch import BoolTensor, FloatTensor, LongTensor
from typing import Optional
from datasets import load_dataset
import transformers
from transformers import DistilBertTokenizerFast, BertPreTrainedModel, get_linear_schedule_with_warmup, AdamW
from torchcontrib.optim import SWA
import regex as re
import numpy as np
from tqdm import tqdm

In [2]:
tags=sorted(list('.?!,;:-—…'))
tag2id = {tag: id+1 for id, tag in enumerate(tags)}
tag2id[' ']=0
tag2id['']=-100
id2tag = {id: tag for tag, id in tag2id.items()}
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
class PunctuationDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids:LongTensor, attention_mask:FloatTensor, labels:Optional[LongTensor] = None) -> None:
        """
        :param input_ids: tokenids
        :param attention_mask: attention_mask, null->0
        :param labels: true labels, optional
        :return None
        """
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __getitem__(self, idx):
        """:param idx: implement index"""
        return {'input_ids': torch.as_tensor(self.input_ids[idx],dtype=torch.long),
                'attention_mask': torch.as_tensor(self.attention_mask[idx],dtype=torch.float32),
                'labels': torch.as_tensor(self.labels[idx],dtype=torch.long)}

    def view(self,idx:int)->str:
        """:param idx(int): returns readable format of single input_ids and labels in the form of readable text"""
        return ' '.join([''.join(x) for x in list(zip(tokenizer.convert_ids_to_tokens(self.input_ids[idx]),[id2tag[x] for x in self.labels[idx].tolist()]))])

    def __len__(self)->int:
        return len(self.labels)

In [6]:
class config:
  def __init__(self):
    self.max_len=128
    self.overlap = 126
    self.train_batch_size = 4
    self.dev_batch_size = 4
    self.gpu_device = 'cpu' # 'cuda:0' #
    self.freeze_epochs = 20
    self.freeze_lr = 1e-4
    self.unfreeze_epochs = 20
    self.unfreeze_layers = 6
    self.unfreeze_lr = 1e-5
    self.base_model_path = 'distilbert-base-uncased'
    self.train_dataset = '/content/gdrive/MyDrive/ASR/ted_talks_processed.train.pt'
    self.dev_dataset = '/content/gdrive/MyDrive/ASR/ted_talks_processed.dev.pt'
    self.alpha = 0.8
    self.hidden_dropout_prob = 0.3
    self.embedding_dim = 768
    self.num_labels = 10
    self.hidden_dim = 128
    self.self_adjusting = True
    self.square_denominator = False
    self.use_crf = False
    self.model_name = 'bertcrf'
    self.model_path = "/content/gdrive/MyDrive/ASR/logs/models/"
config = transformers.configuration_utils.PretrainedConfig.from_dict(config().__dict__)

In [5]:
device = torch.device(config.gpu_device) if torch.cuda.is_available() else torch.device('cpu')
train_dataset=PunctuationDataset(**torch.load(config.train_dataset,map_location=device)[:])
# dev_dataset=PunctuationDataset(**torch.load(config.dev_dataset,map_location=device)[:])
train_dataset.view(-1000)

"[CLS]  turns  activism  into  terrorism  if  it  causes  a  loss  of  profits. now  most  people  never  even  heard  about  this  law, including  members  of  congress. less  than  one  percent  were  in  the  room  when  it  passed  the  house. the  rest  were  outside  at  a  new  memorial. they  were  praising  dr. king  as  his  style  of  activism  was  branded  as  terrorism  if  done  in  the  name  of  animals  or  the  environment. supporters  say  laws  like  this  are  needed  for  the  ex  ##tre  ##mist  ##s: the  van  ##dal  ##s, the  arson  ##ists, the  radicals. but  right  now, companies  like  trans  ##cana  ##da  are  briefing  police  in  presentations  like  this  one  about  how  to  prose  ##cute  non  ##vio  ##lent  protesters  as  terrorists. the  fbi  '  s  training  documents  on  eco- [SEP] "

In [7]:
train_dataloader=torch.utils.data.DataLoader(train_dataset, batch_size=config.train_batch_size, num_workers=4)
# dev_dataloader=torch.utils.data.DataLoader(dev_dataset, batch_size=config.dev_batch_size, num_workers=2)
{x:y.shape for x,y in next(iter(train_dataloader)).items()} #(batch_size, seq_len)

{'attention_mask': torch.Size([4, 128]),
 'input_ids': torch.Size([4, 128]),
 'labels': torch.Size([4, 128])}

In [158]:
#b_s = 4, s_l = 128, h_d = 768
# bert=transformers.BertModel.from_pretrained(config.base_model_path)
# input_ids, attention_mask, labels = next(iter(train_dataloader)).values() # (batch_size, seq_len) * 3
# bo=bert(input_ids, attention_mask) # => last_hidden_state (b_s, s_l, h_d 768), pooler_output (b_s, h_d)
# dropout = nn.Dropout(config.hidden_dropout_prob) # (b_s, s_l, h_d)
# sequence_output=dropout(bo[0])
# fcl = nn.Linear(config.embedding_dim, config.num_labels)
# fcl_output=fcl(sequence_output) #(b_s, s_l, h_d) -> (b_s, s_l, num_labels)
# dice_loss=DiceLoss(self_adjusting=True,alpha=1)(fcl_output,labels)
# bert(next)

tensor(0.9624, grad_fn=<MeanBackward0>)

tensor(0.9624, grad_fn=<MeanBackward0>)

In [148]:
# dice_loss(fcl_output,labels,attention_mask)
#b_s,s_l,n_l
# pred_soft=torch.softmax(fcl_output,-1)#,torch.softmax(fcl_output[0,0,:],-1) #apply softmax to each token
# target_one_hot=F.one_hot(labels,num_classes=config.num_labels)#,labels[0,:5] (b_s, s_l) -> (b_s, s_l, n_l)
# pred_factor=((1-pred_soft) ** config.alpha) if config.self_adjusting else 1
# pred_prod=pred_factor*pred_soft*target_one_hot
# sum(pred_prod,0).shape
# smooth = 1e-8
# intersection=torch.sum(pred_prod,1)
intersection.shape,pred_prod.shape
# cardinality =torch.sum(pred_factor*pred_soft + target_one_hot, 1)
# dice_score=1-2*(intersection+smooth)/(cardinality+smooth)
# dice_score[0,:]
# weight=[0,0,0,0,0,0,0,1,0,0]
# (dice_score[:2,:]*torch.tensor(torch.tensor(weight))).shape
### torch.gather(pred_soft[0,:5],-1,index=labels[0,:5].unsqueeze(-1)) #returns the probability for the most likely example is this really needed?
# target_one_hot.shape
# labels.unsqueeze(2).shape,labels.shape
# pred_soft[0,:5].shape

(torch.Size([4, 10]), torch.Size([4, 128, 10]))

In [161]:
def validate(
        h: torch.Tensor,
        labels: Optional[torch.LongTensor] = None,
        mask: Optional[torch.ByteTensor] = None) -> None:
    if h.dim() != 3:
        raise ValueError(f'h must have dimension of 3, got {h.dim()}')
    if h.size(2) != config.num_labels:
        raise ValueError(
            f'expected last dimension of h is {config.num_labels}, '
            f'got {h.size(2)}')
    if labels is not None:
        if h.shape[:2] != labels.shape:
            raise ValueError(
                'the first two dimensions of h and labels must match, '
                f'got {tuple(h.shape[:2])} and {tuple(labels.shape)}')
    if mask is not None:
        if h.shape[:2] != mask.shape:
            raise ValueError(
                'the first two dimensions of h and mask must match, '
                f'got {tuple(h.shape[:2])} and {tuple(mask.shape)}')
validate(fcl_output,labels) #why must x be the fcl output?

In [149]:
class DiceLoss(nn.Module):
    r"""
    Creates a criterion that optimizes a multi-class Self-adjusting Dice Loss
    ("Dice Loss for Data-imbalanced NLP Tasks" paper)
    Args:
        alpha (float): a factor to push down the weight of easy examples
        gamma (float): a factor added to both the nominator and the denominator for smoothing purposes
    """
    def __init__(self,
                 smooth: Optional[float] = 1e-8,
                #  square_denominator: Optional[bool] = False,
                 self_adjusting: Optional[bool] = False,
                #  with_logits: Optional[bool] = True,
                 reduction: Optional[str] = "mean",
                 alpha: float = 1.0,
                #  ignore_index: int = -100,
                 weight=1, #int or list
                 ) -> None:
        super(DiceLoss, self).__init__()
        # self.ignore_index = ignore_index
        self.reduction = reduction
        self.self_adjusting = self_adjusting
        self.alpha = alpha
        self.smooth = smooth
        # self.square_denominator = square_denominator
        self.weight=weight
    def forward(self,
                pred: torch.Tensor,
                target: torch.Tensor,
                # mask: Optional[torch.Tensor] = None,
                num_classes: int = 10,
                ) -> torch.Tensor:
        pred_soft = torch.softmax(pred,-1) #(batch_size,seq_len,num_labels)->(batch_size,seq_len,num_labels), sum along num_labels to 1
        target_one_hot=F.one_hot(target,num_classes=num_classes) #(b_s, s_l) -> (b_s, s_l, n_l)
        pred_factor = ((1-pred_soft) ** self.alpha) if self.self_adjusting else 1
        # if mask is not None:
        #     mask = mask.view(-1).float()
        #     pred_soft = pred_soft * mask
        #     target_one_hot = target_one_hot * mask
        intersection = torch.sum(pred_factor * pred_soft * target_one_hot, 1) # (b_s,s_l,n_l)->(b_s,n_l)
        cardinality = torch.sum(pred_factor * pred_soft + target_one_hot, 1)  # (b_s,s_l,n_l)->(b_s,n_l)
        dice_score = 1. - 2. * (intersection + self.smooth) / (cardinality + self.smooth) * torch.tensor(self.weight)
        if self.reduction == "mean":
            return dice_score.mean()
        elif self.reduction == "sum":
            return dice_score.sum()
        elif self.reduction == "none" or self.reduction is None:
            return dice_score
        else:
            raise NotImplementedError(f"Reduction `{self.reduction}` is not supported.")
    def __str__(self):
        return f"Dice Loss smooth:{self.smooth}"



class DiceCRF(nn.Module):
    def __init__(
        self, num_labels: int, pad_idx: Optional[int] = None,
    ) -> None:
        """
        :param num_labels: number of labels
        :param pad_idx: padding index. default None
        :return None
        """
        if num_labels < 1: raise ValueError("invalid number of labels: {0}".format(num_labels))
        super().__init__()
        self.num_labels = num_labels
        self.trans_matrix = nn.Parameter(torch.empty(num_labels, num_labels))
        self.start_trans = nn.Parameter(torch.empty(num_labels))
        self.end_trans = nn.Parameter(torch.empty(num_labels))
        self._initialize_parameters(pad_idx)
    def forward(
        self, h: FloatTensor, labels: LongTensor, mask: BoolTensor, reduction: str = 'sum',
    ) -> FloatTensor:
        """
        :param h: hidden matrix (batch_size, seq_len, num_labels)
        :param labels: answer labels of each sequence
                       in mini batch (batch_size, seq_len)
        :param mask: mask tensor of each sequence
                     in mini batch (batch_size, seq_len)
        :return: The log-likelihood (batch_size)
        """
        self._validate(h, labels=labels, mask=mask)
        log_numerator = self._compute_numerator_log_likelihood(h, labels, mask)
        log_denominator = self._compute_denominator_log_likelihood(h, mask)
        llh=log_numerator - log_denominator
        if reduction == 'none':
            return llh
        if reduction == 'sum':
            return llh.sum()
        if reduction == 'mean':
            return llh.mean()
        assert reduction == 'token_mean'
        return llh.sum() / mask.float().sum()
    def _validate(
            self,
            h: torch.Tensor,
            labels: Optional[torch.LongTensor] = None,
            mask: Optional[torch.ByteTensor] = None) -> None:
        if h.dim() != 3:
            raise ValueError(f'h must have dimension of 3, got {h.dim()}')
        if h.size(2) != self.num_labels:
            raise ValueError(
                f'expected last dimension of h is {self.num_labels}, '
                f'got {h.size(2)}')
        if labels is not None:
            if h.shape[:2] != labels.shape:
                raise ValueError(
                    'the first two dimensions of h and labels must match, '
                    f'got {tuple(h.shape[:2])} and {tuple(labels.shape)}')
        if mask is not None:
            if h.shape[:2] != mask.shape:
                raise ValueError(
                    'the first two dimensions of h and mask must match, '
                    f'got {tuple(h.shape[:2])} and {tuple(mask.shape)}')
#             no_empty_seq = not self.batch_first and mask[0].all()
#             no_empty_seq_bf = self.batch_first and mask[:, 0].all()
#             if not no_empty_seq and not no_empty_seq_bf:
#                 raise ValueError('mask of the first timestep must all be on')
    def viterbi_decode(self, h: FloatTensor, mask: BoolTensor) -> List[List[int]]:
        """
        decode labels using viterbi algorithm
        :param h: hidden matrix (batch_size, seq_len, num_labels)
        :param mask: mask tensor of each sequence
                     in mini batch (batch_size, batch_size)
        :return: labels of each sequence in mini batch
        """
        batch_size, seq_len, _ = h.size()
        # prepare the sequence lengths in each sequence
        seq_lens = mask.sum(dim=1)
        # In mini batch, prepare the score
        # from the start sequence to the first label
        score = [self.start_trans.data + h[:, 0]]
        path = []
        for t in range(1, seq_len):
            # extract the score of previous sequence
            # (batch_size, num_labels, 1)
            previous_score = score[t - 1].view(batch_size, -1, 1)
            # extract the score of hidden matrix of sequence
            # (batch_size, 1, num_labels)
            h_t = h[:, t].view(batch_size, 1, -1)
            # extract the score in transition
            # from label of t-1 sequence to label of sequence of t
            # self.trans_matrix has the score of the transition
            # from sequence A to sequence B
            # (batch_size, num_labels, num_labels)
            score_t = previous_score + self.trans_matrix + h_t
            # keep the maximum value
            # and point where maximum value of each sequence
            # (batch_size, num_labels)
            best_score, best_path = score_t.max(1)
            score.append(best_score)
            path.append(best_path)
        # predict labels of mini batch
        best_paths = [
            self._viterbi_compute_best_path(i, seq_lens, score, path)
            for i in range(batch_size)
        ]
        return best_paths
    def _viterbi_compute_best_path(
        self,
        batch_idx: int,
        seq_lens: torch.LongTensor,
        score: List[FloatTensor],
        path: List[torch.LongTensor],
    ) -> List[int]:
        """
        return labels using viterbi algorithm
        :param batch_idx: index of batch
        :param seq_lens: sequence lengths in mini batch (batch_size)
        :param score: transition scores of length max sequence size
                      in mini batch [(batch_size, num_labels)]
        :param path: transition paths of length max sequence size
                     in mini batch [(batch_size, num_labels)]
        :return: labels of batch_idx-th sequence
        """
        seq_end_idx = seq_lens[batch_idx] - 1
        # extract label of end sequence
        _, best_last_label = (score[seq_end_idx][batch_idx] + self.end_trans).max(0)
        best_labels = [int(best_last_label)]
        # predict labels from back using viterbi algorithm
        for p in reversed(path[:seq_end_idx]):
            best_last_label = p[batch_idx][best_labels[0]]
            best_labels.insert(0, int(best_last_label))
        return best_labels
    def _compute_denominator_log_likelihood(self, h: FloatTensor, mask: BoolTensor):
        """
        compute the denominator term for the log-likelihood
        compute the partition function in log-space using the forward-algorithm.
        :param h: hidden matrix (batch_size, seq_len, num_labels)
        :param mask: mask tensor of each sequence
                     in mini batch (batch_size, seq_len)
        :return: The score of denominator term for the log-likelihood
        """
        device = h.device
        batch_size, seq_len, _ = h.size()
        # (num_labels, num_labels) -> (1, num_labels, num_labels)
        trans = self.trans_matrix.unsqueeze(0)
        # add the score from beginning to each label
        # and the first score of each label
        score = self.start_trans + h[:, 0]
        # iterate through processing for the number of words in the mini batch
        for t in range(1, seq_len):
            # (batch_size, self.num_labels, 1)
            before_score = score.unsqueeze(2)
            # prepare t-th mask of sequences in each sequence
            # (batch_size, 1)
            mask_t = mask[:, t].unsqueeze(1)
            mask_t = mask_t.to(device)
            # prepare the transition probability of the t-th sequence label
            # in each sequence
            # (batch_size, 1, num_labels)
            h_t = h[:, t].unsqueeze(1)
            # calculate t-th scores in each sequence
            # (batch_size, num_labels)
            score_t = before_score + h_t + trans
            score_t = torch.logsumexp(score_t, 1)
            # update scores
            # (batch_size, num_labels)
            score = torch.where(mask_t, score_t, score)
        # add the end score of each label
        score += self.end_trans
        # return the log likely food of all data in mini batch
        return torch.logsumexp(score, 1)
    def _compute_numerator_log_likelihood(
        self, h: FloatTensor, y: LongTensor, mask: BoolTensor
    ) -> FloatTensor:
        """
        compute the numerator term for the log-likelihood
        :param h: hidden matrix (batch_size, seq_len, num_labels)
        :param y: answer labels of each sequence
                  in mini batch (batch_size, seq_len)
        :param mask: mask tensor of each sequence
                     in mini batch (batch_size, seq_len)
        :return: The score of numerator term for the log-likelihood
        """
        batch_size, seq_len, _ = h.size()
        h_unsqueezed = h.unsqueeze(-1)
        trans = self.trans_matrix.unsqueeze(-1)
        arange_b = torch.arange(batch_size)
        # extract first vector of sequences in mini batch
        calc_range = seq_len - 1
        score = self.start_trans[y[:, 0]] + sum(
            [self._calc_trans_score_for_num_llh(
                h_unsqueezed, y, trans, mask, t, arange_b
            ) for t in range(calc_range)])
        # extract end label number of each sequence in mini batch
        # (batch_size)
        last_mask_index = mask.sum(1) - 1
        last_labels = y[arange_b, last_mask_index]
        each_last_score = h[arange_b, -1, last_labels] * mask[:, -1]
        # Add the score of the sequences of the maximum length in mini batch
        # Add the scores from the last tag of each sequence to EOS
        score += each_last_score + self.end_trans[last_labels]
        return score
    def _calc_trans_score_for_num_llh(
        self,
        h: FloatTensor,
        y: LongTensor,
        trans: FloatTensor,
        mask: BoolTensor,
        t: int,
        arange_b: FloatTensor,
    ) -> torch.Tensor:
        """
        calculate transition score for computing numberator llh
        :param h: hidden matrix (batch_size, seq_len, num_labels)
        :param y: answer labels of each sequence
                  in mini batch (batch_size, seq_len)
        :param trans: transition score
        :param mask: mask tensor of each sequence
                     in mini batch (batch_size, seq_len)
        :paramt t: index of hidden, transition, and mask matrixex
        :param arange_b: this param is seted torch.arange(batch_size)
        :param batch_size: batch size of this calculation
        """
        device = h.device
        mask_t = mask[:, t]
        mask_t = mask_t.to(device)
        mask_t1 = mask[:, t + 1]
        mask_t1 = mask_t1.to(device)
        # extract the score of t+1 label
        # (batch_size)
        h_t = h[arange_b, t, y[:, t]].squeeze(1)
        # extract the transition score from t-th label to t+1 label
        # (batch_size)
        trans_t = trans[y[:, t], y[:, t + 1]].squeeze(1)
        # add the score of t+1 and the transition score
        # (batch_size)
        return h_t * mask_t + trans_t * mask_t1
    def _initialize_parameters(self, pad_idx: Optional[int]) -> None:
        """
        initialize transition parameters
        :param: pad_idx: if not None, additional initialize
        :return: None
        """
        nn.init.uniform_(self.trans_matrix, -0.1, 0.1)
        nn.init.uniform_(self.start_trans, -0.1, 0.1)
        nn.init.uniform_(self.end_trans, -0.1, 0.1)
        if pad_idx is not None:
            self.start_trans[pad_idx] = -10000.0
            self.trans_matrix[pad_idx, :] = -10000.0
            self.trans_matrix[:, pad_idx] = -10000.0
            self.trans_matrix[pad_idx, pad_idx] = 0.0


In [None]:
### engine.py
# from tqdm import tqdm
def train_step(trainer,batch):
    model.train()
    optimizer.zero_grad()
    batch=[_data.to(device) for _data in batch]
    _, loss = model(batch)
    loss.backward()
    optimizer.step()
    scheduler.step()
    return loss.item()

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        if torch.cuda.is_available(): data=[_data.to(device) for _data in data]
        optimizer.zero_grad()
        _, loss = model(data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        if torch.cuda.is_available(): data=[_data.to(device) for _data in data]
        punct, loss = model(data)
        final_loss += loss.item()
    return final_loss / len(data_loader)


In [None]:
model = BertCRFModel(num_punct=10, embedding_dim=config.EMBEDDING_DIM, hidden_dim=config.HIDDEN_DIM, use_crf=config.USE_CRF)
for i,param in enumerate(model.bert.parameters()):
    param.requires_grad = False
model.to(device)
optimizer = AdamW(optimizer_parameters, lr=config.FREEZE_LEARNING_RATE)
num_train_steps = train_dataset.tensors[0].size()[0] / config.TRAIN_BATCH_SIZE * config.UNFREEZE_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)
# optimizer = SWA(base_opt)

trainer = Engine(train_step)
val_metrics = {
    "precision": Precision(),
    "recall": Recall(),
#     "Dice": DiceCoefficient(cm=),
    "F1": Fbeta(1),
}
evaluator = create_supervised_evaluator(model, metrics=val_metrics)
def log_metrics(engine, title):
    print("Epoch: {} - {} accuracy: {:.2f}"
           .format(trainer.state.epoch, title, engine.state.metrics["acc"]))

@trainer.on(Events.EPOCH_COMPLETED)
def evaluate(trainer):
    with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "train"):
        evaluator.run(train_dataloader)

    with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "dev"):
        evaluator.run(dev_dataloader)

trainer.run(train_dataloader, max_epochs=100)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertModel: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'd

Current run is terminating due to exception: too many values to unpack (expected 2).
Engine run is terminating due to exception: too many values to unpack (expected 2).
Engine run is terminating due to exception: too many values to unpack (expected 2).


ValueError: too many values to unpack (expected 2)

In [None]:
torch.utils.data.DataLoader??