In [29]:
import numpy as np
from transformers import AdamW, DistilBertTokenizerFast
from torchcontrib.optim import SWA
from transformers import get_linear_schedule_with_warmup
# from .utils.logger import get_logger
import subprocess
import datetime
import torch
# from .config import *
# from .engine import *
# from .model import EntityModel
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator, Engine
from ignite.metrics import Precision, Recall, DiceCoefficient, Fbeta

In [27]:
class config:
    MAX_LEN = 128
    OVERLAP = 126
    TRAIN_BATCH_SIZE = 64
    DEV_BATCH_SIZE = 64
    GPU_DEVICE = 'cuda:3'#'cpu'#
    FREEZE_EPOCHS = 20
    FREEZE_LEARNING_RATE = 1e-4
    UNFREEZE_EPOCHS = 20
    UNFROZEN_LAYERS = 6
    UNFREEZE_LEARNING_RATE = 1e-5
    BASE_MODEL_PATH = 'distilbert-base-uncased'
    DATA_DIR = '/home/nxingyu/project/data/'
    TRAIN_DATASET = DATA_DIR+'ted_talks_processed.dev.pt'
    DEV_DATASET = DATA_DIR+'ted_talks_processed.dev.pt'
    ALPHA = 0.8
    hidden_dropout_prob = 0.3
    EMBEDDING_DIM = 768
    HIDDEN_DIM = 128
    SELF_ADJUSTING=True
    SQUARE_DENOMINATOR=False
    USE_CRF=False
    MODEL_NAME = 'bertcrf'
    MODEL_PATH = "/home/nxingyu/project/logs/models/"


    TOKENIZER = DistilBertTokenizerFast.from_pretrained(
        BASE_MODEL_PATH,
    )


param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.0,
    },
]

In [22]:
import torch
import transformers
from torch import nn
import torch.nn.functional as F
from typing import Optional
### model.py
class DiceLoss(nn.Module):
    r"""
    Creates a criterion that optimizes a multi-class Self-adjusting Dice Loss
    ("Dice Loss for Data-imbalanced NLP Tasks" paper)
    Args:
        alpha (float): a factor to push down the weight of easy examples
        gamma (float): a factor added to both the nominator and the denominator for smoothing purposes
    """
    def __init__(self,
                 smooth: Optional[float] = 1e-8,
                 square_denominator: Optional[bool] = False,
                 self_adjusting: Optional[bool] = False,
                #  with_logits: Optional[bool] = True,
                 reduction: Optional[str] = "mean",
                 alpha: float = 1.0,
                 ignore_index: int = -100,
                 weight=1,
                 ) -> None:
        super(DiceLoss, self).__init__()
        self.ignore_index = ignore_index
        self.reduction = reduction
        self.self_adjusting = self_adjusting
        self.alpha = alpha
        self.smooth = smooth
        self.square_denominator = square_denominator
        self.weight=weight
    
    def forward(self,
                input: torch.Tensor,
                target: torch.Tensor,
                mask: Optional[torch.Tensor] = None,
                num_classes: int = 10,
                ) -> torch.Tensor:

        input_soft = torch.softmax(input[target!=-100],dim=1)
        target_one_hot=F.one_hot(target[target!=-100],num_classes=num_classes)
        input_factor = ((1-input_soft) ** self.alpha) if self.self_adjusting else 1
        if mask is not None:
            mask = mask.view(-1).float()
            input_soft = input_soft * mask
            target_one_hot = target_one_hot * mask
        
        intersection = torch.sum(input_factor*input_soft * target_one_hot, 0)
        cardinality = torch.sum(input_factor*torch.square(input_soft,) + torch.square(target_one_hot,), 0) if self.square_denominator else torch.sum(input_factor*input_soft + target_one_hot, 0)
        dice_score = 1. - 2. * intersection / (cardinality + self.smooth) * self.weight
        if self.reduction == "mean":
            return dice_score.mean()
        elif self.reduction == "sum":
            return dice_score.sum()
        elif self.reduction == "none" or self.reduction is None:
            return dice_score
        else:
            raise NotImplementedError(f"Reduction `{self.reduction}` is not supported.")
    def __str__(self):
        return f"Dice Loss smooth:{self.smooth}"

def loss_fn(output, target, mask, num_labels, weight=None):
    lfn = DiceLoss(square_denominator=config.SQUARE_DENOMINATOR,self_adjusting=config.SELF_ADJUSTING,alpha=config.ALPHA,weight=weight)
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(-100).type_as(target)
    )
    loss = lfn(active_logits, active_labels,num_classes=num_labels)
    return loss


class BertCRFModel(nn.Module):
    def __init__(self,num_punct, embedding_dim, hidden_dim, use_lstm=False, use_crf=True, logger=None):
        super(BertCRFModel, self).__init__()
        self.num_punct=num_punct
        self.embedding_dim=embedding_dim
        self.hidden_dim=hidden_dim
        #self.use_lstm = use_lstm
        self.use_crf = use_crf
        self.logger=logger
        self.bert = transformers.BertModel.from_pretrained(
            config.BASE_MODEL_PATH
        )
        self.bert_drop_1 = nn.Dropout(config.hidden_dropout_prob)
        #if self.use_lstm:
        #    self.lstm=nn.LSTM(embedding_dim, hidden_dim//2, num_layers=1, bidirectional=True)
        #    self.out_punct = nn.Linear(self.hidden_dim, self.num_punct)
        #else:
        self.out_punct = nn.Linear(self.embedding_dim, self.num_punct)
        if self.use_crf:
            self.crf= DiceCRF(self.num_punct)
    
    def forward(self, data):
        o1 = self.bert(
                data[0],				#ids
                attention_mask=data[1],	#mask,
        )[0]
        sequence_output = self.bert_drop_1(o1)
        #self.logger.info('bert output shape: {}'.format(sequence_output.shape))
        #if self.use_lstm:
        #    sequence_output=self.lstm(sequence_output)[0]
        #    self.logger.info('lstm output shape: {}'.format(sequence_output.shape))
        punct = self.out_punct(sequence_output)
        #self.logger.info('punct shape: {}'.format(punct.shape))
        if self.use_crf:
            loss= -1*self.crf(punct, data[2], data[1])
        else:
            loss = loss_fn(punct, data[2], data[1], self.num_punct,1)
        #loss = (loss_tag + loss_pos) / 2

        return punct, loss

In [69]:
### engine.py
# from tqdm import tqdm
def train_step(trainer,batch):
    model.train()
    optimizer.zero_grad()
    batch=[_data.to(device) for _data in batch]
    _, loss = model(batch)
    loss.backward()
    optimizer.step()
    scheduler.step()
    return loss.item()

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        if torch.cuda.is_available(): data=[_data.to(device) for _data in data]
        optimizer.zero_grad()
        _, loss = model(data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        if torch.cuda.is_available(): data=[_data.to(device) for _data in data]
        punct, loss = model(data)
        final_loss += loss.item()
    return final_loss / len(data_loader)


In [59]:
device = torch.device(config.GPU_DEVICE if torch.cuda.is_available() else 'cpu')
train_dataset=torch.load(config.TRAIN_DATASET)
dev_dataset=torch.load(config.DEV_DATASET)
train_dataloader=torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)
dev_dataloader=torch.utils.data.DataLoader(dev_dataset, batch_size=config.DEV_BATCH_SIZE, num_workers=2)

In [76]:
train_dataset.map(lambda ex:{'input_ids':ex[0], 'attention_mask':ex[1], 'labels':ex[2]})
# next(iter(train_dataset))
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# train_dataloader=torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)
# next(iter(train_dataloader))

AttributeError: 'TensorDataset' object has no attribute 'map'

In [70]:
model = BertCRFModel(num_punct=10, embedding_dim=config.EMBEDDING_DIM, hidden_dim=config.HIDDEN_DIM, use_crf=config.USE_CRF)
for i,param in enumerate(model.bert.parameters()):
    param.requires_grad = False
model.to(device)
optimizer = AdamW(optimizer_parameters, lr=config.FREEZE_LEARNING_RATE)
num_train_steps = train_dataset.tensors[0].size()[0] / config.TRAIN_BATCH_SIZE * config.UNFREEZE_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)
# optimizer = SWA(base_opt)

trainer = Engine(train_step)
val_metrics = {
    "precision": Precision(),
    "recall": Recall(),
#     "Dice": DiceCoefficient(cm=),
    "F1": Fbeta(1),
}
evaluator = create_supervised_evaluator(model, metrics=val_metrics)
def log_metrics(engine, title):
    print("Epoch: {} - {} accuracy: {:.2f}"
           .format(trainer.state.epoch, title, engine.state.metrics["acc"]))

@trainer.on(Events.EPOCH_COMPLETED)
def evaluate(trainer):
    with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "train"):
        evaluator.run(train_dataloader)

    with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "dev"):
        evaluator.run(dev_dataloader)

trainer.run(train_dataloader, max_epochs=100)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertModel: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'd

Current run is terminating due to exception: too many values to unpack (expected 2).
Engine run is terminating due to exception: too many values to unpack (expected 2).
Engine run is terminating due to exception: too many values to unpack (expected 2).


ValueError: too many values to unpack (expected 2)

In [58]:
torch.utils.data.DataLoader??