In [None]:
import os
os.environ['HTTP_PROXY']="http://proxe.shands.ufl.edu:3128"
os.environ['HTTPS_PROXY']="http://proxe.shands.ufl.edu:3128"
os.environ["CUDA_VISIBLE_DEVICES"] = '0,1,2,3,4'

In [None]:
import sys

sys.path.append("/home/alexgre/workspace/py3/nlp/ClinicalTransformerNER/")
sys.path.append("/home/alexgre/workspace/py3/nlp/ClinicalTransformerNER/src/")

In [None]:
import torch
from pathlib import Path
from transformer_ner.transfomer_log import TransformerNERLogger
from transformer_ner.model import MegatronNerModel, BertConfig
from transformer_ner.task import train, predict, evaluate, _output_bio, set_seed, set_up_eval_tool, load_model
from transformers import BertTokenizer
from transformer_ner.data_utils import (NEXT_GUARD, NEXT_TOKEN,
                                        TransformerNerDataProcessor,
                                        batch_to_model_inputs,
                                        convert_features_to_tensors,
                                        ner_data_loader,
                                        transformer_convert_data_to_features)
from common_utils.common_io import json_dump, json_load, output_bio

In [None]:
BIAS_TERMS_DICT = {
    'intermediate': 'intermediate.dense.bias',
    'key': 'attention.self.key.bias',
    'query': 'attention.self.query.bias',
    'value': 'attention.self.value.bias',
    'output': 'output.dense.bias',
    'output_layernorm': 'output.LayerNorm.bias',
    'attention_layernorm': 'attention.output.LayerNorm.bias',
    'all': 'bias',
}

classifier_name = "classifier"

In [None]:
plm = "/home/alexgre/projects/transformer_pretrained_models/gatortron-syn-345m_deid_vocab/"

In [None]:
trainale_params = list(BIAS_TERMS_DICT.values()) + ['pooler.dense.bias'] + [classifier_name]


def set_freeze(model):
    for param in model.parameters():
            param.requires_grad = False
    for name, param in model.named_parameters():
        for tp in trainale_params:
            if tp in name:
                param.requires_grad = True
                break


def set_unfreeze(model):
    for param in model.parameters():
            param.requires_grad = True
            

def count_param(model):
    pytorch_total_params = sum(p.numel() for p in model.parameters())
    pytorch_train_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    pre = round(pytorch_train_params / pytorch_total_params, 4) * 100
    print(f"""
    total number parameters: {pytorch_total_params}
    total trainable number parameters: {pytorch_train_params}
    precetage: {pre}%
    """)
            
# set_freeze(model)
# set_unfreeze(model)

In [None]:
# with torch.no_grad():
#     model.classifier.weight.copy_(model1.classifier.weight)

In [None]:
class Args:
    def __init__(
        self, model_type, pretrained_model, 
        do_train=True, do_predict=True,
        new_model_dir=None, resume_from_model=None, 
        data_dir=None, logger_file="log.txt"
    ):
        self.model_type = model_type
        self.pretrained_model = pretrained_model if resume_from_model is None else resume_from_model
        self.config_name = self.pretrained_model
        self.tokenizer_name = self.pretrained_model
        self.do_lower_case = True
        self.overwrite_model_dir = True
        self.data_dir = data_dir
        self.data_has_offset_information = False
        self.new_model_dir = new_model_dir
        self.predict_output_file = Path(new_model_dir) / "predicted.txt"
        self.overwrite_output_dir = True
        self.max_seq_length = 512
        self.do_train = do_train
        self.do_predict = do_predict
        self.model_selection_scoring = "strict-f_score-1"
        self.train_batch_size = 4
        self.eval_batch_size = 32
        self.learning_rate = 2e-5
        self.min_lr = 5e-6
        self.seed = 42
        self.logger = TransformerNERLogger(
            logger_level="i",
            logger_file=logger_file).get_logger()
        self.num_train_epochs = 30
        self.gradient_accumulation_steps = 1
        self.do_warmup = True
        self.label2idx = None
        self.idx2label = None
        self.max_num_checkpoints = 2
        self.warmup_ratio = 0.1
        self.weight_decay = 0.01
        self.adam_epsilon = 0.00000001
        self.max_grad_norm = 1.0
        self.log_file = None
        self.log_lvl = None
        self.fp16 = False
        self.local_rank = -1
        self.device = "cpu"
        self.train_steps = 2000
        self.early_stop = 3
        self.progress_bar = False
        self.save_model_core = True
        self.use_crf = False
        self.focal_loss = False
        self.focal_loss_gamma = 2
        self.resume_from_model = resume_from_model
        self.use_biaffine = False
        self.mlp_dim = 128
        self.mlp_layers = 0
        self.adversarial_training = False
        self.adversarial_training_method = None  # None, "fgm", "pgd", "freelb
        self.adversarial_training_conf = None

In [None]:
# 2010 i2b2

# i2b2_2010 = "/home/alexgre/shared_data/challenge_datasets/ner/clinical_ner_datasets/2010_i2b2/"
i2b2_2010 = "/home/alexgre/shared_data/challenge_datasets/ner/clinical_ner_datasets/2012_i2b2/preprocessed_bio_data/"

args1 = Args(
    model_type="megatron", 
    pretrained_model=None, 
    new_model_dir="./i2b2_2012_megatron_bitfit_stage1", 
    data_dir=i2b2_2010,
    do_predict = False,
    logger_file="i2b2_2012_megatron_bitfit_stage1.txt" 
)

args1.device = torch.device("cuda:4")


tokenizer = BertTokenizer.from_pretrained(plm, do_lower_case=args1.do_lower_case, add_prefix_space=True)
tokenizer.add_special_tokens({"additional_special_tokens": [NEXT_TOKEN]})
new_dim = len(tokenizer)

set_seed(args1.seed)

ner_data_processor = TransformerNerDataProcessor()
ner_data_processor.set_data_dir(args1.data_dir)
ner_data_processor.set_logger(args1.logger)
#     ner_data_processor.offset_info_available()

labels, label2idx = ner_data_processor.get_labels(default=args1.model_type)
num_labels = len(label2idx)
idx2label = {v: k for k, v in label2idx.items()}

args1.num_labels = num_labels
args1.label2idx = label2idx
args1.idx2label = idx2label



train_examples = ner_data_processor.get_train_examples()
train_features = transformer_convert_data_to_features(args1,
                                                      input_examples=train_examples,
                                                      label2idx=label2idx,
                                                      tokenizer=tokenizer,
                                                      max_seq_len=args1.max_seq_length)

dev_examples = ner_data_processor.get_dev_examples()
dev_features = transformer_convert_data_to_features(args1,
                                                    input_examples=dev_examples,
                                                    label2idx=label2idx,
                                                    tokenizer=tokenizer,
                                                    max_seq_len=args1.max_seq_length)

args1.eval_tool = set_up_eval_tool(args1)

In [None]:
# bitfit
conf_bitfit = BertConfig.from_pretrained(plm, num_labels=num_labels)
model_bitfit = MegatronNerModel.from_pretrained(plm, config=conf_bitfit)
model_bitfit.resize_token_embeddings(new_dim)
conf_bitfit.vocab_size = new_dim
args1.config = model_bitfit.config
args1.tokenizer = tokenizer
set_freeze(model_bitfit)
count_param(model_bitfit)
model_bitfit.to(args1.device)
# start training
train(args1, model_bitfit, train_features, dev_features)

In [None]:
test_example = ner_data_processor.get_test_examples()
test_features = transformer_convert_data_to_features(args1,
                                                     input_examples=test_example,
                                                     label2idx=label2idx,
                                                     tokenizer=args1.tokenizer,
                                                     max_seq_len=args1.max_seq_length)

model = load_model(args1)
model.to(args1.device)

predictions = predict(args1, model, test_features)
_output_bio(args1, test_example, predictions)

In [None]:
# eval
! python3 "/home/alexgre/workspace/py3/nlp/ClinicalTransformerNER/src/eval_scripts/new_bio_eval.py"\
    -f1 /home/alexgre/shared_data/challenge_datasets/ner/clinical_ner_datasets/2012_i2b2/preprocessed_bio_data/test.txt \
    -f2 "./i2b2_2012_megatron_bitfit_stage1/predicted.txt"

In [None]:
# load bitfit model and get classifier only
temp_model = load_model(args1)

args2 = Args(
    model_type="megatron", 
    pretrained_model=None, 
    new_model_dir="./i2b2_2012_megatron_bitfit_stage2", 
    data_dir=i2b2_2010,
    do_predict = True,
    logger_file="i2b2_2012__megatron_bitfit_stage2.txt" 
)
args2.device = torch.device("cuda:4")
args2.num_labels = num_labels
args2.label2idx = label2idx
args2.idx2label = idx2label
args2.eval_tool = set_up_eval_tool(args2)
args2.learning_rate = 1e-5
args2.seed = 13

train_examples = ner_data_processor.get_train_examples()
train_features = transformer_convert_data_to_features(args2,
                                                      input_examples=train_examples,
                                                      label2idx=label2idx,
                                                      tokenizer=tokenizer,
                                                      max_seq_len=args2.max_seq_length)

dev_examples = ner_data_processor.get_dev_examples()
dev_features = transformer_convert_data_to_features(args2,
                                                    input_examples=dev_examples,
                                                    label2idx=label2idx,
                                                    tokenizer=tokenizer,
                                                    max_seq_len=args2.max_seq_length)

# full FT
conf_bitfit_ft = BertConfig.from_pretrained(plm, num_labels=num_labels)
model_bitfit_ft = MegatronNerModel.from_pretrained(plm, config=conf_bitfit_ft)
model_bitfit_ft.resize_token_embeddings(new_dim)
conf_bitfit_ft.vocab_size = new_dim
args2.config = model_bitfit_ft.config
args2.tokenizer = tokenizer

with torch.no_grad():
    model_bitfit_ft.classifier.weight.copy_(temp_model.classifier.weight)
    model_bitfit_ft.classifier.bias.copy_(temp_model.classifier.bias)

count_param(model_bitfit_ft)    

model_bitfit_ft.to(args2.device)
# start training
train(args2, model_bitfit_ft, train_features, dev_features)

In [None]:
test_example = ner_data_processor.get_test_examples()
test_features = transformer_convert_data_to_features(args2,
                                                     input_examples=test_example,
                                                     label2idx=label2idx,
                                                     tokenizer=args2.tokenizer,
                                                     max_seq_len=args2.max_seq_length)

model = load_model(args2)
model.to(args2.device)

predictions = predict(args2, model, test_features)
_output_bio(args2, test_example, predictions)

In [None]:
# eval
! python3 "/home/alexgre/workspace/py3/nlp/ClinicalTransformerNER/src/eval_scripts/new_bio_eval.py"\
    -f1 /home/alexgre/shared_data/challenge_datasets/ner/clinical_ner_datasets/2012_i2b2/preprocessed_bio_data/test.txt \
    -f2 "./i2b2_2012_megatron_bitfit_stage2/predicted.txt"

In [None]:
# bitfit reserve

args2 = Args(
    model_type="megatron", 
    pretrained_model=None, 
    new_model_dir="./i2b2_2012_megatron_bitfit_stage2_reserve", 
    data_dir=i2b2_2010,
    do_predict = True,
    logger_file="i2b2_2012_megatron_bitfit_stage2_reserve.txt" 
)
args2.device = torch.device("cuda:4")
args2.num_labels = num_labels
args2.label2idx = label2idx
args2.idx2label = idx2label
args2.eval_tool = set_up_eval_tool(args2)
args2.learning_rate = 1e-5
args2.seed = 13

train_examples = ner_data_processor.get_train_examples()
train_features = transformer_convert_data_to_features(args2,
                                                      input_examples=train_examples,
                                                      label2idx=label2idx,
                                                      tokenizer=tokenizer,
                                                      max_seq_len=args2.max_seq_length)

dev_examples = ner_data_processor.get_dev_examples()
dev_features = transformer_convert_data_to_features(args2,
                                                    input_examples=dev_examples,
                                                    label2idx=label2idx,
                                                    tokenizer=tokenizer,
                                                    max_seq_len=args2.max_seq_length)

# full FT
conf_bitfit_ft = BertConfig.from_pretrained(args1.new_model_dir)
model_bitfit_ft = load_model(args1)
# model_bitfit_ft.resize_token_embeddings(new_dim)
# conf_bitfit_ft.vocab_size = new_dim
args2.config = model_bitfit_ft.config
args2.tokenizer = tokenizer

count_param(model_bitfit_ft)    

model_bitfit_ft.to(args2.device)
# start training
train(args2, model_bitfit_ft, train_features, dev_features)

In [None]:
print(args2.__dict__)
test_example = ner_data_processor.get_test_examples()
test_features = transformer_convert_data_to_features(args2,
                                                     input_examples=test_example,
                                                     label2idx=label2idx,
                                                     tokenizer=args2.tokenizer,
                                                     max_seq_len=args2.max_seq_length)

model = load_model(args2)
model.to(args2.device)

predictions = predict(args2, model, test_features)
_output_bio(args2, test_example, predictions)

In [None]:
! python3 "/home/alexgre/workspace/py3/nlp/ClinicalTransformerNER/src/eval_scripts/new_bio_eval.py"\
    -f1 /home/alexgre/shared_data/challenge_datasets/ner/clinical_ner_datasets/2012_i2b2/preprocessed_bio_data/test.txt \
    -f2 "./i2b2_2012_megatron_bitfit_stage2_reserve/predicted.txt"

In [None]:
# only FT

# temp_model = load_model(args1)

args3 = Args(
    model_type="megatron", 
    pretrained_model=plm, 
    new_model_dir="./i2b2_2012_megatron_ft", 
    data_dir=i2b2_2010,
    do_predict = True,
    logger_file="i2b2_2012_megatron_ft.txt" 
)
args3.device = torch.device("cuda:4")
args3.num_labels = num_labels
args3.label2idx = label2idx
args3.idx2label = idx2label
args3.eval_tool = set_up_eval_tool(args3)
args3.learning_rate = 2e-5
args3.seed = 42

train_examples = ner_data_processor.get_train_examples()
train_features = transformer_convert_data_to_features(args3,
                                                      input_examples=train_examples,
                                                      label2idx=label2idx,
                                                      tokenizer=tokenizer,
                                                      max_seq_len=args3.max_seq_length)

dev_examples = ner_data_processor.get_dev_examples()
dev_features = transformer_convert_data_to_features(args3,
                                                    input_examples=dev_examples,
                                                    label2idx=label2idx,
                                                    tokenizer=tokenizer,
                                                    max_seq_len=args3.max_seq_length)

# full FT
conf_ft = BertConfig.from_pretrained(plm, num_labels=num_labels)
model_ft = MegatronNerModel.from_pretrained(plm, config=conf_ft)
model_ft.resize_token_embeddings(new_dim)
conf_ft.vocab_size = new_dim
args3.config = model_ft.config
args3.tokenizer = tokenizer

count_param(model_ft)    

model_ft.to(args3.device)
# start training
train(args3, model_ft, train_features, dev_features)

In [None]:
# print(args3.__dict__)
test_example = ner_data_processor.get_test_examples()
test_features = transformer_convert_data_to_features(args3,
                                                     input_examples=test_example,
                                                     label2idx=label2idx,
                                                     tokenizer=args3.tokenizer,
                                                     max_seq_len=args3.max_seq_length)

model = load_model(args3)
model.to(args3.device)

predictions = predict(args3, model, test_features)
_output_bio(args3, test_example, predictions)

In [None]:
# eval
! python3 "/home/alexgre/workspace/py3/nlp/ClinicalTransformerNER/src/eval_scripts/new_bio_eval.py"\
    -f1 /home/alexgre/shared_data/challenge_datasets/ner/clinical_ner_datasets/2012_i2b2/preprocessed_bio_data/test.txt \
    -f2 "./i2b2_2012_megatron_ft/predicted.txt"

In [None]:
import os
import random
import warnings
from pathlib import Path
import traceback

import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from torch.nn import functional as F
from torch.optim import AdamW
from tqdm import tqdm, trange

from torch.optim.lr_scheduler import LambdaLR

def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, min_lr=1e-6, last_epoch=-1):
    # this scheduler will use min_lr instead of 0
    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(
            min_lr, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda, last_epoch)


def train_mod(args, model, train_features, dev_features):
    """NER model training on train dataset; select model based on performance on dev dataset"""
    # create data loader
    data_loader = ner_data_loader(train_features, batch_size=args.train_batch_size, task='train', auto=True)
    # total training step counts
    t_total = len(data_loader) // args.gradient_accumulation_steps * args.num_train_epochs

    # parameters for optimization
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)

    # using fp16 for training rely on Nvidia apex package
    # fp16 training: try to use PyTorch naive implementation if available; we will only support apex anymore
    scaler = None
    autocast = None
    if args.fp16:
        try:
            autocast = torch.cuda.amp.autocast
            scaler = torch.cuda.amp.GradScaler()
        except Exception:
            raise ImportError("You need to update to PyTorch 1.6, the current PyTorch version is {}"
                              .format(torch.__version__))

    # training linear warm-up setup
    scheduler = None
    if args.do_warmup:
        warmup_steps = np.dtype('int64').type(args.warmup_ratio * t_total)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=warmup_steps, min_lr=args.min_lr, num_training_steps=t_total)

    args.logger.info("***** Running training *****")
    args.logger.info("  Num data points = {}".format(len(data_loader)))
    args.logger.info("  Num Epochs = {}".format(args.num_train_epochs))
    args.logger.info("  Instantaneous batch size per GPU = {}".format(args.train_batch_size))
    args.logger.info("  Gradient Accumulation steps = {}".format(args.gradient_accumulation_steps))
    args.logger.info("  Total optimization steps = {}".format(t_total))
    args.logger.info("  Training steps (number of steps between two evaluation on dev) = {}".format(
        args.train_steps * args.gradient_accumulation_steps))
    args.logger.info("******************************")

    # create directory to save model
    new_model_dir = Path(args.new_model_dir)
    new_model_dir.mkdir(parents=True, exist_ok=True)
    # save label2idx json in new model directory
    json_dump(args.label2idx, new_model_dir / "label2idx.json")

    # save base model name to a base_model_name.txt
    with open(new_model_dir / "base_model_name.txt", "w") as f:
        f.write('model_type: {}\nbase_model: {}\nconfig: {}\ntokenizer: {}'.format(
            args.model_type, args.pretrained_model, args.config_name, args.tokenizer_name))

    global_step = 0
    tr_loss = .0
    best_score, epcoh_best_score = .0, .0
    early_stop_flag = 0

    set_freeze(model)
    model.zero_grad()
    
    epoch_iter = trange(int(args.num_train_epochs), desc="Epoch", disable=not args.progress_bar, position=1, leave=False)
    for epoch in epoch_iter:
        if epoch == 5:
            set_unfreeze(model)
        count_param(model) 
        batch_iter = tqdm(iterable=data_loader, desc='Batch', disable=not args.progress_bar)
        for step, batch in enumerate(batch_iter):
            model.train()

            batch = tuple(b.to(args.device) for b in batch)
            train_inputs = batch_to_model_inputs(batch, args.model_type)

            if args.fp16:
                with autocast():
                    _, _, loss = model(**train_inputs)
            else:
                _, _, loss = model(**train_inputs)

            loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()

            if args.fp16:
                scaler.scale(loss).backward()
            else:
                loss.backward()

            # apply ADVERSARIAL TRAINING
            if args.adversarial_training:
                adversarial_train(args, adversarial_trainer, model, train_inputs)

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                    optimizer.step()

                if args.do_warmup:
                    scheduler.step()

                model.zero_grad()
                global_step += 1

            # using training step
            if args.train_steps > 0 and (global_step + 1) % args.train_steps == 0:
                # the current implementation will skip the all evaluations in the first epoch
                best_score, eval_loss = evaluate(
                    args, model, new_model_dir, dev_features, epoch, global_step, best_score)
                args.logger.info("""
                Global step: {}; 
                Epoch: {}; 
                average_train_loss: {:.4f}; 
                eval_loss: {:.4f}; 
                current best score: {:.4f}""".format(
                    global_step, epoch + 1, round(tr_loss / global_step, 4), eval_loss, best_score))

        # default model select method using strict F1-score with beta=1; evaluate model after each epoch on dev
        if args.train_steps <= 0 or epoch == 0:
            best_score, eval_loss = evaluate(
                args, model, new_model_dir, dev_features, epoch, global_step, best_score)
            args.logger.info("""
                Global step: {}; 
                Epoch: {}; 
                average_train_loss: {:.4f}; 
                eval_loss: {:.4f}; 
                current best score: {:.4f}""".format(
                global_step, epoch + 1, round(tr_loss / global_step, 4), eval_loss, best_score))

        # early stop check
        if epcoh_best_score < best_score:
            epcoh_best_score = best_score
            early_stop_flag = 0
        else:
            early_stop_flag += 1

        if 0 < args.early_stop <= early_stop_flag:
            args.logger.warn('Early stop activated; performance not improve anymore.')
            break

In [None]:
args3 = Args(
    model_type="megatron", 
    pretrained_model=plm, 
    new_model_dir="./i2b2_2012_megatron_ft_stage_bitfit", 
    data_dir=i2b2_2010,
    do_predict = True,
    logger_file="i2b2_2012_megatron_ft_stage_bitfit.txt" 
)
args3.device = torch.device("cuda:4")
args3.num_labels = num_labels
args3.label2idx = label2idx
args3.idx2label = idx2label
args3.eval_tool = set_up_eval_tool(args3)
args3.learning_rate = 1e-5
args3.seed = 13

train_examples = ner_data_processor.get_train_examples()
train_features = transformer_convert_data_to_features(args3,
                                                      input_examples=train_examples,
                                                      label2idx=label2idx,
                                                      tokenizer=tokenizer,
                                                      max_seq_len=args3.max_seq_length)

dev_examples = ner_data_processor.get_dev_examples()
dev_features = transformer_convert_data_to_features(args3,
                                                    input_examples=dev_examples,
                                                    label2idx=label2idx,
                                                    tokenizer=tokenizer,
                                                    max_seq_len=args3.max_seq_length)

# full FT
conf_ft = BertConfig.from_pretrained(plm, num_labels=num_labels)
model_ft = MegatronNerModel.from_pretrained(plm, config=conf_ft)
model_ft.resize_token_embeddings(new_dim)
conf_ft.vocab_size = new_dim
args3.config = model_ft.config
args3.tokenizer = tokenizer 

model_ft.to(args3.device)
count_param(model_ft)   

In [None]:
# start training
train_mod(args3, model_ft, train_features, dev_features)

In [None]:
test_example = ner_data_processor.get_test_examples()
test_features = transformer_convert_data_to_features(args3,
                                                     input_examples=test_example,
                                                     label2idx=label2idx,
                                                     tokenizer=args3.tokenizer,
                                                     max_seq_len=args3.max_seq_length)

model = load_model(args3)
model.to(args3.device)

predictions = predict(args3, model, test_features)
_output_bio(args3, test_example, predictions)

In [None]:
# eval
! python3 "/home/alexgre/workspace/py3/nlp/ClinicalTransformerNER/src/eval_scripts/new_bio_eval.py"\
    -f1 /home/alexgre/shared_data/challenge_datasets/ner/clinical_ner_datasets/2012_i2b2/preprocessed_bio_data/test.txt \
    -f2 "./i2b2_2012_megatron_ft_stage_bitfit/predicted.txt"