In [8]:
import transformers
import torch
import numpy as np
import wandb
import time

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm.notebook import tqdm

from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors

fp16 = True
if fp16:
    from apex import amp

model_name = "albert-large-v2"

In [9]:
# Load the dataset
tokenizer = transformers.AlbertTokenizer.from_pretrained(model_name)

def load_dataset(examples, processor):
    output_mode = output_modes[task]
    label_list = processor.get_labels()
    features = convert_examples_to_features(examples, 
                                            tokenizer, 
                                            label_list=label_list,
                                            max_length=128,
                                            pad_on_left=False,
                                            output_mode=output_mode,
                                            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                            pad_token_segment_id=0)
    input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    labels = torch.tensor([f.label for f in features], dtype=torch.long)
    return TensorDataset(input_ids, attention_mask, token_type_ids, labels)

# Process dataset
task = 'mrpc'
input_file = "C:\\Users\\jbetk\\Documents\\data\\ml\\text_similarity\\MSRParaphraseCorpus"
processor = processors[task]()
train_dataset = load_dataset(processor.get_train_examples(input_file), processor)
val_dataset = load_dataset(processor.get_dev_examples(input_file), processor)

In [10]:
# Load model
config = transformers.AlbertConfig.from_pretrained(model_name)
model = transformers.AlbertForSequenceClassification.from_pretrained(model_name, config=config)
device = torch.device("cuda")
cpu = torch.device("cpu")

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
#print("Parameters: ", optimizer_grouped_parameters)
optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, 
                                                         num_warmup_steps=0, num_training_steps=len(train_dataset))

# Shift model to cuda & enable fp16 if applicable.
model.to(device)
if fp16:
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    
# Initialize w&b logger
do_wandb = False
if do_wandb:
    wandb.init(project="nonint-transformers-torch",\
               name="albert_sem_comp_msrpc",\
               config={"dataset": "msrpc"})
    # There's something bugged about this, but it doesnt really seem to do much anyways. Apparently it enables some 
    # sort of gradient exploration map.
    #wandb.watch(model)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [15]:
xfer_times = []
forward_times = []
backward_times = []
opt_times = []
sched_times = []

def compute_accuracy(_pred, _true):
    return np.sum(_pred == _true) / _pred.shape[0]

def clear_timers():
    xfer_times.clear()
    forward_times.clear()
    backward_times.clear()
    opt_times.clear()
    sched_times.clear()

def train_epoch(_model, _optimizer, _scheduler, _device, _dataloader, _logging_steps, _fp16=False):
    clear_timers()
    
    _epoch_iterator = tqdm(_dataloader, desc="Iteration")
    _steps = 0
    _tr_loss, _logging_loss = 0, 0
    _accuracy_accum, _accuracy_last = 0, 0
    _model.train()
    for _step, _batch in enumerate(_epoch_iterator):
        __s = time.time()
        _batch = tuple(_t.to(_device) for _t in _batch)
        _inputs = {"input_ids": _batch[0], 
                   "attention_mask": _batch[1], 
                   "token_type_ids": _batch[2], 
                   "labels": _batch[3]}
        xfer_times.append(time.time() - __s)
        
        __s = time.time()
        _outputs = _model(**_inputs)
        forward_times.append(time.time() - __s)
        
        _loss = _outputs[0]
        
        backward_time = 0
        __s = time.time()
        if fp16:
            with amp.scale_loss(_loss, _optimizer) as _scaled_loss:
                _scaled_loss.backward()
                backward_time = time.time() - __s
        else:
            _loss.backward()
            backward_time = time.time() - __s
        backward_times.append(backward_time)
        
        _tr_loss += _loss.item()
        _accuracy_accum += compute_accuracy(np.argmax(_outputs[1].detach().cpu().numpy(), axis=-1), _batch[3].cpu().numpy())
        
        if _fp16:
            torch.nn.utils.clip_grad_norm_(amp.master_params(_optimizer), 1)
        else:
            torch.nn.utils.clip_grad_norm_(_model.parameters(), 1)
        
        __s = time.time()
        _optimizer.step()
        opt_times.append(time.time() - __s)
        __s = time.time()
        _scheduler.step()
        sched_times.append(time.time() - __s)
        _model.zero_grad()
        _steps += 1
        
        # Log
        if _steps % _logging_steps == 0:
            _loss_scalar = (_tr_loss - _logging_loss) / _logging_steps
            _accuracy_scalar = (_accuracy_accum - _accuracy_last) / _logging_steps
            _logging_loss = _tr_loss
            _accuracy_last = _accuracy_accum
            _logs = {}
            _logs["loss"] = _loss_scalar
            _logs["accuracy"] = _accuracy_scalar
            _logs["learning_rate"] = _scheduler.get_lr()[0]
            #print(json.dumps({**_logs, **{"step": _steps}}))
            if do_wandb:
                wandb.log(_logs)
    
def check_validation(_model, _device, _val_dataloader):
    with torch.no_grad():
        _val_iterator = tqdm(_val_dataloader, desc="Validation iteration")
        _loss = 0
        _accuracy = 0
        for _step, _batch in enumerate(_val_iterator):
            _batch = tuple(_t.to(device) for _t in _batch)
            _inputs = {"input_ids": _batch[0], 
                       "attention_mask": _batch[1], 
                       "token_type_ids": _batch[2], 
                       "labels": _batch[3]}
            _outputs = model(**_inputs)
            _loss += _outputs[0].item()
            _accuracy += compute_accuracy(np.argmax(_outputs[1].cpu().numpy(), axis=-1), _batch[3].cpu().numpy())
        _loss_computed = _loss/len(_val_dataloader)
        _acc_computed = _accuracy/len(_val_dataloader)
        print("Validation loss %f, accuracy=%f" % (_loss_computed, _acc_computed))
        if do_wandb:
            wandb.log({'val_loss': _loss_computed, 'val_accuracy': _acc_computed})

In [16]:
LOGGING_STEPS = 5
EPOCHS = 1
BATCH_SIZE = 24

print("***** Running training *****")
print("  Num examples = %d" % (len(train_dataset)))
print("  Num Epochs = %d" % (EPOCHS))
print("  Total optimization steps = %d" % (len(train_dataset)))

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

model.zero_grad()
for _ in range(EPOCHS):
    train_epoch(model, optimizer, scheduler, device, train_dataloader, LOGGING_STEPS, _fp16=True)
    check_validation(model, device, val_dataloader)


***** Running training *****
  Num examples = 4076
  Num Epochs = 1
  Total optimization steps = 4076
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0


Validation loss 0.377003, accuracy=0.834987


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=170.0, style=ProgressStyle(description_wi…



HBox(children=(FloatProgress(value=0.0, description='Validation iteration', max=72.0, style=ProgressStyle(desc…

In [None]:
from scipy.special import softmax

def infer(_model, _sentence1, _sentence2):
    features = [tokenizer.encode_plus(text=_sentence1, text_pair=_sentence2, max_length=128, pad_to_max_length=True)]
    _inputs = {"input_ids": torch.tensor([f['input_ids'] for f in features], dtype=torch.long).to(device), 
               "attention_mask": torch.tensor([f['attention_mask'] for f in features], dtype=torch.long).to(device), 
               "token_type_ids": torch.tensor([f['token_type_ids'] for f in features], dtype=torch.long).to(device)}
    with torch.no_grad():
        _outputs = model(**_inputs)
        return softmax(_outputs[0].cpu().numpy())
    
print(infer(model, "The man and the woman went to the store", "The woman and the man went to the store"))
print(infer(model, "The man and the woman went to the store", "The man walked with his wife"))
print(infer(model, "The man and the woman went to the store", "The man trudged along with his daughter"))

In [None]:
import os

# Save the model 
output_dir = os.path.join("c:/Users/jbetk/Documents/data/ml/saved_models", "semantic_comparison_pytorch")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_to_save = (
    model.module if hasattr(model, "module") else model
)  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))


In [None]:
# Save to torchscript
dummy_input = [
    torch.zeros(1, 128, dtype=torch.long),
    torch.zeros(1, 128, dtype=torch.long),
    torch.zeros(1, 128, dtype=torch.long),
]
__config = transformers.AlbertConfig.from_pretrained(output_dir, torchscript=True)
__model = transformers.AlbertForSequenceClassification.from_pretrained(output_dir, config=__config)
__model.eval()
#model(*dummy_input)
traced_model = torch.jit.trace(__model, dummy_input)
torch.jit.save(traced_model, os.path.join(output_dir, "torchscript_out.pt"))

In [None]:
import os

# Save the model 
output_dir = os.path.join("c:/Users/jbetk/Documents/data/ml/saved_models", "semantic_comparison_pytorch")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_to_save = (
    model.module if hasattr(model, "module") else model
)  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))


In [None]:
# Save to torchscript
dummy_input = [
    torch.zeros(1, 128, dtype=torch.long),
    torch.zeros(1, 128, dtype=torch.long),
    torch.zeros(1, 128, dtype=torch.long),
]
__config = transformers.AlbertConfig.from_pretrained(output_dir, torchscript=True)
__model = transformers.AlbertForSequenceClassification.from_pretrained(output_dir, config=__config)
__model.eval()
#model(*dummy_input)
traced_model = torch.jit.trace(__model, dummy_input)
torch.jit.save(traced_model, os.path.join(output_dir, "torchscript_out.pt"))