In [1]:
import transformers
import torch
import numpy as np
import scipy as sp
import wandb
import time
import orjson

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm.notebook import tqdm

from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors

fp16 = True
if fp16:
    from apex import amp

model_name = "albert-large-v2"

In [2]:
# Load a sentiment review dataset.
def split_inputs_and_outputs(data_map):
    return [torch.tensor(np.asarray(data_map['input_id']), dtype=torch.long),
           torch.tensor(np.asarray(data_map['attention_mask']), dtype=torch.float),
           torch.tensor(np.asarray(data_map['token_type_id']), dtype=torch.long)],\
           torch.tensor(np.asarray(data_map['label']), dtype=torch.float)

def split_inputs_and_outputs_distil(data_map):
    return [torch.tensor(np.asarray(data_map['input_id']), dtype=torch.long),
           torch.tensor(np.asarray(data_map['attention_mask']), dtype=torch.float)],\
           torch.tensor(np.asarray(data_map['label']), dtype=torch.float)

def split_inputs_and_outputs_gpt2(data_map):
    return torch.tensor(np.asarray(data_map['input_id']), dtype=torch.long),\
           torch.tensor(np.asarray(data_map['label']), dtype=torch.float)

def load_data(train_filename, val_filename, ldr_fn):
    training_data = orjson.loads(open(train_filename, "rb").read())
    train_x, train_y = ldr_fn(training_data)
    val_data = orjson.loads(open(val_filename, "rb").read())
    val_x, val_y = ldr_fn(val_data)
    return TensorDataset(train_x[0], train_x[1], train_x[2], train_y), TensorDataset(val_x[0], val_x[1], val_x[2], val_y)

# Load data.
train_dataset_path = "C:/Users/jbetk/Documents/data/ml/sentiment_analysis/outputs/albert/processed.json"
val_dataset_path = "C:/Users/jbetk/Documents/data/ml/sentiment_analysis/outputs/albert/validation.json"
train_dataset, val_dataset = load_data(train_dataset_path, val_dataset_path, split_inputs_and_outputs)

# Use regression loss
num_labels = 1

In [3]:
# Load a semantic comparison dataset (MSRPC)
tokenizer = transformers.AlbertTokenizer.from_pretrained(model_name)

def load_msrp_dataset(examples, processor):
    output_mode = output_modes[task]
    label_list = processor.get_labels()
    features = convert_examples_to_features(examples, 
                                            tokenizer, 
                                            label_list=label_list,
                                            max_length=128,
                                            pad_on_left=False,
                                            output_mode=output_mode,
                                            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                            pad_token_segment_id=0)
    input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    labels = torch.tensor([f.label for f in features], dtype=torch.long)
    return TensorDataset(input_ids, attention_mask, token_type_ids, labels)

# Process dataset
task = 'mrpc'
input_file = "C:\\Users\\jbetk\\Documents\\data\\ml\\text_similarity\\MSRParaphraseCorpus"
processor = processors[task]()
train_dataset = load_dataset(processor.get_train_examples(input_file), processor)
val_dataset = load_msrp_dataset(processor.get_dev_examples(input_file), processor)

num_labels=2

KeyboardInterrupt: 

In [3]:
# Load model
config = transformers.AlbertConfig.from_pretrained(model_name)
config.num_labels = num_labels
model = transformers.AlbertForSequenceClassification.from_pretrained(model_name, config=config)
device = torch.device("cuda")
cpu = torch.device("cpu")

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
#print("Parameters: ", optimizer_grouped_parameters)
optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, 
                                                         num_warmup_steps=0, num_training_steps=len(train_dataset))

# Shift model to cuda & enable fp16 if applicable.
model.to(device)
if fp16:
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    
# Initialize w&b logger
do_wandb = True
if do_wandb:
    wandb.init(project="nonint-transformers-torch",\
               name="albert_sentiment_analysis_torch",\
               config={"dataset": "sent_amazon_yelp"})

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


wandb: Wandb version 0.8.29 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [5]:
xfer_times = []
forward_times = []
backward_times = []
opt_times = []
sched_times = []

def compute_accuracy(_pred, _true):
    if num_labels > 0:
        return np.sum(_pred == _true) / _pred.shape[0]
    else:
        return 0 # there is no accuracy with MSE.

def clear_timers():
    xfer_times.clear()
    forward_times.clear()
    backward_times.clear()
    opt_times.clear()
    sched_times.clear()

def train_epoch(_model, _optimizer, _scheduler, _device, _dataloader, _logging_steps, _fp16=False):
    clear_timers()
    
    _epoch_iterator = tqdm(_dataloader, desc="Iteration")
    _steps = 0
    _tr_loss, _logging_loss = 0, 0
    _accuracy_accum, _accuracy_last = 0, 0
    _model.train()
    for _step, _batch in enumerate(_epoch_iterator):
        __s = time.time()
        _batch = tuple(_t.to(_device) for _t in _batch)
        _inputs = {"input_ids": _batch[0], 
                   "attention_mask": _batch[1], 
                   "token_type_ids": _batch[2], 
                   "labels": _batch[3]}
        xfer_times.append(time.time() - __s)
        
        __s = time.time()
        _outputs = _model(**_inputs)
        forward_times.append(time.time() - __s)
        
        _loss = _outputs[0]
        
        backward_time = 0
        __s = time.time()
        if fp16:
            with amp.scale_loss(_loss, _optimizer) as _scaled_loss:
                _scaled_loss.backward()
                backward_time = time.time() - __s
        else:
            _loss.backward()
            backward_time = time.time() - __s
        backward_times.append(backward_time)
        
        _tr_loss += _loss.item()
        _logits_softmax = sp.special.softmax(_outputs[1].detach().cpu().softmax(-1).numpy(), axis=-1)
        _accuracy_accum += compute_accuracy(np.argmax(_logits_softmax, axis=-1), _batch[3].cpu().numpy())
        
        if _fp16:
            torch.nn.utils.clip_grad_norm_(amp.master_params(_optimizer), 1)
        else:
            torch.nn.utils.clip_grad_norm_(_model.parameters(), 1)
        
        __s = time.time()
        _optimizer.step()
        opt_times.append(time.time() - __s)
        __s = time.time()
        _scheduler.step()
        sched_times.append(time.time() - __s)
        _model.zero_grad()
        _steps += 1
        
        # Log
        if _steps % _logging_steps == 0:
            _loss_scalar = (_tr_loss - _logging_loss) / _logging_steps
            _accuracy_scalar = (_accuracy_accum - _accuracy_last) / _logging_steps
            _logging_loss = _tr_loss
            _accuracy_last = _accuracy_accum
            _logs = {}
            _logs["loss"] = _loss_scalar
            _logs["accuracy"] = _accuracy_scalar
            _logs["learning_rate"] = _scheduler.get_lr()[0]
            #print(json.dumps({**_logs, **{"step": _steps}}))
            if do_wandb:
                wandb.log(_logs)
    
def check_validation(_model, _device, _val_dataloader):
    with torch.no_grad():
        _val_iterator = tqdm(_val_dataloader, desc="Validation iteration")
        _loss = 0
        _accuracy = 0
        for _step, _batch in enumerate(_val_iterator):
            _batch = tuple(_t.to(device) for _t in _batch)
            _inputs = {"input_ids": _batch[0], 
                       "attention_mask": _batch[1], 
                       "token_type_ids": _batch[2], 
                       "labels": _batch[3]}
            _outputs = model(**_inputs)
            _loss += _outputs[0].item()
            _logits_softmax = sp.special.softmax(_outputs[1].detach().cpu().softmax(-1).numpy(), axis=-1)
            _accuracy += compute_accuracy(np.argmax(_logits_softmax, axis=-1), _batch[3].cpu().numpy())
        _loss_computed = _loss/len(_val_dataloader)
        _acc_computed = _accuracy/len(_val_dataloader)
        print("Validation loss %f, accuracy=%f" % (_loss_computed, _acc_computed))
        if do_wandb:
            wandb.log({'val_loss': _loss_computed, 'val_accuracy': _acc_computed})

LOGGING_STEPS = 5
EPOCHS = 2
BATCH_SIZE = 24

print("***** Running training *****")
print("  Num examples = %d" % (len(train_dataset)))
print("  Num Epochs = %d" % (EPOCHS))
print("  Total optimization steps = %d" % (len(train_dataset)))

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

model.zero_grad()
for _ in range(EPOCHS):
    train_epoch(model, optimizer, scheduler, device, train_dataloader, LOGGING_STEPS, _fp16=True)
    check_validation(model, device, val_dataloader)


***** Running training *****
  Num examples = 2625860
  Num Epochs = 2
  Total optimization steps = 2625860
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1024.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 512.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 256.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 128.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 64.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1024.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1024.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0
Gradient overflow.  Skipping ste

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=109411.0, style=ProgressStyle(description…

wandb: Wandb version 0.8.29 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade
requests_with_retry encountered retryable exception: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)). args: ('https://api.wandb.ai/files/neonbjb/nonint-transformers-torch/tcl01e2d/file_stream',), kwargs: {'json': {'files': {'wandb-history.jsonl': {'offset': 9352, 'content': ['{"loss": 0.36578323841094973, "accuracy": 0.0, "learning_rate": 1.9643773849329365e-05, "_runtime": 20612.412152528763, "_timestamp": 1583732541.9383655, "_step": 9352}\n', '{"loss": 0.40464792847633363, "accuracy": 0.0, "learning_rate": 1.9643735766567906e-05, "_runtime": 20614.615302562714, "_timestamp": 1583732544.1415155, "_step": 9353}\n', '{"loss": 0.5400665819644928, "accuracy": 0.0, "learning_rate": 1.964369768380645e-05, "_runtime": 20616.81369972229, "_timestamp": 1583732546.3399127, "_step": 9354}\n', '{"loss": 0

HBox(children=(FloatProgress(value=0.0, description='Validation iteration', max=167.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=109411.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='Validation iteration', max=167.0, style=ProgressStyle(des…

In [8]:
from scipy.special import softmax

def infer(_model, _sentence1, _sentence2=None, softmax=True):
    features = [tokenizer.encode_plus(text=_sentence1, text_pair=_sentence2, max_length=128, pad_to_max_length=True)]
    _inputs = {"input_ids": torch.tensor([f['input_ids'] for f in features], dtype=torch.long).to(device), 
               "attention_mask": torch.tensor([f['attention_mask'] for f in features], dtype=torch.long).to(device), 
               "token_type_ids": torch.tensor([f['token_type_ids'] for f in features], dtype=torch.long).to(device)}
    with torch.no_grad():
        _outputs = model(**_inputs)
        if softmax: 
            return softmax(_outputs[0].cpu().numpy())
        else:
            return
    
print(infer(model, "The man and the woman went to the store", softmax=False))
print(infer(model, "I love it!", softmax=False))
print(infer(model, "I hated it.", softmax=False))

[[1.]]
[[1.]]
[[1.]]


In [7]:
import os

tokenizer = transformers.AlbertTokenizer.from_pretrained(model_name)

# Save the model 
output_dir = os.path.join("c:/Users/jbetk/Documents/data/ml/saved_models", "sentiment_analysis_albert_pytorch")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_to_save = (
    model.module if hasattr(model, "module") else model
)  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))


In [None]:
# Save to torchscript
dummy_input = [
    torch.zeros(1, 128, dtype=torch.long),
    torch.zeros(1, 128, dtype=torch.long),
    torch.zeros(1, 128, dtype=torch.long),
]
__config = transformers.AlbertConfig.from_pretrained(output_dir, torchscript=True)
__model = transformers.AlbertForSequenceClassification.from_pretrained(output_dir, config=__config)
__model.eval()
#model(*dummy_input)
traced_model = torch.jit.trace(__model, dummy_input)
torch.jit.save(traced_model, os.path.join(output_dir, "torchscript_out.pt"))