In [1]:
import orjson
import transformers
import torch
import numpy as np
import wandb
import time

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm.notebook import tqdm

fp16 = True
if fp16:
    from apex import amp

model_name = "xlnet-base-cased"

In [2]:
def load_dataset(input_file):
    data = orjson.loads(open(input_file, "rb").read())
    # Root data is a list of lists of features. The first-order list organizes the sequences into sets of like-length 
    # that can be batched together.
    datasets = []
    count = 0
    for features in data:
        # Each feature is a dictionary of a 'text' sequence and a 'title' sequence. The goal of this model is to
        # predict the 'title' given the 'text'. Process them out together, the model trainer will do the rest of the
        # work.
        input_ids = torch.tensor([f['text']['input_ids'] for f in features], dtype=torch.long)
        attention_mask = torch.tensor([f['text']['attention_mask'] for f in features], dtype=torch.float)
        token_type_ids = torch.tensor([f['text']['token_type_ids'] for f in features], dtype=torch.long)
        title_input_ids = torch.tensor([f['title']['input_ids'] for f in features], dtype=torch.long)
        datasets.append(TensorDataset(input_ids, attention_mask, token_type_ids, title_input_ids))
        count += len(features)
    return datasets, count

# Process dataset
input_folder = "C:\\Users\\jbetk\\Documents\\data\\ml\\title_prediction\\outputs\\"
train_datasets, total_train_data_sz = load_dataset(input_folder + "processed.json")
val_datasets, total_val_data_sz = load_dataset(input_folder + "validation.json")

In [3]:
EPOCHS = 1
BATCH_SIZE = 4

# Load model
tokenizer = transformers.XLNetTokenizer.from_pretrained(model_name)
config = transformers.XLNetConfig.from_pretrained(model_name)
model = transformers.XLNetLMHeadModel.from_pretrained(model_name, config=config)
device = torch.device("cuda")
cpu = torch.device("cpu")

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, 
                                                         num_warmup_steps=0, num_training_steps=EPOCHS * total_train_data_sz)

# Shift model to cuda & enable fp16 if applicable.
model.to(device)
if fp16:
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    
# Initialize w&b logger
do_wandb = True
if do_wandb:
    wandb.init(project="nonint-transformers-torch",\
               name="xlnet_title_prediction_allin",\
               config={"dataset": "title_pred"})
    # There's something bugged about this, but it doesnt really seem to do much anyways. Apparently it enables some 
    # sort of gradient exploration map.
    #wandb.watch(model)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


wandb: Wandb version 0.8.29 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [4]:
preprocess_times = []
xfer_times = []
forward_times = []
backward_times = []
opt_times = []
sched_times = []

def clear_timers():
    xfer_times.clear()
    forward_times.clear()
    backward_times.clear()
    opt_times.clear()
    sched_times.clear()

def prepare_inputs(_batched_inputs, _max_title_len, _device):
    with torch.no_grad():
        _input_ids = _batched_inputs[0]
        _attention_masks = _batched_inputs[1]
        _token_types = _batched_inputs[2]
        
        _batch_sz = _input_ids.shape[0]
        _seq_len = _input_ids.shape[-1]
        
        # These tensors will be used to append on to the input tensors where the prediction will occur.
        _input_mask_tensor = torch.full((_batch_sz, _max_title_len), tokenizer.mask_token_id, dtype=torch.long)
        _ones_float_tensor_for_title = torch.ones((_batch_sz, _max_title_len), dtype=torch.float)
        _ones_long_tensor_for_title = torch.ones((_batch_sz, _max_title_len), dtype=torch.long)
        
        # For the input_ids, append on _max_title_len masks.
        _input_ids = torch.cat([_input_ids, _input_mask_tensor], dim=-1)
        # For the attention mask, just add all 1s because this is not padding.
        _attention_masks = torch.cat([_attention_masks, _ones_float_tensor_for_title], dim=-1)
        # For token type IDs, also all 1s since this is the "second sentence".
        _token_types = torch.cat([_token_types, _ones_long_tensor_for_title], dim=-1)
    
        # Create a target mapping that will be used for all inputs, since they all follow a similar format.
        _target_mapping = torch.zeros((_batch_sz, _max_title_len, _seq_len + _max_title_len), dtype=torch.float)
        for i in range(_max_title_len):
            for b in range(_batch_sz):
                _target_mapping[b][i][_seq_len + i] = 1
        
        _inputs = {"input_ids": _input_ids, 
              "attention_mask": _attention_masks, 
              "token_type_ids": _token_types,
              "target_mapping": _target_mapping,
              "labels": _batched_inputs[3]}
        
        # Don't forget to send all these tensors to the device.
        __s = time.time()
        for i, (k,v) in enumerate(_inputs.items()):
            _inputs[k] = v.to(_device)
        xfer_times.append(time.time() - __s)
        
    return _inputs

def train_epoch(_model, _optimizer, _scheduler, _device, _dataloader, _max_title_len, _fp16):
    clear_timers()
    
    _epoch_iterator = tqdm(_dataloader, desc="Iteration")
    _steps = 0
    _tr_loss, _logging_loss = 0, 0
    _accuracy_accum, _accuracy_last = 0, 0
    _model.train()
    
    for _step, _batch in enumerate(_epoch_iterator):
        __s = time.time()
        _inputs = prepare_inputs(_batch, _max_title_len, _device)
        preprocess_times.append(time.time() - __s)
        
        # Forward
        __s = time.time()
        _loss, _logits = _model.forward(**_inputs)
        forward_times.append(time.time() - __s)            
        
        # Backwards
        __s = time.time()
        if fp16:
            with amp.scale_loss(_loss, _optimizer) as _scaled_loss:
                _scaled_loss.backward()
                backward_time = time.time() - __s
        else:
            _loss.backward()
            backward_time = time.time() - __s
        backward_times.append(backward_time)
        
        # Update weights
        if _fp16:
            torch.nn.utils.clip_grad_norm_(amp.master_params(_optimizer), 1)
        else:
            torch.nn.utils.clip_grad_norm_(_model.parameters(), 1)
        __s = time.time()
        _optimizer.step()
        opt_times.append(time.time() - __s)
        __s = time.time()
        _scheduler.step()
        sched_times.append(time.time() - __s)
        
        _model.zero_grad()
        
        # Always accumulate loss across the last chunk, where it should be lowest. That's the goal of this model.
        _steps += 1
        _tr_loss += _loss.item()
        
        # Always log.
        _loss_scalar = (_tr_loss - _logging_loss)
        _logging_loss = _tr_loss
        _logs = {}
        _logs["loss"] = _loss_scalar
        _logs["learning_rate"] = _scheduler.get_lr()[0]
        if do_wandb:
            wandb.log(_logs)


def validate_epoch(_model, _device, _dataloader, _max_title_len):
    _epoch_iterator = tqdm(_dataloader, desc="Iteration")
    _steps = 0
    _tr_loss, _logging_loss = 0, 0
    _accuracy_accum, _accuracy_last = 0, 0
    _model.train()
    for _step, _batch in enumerate(_epoch_iterator):
        _inputs = prepare_inputs(_batch, _max_title_len, _device)
        __s = time.time()
        with torch.no_grad():
            _loss, _logits = _model.forward(**_inputs)
        
        # Always accumulate loss across the last chunk, where it should be lowest. That's the goal of this model.
        _steps += 1
        _tr_loss += _loss.item()
    return _tr_loss, _steps

print("***** Running training *****")

def full_validate():
    print("Running validation..")
    combined_val_steps, combined_val_loss = 0, 0
    for val_dataset in val_datasets:
        val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
        l, s = validate_epoch(model, device, val_dataloader, 64)
        combined_val_steps += s
        combined_val_loss += l
        
    _logs = {}
    _logs["val_loss"] = combined_val_loss / combined_val_steps
    if do_wandb:
        wandb.log(_logs)
    print("Validation loss averaged over %i steps: %f" % (int(combined_val_steps), combined_val_loss / combined_val_steps))

model.zero_grad()
for _ in range(EPOCHS):
    for train_dataset in train_datasets:
        full_validate()
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
        
        train_epoch(model, optimizer, scheduler, device, train_dataloader, 64, fp16)
    

***** Running training *****
Running validation..




















HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8.0, style=ProgressStyle(description_widt…

RuntimeError: CUDA out of memory. Tried to allocate 1.12 GiB (GPU 0; 11.00 GiB total capacity; 5.59 GiB already allocated; 674.74 MiB free; 8.04 GiB reserved in total by PyTorch)

In [None]:
import os

# Save the model 
output_dir = os.path.join("c:/Users/jbetk/Documents/data/ml/saved_models", "xlnet_title_generation")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_to_save = (
    model.module if hasattr(model, "module") else model
)  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))

print("Save completed.")
