In [1]:
from lr.models.transformers.util import load_and_cache_examples
from torch.utils.data import TensorDataset
import logging
import os
import torch
import random
import numpy as np


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def set_seed(seed, n_gpu):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)
        
set_seed(42, n_gpu=0)

### Params

In [3]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertForSequenceClassification.from_pretrained(pretrained_weights, num_labels = 3)


hyperparams = {"local_rank": -1,
               "max_seq_length": 150,
               "overwrite_cache": False,
               "cached_path":"data/toy/",
               "train_path": "data/toy/train.csv",
               "dev_path":"data/toy/dev.csv"}

## Creating features

In [4]:
train_dataset = load_and_cache_examples(hyperparams, tokenizer)

In [5]:
dev_dataset = load_and_cache_examples(hyperparams, tokenizer, evaluate=True)

## train

In [6]:
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm, trange

# params

train_dataset = train_dataset
model = model
tokenizer = tokenizer

hyperparams = {"local_rank": -1,
               "max_seq_length": 128,
               "overwrite_cache": False,
               "cached_path":"data/toy/",
               "train_path": "data/toy/train.csv",
               "dev_path":"data/toy/dev.csv",
               "num_train_epochs":3.0,
               "per_gpu_train_batch_size":8,
               "per_gpu_eval_batch_size":8,
               "gradient_accumulation_steps": 1,
               "learning_rate":5e-5,
               "weight_decay":0.0,
               "adam_epsilon": 1e-8,
               "max_grad_norm": 1.0,
               "max_steps": -1,
               "warmup_steps": 0,
               "save_steps": 500,
               "no_cuda":True,
               "n_gpu":1,
               "model_name_or_path":"bert",
               "output_dir":"bert",
               "random_state": 1234,
               "fp16":False,
               "fp16_opt_level":"01",
               "device":"cpu",
               "model_type": "bert"}




               
# script
local_rank = hyperparams["local_rank"]
per_gpu_train_batch_size = hyperparams["per_gpu_train_batch_size"]
n_gpu = hyperparams["n_gpu"]
max_steps = hyperparams["max_steps"]
num_train_epochs = hyperparams["num_train_epochs"]
gradient_accumulation_steps = hyperparams["gradient_accumulation_steps"]
weight_decay = hyperparams["weight_decay"]
learning_rate = hyperparams["learning_rate"]
adam_epsilon = hyperparams["adam_epsilon"]
warmup_steps = hyperparams["warmup_steps"]
seed = hyperparams["random_state"]
device = hyperparams["device"]
model_type = hyperparams["model_type"]

output_dir = hyperparams["output_dir"]
fp16_opt_level = hyperparams["fp16_opt_level"] 
fp16 = hyperparams["fp16"] 

model_name_or_path = hyperparams["model_name_or_path"]
opt_path = os.path.join(model_name_or_path, "optimizer.pt")
sche_path = os.path.join(model_name_or_path, "scheduler.pt")



train_batch_size = per_gpu_train_batch_size * max(1, n_gpu)

if local_rank == -1:
    train_sampler = RandomSampler(train_dataset)
else:
    DistributedSampler(train_dataset)
    
    
train_dataloader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=train_batch_size)

if max_steps > 0:
    t_total = max_steps
    num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1
else:
    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
    
    
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {"params": [p for n, p in model.named_parameters() if any(
        nd in n for nd in no_decay)], "weight_decay": 0.0},
]


optimizer = AdamW(optimizer_grouped_parameters,
                  lr=learning_rate,
                  eps=adam_epsilon)

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=t_total)

# Check if saved optimizer or scheduler states exist
if os.path.isfile(opt_path) and os.path.isfile(sche_path):
    # Load in optimizer and scheduler states
    optimizer.load_state_dict(torch.load(opt_path))
    scheduler.load_state_dict(torch.load(sche_path))
    
if fp16:
    try:
        from apex import amp
    except ImportError:
        raise ImportError(
            "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
    model, optimizer = amp.initialize(
        model, optimizer, opt_level=fp16_opt_level)
    
# multi-gpu training (should be after apex fp16 initialization)
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

# Distributed training (should be after apex fp16 initialization)
if local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)
    
# Train!
logging.info("***** Running training *****")
logging.info("  Num examples = %d", len(train_dataset))
logging.info("  Num Epochs = %d", num_train_epochs)
logging.info("  Instantaneous batch size per GPU = %d", per_gpu_train_batch_size)
logging.info("  Total train batch size (w. parallel, distributed & accumulation) = %d", train_batch_size
    * gradient_accumulation_steps
    * (torch.distributed.get_world_size() if local_rank != -1 else 1))
logging.info("  Gradient Accumulation steps = %d", gradient_accumulation_steps)
logging.info("  Total optimization steps = %d", t_total)

global_step = 0
epochs_trained = 0
steps_trained_in_current_epoch = 0

# Check if continuing training from a checkpoint
if os.path.exists(model_name_or_path) and model_name_or_path.find("checkpoints") > 0:
    # set global_step to gobal_step of last saved checkpoint from model
    # path
    global_step = int(model_name_or_path.split("-")[-1].split("/")[0])
    epochs_trained = global_step // (len(train_dataloader) //
                                     gradient_accumulation_steps)
    steps_trained_in_current_epoch = global_step % (
        len(train_dataloader) // gradient_accumulation_steps)

    logging.info(
        "  Continuing training from checkpoint, will skip to saved global_step")
    logging.info("  Continuing training from epoch %d", epochs_trained)
    logging.info("  Continuing training from global step %d", global_step)
    logging.info(
        "  Will skip the first %d steps in the first epoch",
        steps_trained_in_current_epoch)

tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
set_seed(seed, n_gpu=n_gpu) # Added here for reproductibility
train_iterator = trange(epochs_trained,
                        int(num_train_epochs),
                        desc="Epoch",
                        disable=local_rank not in [-1, 0])


for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration",
                          disable=local_rank not in [-1, 0])
    for step, batch in enumerate(epoch_iterator):
        # Skip past any already trained steps if resuming training
        if steps_trained_in_current_epoch > 0:
            steps_trained_in_current_epoch -= 1
            continue
        model.train()
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0],
                  "attention_mask": batch[1],
                  "labels": batch[3]}
        # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
        if model_type != "distilbert":
            inputs["token_type_ids"] = (batch[2] if model_type in [
                        "bert", "xlnet", "albert"] else None)
        outputs = model(**inputs)
        loss = outputs[0]
        break
    break



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
Iteration:   0%|          | 0/100 [00:00<?, ?it/s][A

In [7]:
loss.item()

1.177870512008667

In [8]:
# if os.path.exists("example.log"):
#     os.remove("example.log")
    