In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"  # specify which GPU(s) to be used

%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from __future__ import absolute_import, division, print_function

import argparse
import glob
import logging
import os
import shutil
import random

import numpy as np
import torch
from seqeval.metrics import (
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)
from tensorboardX import SummaryWriter
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from utils_seq_labeling import (
    convert_examples_to_features,
    get_labels,
    read_examples_from_file,
)
from modeling_layoutlm import LayoutLMForTokenClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import (
    WEIGHTS_NAME,
    BertConfig,
    BertForTokenClassification,
    BertTokenizer,
)
from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
from transformers import (
    DistilBertConfig,
    DistilBertForTokenClassification,
    DistilBertTokenizer,
)

logger = logging.getLogger(__name__)

ALL_MODELS = sum(
    (
        tuple(conf.pretrained_config_archive_map.keys())
        for conf in (BertConfig, RobertaConfig, DistilBertConfig)
    ),
    (),
)

In [10]:
from modeling_layoutlm_2 import LayoutLMForTokenClassification

In [4]:
MODEL_CLASSES = {
    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
    "distilbert": (
        DistilBertConfig,
        DistilBertForTokenClassification,
        DistilBertTokenizer,
    ),
    "layoutlm": (BertConfig, LayoutLMForTokenClassification, BertTokenizer),
}

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

In [5]:
from run_seq_labeling import load_and_cache_examples

In [6]:
parser = argparse.ArgumentParser()

## Required parameters
parser.add_argument(
    "--data_dir",
    default='data_xt',
    type=str,
    required=False,
    help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
)
parser.add_argument(
    "--model_type",
    default='layoutlm',
    type=str,
    required=False,
    help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)
parser.add_argument(
    "--model_name_or_path",
    default='pretrained/layoutlm-large-uncased',
    #default='pretrained/layoutlm-base-uncased',
    # default='pretrained/model',
    type=str,
    required=False,
    help="Path to pre-trained model or shortcut name selected in the list: "
    + ", ".join(ALL_MODELS),
)
parser.add_argument(
    "--output_dir",
    default='pretrained/model',
    type=str,
    required=False,
    help="The output directory where the model predictions and checkpoints will be written.",
)

## Other parameters
parser.add_argument(
    "--labels",
    default="data_xt/labels.txt",
    type=str,
    help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
)
parser.add_argument(
    "--config_name",
    default="",
    type=str,
    help="Pretrained config name or path if not the same as model_name",
)
parser.add_argument(
    "--tokenizer_name",
    default="",
    type=str,
    help="Pretrained tokenizer name or path if not the same as model_name",
)
parser.add_argument(
    "--cache_dir",
    default="",
    type=str,
    help="Where do you want to store the pre-trained models downloaded from s3",
)
parser.add_argument(
    "--max_seq_length",
    default=512,
    type=int,
    help="The maximum total input sequence length after tokenization. Sequences longer "
    "than this will be truncated, sequences shorter will be padded.",
)
parser.add_argument(
    "--do_lower_case",
    action="store_true",
    default=True,
    help="Set this flag if you are using an uncased model.",
)
parser.add_argument(
    "--per_gpu_eval_batch_size",
    default=2,
    type=int,
    help="Batch size per GPU/CPU for evaluation.",
)
parser.add_argument(
    "--no_cuda", action="store_true",
    help="Avoid using CUDA when available"
)
parser.add_argument(
    "--overwrite_output_dir",
    action="store_true",
    help="Overwrite the content of the output directory",
)
parser.add_argument(
    "--overwrite_cache",
    action="store_true",
    help="Overwrite the cached training and evaluation sets",
)
parser.add_argument(
    "--seed", type=int, default=42, help="random seed for initialization"
)

parser.add_argument(
    "--fp16",
    action="store_true",
    default=True,
    help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
)
parser.add_argument(
    "--fp16_opt_level",
    type=str,
    default="O1",
    help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
    "See details at https://nvidia.github.io/apex/amp.html",
)
parser.add_argument(
    "--local_rank",
    type=int,
    default=-1,
    help="For distributed training: local_rank",
)
args = parser.parse_args('')

In [8]:
setattr(args, 'per_gpu_train_batch_size', 12 * 2)
setattr(args, 'per_gpu_train_batch_size', 2 * 1)
setattr(args, 'max_steps', -1)
setattr(args, 'gradient_accumulation_steps', 1)
setattr(args, 'num_train_epochs', 100.0)
setattr(args, 'weight_decay', 0.0)
setattr(args, 'learning_rate', 5e-5)
setattr(args, 'adam_epsilon', 1e-8)
setattr(args, 'warmup_steps', 0)
setattr(args, 'max_grad_norm', 1.0)
setattr(args, 'logging_steps', 50)
setattr(args, 'save_steps', 50)
setattr(args, 'output_dir', 'output')
setattr(args, 'max_seq_length', 512)
setattr(args, 'local_rank', -1)
setattr(args, 'n_gpu', 1)
setattr(args, 'no_cuda', True)
setattr(args, 'fp16', False)
setattr(args, 'do_lower_case', True)
setattr(args, 'labels', 'data_xt/labels.txt')
setattr(args, 'data_dir', 'data_xt')

In [6]:
# Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda:
    device = torch.device(
        "cuda:0" if torch.cuda.is_available() and not args.no_cuda else "cpu"
    )
    if args.no_cuda:
        args.n_gpu = 0
    else:
        torch.cuda.set_device(device)
        args.n_gpu = torch.cuda.device_count()
        
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl")
    args.n_gpu = 1
args.device = device

# Setup logging
logging.basicConfig(
    filename=os.path.join(args.output_dir, "train.log"),
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    args.local_rank,
    device,
    args.n_gpu,
    bool(args.local_rank != -1),
    args.fp16,
)

# Set seed
set_seed(args)

# Prepare CONLL-2003 task
labels = get_labels(args.labels)
num_labels = len(labels)
# Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
pad_token_label_id = CrossEntropyLoss().ignore_index

# Load pretrained model and tokenizer
if args.local_rank not in [-1, 0]:
    torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

args.model_type = args.model_type.lower()
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(
    args.config_name if args.config_name else args.model_name_or_path,
    num_labels=num_labels,
    cache_dir=args.cache_dir if args.cache_dir else None,
)
tokenizer = tokenizer_class.from_pretrained(
    args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
    do_lower_case=args.do_lower_case,
    cache_dir=args.cache_dir if args.cache_dir else None,
)
model = model_class.from_pretrained(
    args.model_name_or_path,
    from_tf=bool(".ckpt" in args.model_name_or_path),
    config=config,
    cache_dir=args.cache_dir if args.cache_dir else None,
)

if args.local_rank == 0:
    torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

model.to(args.device);

In [7]:
train_dataset = load_and_cache_examples(
    args, tokenizer, labels, pad_token_label_id, mode="train"
)

- show some batch elements

In [12]:
for step, batch in enumerate(train_dataset):
    if args.model_type != "layoutlm":
        batch = batch[:4]
    batch = tuple(t.to(args.device) for t in batch)
    
    inputs = {
        "input_ids": batch[0],
        "attention_mask": batch[1],
        "labels": batch[3],
    }
    
    if args.model_type == "layoutlm":
        inputs["bbox"] = batch[4]
        
    if args.model_type != "distilbert":
        inputs["token_type_ids"] = (
            batch[2]
            if args.model_type in ["bert", "xlnet", "layoutlm"]
            else None
        )  # XLM and RoBERTa don"t use segment_ids
        
    bbox = inputs['bbox']
    print((bbox[:, 2] - bbox[:, 0]).max(), (bbox[:, 2] - bbox[:, 0]).min())
    break

tensor(168) tensor(0)


In [6]:
mode="train"
examples = read_examples_from_file(args.data_dir, mode)

In [7]:
examples[0].words[:3], examples[0].labels[:3]

(['HARDOUIN', 'Edwige', 'PERMIS'],
 ['B-IDENTITY_GROUP', 'E-IDENTITY_GROUP', 'B-IDENTITY_GROUP'])

- ### Train the model

In [18]:
""" Train the model """
if args.local_rank in [-1, 0]:
    tb_writer = SummaryWriter()

args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
train_sampler = (
    RandomSampler(train_dataset)
    if args.local_rank == -1
    else DistributedSampler(train_dataset)
)
train_dataloader = DataLoader(
    train_dataset, sampler=train_sampler, batch_size=args.train_batch_size
)

if args.max_steps > 0:
    t_total = args.max_steps
    args.num_train_epochs = (
        args.max_steps
        // (len(train_dataloader) // args.gradient_accumulation_steps)
        + 1
    )
else:
    t_total = (
        len(train_dataloader)
        // args.gradient_accumulation_steps
        * args.num_train_epochs
    )

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": args.weight_decay,
    },
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(
    optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon
)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
)
if args.fp16 and not args.no_cuda:
    try:
        from apex import amp
    except ImportError:
        raise ImportError(
            "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
        )
    model, optimizer = amp.initialize(
        model, optimizer, opt_level=args.fp16_opt_level
    )

# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(
        model,
        device_ids=[args.local_rank],
        output_device=args.local_rank,
        find_unused_parameters=True,
    )

In [19]:
# Train!
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_dataset))
logger.info("  Num Epochs = %d", args.num_train_epochs)
logger.info(
    "  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
)
logger.info(
    "  Total train batch size (w. parallel, distributed & accumulation) = %d",
    args.train_batch_size
    * args.gradient_accumulation_steps
    * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
)
logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
logger.info("  Total optimization steps = %d", t_total)

global_step = 0
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()

- ### step by step train

In [20]:
set_seed(args)  # Added here for reproductibility (even between python 2 and 3)

In [104]:
epoch_iterator = enumerate(tqdm(
    train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]
))




Iteration:   0%|          | 0/201 [00:00<?, ?it/s][A[A[A

In [105]:
for i in range(70):
    step, batch = next(epoch_iterator)
print (step, len(batch))



Iteration:   0%|          | 1/201 [00:00<02:04,  1.61it/s][A[A

Iteration:  31%|███▏      | 63/201 [00:00<01:00,  2.29it/s][A[A

69 5


In [113]:
step, batch = next(epoch_iterator)
print (step, len(batch))

if args.model_type != "layoutlm":
    batch = batch[:4]
batch = tuple(t.to(args.device) for t in batch)
inputs = {
    "input_ids": batch[0],
    "attention_mask": batch[1],
    "labels": batch[3],
}
if args.model_type == "layoutlm":
    inputs["bbox"] = batch[4]
if args.model_type != "distilbert":
    inputs["token_type_ids"] = (
        batch[2]
        if args.model_type in ["bert", "xlnet", "layoutlm"]
        else None
    )  # XLM and RoBERTa don"t use segment_ids
    
bbox = inputs['bbox']
(bbox[:, :, 2] - bbox[:, :, 0]).max(), (bbox[:, :, 2] - bbox[:, :, 0]).min()



Iteration:  36%|███▋      | 73/201 [06:32<2:39:58, 74.99s/it][A[A

73 5


(tensor(662), tensor(0))

In [None]:
model.train()
if args.model_type != "layoutlm":
    batch = batch[:4]
batch = tuple(t.to(args.device) for t in batch)
inputs = {
    "input_ids": batch[0],
    "attention_mask": batch[1],
    "labels": batch[3],
}
if args.model_type == "layoutlm":
    inputs["bbox"] = batch[4]
if args.model_type != "distilbert":
    inputs["token_type_ids"] = (
        batch[2]
        if args.model_type in ["bert", "xlnet", "layoutlm"]
        else None
    )  # XLM and RoBERTa don"t use segment_ids

outputs = model(**inputs)
loss = outputs[
    0
]  # model outputs are always tuple in pytorch-transformers (see doc)

if args.n_gpu > 1:
    loss = loss.mean()  # mean() to average on multi-gpu parallel training
if args.gradient_accumulation_steps > 1:
    loss = loss / args.gradient_accumulation_steps

if args.fp16:
    with amp.scale_loss(loss, optimizer) as scaled_loss:
        scaled_loss.backward()
else:
    loss.backward()

tr_loss += loss.item()
if (step + 1) % args.gradient_accumulation_steps == 0:
    if args.fp16:
        torch.nn.utils.clip_grad_norm_(
            amp.master_params(optimizer), args.max_grad_norm
        )
    else:
        torch.nn.utils.clip_grad_norm_(
            model.parameters(), args.max_grad_norm
        )

    scheduler.step()  # Update learning rate schedule
    optimizer.step()
    model.zero_grad()
    global_step += 1

print (tr_loss)

- ### full train

In [None]:
train_iterator = trange(
    int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
)

set_seed(args)  # Added here for reproductibility (even between python 2 and 3)

for _ in train_iterator:
    epoch_iterator = tqdm(
        train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]
    )
    for step, batch in enumerate(epoch_iterator):
        model.train()
        if args.model_type != "layoutlm":
            batch = batch[:4]
        batch = tuple(t.to(args.device) for t in batch)
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[3],
        }
        if args.model_type == "layoutlm":
            inputs["bbox"] = batch[4]
        if args.model_type != "distilbert":
            inputs["token_type_ids"] = (
                batch[2]
                if args.model_type in ["bert", "xlnet", "layoutlm"]
                else None
            )  # XLM and RoBERTa don"t use segment_ids

        outputs = model(**inputs)
        loss = outputs[
            0
        ]  # model outputs are always tuple in pytorch-transformers (see doc)

        if args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        tr_loss += loss.item()
        if (step + 1) % args.gradient_accumulation_steps == 0:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), args.max_grad_norm
                )
            else:
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.max_grad_norm
                )
                
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

        if args.max_steps > 0 and global_step > args.max_steps:
            epoch_iterator.close()
            break
    if args.max_steps > 0 and global_step > args.max_steps:
        train_iterator.close()
        break

if args.local_rank in [-1, 0]:
    tb_writer.close()