In [1]:
!pip install  pytorch_transformers

Collecting pytorch_transformers
  Using cached https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl
Collecting sentencepiece (from pytorch_transformers)
  Using cached https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl
Collecting sacremoses (from pytorch_transformers)
Collecting joblib (from sacremoses->pytorch_transformers)
  Using cached https://files.pythonhosted.org/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl
[31mfastai 1.0.55 requires nvidia-ml-py3, which is not installed.[0m
[31mthinc 6.12.1 has requirement msgpack<0.6.0,>=0.5.6, but you'll have msgpack 0.6.0 which is incompatible.[0m
Installing collected packages: sentencepiece, joblib, sacremoses, pytorch-transformers
Successfully installed joblib-0.13.2

In [2]:
from __future__ import absolute_import, division, print_function

import glob
import logging
import os
import sys
import time
import random
import json

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import random
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm_notebook, trange


from pytorch_transformers import (WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer,
                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer, 
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
                                  RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

from pytorch_transformers import AdamW, WarmupLinearSchedule

from utils import (convert_examples_to_features,
                        output_modes, processors)


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
config_json='/home/ec2-user/SageMaker/transformers-classification/config-bert.json'
print("using config "+config_json)
with open(config_json, 'r') as f:
    args= json.load(f)

print(args)

using config /home/ec2-user/SageMaker/transformers-classification/config-bert.json
{'data_dir': '../data/', 'model_type': 'bert', 'model_name': 'bert-base-uncased', 'task_name': 'binary', 'output_dir': '../outputs_bert/', 'cache_dir': 'cache/', 'do_train': True, 'do_eval': True, 'fp16': False, 'fp16_opt_level': 'O1', 'max_seq_length': 512, 'output_mode': 'classification', 'train_batch_size': 48, 'eval_batch_size': 12, 'gradient_accumulation_steps': 1, 'num_train_epochs': 8, 'weight_decay': 0, 'learning_rate': 4e-05, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'max_grad_norm': 1.0, 'logging_steps': 50, 'evaluate_during_training': False, 'save_steps': 1000, 'eval_all_checkpoints': True, 'overwrite_output_dir': True, 'reprocess_input_data': False, 'notes': 'first batch of compliance data'}


In [13]:
MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
}



In [20]:
def load_datasets(args, tokenizer, evaluate=False):
    
    output_mode = args['output_mode']
    task = args['task_name']
    
    processor = processors[task]()
    label_list = processor.get_labels()
    examples = processor.get_dev_examples(args['data_dir']) if evaluate else processor.get_train_examples(args['data_dir'])
        
    features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer, output_mode,
            cls_token_at_end=bool(args['model_type'] in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0,
            pad_on_left=bool(args['model_type'] in ['xlnet']),                 # pad on the left for xlnet
            pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0)
        
    #logger.info("Saving features into cached file %s", cached_features_file)
    #torch.save(features, cached_features_file)
        
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset

In [22]:
def _train(args):
    # initialization
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

    config = config_class.from_pretrained(args['model_name'], num_labels=2, finetuning_task=args['task_name'])
    tokenizer = tokenizer_class.from_pretrained(args['model_name'])
    model = model_class.from_pretrained(args['model_name'])
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = torch.nn.DataParallel(model)
    model.to(device)
    
    logger.info("Loading  dataset")
    train_dataset= load_datasets(args, tokenizer, False)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])
    print("len(train_dataloader) "+ str(len(train_dataloader))) 
    t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']
    
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args['warmup_steps'], t_total=t_total)
            
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Total train batch size  = %d", args['train_batch_size'])
    logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args['num_train_epochs']), desc="Epoch")
    
    for _ in train_iterator:
        epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            loss = outputs[0].mean()  # model outputs are always tuple in pytorch-transformers (see doc)
            print("\r%f" % loss, end='')

            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm'])
                
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])

            tr_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
                     
                    logging_loss = tr_loss

                if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args['output_dir'], 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)


    return global_step, tr_loss / global_step

In [23]:
args['num_train_epochs']=1

In [24]:
args['data_dir']='/home/ec2-user/SageMaker/transformers-classification/data'

In [25]:
_train(args)

INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/ec2-user/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
INFO:pytorch_transformers.modeling_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "binary",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.c

Let's use 4 GPUs!




  0%|          | 0/42528 [00:00<?, ?it/s][A[A

  0%|          | 1/42528 [00:08<104:23:21,  8.84s/it][A[A

  2%|▏         | 1001/42528 [00:09<71:21:19,  6.19s/it][A[A

 15%|█▌        | 6501/42528 [00:09<43:20:01,  4.33s/it][A[A

 23%|██▎       | 9940/42528 [00:10<27:26:17,  3.03s/it][A[A

 26%|██▋       | 11167/42528 [00:10<18:29:02,  2.12s/it][A[A

 36%|███▋      | 15501/42528 [00:12<11:09:07,  1.49s/it][A[A

 38%|███▊      | 16271/42528 [00:16<7:35:41,  1.04s/it] [A[A

 40%|███▉      | 16816/42528 [00:16<5:12:28,  1.37it/s][A[A

 42%|████▏     | 18001/42528 [00:17<3:28:39,  1.96it/s][A[A

 44%|████▎     | 18514/42528 [00:17<2:23:06,  2.80it/s][A[A

 47%|████▋     | 20001/42528 [00:19<1:34:07,  3.99it/s][A[A

 51%|█████     | 21501/42528 [00:21<1:01:38,  5.68it/s][A[A

 74%|███████▍  | 31501/42528 [00:21<22:37,  8.12it/s]  [A[A

 79%|███████▉  | 33533/42528 [00:23<12:58, 11.56it/s][A[A

 87%|████████▋ | 37001/42528 [00:24<05:34, 16.50it/s][A[A

 90%|██

len(train_dataloader) 886


HBox(children=(IntProgress(value=0, description='Iteration', max=886, style=ProgressStyle(description_width='i…



0.000406



Epoch: 100%|██████████| 1/1 [07:48<00:00, 468.23s/it][A[A

(886, 0.02185620322232119)