In [1]:
import os
import random
import logging
import numpy as np
import pandas as pd

# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
from tqdm import trange

import pprint
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE

from nltk import sent_tokenize, word_tokenize

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# The GPU id to use, usually either "0" or "1"
os.environ["CUDA_VISIBLE_DEVICES"]="6,7"
os.environ["TFHUB_CACHE_DIR"]="tfhub_modules"

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

pp = pprint.PrettyPrinter(indent=4)

In [41]:
local_rank = -1
max_seq_length = 200
bert_model = "bert-base-uncased"
do_lower_case = True
num_labels = 7
gradient_accumulation_steps = 1
train_batch_size = 32
eval_batch_size = 8
test_batch_size = 8
learning_rate = 5e-5
num_train_epochs = 3.0
warmup_proportion = 0.1
output_dir = "bert"
do_train = True
do_eval = True
fp16 = True
loss_scale = 0

In [3]:
if local_rank == -1:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
else:
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')

seed = 20190104

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)

In [4]:
logger.info("device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
    device, n_gpu, bool(local_rank != -1), fp16))

01/25/2019 10:01:10 - INFO - __main__ -   device: cuda, n_gpu: 2, distributed training: False, 16-bits training: True


## Tokenizer Comparison - NLTK Tokenizer vs BertTokenizer

In [5]:
sentences = [
    "I love Kung Fu Panda and chicken tikka masala!",
    "Divide each difficulty into as many parts as is feasible and necessary to resolve it.",
    "It is not enough to have a good mind; the main thing is to use it well.",
]

### NLTK Tokenizer

In [6]:
nltk_tokenized_text = [word_tokenize(sentence.lower()) for sentence in sentences]
pp.pprint(nltk_tokenized_text)

[   [   'i',
        'love',
        'kung',
        'fu',
        'panda',
        'and',
        'chicken',
        'tikka',
        'masala',
        '!'],
    [   'divide',
        'each',
        'difficulty',
        'into',
        'as',
        'many',
        'parts',
        'as',
        'is',
        'feasible',
        'and',
        'necessary',
        'to',
        'resolve',
        'it',
        '.'],
    [   'it',
        'is',
        'not',
        'enough',
        'to',
        'have',
        'a',
        'good',
        'mind',
        ';',
        'the',
        'main',
        'thing',
        'is',
        'to',
        'use',
        'it',
        'well',
        '.']]


### BertTokenizer

In [7]:
bert_tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
bert_tokenized_text = [bert_tokenizer.tokenize(sentence) for sentence in sentences]
pp.pprint(bert_tokenized_text)

01/25/2019 10:01:13 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/david/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


[   [   'i',
        'love',
        'kung',
        'fu',
        'panda',
        'and',
        'chicken',
        'ti',
        '##kka',
        'mas',
        '##ala',
        '!'],
    [   'divide',
        'each',
        'difficulty',
        'into',
        'as',
        'many',
        'parts',
        'as',
        'is',
        'feasible',
        'and',
        'necessary',
        'to',
        'resolve',
        'it',
        '.'],
    [   'it',
        'is',
        'not',
        'enough',
        'to',
        'have',
        'a',
        'good',
        'mind',
        ';',
        'the',
        'main',
        'thing',
        'is',
        'to',
        'use',
        'it',
        'well',
        '.']]


## Data Preprocessing

In [8]:
label_list = ["anger", "sadness", "joy"]
label_map = {label : i for i, label in enumerate(label_list)}
logger.info(label_map)

01/25/2019 10:01:13 - INFO - __main__ -   {'anger': 0, 'sadness': 1, 'joy': 2}


In [9]:
class ISEARDataset(object):
    FILENAME = "data/isear_databank.csv"
    RANDOM_STATE = 41
  
    def get_labels(self):
        return ["anger", "disgust", "fear", "guilt", "joy", "sadness", "shame"]
  
    def get_label_map(self):
        return {label : i for i, label in enumerate(self.get_labels())}

    def __init__(self, n_items=0):
        data = pd.read_csv(self.FILENAME)

        if n_items > 0:
            data = data.iloc[0:n_items,:]

        data["text"] = data["SIT"]
        data["label"] = data["Field1"]

        for label in self.get_labels():
            data.loc[data["label"] == label, "label_int"] = self.get_label_map()[label]
            
        self.data = data[["text", "label", "label_int"]]

In [10]:
dataset = ISEARDataset()

train_dataset, test_dataset = train_test_split(dataset.data, test_size=0.3, random_state=dataset.RANDOM_STATE, stratify=dataset.data.label)
train_dataset, eval_dataset = train_test_split(train_dataset, test_size=0.2, random_state=dataset.RANDOM_STATE, stratify=train_dataset.label)

In [11]:
for (row_index, row) in train_dataset.iterrows():
    logger.info("row index:{}".format(row_index))
    logger.info("text:{}".format(row.text))
    logger.info("label:{}".format(row.label))
    logger.info("label_int:{}".format(row.label_int))
    break

01/25/2019 10:01:13 - INFO - __main__ -   row index:6800
01/25/2019 10:01:13 - INFO - __main__ -   text:The time when my sister had her first baby I was so happy and á
joyous because she stayed for two days after marriage before she á
had a child.
01/25/2019 10:01:13 - INFO - __main__ -   label:joy
01/25/2019 10:01:13 - INFO - __main__ -   label_int:4.0


In [12]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        
def convert_dataset_to_features(dataset, label_map, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    index = 0
    for (guid, row) in dataset.iterrows():
        tokens = tokenizer.tokenize(row.text)

        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens) > max_seq_length - 2:
            tokens = tokens[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[row.label]
        if index < 5:
            logger.info("*** row ***")
            logger.info("guid: %s" % (guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (row.label, label_id))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
        index += 1
        
    return features


def _truncate_seq_pair(tokens, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens) > len(tokens_b):
            tokens.pop()
        else:
            tokens_b.pop()

In [13]:
train_features = convert_dataset_to_features(train_dataset, dataset.get_label_map(), max_seq_length, bert_tokenizer)

01/25/2019 10:01:13 - INFO - __main__ -   *** row ***
01/25/2019 10:01:13 - INFO - __main__ -   guid: 6800
01/25/2019 10:01:13 - INFO - __main__ -   tokens: [CLS] the time when my sister had her first baby i was so happy and a joy ##ous because she stayed for two days after marriage before she a had a child . [SEP]
01/25/2019 10:01:13 - INFO - __main__ -   input_ids: 101 1996 2051 2043 2026 2905 2018 2014 2034 3336 1045 2001 2061 3407 1998 1037 6569 3560 2138 2016 4370 2005 2048 2420 2044 3510 2077 2016 1037 2018 1037 2775 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/25/2019 10:01:13 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 

01/25/2019 10:01:13 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/25/2019 10:01:13 - INFO - __main__ -   label: shame (id = 6)


In [14]:
eval_features = convert_dataset_to_features(eval_dataset, dataset.get_label_map(), max_seq_length, bert_tokenizer)

01/25/2019 10:01:15 - INFO - __main__ -   *** row ***
01/25/2019 10:01:15 - INFO - __main__ -   guid: 3305
01/25/2019 10:01:15 - INFO - __main__ -   tokens: [CLS] when i decided to leave my steady , secure employment to come to a university . because i didn ' t know if i could cope with all the a requirements of study and also being older i didn ' t know if i a would enjoy mixing with younger people , also financial a ins ##ec ##urity . [SEP]
01/25/2019 10:01:15 - INFO - __main__ -   input_ids: 101 2043 1045 2787 2000 2681 2026 6706 1010 5851 6107 2000 2272 2000 1037 2118 1012 2138 1045 2134 1005 1056 2113 2065 1045 2071 11997 2007 2035 1996 1037 5918 1997 2817 1998 2036 2108 3080 1045 2134 1005 1056 2113 2065 1045 1037 2052 5959 6809 2007 3920 2111 1010 2036 3361 1037 16021 8586 25137 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

01/25/2019 10:01:15 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/25/2019 10:01:15 - INFO - __main__ -   label: sadness (id = 5)


In [15]:
test_features = convert_dataset_to_features(test_dataset, dataset.get_label_map(), max_seq_length, bert_tokenizer)

01/25/2019 10:01:16 - INFO - __main__ -   *** row ***
01/25/2019 10:01:16 - INFO - __main__ -   guid: 3559
01/25/2019 10:01:16 - INFO - __main__ -   tokens: [CLS] a person i know who tells lies and so pretending to be better a than she is . [SEP]
01/25/2019 10:01:16 - INFO - __main__ -   input_ids: 101 1037 2711 1045 2113 2040 4136 3658 1998 2061 12097 2000 2022 2488 1037 2084 2016 2003 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/25/2019 10:01:16 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

01/25/2019 10:01:16 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/25/2019 10:01:16 - INFO - __main__ -   label: sadness (id = 5)


In [16]:
# Prepare model
model = BertForSequenceClassification.from_pretrained(
    bert_model,
    cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank),
    num_labels = num_labels
)
model.to(device)

if fp16:
    model.half()
        
if local_rank != -1:
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    model = DDP(model)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

01/25/2019 10:01:19 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /home/david/.pytorch_pretrained_bert/distributed_-1/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
01/25/2019 10:01:19 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /home/david/.pytorch_pretrained_bert/distributed_-1/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp7dfjdmjw
01/25/2019 10:01:23 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers":

In [17]:
param_optimizer = list(model.named_parameters())

In [18]:
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

train_batch_size = int(train_batch_size / gradient_accumulation_steps)
logger.info("train_batch_size = {}".format(train_batch_size))
logger.info("num train_dataset = {}".format(len(train_dataset)))
logger.info("gradient_accumulation_steps = {}".format(gradient_accumulation_steps))
logger.info("num_train_epochs = {}".format(num_train_epochs))

num_train_steps = int(
    len(train_dataset) / train_batch_size / gradient_accumulation_steps * num_train_epochs
)
logger.info("num_train_steps = {}".format(num_train_steps))

t_total = num_train_steps

if local_rank != -1:
    t_total = t_total // torch.distributed.get_world_size()

logger.info("t_total = {}".format(t_total))

01/25/2019 10:01:36 - INFO - __main__ -   train_batch_size = 32
01/25/2019 10:01:36 - INFO - __main__ -   num train_dataset = 4292
01/25/2019 10:01:36 - INFO - __main__ -   gradient_accumulation_steps = 1
01/25/2019 10:01:36 - INFO - __main__ -   num_train_epochs = 3.0
01/25/2019 10:01:36 - INFO - __main__ -   num_train_steps = 402
01/25/2019 10:01:36 - INFO - __main__ -   t_total = 402


In [19]:
if fp16:
    try:
        from apex.optimizers import FP16_Optimizer
        from apex.optimizers import FusedAdam
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    optimizer = FusedAdam(optimizer_grouped_parameters,
                          lr=learning_rate,
                          bias_correction=False,
                          max_grad_norm=1.0)
    if loss_scale == 0:
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
    else:
        optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)

else:
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)

In [20]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0
    
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_dataset))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_steps)
train_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
train_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
train_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
train_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

01/25/2019 10:01:36 - INFO - __main__ -   ***** Running training *****
01/25/2019 10:01:36 - INFO - __main__ -     Num examples = 4292
01/25/2019 10:01:36 - INFO - __main__ -     Batch size = 32
01/25/2019 10:01:36 - INFO - __main__ -     Num steps = 402


In [21]:
train_input_ids

tensor([[ 101, 1996, 2051,  ...,    0,    0,    0],
        [ 101, 2043, 1045,  ...,    0,    0,    0],
        [ 101, 2387, 2026,  ...,    0,    0,    0],
        ...,
        [ 101, 1037, 2154,  ...,    0,    0,    0],
        [ 101, 2019, 8875,  ...,    0,    0,    0],
        [ 101, 2043, 2026,  ...,    0,    0,    0]])

In [22]:
train_input_ids[0]

tensor([ 101, 1996, 2051, 2043, 2026, 2905, 2018, 2014, 2034, 3336, 1045, 2001,
        2061, 3407, 1998, 1037, 6569, 3560, 2138, 2016, 4370, 2005, 2048, 2420,
        2044, 3510, 2077, 2016, 1037, 2018, 1037, 2775, 1012,  102,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

In [23]:
train_input_ids.shape

torch.Size([4292, 200])

In [24]:
train_input_mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [25]:
train_input_mask[0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [26]:
train_input_mask.shape

torch.Size([4292, 200])

In [27]:
train_segment_ids

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [28]:
train_segment_ids[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [29]:
train_segment_ids.shape

torch.Size([4292, 200])

In [30]:
train_label_ids

tensor([4, 4, 1,  ..., 6, 3, 5])

In [31]:
train_label_ids[0]

tensor(4)

In [32]:
train_label_ids.shape

torch.Size([4292])

In [33]:
train_data = TensorDataset(train_input_ids, train_input_mask, train_segment_ids, train_label_ids)
if local_rank == -1:
    train_sampler = RandomSampler(train_data)
else:
    train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

In [34]:
train_sampler

<torch.utils.data.sampler.RandomSampler at 0x7fc418f84278>

In [35]:
logger.info("num_train_epochs = {}".format(num_train_epochs))

01/25/2019 10:01:36 - INFO - __main__ -   num_train_epochs = 3.0


In [36]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

model.train()
for _ in trange(int(num_train_epochs), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        train_input_ids, train_input_mask, train_segment_ids, train_label_ids = batch
        loss = model(train_input_ids, train_segment_ids, train_input_mask, train_label_ids)
        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
            
        if fp16:
            optimizer.backward(loss)
        else:
            loss.backward()
            
        tr_loss += loss.item()
        nb_tr_examples += train_input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % gradient_accumulation_steps == 0:
            # modify learning rate with special warm up BERT uses
            lr_this_step = learning_rate * warmup_linear(global_step/t_total, warmup_proportion)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, description='Iteration', max=135, style=ProgressStyle(description_width='i…




Grad overflow on iteration 0
Using dynamic loss scale of 4294967296

Grad overflow on iteration 1
Using dynamic loss scale of 2147483648.0

Grad overflow on iteration 2
Using dynamic loss scale of 1073741824.0

Grad overflow on iteration 3
Using dynamic loss scale of 536870912.0

Grad overflow on iteration 4
Using dynamic loss scale of 268435456.0

Grad overflow on iteration 5
Using dynamic loss scale of 134217728.0

Grad overflow on iteration 6
Using dynamic loss scale of 67108864.0

Grad overflow on iteration 7
Using dynamic loss scale of 33554432.0

Grad overflow on iteration 8
Using dynamic loss scale of 16777216.0

Grad overflow on iteration 9
Using dynamic loss scale of 8388608.0

Grad overflow on iteration 10
Using dynamic loss scale of 4194304.0

Grad overflow on iteration 11
Using dynamic loss scale of 2097152.0

Grad overflow on iteration 12
Using dynamic loss scale of 1048576.0

Grad overflow on iteration 13
Using dynamic loss scale of 524288.0

Grad overflow on iteration 1

Epoch:  33%|███▎      | 1/3 [01:27<02:55, 87.64s/it]

HBox(children=(IntProgress(value=0, description='Iteration', max=135, style=ProgressStyle(description_width='i…

Epoch:  67%|██████▋   | 2/3 [02:50<01:26, 86.24s/it]

HBox(children=(IntProgress(value=0, description='Iteration', max=135, style=ProgressStyle(description_width='i…

Epoch: 100%|██████████| 3/3 [04:13<00:00, 85.32s/it]


In [37]:
# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = os.path.join(output_dir, "pytorch_model.bin")
torch.save(model_to_save.state_dict(), output_model_file)

# Load a trained model that you have fine-tuned
model_state_dict = torch.load(output_model_file)
model = BertForSequenceClassification.from_pretrained(bert_model, state_dict=model_state_dict, num_labels=num_labels)
model.to(device)

01/25/2019 10:05:51 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /home/david/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
01/25/2019 10:05:51 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /home/david/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpq1gwvo6r
01/25/2019 10:05:55 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=T

In [39]:
if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", eval_batch_size)
    eval_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    eval_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    eval_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    eval_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(eval_input_ids, eval_input_mask, eval_segment_ids, eval_label_ids)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
            logits = model(input_ids, segment_ids, input_mask)

        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
    loss = tr_loss/nb_tr_steps if do_train else None
    result = {'eval_loss': eval_loss,
              'eval_accuracy': eval_accuracy,
              'global_step': global_step,
              'loss': loss}

    output_eval_file = os.path.join(output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

01/25/2019 10:09:21 - INFO - __main__ -   ***** Running evaluation *****
01/25/2019 10:09:21 - INFO - __main__ -     Num examples = 1074
01/25/2019 10:09:21 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Evaluating', max=135, style=ProgressStyle(description_width='…

01/25/2019 10:09:34 - INFO - __main__ -   ***** Eval results *****
01/25/2019 10:09:34 - INFO - __main__ -     eval_accuracy = 0.6983240223463687
01/25/2019 10:09:34 - INFO - __main__ -     eval_loss = 1.0040278805626763
01/25/2019 10:09:34 - INFO - __main__ -     global_step = 405
01/25/2019 10:09:34 - INFO - __main__ -     loss = 0.3331154152199074





In [42]:
logger.info("***** Running testing *****")
logger.info("  Num examples = %d", len(test_dataset))
logger.info("  Batch size = %d", test_batch_size)
test_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
test_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
test_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
test_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)
test_data = TensorDataset(test_input_ids, test_input_mask, test_segment_ids, test_label_ids)
# Run prediction for full data
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=test_batch_size)

model.eval()
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0

for input_ids, input_mask, segment_ids, label_ids in tqdm(test_dataloader, desc="Evaluating"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
        tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids)
        logits = model(input_ids, segment_ids, input_mask)

    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
    tmp_test_accuracy = accuracy(logits, label_ids)

    test_loss += tmp_test_loss.mean().item()
    test_accuracy += tmp_test_accuracy

    nb_test_examples += input_ids.size(0)
    nb_test_steps += 1

test_loss = test_loss / nb_test_steps
test_accuracy = test_accuracy / nb_test_examples
loss = tr_loss/nb_tr_steps if do_train else None
result = {'test_loss': test_loss,
          'test_accuracy': test_accuracy,
          'global_step': global_step,
          'loss': loss}

output_test_file = os.path.join(output_dir, "test_results.txt")
with open(output_test_file, "w") as writer:
    logger.info("***** Eval results *****")
    for key in sorted(result.keys()):
        logger.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))

01/25/2019 10:13:07 - INFO - __main__ -   ***** Running testing *****
01/25/2019 10:13:07 - INFO - __main__ -     Num examples = 2300
01/25/2019 10:13:07 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Evaluating', max=288, style=ProgressStyle(description_width='…

01/25/2019 10:13:34 - INFO - __main__ -   ***** Eval results *****
01/25/2019 10:13:34 - INFO - __main__ -     global_step = 405
01/25/2019 10:13:34 - INFO - __main__ -     loss = 0.3331154152199074
01/25/2019 10:13:34 - INFO - __main__ -     test_accuracy = 0.7043478260869566
01/25/2019 10:13:34 - INFO - __main__ -     test_loss = 0.9534986084844503



