In [1]:
TRAIN_PATH = "data/text_classification/aclImdb/train.tsv"
DEV_PATH = "data/text_classification/aclImdb/test.tsv"
TEST_PATH = "data/text_classification/aclImdb/test.tsv"

In [3]:
import csv
import sys
from torchtext.data import TabularDataset, Field, BucketIterator

csv.field_size_limit(sys.maxsize)

text = Field(sequential=False)
label = Field(sequential=False, use_vocab=False)

train_data = TabularDataset(path=TRAIN_PATH, format='tsv', fields=[('label', label), ('text', text)])
dev_data = TabularDataset(path=DEV_PATH, format='tsv', fields=[('label', label), ('text', text)])
test_data = TabularDataset(path=TEST_PATH, format='tsv', fields=[('label', label), ('text', text)])

In [1]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/95/68/84de54aea460eb5b2e90bf47a429aacc1ce97ff052ec40874ea38ae2331d/pytorch_pretrained_bert-0.4.0-py3-none-any.whl (45kB)
[K    100% |████████████████████████████████| 51kB 1.4MB/s ta 0:00:011
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.4.0


In [4]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
labels = ["neg", "pos"]

Available models are `bert-base-uncased`, `bert-large-uncased`, `bert-base-cased`, `bert-base-multilingual` and `bert-base-chinese`.

In [14]:
from pytorch_pretrained_bert.tokenization import BertTokenizer

BERT_MODEL = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)

12/28/2018 11:09:14 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/yvespeirsman/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [15]:
from pytorch_pretrained_bert.modeling import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels = len(labels))
model.to(device)

12/28/2018 11:09:15 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/yvespeirsman/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
12/28/2018 11:09:15 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/yvespeirsman/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/8z/9pkh6rsn20x7_5xxvbg04rcm0000gn/T/tmp1tyu6w8a
12/28/2018 11:09:21 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_head

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertInterme

In [16]:
from pytorch_pretrained_bert.optimization import BertAdam

TRAIN_BATCH_SIZE = 32
GRADIENT_ACCUMULATION_STEPS = 1
NUM_TRAIN_EPOCHS = 3
LEARNING_RATE = 5e-5
WARMUP_PROPORTION = 0.1

num_train_steps = int(len(train_data) / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
t_total = num_train_steps

optimizer = BertAdam(optimizer_grouped_parameters,
                     LEARNING_RATE,
                     warmup=WARMUP_PROPORTION,
                     t_total=t_total)

In [11]:
import logging

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        

def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}
    
    features = []
    for (ex_index, example) in enumerate(examples):
        tokens = tokenizer.tokenize(example.text)

        if len(tokens) > max_seq_length - 2:
            tokens = tokens[:(max_seq_length - 2)]
            
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        segment_ids = [0] * len(tokens)
            
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[example.label]
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features
            

In [17]:
MAX_SEQ_LENGTH = 100

train_features = convert_examples_to_features(train_data, labels, MAX_SEQ_LENGTH, tokenizer)

12/28/2018 11:09:27 - INFO - __main__ -   *** Example ***
12/28/2018 11:09:27 - INFO - __main__ -   tokens: [CLS] bro ##m ##well high is a cartoon comedy . it ran at the same time as some other programs about school life , such as " teachers " . my 35 years in the teaching profession lead me to believe that bro ##m ##well high ' s satire is much closer to reality than is " teachers " . the scramble to survive financially , the insight ##ful students who can see right through their pathetic teachers ' po ##mp , the pet ##tine ##ss of the whole situation , all remind me of the schools i knew [SEP]
12/28/2018 11:09:27 - INFO - __main__ -   input_ids: 101 22953 2213 4381 2152 2003 1037 9476 4038 1012 2009 2743 2012 1996 2168 2051 2004 2070 2060 3454 2055 2082 2166 1010 2107 2004 1000 5089 1000 1012 2026 3486 2086 1999 1996 4252 9518 2599 2033 2000 2903 2008 22953 2213 4381 2152 1005 1055 18312 2003 2172 3553 2000 4507 2084 2003 1000 5089 1000 1012 1996 25740 2000 5788 13732 1010 1996 12369

12/28/2018 11:09:27 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/28/2018 11:09:27 - INFO - __main__ -   label: pos (id = 1)


{'neg': 0, 'pos': 1}


In [20]:
def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [22]:
from tqdm import tqdm_notebook as tqdm
from tqdm import trange

OUTPUT_DIR = "/tmp/"
EVAL_BATCH_SIZE = TRAIN_BATCH_SIZE
N_GPU = 1

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


global_step = 0

logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_data))
logger.info("  Batch size = %d", TRAIN_BATCH_SIZE)
logger.info("  Num steps = %d", num_train_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

model.train()
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        loss = model(input_ids, segment_ids, input_mask, label_ids)
        if N_GPU > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS

        loss.backward()

        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            # modify learning rate with special warm up BERT uses
            lr_this_step = LEARNING_RATE * warmup_linear(global_step/t_total, WARMUP_PROPORTION)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = os.path.join(OUTPUT_DIR, "pytorch_model.bin")
torch.save(model_to_save.state_dict(), output_model_file)


12/28/2018 11:23:24 - INFO - __main__ -   ***** Running training *****
12/28/2018 11:23:24 - INFO - __main__ -     Num examples = 25000
12/28/2018 11:23:24 - INFO - __main__ -     Batch size = 32
12/28/2018 11:23:24 - INFO - __main__ -     Num steps = 2343
Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, description='Iteration', max=782), HTML(value='')))




KeyboardInterrupt: 




In [None]:

# Load a trained model that you have fine-tuned
model_state_dict = torch.load(output_model_file)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, state_dict=model_state_dict)
model.to(device)

eval_examples = dev_data
eval_features = convert_examples_to_features(
    eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
logger.info("***** Running evaluation *****")
logger.info("  Num examples = %d", len(eval_examples))
logger.info("  Batch size = %d", EVAL_BATCH_SIZE)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE)

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
        tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
        logits = model(input_ids, segment_ids, input_mask)

    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1

eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples

result = {'eval_loss': eval_loss,
          'eval_accuracy': eval_accuracy,
          'global_step': global_step,
          'loss': tr_loss/nb_tr_steps}

output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    logger.info("***** Eval results *****")
    for key in sorted(result.keys()):
        logger.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))
