In [None]:
import sys
sys.path.append("../../")
import os
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from utils_nlp.dataset.multinli import load_pandas_df
from utils_nlp.eval.classification import eval_classification
from utils_nlp.common.timer import Timer
import torch
import torch.nn as nn
import numpy as np
from pytorch_transformers import (WEIGHTS_NAME, XLNetConfig,XLNetForSequenceClassification,XLNetTokenizer)
from tqdm import tqdm, trange
from torch.utils.data import (
    DataLoader,
    RandomSampler,
    SequentialSampler,
    TensorDataset,
)
from pytorch_transformers import AdamW, WarmupLinearSchedule

In [26]:
DATA_FOLDER = "../../../temp"
# TO_LOWER = True
# MAX_LEN = 150
# BATCH_SIZE = 32
# NUM_GPUS = 2
# NUM_EPOCHS = 1
TRAIN_SIZE = 0.6
LABEL_COL = "genre"
TEXT_COL = "sentence1"
train_batch_size = 8
gradient_accumulation_steps = 1
num_train_epoch = 3
learning_rate = 5e-5
max_grad_norm = 1.0
weight_decay = 0.0
adam_epsilon = 1e-8
warmup_steps = 0
per_gpu_eval_batch_size = 32
max_seq_length = 384

In [25]:
def get_features(examples, labels, label_list, max_seq_length,
                                 tokenizer, output_mode,
                                 cls_token_at_end=False, pad_on_left=False,
                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
                                 cls_token_segment_id=1, pad_token_segment_id=0,
                                 mask_padding_with_zero=True):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """
    features = []
    for (ex_index, example) in enumerate(examples):

        tokens_a = tokenizer.tokenize(example)

        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]

        tokens = tokens_a + [sep_token]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if cls_token_at_end:
            tokens = tokens + [cls_token]
            segment_ids = segment_ids + [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = labels[ex_index]
        
        features.append({"input_ids":input_ids,"input_mask":input_mask,"segment_ids":segment_ids,"label_id":label_id})
    return features

In [27]:
df = load_pandas_df(DATA_FOLDER, "train")
df = df[df["gold_label"]=="neutral"]  # get unique sentences

In [28]:
# split
df_train, df_test = train_test_split(df, train_size = TRAIN_SIZE, random_state=0)

# encode labels
label_encoder = LabelEncoder()
labels_train = label_encoder.fit_transform(df_train[LABEL_COL])
labels_test = label_encoder.transform(df_test[LABEL_COL])

num_labels = len(np.unique(labels_train))



In [30]:
label_list = label_encoder.classes_
output_mode = "classification"
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
df_train = list(df_train[TEXT_COL])


features = get_features(df_train, labels_train, label_list, max_seq_length, tokenizer, output_mode,
    cls_token_at_end=True,            # xlnet has a cls token at the end
    cls_token=tokenizer.cls_token,
    sep_token=tokenizer.sep_token,
    cls_token_segment_id=2,
    pad_on_left=True,                 # pad on the left for xlnet
    pad_token_segment_id=4)

In [21]:
all_input_ids = torch.tensor([f["input_ids"] for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f["input_mask"] for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f["segment_ids"] for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f["label_id"] for f in features], dtype=torch.long)

train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

In [34]:
config = XLNetConfig.from_pretrained('xlnet-base-cased', num_labels=num_labels)
model = XLNetForSequenceClassification(config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_GPUS = torch.cuda.device_count()
model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): XLNetLayerNorm()
          (dropout): Dropout(p=0.1)
        )
        (ff): XLNetFeedForward(
          (layer_norm): XLNetLayerNorm()
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1)
        )
        (dropout): Dropout(p=0.1)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): XLNetLayerNorm()
          (dropout): Dropout(p=0.1)
        )
        (ff): XLNetFeedForward(
          (layer_norm): XLNetLayerNorm()
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout

In [35]:

def evaluate(model, tokenizer, prefix=""):
    results = {}
    eval_dataset = train_dataset
    eval_batch_size = per_gpu_eval_batch_size
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)

    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':         batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=1)
    result = (preds==out_label_ids).mean()
    results.update(result)
    return results

def train(model, train_dataset):
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epoch
    max_steps = t_total

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(num_train_epoch), desc="Epoch", disable=False)
    #set_seed(100)  # Added here for reproductibility (even between python 2 and 3)
    print("starting training")
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False)
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':         batch[3]}
            outputs = model(inputs["input_ids"], attention_mask = inputs["attention_mask"],token_type_ids=inputs["token_type_ids"], labels=inputs["labels"] )
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps


            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

            if max_steps > 0 and global_step > max_steps:
                epoch_iterator.close()
                break
        if max_steps > 0 and global_step > max_steps:
            train_iterator.close()
            break
    return global_step, tr_loss / global_step

In [36]:
global_step, tr_loss = train(model, train_dataset)
del model
torch.cuda.empty_cache() 







Epoch:   0%|          | 0/3 [00:00<?, ?it/s][A[A[A[A[A[A






Iteration:   0%|          | 0/9818 [00:00<?, ?it/s][A[A[A[A[A[A[A

env: CUDA_LAUNCH_BLOCKING=1
starting training









Iteration:   0%|          | 1/9818 [00:03<8:35:06,  3.15s/it][A[A[A[A[A[A[A






Iteration:   0%|          | 2/9818 [00:06<8:33:19,  3.14s/it][A[A[A[A[A[A[A






Iteration:   0%|          | 3/9818 [00:09<8:24:35,  3.08s/it][A[A[A[A[A[A[A






Iteration:   0%|          | 4/9818 [00:12<8:30:02,  3.12s/it][A[A[A[A[A[A[A






Iteration:   0%|          | 5/9818 [00:15<8:30:04,  3.12s/it][A[A[A[A[A[A[A

KeyboardInterrupt: 

In [None]:
%env