In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lr.text_processing.util import pre_process_nli_df
from lr.training.util import get_positive_labels, filter_df_by_label
from transformers import glue_processors as processors
from transformers.data.processors.utils import DataProcessor, InputExample, InputFeatures
from IPython.display import display, HTML

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class NLIProcessor(DataProcessor):
    """Processor for the any nli dataf frame in csv
       (columns = premise | hypothesis | label)"""

    def read_and_clean_csv(self, path):
        df = pd.read_csv(path)
        df = filter_df_by_label(df.dropna()).reset_index(drop=True)
        pre_process_nli_df(df)
        return df

    def get_train_examples(self, path):
        return self._create_examples(self.read_and_clean_csv(path), "train")

    def get_dev_examples(self, path):
        return self._create_examples(self.read_and_clean_csv(path), "dev")

    def get_labels(self):
        return ["contradiction", "entailment", "neutral"]
    
    def get_label_map(self):
        label_list = self.get_labels()
        label_map = {label: i for i, label in enumerate(label_list)}
        return label_map

    def _create_examples(self, df, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        n = df.shape[0]
        for i in range(n):
            example = df.loc[i]
            guid = "{}-{}".format(set_type, example.name)
            input_example = InputExample(guid=guid,
                                         text_a=example.premise,
                                         text_b=example.hypothesis,
                                         label=example.label)

            examples.append(input_example)
        return examples

In [3]:
processor = NLIProcessor()

train_path = "data/toy/train.csv"
dev_path = "data/toy/dev.csv"

train_examples = processor.get_train_examples(train_path)
dev_examples = processor.get_dev_examples(dev_path)

In [4]:
train_examples[0]

{
  "guid": "train-0",
  "label": "entailment",
  "text_a": "a couple playing with a little boy on the beach",
  "text_b": "a couple are playing with a young child outside"
}

In [5]:
dev_examples[0]

{
  "guid": "dev-0",
  "label": "neutral",
  "text_a": "an excited smiling woman stands at a red railing as she holds a boombox to one side",
  "text_b": "a tall human stanindg"
}

In [6]:
label_map = processor.get_label_map()
label_map

{'contradiction': 0, 'entailment': 1, 'neutral': 2}

## draft

In [7]:
import logging
import os
logger = logging.getLogger(__name__)

In [8]:
from transformers import BertTokenizer
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

In [9]:
def convert_examples_to_features(examples,
                                 tokenizer,
                                 label_map,
                                 max_length=512,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 mask_padding_with_zero=True):
    
    features = []
    for (ex_index, example) in enumerate(examples):
        len_examples = len(examples)
        if ex_index % 10000 == 0:
            logger.info("Writing example %d/%d" % (ex_index, len_examples))
        inputs = tokenizer.encode_plus(example.text_a,
                                       example.text_b,
                                       add_special_tokens=True,
                                       max_length=max_length)
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)



        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
        label = label_map[example.label]
        features.append(InputFeatures(input_ids=input_ids,
                                      attention_mask=attention_mask,
                                      token_type_ids=token_type_ids,
                                      label=label))
    return features


In [10]:
train_features = convert_examples_to_features(examples=train_examples,
                                              tokenizer=tokenizer,
                                              label_map=processor.get_label_map())


dev_features = convert_examples_to_features(examples=dev_examples,
                                              tokenizer=tokenizer,
                                              label_map=processor.get_label_map())

print(len(train_features), len(dev_features))

800 200
