Merge pull request #91 from microsoft/staging

Staging to master after PR #36
microsoft · Jun 10, 2019 · 42340a7 · 42340a7
2 parents 4b81887 + 2afdb73
commit 42340a7
Show file tree

Hide file tree

Showing 7 changed files with 1,291 additions and 28 deletions.
diff --git a/scenarios/named_entity_recognition/ner_wikigold_bert.ipynb b/scenarios/named_entity_recognition/ner_wikigold_bert.ipynb
diff --git a/scenarios/text_classification/tc_mnli_bert.ipynb b/scenarios/text_classification/tc_mnli_bert.ipynb
@@ -26,7 +26,7 @@
     "from sklearn.model_selection import train_test_split\n",
     "from utils_nlp.dataset.multinli import load_pandas_df\n",
     "from utils_nlp.eval.classification import eval_classification\n",
-    "from utils_nlp.bert.sequence_classification import SequenceClassifier\n",
+    "from utils_nlp.bert.sequence_classification import BERTSequenceClassifier\n",
     "from utils_nlp.bert.common import Language, Tokenizer\n",
     "from utils_nlp.common.timer import Timer\n",
     "import torch\n",
@@ -307,7 +307,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "classifier = SequenceClassifier(\n",
+    "classifier = BERTSequenceClassifier(\n",
     "    language=LANGUAGE, num_labels=num_labels, cache_dir=BERT_CACHE_DIR\n",
     ")"
    ]
@@ -508,7 +508,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.5",
    "language": "python",
    "name": "python3"
   },
@@ -522,7 +522,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.5.5"
   }
  },
  "nbformat": 4,

diff --git a/scenarios/text_classification/tc_yahoo_answers_bert.ipynb b/scenarios/text_classification/tc_yahoo_answers_bert.ipynb
@@ -24,7 +24,7 @@
     "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
     "import utils_nlp.dataset.yahoo_answers as ya_dataset\n",
     "from utils_nlp.eval.classification import eval_classification\n",
-    "from utils_nlp.bert.sequence_classification import SequenceClassifier\n",
+    "from utils_nlp.bert.sequence_classification import BERTSequenceClassifier\n",
     "from utils_nlp.bert.common import Language, Tokenizer\n",
     "from utils_nlp.common.timer import Timer\n",
     "import torch\n",
@@ -166,7 +166,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "classifier = SequenceClassifier(\n",
+    "classifier = BERTSequenceClassifier(\n",
     "    language=Language.ENGLISH, num_labels=num_labels, cache_dir=BERT_CACHE_DIR\n",
     ")"
    ]
@@ -402,7 +402,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.5",
    "language": "python",
    "name": "python3"
   },
@@ -416,7 +416,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.5.5"
   }
  },
  "nbformat": 4,

diff --git a/utils_nlp/bert/common.py b/utils_nlp/bert/common.py
@@ -3,6 +3,16 @@
 
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from enum import Enum
+import warnings
+import torch
+
+from torch.utils.data import (
+    DataLoader,
+    RandomSampler,
+    SequentialSampler,
+    TensorDataset,
+)
+from torch.utils.data.distributed import DistributedSampler
 
 # Max supported sequence length
 BERT_MAX_LEN = 512
@@ -27,19 +37,20 @@ def __init__(
         Args:
             language (Language, optional): The pretrained model's language.
                                            Defaults to Language.ENGLISH.
-            cache_dir (str, optional): Location of BERT's cache directory. Defaults to ".".
+            cache_dir (str, optional): Location of BERT's cache directory.
+                Defaults to ".".
         """
         self.tokenizer = BertTokenizer.from_pretrained(
             language.value, do_lower_case=to_lower, cache_dir=cache_dir
         )
         self.language = language
 
     def tokenize(self, text):
-        """Uses a BERT tokenizer 
-        
+        """Uses a BERT tokenizer
+
         Args:
             text (list): [description]
-        
+
         Returns:
             [list]: [description]
         """
@@ -51,7 +62,7 @@ def preprocess_classification_tokens(self, tokens, max_len=BERT_MAX_LEN):
             - add BERT sentence markers ([CLS] and [SEP])
             - map tokens to indices
             - pad and truncate sequences
-            - create an input_mask    
+            - create an input_mask
         Args:
             tokens (list): List of tokens to preprocess.
             max_len (int, optional): Maximum number of tokens
@@ -78,3 +89,191 @@ def preprocess_classification_tokens(self, tokens, max_len=BERT_MAX_LEN):
         # create input mask
         input_mask = [[min(1, x) for x in y] for y in tokens]
         return tokens, input_mask
+
+    def preprocess_ner_tokens(
+        self,
+        text,
+        max_len=BERT_MAX_LEN,
+        labels=None,
+        label_map=None,
+        trailing_piece_tag="X",
+    ):
+        """
+        Preprocesses input text, involving the following steps
+            0. Tokenize input text.
+            1. Convert string tokens to token ids.
+            2. Convert input labels to label ids, if labels and label_map are
+                provided.
+            3. If a word is tokenized into multiple pieces of tokens by the
+                WordPiece tokenizer, label the extra tokens with
+                trailing_piece_tag.
+            4. Pad or truncate input text according to max_seq_length
+            5. Create input_mask for masking out padded tokens.
+
+        Args:
+            text (list): List of input sentences/paragraphs.
+            max_len (int, optional): Maximum length of the list of
+                tokens. Lists longer than this are truncated and shorter
+                ones are padded with "O"s. Default value is BERT_MAX_LEN=512.
+            labels (list, optional): List of token label lists. Default
+                value is None.
+            label_map (dict, optional): Dictionary for mapping original token
+                labels (which may be string type) to integers. Default value
+                is None.
+            trailing_piece_tag (str, optional): Tag used to label trailing
+                word pieces. For example, "playing" is broken into "play"
+                and "##ing", "play" preserves its original label and "##ing"
+                is labeled as trailing_piece_tag. Default value is "X".
+
+        Returns:
+            tuple: A tuple containing the following three or four lists.
+                1. input_ids_all: List of lists. Each sublist contains
+                    numerical values, i.e. token ids, corresponding to the
+                    tokens in the input text data.
+                2. input_mask_all: List of lists. Each sublist
+                    contains the attention mask of the input token id list,
+                    1 for input tokens and 0 for padded tokens, so that
+                    padded tokens are not attended to.
+                3. trailing_token_mask: List of lists. Each sublist is
+                    a boolean list, True for the first word piece of each
+                    original word, False for the trailing word pieces,
+                    e.g. "##ing". This mask is useful for removing the
+                    predictions on trailing word pieces, so that each
+                    original word in the input text has a unique predicted
+                    label.
+                4. label_ids_all: List of lists of numerical labels,
+                    each sublist contains token labels of a input
+                    sentence/paragraph, if labels is provided.
+        """
+        if max_len > BERT_MAX_LEN:
+            warnings.warn(
+                "setting max_len to max allowed tokens: {}".format(
+                    BERT_MAX_LEN
+                )
+            )
+            max_len = BERT_MAX_LEN
+
+        label_available = True
+        if labels is None:
+            label_available = False
+            # create an artificial label list for creating trailing token mask
+            labels = ["O"] * len(text)
+
+        input_ids_all = []
+        input_mask_all = []
+        label_ids_all = []
+        trailing_token_mask_all = []
+        for t, t_labels in zip(text, labels):
+            new_labels = []
+            tokens = []
+            for word, tag in zip(t.split(), t_labels):
+                sub_words = self.tokenizer.tokenize(word)
+                for count, sub_word in enumerate(sub_words):
+                    if count > 0:
+                        tag = trailing_piece_tag
+                    new_labels.append(tag)
+                    tokens.append(sub_word)
+
+            if len(tokens) > max_len:
+                tokens = tokens[:max_len]
+                new_labels = new_labels[:max_len]
+
+            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens.
+            # Only real tokens are attended to.
+            input_mask = [1.0] * len(input_ids)
+
+            # Zero-pad up to the max sequence length.
+            padding = [0.0] * (max_len - len(input_ids))
+            label_padding = ["O"] * (max_len - len(input_ids))
+
+            input_ids += padding
+            input_mask += padding
+            new_labels += label_padding
+
+            trailing_token_mask_all.append(
+                [
+                    True if label != trailing_piece_tag else False
+                    for label in new_labels
+                ]
+            )
+
+            if label_map:
+                label_ids = [label_map[label] for label in new_labels]
+            else:
+                label_ids = new_labels
+
+            input_ids_all.append(input_ids)
+            input_mask_all.append(input_mask)
+            label_ids_all.append(label_ids)
+
+        if label_available:
+            return (
+                input_ids_all,
+                input_mask_all,
+                trailing_token_mask_all,
+                label_ids_all,
+            )
+        else:
+            return input_ids_all, input_mask_all, trailing_token_mask_all
+
+
+def create_data_loader(
+    input_ids,
+    input_mask,
+    label_ids=None,
+    sample_method="random",
+    batch_size=32,
+):
+    """
+    Create a dataloader for sampling and serving data batches.
+    Args:
+        input_ids (list): List of lists. Each sublist contains numerical
+            values, i.e. token ids, corresponding to the tokens in the input
+            text data.
+        input_mask (list): List of lists. Each sublist contains the attention
+            mask of the input token id list, 1 for input tokens and 0 for
+            padded tokens, so that padded tokens are not attended to.
+        label_ids (list, optional): List of lists of numerical labels,
+            each sublist contains token labels of a input
+            sentence/paragraph. Default value is None.
+        sample_method (str, optional): Order of data sampling. Accepted
+            values are "random", "sequential" and "distributed". Default
+            value is "random".
+        batch_size (int, optional): Number of samples used in each training
+            iteration. Default value is 32.
+
+    Returns:
+        DataLoader: A Pytorch Dataloader containing the input_ids tensor,
+            input_mask tensor, and label_ids (if provided) tensor.
+
+    """
+    input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
+    input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
+
+    if label_ids:
+        label_ids_tensor = torch.tensor(label_ids, dtype=torch.long)
+        tensor_data = TensorDataset(
+            input_ids_tensor, input_mask_tensor, label_ids_tensor
+        )
+    else:
+        tensor_data = TensorDataset(input_ids_tensor, input_mask_tensor)
+
+    if sample_method == "random":
+        sampler = RandomSampler(tensor_data)
+    elif sample_method == "sequential":
+        sampler = SequentialSampler(tensor_data)
+    elif sample_method == "distributed":
+        sampler = DistributedSampler(tensor_data)
+    else:
+        raise ValueError(
+            "Invalid sample_method value, accepted values are: "
+            "random, sequential, and distributed"
+        )
+
+    dataloader = DataLoader(
+        tensor_data, sampler=sampler, batch_size=batch_size
+    )
+
+    return dataloader
diff --git a/utils_nlp/bert/sequence_classification.py b/utils_nlp/bert/sequence_classification.py
@@ -8,21 +8,22 @@
 from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 from pytorch_pretrained_bert.optimization import BertAdam
 from tqdm import tqdm
-from utils_nlp.bert.common import BERT_MAX_LEN, Language
+from utils_nlp.bert.common import Language
 from utils_nlp.pytorch.device_utils import get_device, move_to_device
 
 
-class SequenceClassifier:
+class BERTSequenceClassifier:
     """BERT-based sequence classifier"""
 
     def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir="."):
         """Initializes the classifier and the underlying pretrained model.
         Args:
             language (Language, optional): The pretrained model's language.
                                            Defaults to Language.ENGLISH.
-            num_labels (int, optional): The number of unique labels in the training data.
-                                        Defaults to 2.
-            cache_dir (str, optional): Location of BERT's cache directory. Defaults to ".".
+            num_labels (int, optional): The number of unique labels in the
+                training data. Defaults to 2.
+            cache_dir (str, optional): Location of BERT's cache directory.
+                Defaults to ".".
         """
         if num_labels < 2:
             raise Exception("Number of labels should be at least 2.")
@@ -54,14 +55,15 @@ def fit(
             labels (list): List of training labels.
             device (str, optional): Device used for training ("cpu" or "gpu").
                                     Defaults to "gpu".
-            num_gpus (int, optional): The number of gpus to use. 
-                                      If None is specified, all available GPUs will be used.
-                                      Defaults to None.
-            num_epochs (int, optional): Number of training epochs. Defaults to 1.
+            num_gpus (int, optional): The number of gpus to use.
+                                      If None is specified, all available GPUs
+                                      will be used. Defaults to None.
+            num_epochs (int, optional): Number of training epochs.
+                Defaults to 1.
             batch_size (int, optional): Training batch size. Defaults to 32.
             lr (float): Learning rate of the Adam optimizer. Defaults to 2e-5.
-            verbose (bool, optional): If True, shows the training progress and loss values.
-                                      Defaults to True.
+            verbose (bool, optional): If True, shows the training progress and
+                loss values. Defaults to True.
         """
 
         device = get_device("cpu" if num_gpus == 0 else "gpu")
@@ -142,14 +144,14 @@ def fit(
         del [x_batch, y_batch, mask_batch]
         torch.cuda.empty_cache()
 
-    def predict(self, token_ids, input_mask, num_gpus=1, batch_size=32):
+    def predict(self, token_ids, input_mask, num_gpus=None, batch_size=32):
         """Scores the given dataset and returns the predicted classes.
         Args:
             token_ids (list): List of training token lists.
             input_mask (list): List of input mask lists.
-            num_gpus (int, optional): The number of gpus to use. 
-                                      If None is specified, all available GPUs will be used.
-                                      Defaults to 1.
+            num_gpus (int, optional): The number of gpus to use.
+                                      If None is specified, all available GPUs
+                                      will be used. Defaults to None.
             batch_size (int, optional): Scoring batch size. Defaults to 32.
         Returns:
             [ndarray]: Predicted classes.