Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
Merge pull request #91 from microsoft/staging
Browse files Browse the repository at this point in the history
Staging to master after PR #36
  • Loading branch information
saidbleik committed Jun 10, 2019
2 parents 4b81887 + 2afdb73 commit 42340a7
Show file tree
Hide file tree
Showing 7 changed files with 1,291 additions and 28 deletions.
621 changes: 621 additions & 0 deletions scenarios/named_entity_recognition/ner_wikigold_bert.ipynb

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions scenarios/text_classification/tc_mnli_bert.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"from sklearn.model_selection import train_test_split\n",
"from utils_nlp.dataset.multinli import load_pandas_df\n",
"from utils_nlp.eval.classification import eval_classification\n",
"from utils_nlp.bert.sequence_classification import SequenceClassifier\n",
"from utils_nlp.bert.sequence_classification import BERTSequenceClassifier\n",
"from utils_nlp.bert.common import Language, Tokenizer\n",
"from utils_nlp.common.timer import Timer\n",
"import torch\n",
Expand Down Expand Up @@ -307,7 +307,7 @@
"metadata": {},
"outputs": [],
"source": [
"classifier = SequenceClassifier(\n",
"classifier = BERTSequenceClassifier(\n",
" language=LANGUAGE, num_labels=num_labels, cache_dir=BERT_CACHE_DIR\n",
")"
]
Expand Down Expand Up @@ -508,7 +508,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3.5",
"language": "python",
"name": "python3"
},
Expand All @@ -522,7 +522,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.5.5"
}
},
"nbformat": 4,
Expand Down
8 changes: 4 additions & 4 deletions scenarios/text_classification/tc_yahoo_answers_bert.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
"import utils_nlp.dataset.yahoo_answers as ya_dataset\n",
"from utils_nlp.eval.classification import eval_classification\n",
"from utils_nlp.bert.sequence_classification import SequenceClassifier\n",
"from utils_nlp.bert.sequence_classification import BERTSequenceClassifier\n",
"from utils_nlp.bert.common import Language, Tokenizer\n",
"from utils_nlp.common.timer import Timer\n",
"import torch\n",
Expand Down Expand Up @@ -166,7 +166,7 @@
"metadata": {},
"outputs": [],
"source": [
"classifier = SequenceClassifier(\n",
"classifier = BERTSequenceClassifier(\n",
" language=Language.ENGLISH, num_labels=num_labels, cache_dir=BERT_CACHE_DIR\n",
")"
]
Expand Down Expand Up @@ -402,7 +402,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3.5",
"language": "python",
"name": "python3"
},
Expand All @@ -416,7 +416,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.5.5"
}
},
"nbformat": 4,
Expand Down
209 changes: 204 additions & 5 deletions utils_nlp/bert/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@

from pytorch_pretrained_bert.tokenization import BertTokenizer
from enum import Enum
import warnings
import torch

from torch.utils.data import (
DataLoader,
RandomSampler,
SequentialSampler,
TensorDataset,
)
from torch.utils.data.distributed import DistributedSampler

# Max supported sequence length
BERT_MAX_LEN = 512
Expand All @@ -27,19 +37,20 @@ def __init__(
Args:
language (Language, optional): The pretrained model's language.
Defaults to Language.ENGLISH.
cache_dir (str, optional): Location of BERT's cache directory. Defaults to ".".
cache_dir (str, optional): Location of BERT's cache directory.
Defaults to ".".
"""
self.tokenizer = BertTokenizer.from_pretrained(
language.value, do_lower_case=to_lower, cache_dir=cache_dir
)
self.language = language

def tokenize(self, text):
"""Uses a BERT tokenizer
"""Uses a BERT tokenizer
Args:
text (list): [description]
Returns:
[list]: [description]
"""
Expand All @@ -51,7 +62,7 @@ def preprocess_classification_tokens(self, tokens, max_len=BERT_MAX_LEN):
- add BERT sentence markers ([CLS] and [SEP])
- map tokens to indices
- pad and truncate sequences
- create an input_mask
- create an input_mask
Args:
tokens (list): List of tokens to preprocess.
max_len (int, optional): Maximum number of tokens
Expand All @@ -78,3 +89,191 @@ def preprocess_classification_tokens(self, tokens, max_len=BERT_MAX_LEN):
# create input mask
input_mask = [[min(1, x) for x in y] for y in tokens]
return tokens, input_mask

def preprocess_ner_tokens(
self,
text,
max_len=BERT_MAX_LEN,
labels=None,
label_map=None,
trailing_piece_tag="X",
):
"""
Preprocesses input text, involving the following steps
0. Tokenize input text.
1. Convert string tokens to token ids.
2. Convert input labels to label ids, if labels and label_map are
provided.
3. If a word is tokenized into multiple pieces of tokens by the
WordPiece tokenizer, label the extra tokens with
trailing_piece_tag.
4. Pad or truncate input text according to max_seq_length
5. Create input_mask for masking out padded tokens.
Args:
text (list): List of input sentences/paragraphs.
max_len (int, optional): Maximum length of the list of
tokens. Lists longer than this are truncated and shorter
ones are padded with "O"s. Default value is BERT_MAX_LEN=512.
labels (list, optional): List of token label lists. Default
value is None.
label_map (dict, optional): Dictionary for mapping original token
labels (which may be string type) to integers. Default value
is None.
trailing_piece_tag (str, optional): Tag used to label trailing
word pieces. For example, "playing" is broken into "play"
and "##ing", "play" preserves its original label and "##ing"
is labeled as trailing_piece_tag. Default value is "X".
Returns:
tuple: A tuple containing the following three or four lists.
1. input_ids_all: List of lists. Each sublist contains
numerical values, i.e. token ids, corresponding to the
tokens in the input text data.
2. input_mask_all: List of lists. Each sublist
contains the attention mask of the input token id list,
1 for input tokens and 0 for padded tokens, so that
padded tokens are not attended to.
3. trailing_token_mask: List of lists. Each sublist is
a boolean list, True for the first word piece of each
original word, False for the trailing word pieces,
e.g. "##ing". This mask is useful for removing the
predictions on trailing word pieces, so that each
original word in the input text has a unique predicted
label.
4. label_ids_all: List of lists of numerical labels,
each sublist contains token labels of a input
sentence/paragraph, if labels is provided.
"""
if max_len > BERT_MAX_LEN:
warnings.warn(
"setting max_len to max allowed tokens: {}".format(
BERT_MAX_LEN
)
)
max_len = BERT_MAX_LEN

label_available = True
if labels is None:
label_available = False
# create an artificial label list for creating trailing token mask
labels = ["O"] * len(text)

input_ids_all = []
input_mask_all = []
label_ids_all = []
trailing_token_mask_all = []
for t, t_labels in zip(text, labels):
new_labels = []
tokens = []
for word, tag in zip(t.split(), t_labels):
sub_words = self.tokenizer.tokenize(word)
for count, sub_word in enumerate(sub_words):
if count > 0:
tag = trailing_piece_tag
new_labels.append(tag)
tokens.append(sub_word)

if len(tokens) > max_len:
tokens = tokens[:max_len]
new_labels = new_labels[:max_len]

input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

# The mask has 1 for real tokens and 0 for padding tokens.
# Only real tokens are attended to.
input_mask = [1.0] * len(input_ids)

# Zero-pad up to the max sequence length.
padding = [0.0] * (max_len - len(input_ids))
label_padding = ["O"] * (max_len - len(input_ids))

input_ids += padding
input_mask += padding
new_labels += label_padding

trailing_token_mask_all.append(
[
True if label != trailing_piece_tag else False
for label in new_labels
]
)

if label_map:
label_ids = [label_map[label] for label in new_labels]
else:
label_ids = new_labels

input_ids_all.append(input_ids)
input_mask_all.append(input_mask)
label_ids_all.append(label_ids)

if label_available:
return (
input_ids_all,
input_mask_all,
trailing_token_mask_all,
label_ids_all,
)
else:
return input_ids_all, input_mask_all, trailing_token_mask_all


def create_data_loader(
input_ids,
input_mask,
label_ids=None,
sample_method="random",
batch_size=32,
):
"""
Create a dataloader for sampling and serving data batches.
Args:
input_ids (list): List of lists. Each sublist contains numerical
values, i.e. token ids, corresponding to the tokens in the input
text data.
input_mask (list): List of lists. Each sublist contains the attention
mask of the input token id list, 1 for input tokens and 0 for
padded tokens, so that padded tokens are not attended to.
label_ids (list, optional): List of lists of numerical labels,
each sublist contains token labels of a input
sentence/paragraph. Default value is None.
sample_method (str, optional): Order of data sampling. Accepted
values are "random", "sequential" and "distributed". Default
value is "random".
batch_size (int, optional): Number of samples used in each training
iteration. Default value is 32.
Returns:
DataLoader: A Pytorch Dataloader containing the input_ids tensor,
input_mask tensor, and label_ids (if provided) tensor.
"""
input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)

if label_ids:
label_ids_tensor = torch.tensor(label_ids, dtype=torch.long)
tensor_data = TensorDataset(
input_ids_tensor, input_mask_tensor, label_ids_tensor
)
else:
tensor_data = TensorDataset(input_ids_tensor, input_mask_tensor)

if sample_method == "random":
sampler = RandomSampler(tensor_data)
elif sample_method == "sequential":
sampler = SequentialSampler(tensor_data)
elif sample_method == "distributed":
sampler = DistributedSampler(tensor_data)
else:
raise ValueError(
"Invalid sample_method value, accepted values are: "
"random, sequential, and distributed"
)

dataloader = DataLoader(
tensor_data, sampler=sampler, batch_size=batch_size
)

return dataloader
32 changes: 17 additions & 15 deletions utils_nlp/bert/sequence_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,22 @@
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
from tqdm import tqdm
from utils_nlp.bert.common import BERT_MAX_LEN, Language
from utils_nlp.bert.common import Language
from utils_nlp.pytorch.device_utils import get_device, move_to_device


class SequenceClassifier:
class BERTSequenceClassifier:
"""BERT-based sequence classifier"""

def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir="."):
"""Initializes the classifier and the underlying pretrained model.
Args:
language (Language, optional): The pretrained model's language.
Defaults to Language.ENGLISH.
num_labels (int, optional): The number of unique labels in the training data.
Defaults to 2.
cache_dir (str, optional): Location of BERT's cache directory. Defaults to ".".
num_labels (int, optional): The number of unique labels in the
training data. Defaults to 2.
cache_dir (str, optional): Location of BERT's cache directory.
Defaults to ".".
"""
if num_labels < 2:
raise Exception("Number of labels should be at least 2.")
Expand Down Expand Up @@ -54,14 +55,15 @@ def fit(
labels (list): List of training labels.
device (str, optional): Device used for training ("cpu" or "gpu").
Defaults to "gpu".
num_gpus (int, optional): The number of gpus to use.
If None is specified, all available GPUs will be used.
Defaults to None.
num_epochs (int, optional): Number of training epochs. Defaults to 1.
num_gpus (int, optional): The number of gpus to use.
If None is specified, all available GPUs
will be used. Defaults to None.
num_epochs (int, optional): Number of training epochs.
Defaults to 1.
batch_size (int, optional): Training batch size. Defaults to 32.
lr (float): Learning rate of the Adam optimizer. Defaults to 2e-5.
verbose (bool, optional): If True, shows the training progress and loss values.
Defaults to True.
verbose (bool, optional): If True, shows the training progress and
loss values. Defaults to True.
"""

device = get_device("cpu" if num_gpus == 0 else "gpu")
Expand Down Expand Up @@ -142,14 +144,14 @@ def fit(
del [x_batch, y_batch, mask_batch]
torch.cuda.empty_cache()

def predict(self, token_ids, input_mask, num_gpus=1, batch_size=32):
def predict(self, token_ids, input_mask, num_gpus=None, batch_size=32):
"""Scores the given dataset and returns the predicted classes.
Args:
token_ids (list): List of training token lists.
input_mask (list): List of input mask lists.
num_gpus (int, optional): The number of gpus to use.
If None is specified, all available GPUs will be used.
Defaults to 1.
num_gpus (int, optional): The number of gpus to use.
If None is specified, all available GPUs
will be used. Defaults to None.
batch_size (int, optional): Scoring batch size. Defaults to 32.
Returns:
[ndarray]: Predicted classes.
Expand Down

0 comments on commit 42340a7

Please sign in to comment.