# Using Pre-trained Word Embeddings

In [1]:
!pip install gluonnlp
!pip install -U sacremoses

Collecting gluonnlp
[?25l  Downloading https://files.pythonhosted.org/packages/83/29/c7dffbfc39f8dd8bb9314df7aaf92a67f6c7826ed35d546c8fa63d6e5925/gluonnlp-0.8.3.tar.gz (236kB)
[K    100% |████████████████████████████████| 245kB 27.9MB/s ta 0:00:01
Building wheels for collected packages: gluonnlp
  Running setup.py bdist_wheel for gluonnlp ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/50/6e/32/521aa84da7f9ee725d3c9be0b5e0d771df659bf25da5929f6c
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.8.3
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K    100% |████████████████████████████████| 870kB 25.0MB/s ta 0:00:01
[?2

In [2]:
import mxnet as mx
from mxnet import gluon
from mxnet import nd
import gluonnlp as nlp

import re
import io
import time
import multiprocessing as mp
import numpy as np

In [3]:
ctx = mx.gpu(0) if mx.test_utils.list_gpus() else mx.cpu()

In [4]:
text = " hello world \n hello nice world \n hi world \n"

We need a tokenizer to process this string

In [5]:
def simple_tokenize(source_str, token_delim=' ', seq_delim='\n'):
    return filter(None, re.split(token_delim + '|' + seq_delim, source_str))
counter = nlp.data.count_tokens(simple_tokenize(text))

In [6]:
counter

Counter({'hello': 2, 'world': 3, 'nice': 1, 'hi': 1})

In [7]:
vocab = nlp.Vocab(counter)

In [8]:
vocab.idx_to_token

['<unk>', '<pad>', '<bos>', '<eos>', 'world', 'hello', 'hi', 'nice']

In [9]:
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.simple')

Embedding file wiki.simple.npz is not found. Downloading from Gluon Repository. This may take some time.
Downloading /home/ec2-user/.mxnet/embedding/fasttext/wiki.simple.npz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/fasttext/wiki.simple.npz...


In [10]:
vocab.set_embedding(fasttext_simple)

In [11]:
vocab.embedding['beautiful']


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 300 @cpu(0)>

In [12]:
vocab.embedding['hello', 'world'][:, :5]


[[ 0.39567   0.21454  -0.035389 -0.24299  -0.095645]
 [ 0.10444  -0.10858   0.27212   0.13299  -0.33165 ]]
<NDArray 2x5 @cpu(0)>

## Application of Pre-trained Word Embeddings

In [13]:
embedding = nlp.embedding.create('glove', source='glove.6B.50d')

Embedding file glove.6B.50d.npz is not found. Downloading from Gluon Repository. This may take some time.
Downloading /home/ec2-user/.mxnet/embedding/glove/glove.6B.50d.npz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/glove/glove.6B.50d.npz...


In [14]:
vocab = nlp.Vocab(nlp.data.Counter(embedding.idx_to_token))
vocab.set_embedding(embedding)

In [15]:
len(vocab.idx_to_token)

400004

In [16]:
print(vocab['beautiful'])
print(vocab.idx_to_token[71424])

71424
beautiful


### Word Similarity

![](support/cosinesimilarity.png)

In [17]:
def cos_sim(x, y):
    return nd.dot(x, y) / (nd.norm(x) * nd.norm(y))

In [18]:
def norm_vecs_by_row(x):
    return x / nd.sqrt(nd.sum(x * x, axis=1)).reshape((-1,1))

def get_knn(vocab, k, word):
    word_vec = vocab.embedding[word].reshape((-1, 1))
    vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
    dot_prod = nd.dot(vocab_vecs[4:], word_vec)
    indices = nd.topk(dot_prod.squeeze(), k=k+1, ret_typ='indices')
    indices = [int(i.asscalar())+4 for i in indices]
    # Remove unknown and input tokens.
    return vocab.to_tokens(indices[1:])

In [19]:
get_knn(vocab, 5, 'baby')

['babies', 'boy', 'girl', 'newborn', 'pregnant']

We can verify the cosine similarity of vectors of 'baby' and 'babies'.

In [20]:
cos_sim(vocab.embedding['baby'], vocab.embedding['babies'])


[0.83871305]
<NDArray 1 @cpu(0)>

Let us find the 5 most similar words of 'beautiful' from the vocabulary.

In [21]:
get_knn(vocab, 5, 'beautiful')

['lovely', 'gorgeous', 'wonderful', 'charming', 'beauty']

### Word Analogy

# Sentence Embeddings with Pretrained ELMo



<img align="middle" src="https://pbs.twimg.com/profile_images/1092451830758547457/EqQ6Csl3_400x400.jpg" />

In [22]:
elmo_intro = """
Extensive experiments demonstrate that ELMo representations work extremely well in practice.
We first show that they can be easily added to existing models for six diverse and challenging language understanding problems, including textual entailment, question answering and sentiment analysis.
The addition of ELMo representations alone significantly improves the state of the art in every case, including up to 20% relative error reductions.
For tasks where direct comparisons are possible, ELMo outperforms CoVe (McCann et al., 2017), which computes contextualized representations using a neural machine translation encoder.
Finally, an analysis of both ELMo and CoVe reveals that deep representations outperform those derived from just the top layer of an LSTM.
Our trained models and code are publicly available, and we expect that ELMo will provide similar gains for many other NLP problems.
"""

elmo_intro_file = 'elmo_intro.txt'
with io.open(elmo_intro_file, 'w', encoding='utf8') as f:
    f.write(elmo_intro)

dataset = nlp.data.TextLineDataset(elmo_intro_file, 'utf8')
print(len(dataset))
print(dataset[2]) # print an example sentence from the input data

7
We first show that they can be easily added to existing models for six diverse and challenging language understanding problems, including textual entailment, question answering and sentiment analysis.


## Data Transform

### Tokenization

In [23]:
tokenizer = nlp.data.NLTKMosesTokenizer()
dataset = dataset.transform(tokenizer)
dataset = dataset.transform(lambda x: ['<bos>'] + x + ['<eos>'])
print(dataset[2]) # print the same tokenized sentence as above
print(len(dataset[2]))

  'NLTK or relevant packages are not installed. '


['<bos>', 'We', 'first', 'show', 'that', 'they', 'can', 'be', 'easily', 'added', 'to', 'existing', 'models', 'for', 'six', 'diverse', 'and', 'challenging', 'language', 'understanding', 'problems', ',', 'including', 'textual', 'entailment', ',', 'question', 'answering', 'and', 'sentiment', 'analysis', '.', '<eos>']
33


### Pretrained ELMo Vocab

In [24]:
vocab = nlp.vocab.ELMoCharVocab()
dataset = dataset.transform(lambda x: (vocab[x], len(x)), lazy=False)

### DataLoader

In [25]:
batch_size = 2
dataset_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
                                              nlp.data.batchify.Stack())
data_loader = gluon.data.DataLoader(dataset,
                                    batch_size=batch_size,
                                    batchify_fn=dataset_batchify_fn)

  'Padding value is not given and will be set automatically to 0 '


## Load pretrained ELMo Model

In [26]:
elmo_bilm, _ = nlp.model.get_model('elmo_2x1024_128_2048cnn_1xhighway',
                                   dataset_name='gbw',
                                   pretrained=True,
                                   ctx=mx.cpu())
#print(elmo_bilm)

Downloading /home/ec2-user/.mxnet/models/elmo_2x1024_128_2048cnn_1xhighway_gbw-8c9257d9.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/elmo_2x1024_128_2048cnn_1xhighway_gbw-8c9257d9.zip...


![](support/elmo_arch.png)

## Get sentence Features from Elmo

In [27]:
def get_features(data, valid_lengths):
    length = data.shape[1]
    hidden_state = elmo_bilm.begin_state(mx.nd.zeros, batch_size=batch_size)
    mask = mx.nd.arange(length).expand_dims(0).broadcast_axes(axis=(0,), size=(batch_size,))
    mask = mask < valid_lengths.expand_dims(1).astype('float32')
    output, hidden_state = elmo_bilm(data, hidden_state, mask)
    return output


In [28]:
batch = next(iter(data_loader))
features = get_features(*batch)
print([x.shape for x in features])

[(2, 14, 256), (2, 14, 256), (2, 14, 256)]


# Finetuning BERT for sentence classification

<img align="middle" src="https://miro.medium.com/max/854/1*oUpWrMdvDWcWE_QSne-jOw.jpeg" />

## Get BERT base model

In [29]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                                             dataset_name='book_corpus_wiki_en_uncased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)

Vocab file is not found. Downloading.
Downloading /home/ec2-user/.mxnet/models/1579802082.77426book_corpus_wiki_en_uncased-a6607397.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_uncased-a6607397.zip...
Downloading /home/ec2-user/.mxnet/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip...


![](support/bert.png)

## Data Preprocessing


In [30]:
train_dataset, test_dataset = [nlp.data.IMDB(root='data/imdb', segment=segment)
                               for segment in ('train', 'test')]

In [31]:
def process_label(x):
    data, label = x
    # Label is a review score from 1 to 10. We take 6..10 as a positive sentiment
    # and 1..5 as a negative
    label = int(label > 5)
    return [data, label]

def process_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.SimpleDataset(pool.map(process_label, dataset))
    end = time.time()
    print('Done! Label processing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset

train_dataset = process_dataset(train_dataset)
test_dataset = process_dataset(test_dataset)

Done! Label processing Time=1.50s, #Sentences=25000
Done! Label processing Time=1.51s, #Sentences=25000


## Data preprocessing for BERT

In [32]:
from gluonnlp.data import BERTSentenceTransform

class BERTDatasetTransform(object):
    def __init__(self, tokenizer, max_seq_length, class_labels=None,
                 label_alias=None, pad=True, pair=True, has_label=True):
        self.class_labels = class_labels
        self.has_label = has_label
        self._label_dtype = 'int32' if class_labels else 'float32'
        
        if has_label and class_labels:
            self._label_map = {}
            for (i, label) in enumerate(class_labels):
                self._label_map[label] = i
            if label_alias:
                for key in label_alias:
                    self._label_map[key] = self._label_map[label_alias[key]]
        
        self._bert_xform = BERTSentenceTransform(
            tokenizer, max_seq_length, pad=pad, pair=pair)

    def __call__(self, line):
        if self.has_label:
            input_ids, valid_length, segment_ids = self._bert_xform(line[:-1])
            label = line[-1]
            # map to int if class labels are available
            if self.class_labels:
                label = self._label_map[label]
            label = np.array([label], dtype=self._label_dtype)
            return input_ids, valid_length, segment_ids, label
        else:
            return self._bert_xform(line)


In [33]:
# Use the vocabulary from pre-trained model for tokenization
bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)

# The maximum length of an input sequence
max_len = 500

# The labels for the two classes
all_labels = [0, 1]

transform = BERTDatasetTransform(bert_tokenizer, max_len,
                                 class_labels=all_labels,
                                 has_label=True,
                                 pad=True,
                                 pair=False)

data_train = train_dataset.transform(transform)
data_test = test_dataset.transform(transform)


In [34]:
sample_id = 5
print(vocabulary)
print('%s token id = %s' % (vocabulary.padding_token, vocabulary[vocabulary.padding_token]))
print('%s token id = %s' % (vocabulary.cls_token, vocabulary[vocabulary.cls_token]))
print('%s token id = %s' % (vocabulary.sep_token, vocabulary[vocabulary.sep_token]))
print('token ids = %s' % data_train[sample_id][0][:11])
print('valid length = %s' % data_train[sample_id][1])
print('label = %s' % data_train[sample_id][3])

Vocab(size=30522, unk="[UNK]", reserved="['[CLS]', '[SEP]', '[MASK]', '[PAD]']")
[PAD] token id = 1
[CLS] token id = 2
[SEP] token id = 3
token ids = [    2  2023  3475  1005  1056  1996 21699  5863  3766  1010  4496]
valid length = 270
label = [1]


## Classifier Model using BERT

In [35]:
class BERTClassifier(gluon.nn.Block):
    def __init__(self, bert, num_classes=2, dropout=0.0, prefix=None, params=None):
        super(BERTClassifier, self).__init__(prefix=prefix, params=params)
        self.bert = bert

        with self.name_scope():
            self.classifier = gluon.nn.HybridSequential(prefix=prefix)
            if dropout:
                self.classifier.add(gluon.nn.Dropout(rate=dropout))
            self.classifier.add(gluon.nn.Dense(units=num_classes))

    def forward(self, inputs, token_types, valid_length=None):
        _, pooler_out = self.bert(inputs, token_types, valid_length)
        return self.classifier(pooler_out)

In [36]:
model = BERTClassifier(bert_base, num_classes=2, dropout=0.1)
# only need to initialize the classifier layer.
model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
#bert_classifier.hybridize(static_alloc=True)

## Loss, Trainer, DataLoader

In [37]:
batch_size = 10
lr = 5e-6
log_interval = 300
num_epochs = 1
grad_clip = 1

loss_function = mx.gluon.loss.SoftmaxCELoss()
trainer = mx.gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr, 'epsilon': 1e-9})

train_dataloader = mx.gluon.data.DataLoader(data_train, batch_size=batch_size, shuffle=True, num_workers=10)
test_dataloader = mx.gluon.data.DataLoader(data_test, batch_size=batch_size, shuffle=False, num_workers=10)

## Evaluation Metric

In [38]:
#accuracy
metric = mx.metric.Accuracy()

def evaluate(model, dataloader, context):
    metric = mx.metric.Accuracy()
    step_loss = 0
    
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(dataloader):
        token_ids = token_ids.as_in_context(ctx)
        valid_length = valid_length.as_in_context(ctx)
        segment_ids = segment_ids.as_in_context(ctx)
        label = label.as_in_context(ctx)

        out = model(token_ids, segment_ids, valid_length.astype('float32'))
        ls = loss_function(out, label).mean()

        step_loss += ls.asscalar()
        metric.update([label], [out])

    return metric.get()[1], step_loss / len(dataloader)

## Training Loop

In [39]:
def train(model, ctx, num_epochs):
    metric = mx.metric.Accuracy()
    # Collect all differentiable parameters for gradient clipping
    params = [p for p in model.collect_params().values() if p.grad_req != 'null']

    for epoch_id in range(num_epochs):
        metric.reset()
        step_loss = 0
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            label = label.as_in_context(ctx)

            with mx.autograd.record():
                out = model(token_ids, segment_ids, valid_length.astype('float32'))
                ls = loss_function(out, label).mean()

            ls.backward()

            trainer.allreduce_grads()
            nlp.utils.clip_grad_global_norm(params, 1)
            trainer.update(1)

            step_loss += ls.asscalar()
            metric.update([label], [out])

            if (batch_id + 1) % (log_interval) == 0:
                print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f}'
                             .format(epoch_id, batch_id + 1, len(train_dataloader),
                                     step_loss / log_interval, trainer.learning_rate, metric.get()[1]))
                step_loss = 0
        
        test_acc, test_loss = evaluate(model, test_dataloader, ctx)
        print('[Epoch {}] test_loss={:.4f}, test_acc={:.3f}'
             .format(epoch_id, test_loss, test_acc))

In [40]:
train(model, ctx, num_epochs)

MXNetError: [17:55:08] /home/ubuntu/pip_build/mxnet-build/3rdparty/mshadow/mshadow/././././cuda/tensor_gpu-inl.cuh:110: Check failed: err == cudaSuccess (2 vs. 0) : Name: MapPlanKernel ErrStr:out of memory
Stack trace:
  [bt] (0) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x6b4e0b) [0x7fb5d2bcee0b]
  [bt] (1) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x5064646) [0x7fb5d757e646]
  [bt] (2) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x507bc84) [0x7fb5d7595c84]
  [bt] (3) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x6772606) [0x7fb5d8c8c606]
  [bt] (4) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x375) [0x7fb5d5cf6305]
  [bt] (5) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x36ef064) [0x7fb5d5c09064]
  [bt] (6) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x36fd071) [0x7fb5d5c17071]
  [bt] (7) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x3700690) [0x7fb5d5c1a690]
  [bt] (8) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x3700926) [0x7fb5d5c1a926]



## Test with example reveiw

In [None]:
review_text = 'I would like to say something positive about this movie, and I can\'t'
review_transformed = transform((review_text, 0))

token_ids =  mx.nd.array(review_transformed[0], ctx=ctx).reshape(1, -1)
segment_ids =  mx.nd.array(review_transformed[2], ctx=ctx).reshape(1, -1)
valid_length = mx.nd.array(review_transformed[1], ctx=ctx).reshape(1)

In [None]:
positive_review_probability = model(token_ids, segment_ids, valid_length.astype('float32')).softmax()
print('"{}" is {:.1f}% likely to be positive'.format(
    review_text,
    100 * positive_review_probability[0][1].asscalar()))