In [2]:
import os
os.environ["MODEL_DIR"] = '../model'

# Config

In [2]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.flow as naf

from nlpaug.util import Action

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
text = 'The quick brown fox jumps over the lazy dog'
print(text)

The quick brown fox jumps over the lazy dog


# Character Augmentation

Augmenting data in character level. Possible scenarios include image to text and chatbot. During recognizing text from image, we need to optical character recognition (OCR) model to achieve it but OCR introduces some errors such as recognizing "o" and "0". `OCRAug` simulate these errors to perform the data augmentation. For chatbot, we still have typo even though most of application comes with word correction. Therefore, `QWERTYAug` is introduced to similar this kind of errors.

## Substitute character by pre-defined OCR error

In [4]:
aug = nac.OcrAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
The quick 6rown fux jumps over the lazy dog


## Substitute character by keyboard distance

In [5]:
aug = nac.QwertyAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
Tbe quiSk nrown fIx jKmps ov2r tje laAy don


## Insert character randomly

In [6]:
aug = nac.RandomCharAug(action=Action.INSERT)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
T3he quicNk @brown fEox juamps $over th6e la1zy d*og


## Substitute character randomly

In [7]:
aug = nac.RandomCharAug(action=Action.SUBSTITUTE)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
ThN qDick brow0 foB jumks oveE t+e laz6 dBg


# Swap character randomly

In [4]:
aug = nac.RandomCharAug(action=Action.SWAP)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
Hte quikc borwn fxo jupms ovre teh lzay dgo


## Delete character randomly

In [8]:
aug = nac.RandomCharAug(action=Action.DELETE)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
Te quic rown fx jump ver he laz og


# Word Augmentation

Besides character augmentation, word level is important as well. We make use of word2vec (Mikolov et al., 2013), GloVe (Pennington et al., 2014), fasttext (Joulin et al., 2016), BERT(Devlin et al., 2018) and wordnet to insert and substitute similar word. `Word2vecAug`,  `GloVeAug` and `FasttextAug` use word embeddings to find most similar group of words to replace original word. On the other hand, `BertAug` use language models to predict possible target word. `WordNetAug` use statistics way to find the similar group of words.

## Insert word randomly by word2vec similarity

In [9]:
aug = naw.Word2vecAug(
    model_path=os.environ.get("MODEL_DIR")+'GoogleNews-vectors-negative300.bin',
    action=Action.INSERT)

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
The quick brown fox jumps Alzeari over the lazy Superintendents dog


## Substitute word by word2vec similarity

In [10]:
aug = naw.Word2vecAug(
    model_path=os.environ.get("MODEL_DIR")+'GoogleNews-vectors-negative300.bin',
    action=Action.SUBSTITUTE)

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
The easy brown fox jumps around the lazy dog


## Insert word randomly by GloVe similarity

In [11]:
aug = naw.GloVeAug(
    model_path=os.environ.get("MODEL_DIR")+'glove.6B.50d.txt',
    action=Action.INSERT)

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
The quick progresul brown fox ogeechee jumps over the lazy dog


## Substitute word by GloVe similarity

In [12]:
aug = naw.GloVeAug(
    model_path=os.environ.get("MODEL_DIR")+'glove.6B.50d.txt',
    action=Action.SUBSTITUTE)

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
The quick brown abc jumps over which lazy dog


## Insert word randomly by fasttext similarity

In [13]:
aug = naw.FasttextAug(
    model_path=os.environ.get("MODEL_DIR")+'wiki-news-300d-1M.vec',
    action=Action.INSERT)

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
The quick Leering brown fox jumps over ProLink the lazy dog


## Substitute word by fasttext similarity

In [14]:
aug = naw.FasttextAug(
    model_path=os.environ.get("MODEL_DIR")+'wiki-news-300d-1M.vec',
    action=Action.SUBSTITUTE)

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
The easy brown fox jumping over the lazy dog


## Insert word by BERT similarity

In [15]:
aug = naw.BertAug(action=Action.INSERT)

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
even the quick brown fox usually jumps over the lazy dog


## Substitute word by BERT similarity

In [16]:
aug = naw.BertAug(action=Action.SUBSTITUTE)

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
little quick brown fox jumps over the lazy dog


## Substitute word by synonym

In [17]:
aug = naw.WordNetAug()

augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
The straightaway brown fox jumps over the faineant dog


## Delete word randomly

In [18]:
aug = naw.RandomWordAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
The brown jumps over the lazy dog


# Flow Augmentation

To make use of multiple augmentation, `sequential` and `sometimes` pipelines are introduced to connect augmenters.

## Apply different augmenters sequentially

In [19]:
aug = naf.Sequential([
    nac.RandomCharAug(action=Action.INSERT),
    naw.RandomWordAug()
])

aug.augment(text)

'&The b0rown jum@ps ovear %the 1lazy gdog'

## Apply some augmenters randomly

In [20]:
aug = naf.Sometimes([
    nac.RandomCharAug(action=Action.DELETE),
    nac.RandomCharAug(action=Action.INSERT),
    naw.RandomWordAug()
])

aug.augment(text)

'The quick brown fox jumps over the lazy dog'