# Config

In [2]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.flow as naf

from nlpaug.util import Action



In [3]:
text = 'The quick brown fox jumps over the lazy dog'
tokens = text.split(' ')
print('Token:{}'.format(tokens))

Token:['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


# Character Augmentation

Augmenting data in character level. Possible scenarios include image to text and chatbot. During recognizing text from image, we need to optical character recognition (OCR) model to achieve it but OCR introduces some errors such as recognizing "o" and "0". `OCRAug` simulate these errors to perform the data augmentation. For chatbot, we still have typo even though most of application comes with word correction. Therefore, `QWERTYAug` is introduced to similar this kind of errors.

## Substitute character by pre-defined OCR error

In [4]:
aug = nac.OcrAug()

for token in tokens:
    print('{} --> {}'.format(token, aug.augment([token])[0]))

The --> The
quick --> quick
brown --> bkown
fox --> fox
jumps --> jumps
over --> ovek
the --> the
lazy --> lazy
dog --> do9


## Substitute character by keyboard distance

In [5]:
aug = nac.QwertyAug()

for token in tokens:
    print('{} --> {}'.format(token, aug.augment([token])[0]))

The --> Th2
quick --> quic.
brown --> browM
fox --> eox
jumps --> jumpE
over --> kver
the --> tne
lazy --> laXy
dog --> d(g


## Insert character randomly

In [6]:
aug = nac.RandomCharAug(action=Action.INSERT)

for token in tokens:
    print('{} --> {}'.format(token, aug.augment([token])[0]))

The --> 9The
quick --> quiack
brown --> brOown
fox --> feox
jumps --> jumcps
over --> ovKer
the --> thee
lazy --> l&azy
dog --> do&g


## Substitute character randomly

In [7]:
aug = nac.RandomCharAug(action=Action.SUBSTITUTE)

for token in tokens:
    print('{} --> {}'.format(token, aug.augment([token])[0]))

The --> Th$
quick --> qu4ck
brown --> brnwn
fox --> fox
jumps --> jumpC
over --> oveu
the --> &he
lazy --> lazH
dog --> dUg


## Delete character randomly

In [8]:
aug = nac.RandomCharAug(action=Action.DELETE)

for token in tokens:
    print('{} --> {}'.format(token, aug.augment([token])[0]))

The --> Th
quick --> uick
brown --> bron
fox --> ox
jumps --> umps
over --> oer
the --> te
lazy --> azy
dog --> og


# Word Augmentation

Besides character augmentation, word level is important as well. We make use of word2vec (Mikolov et al., 2013), GloVe (Pennington et al., 2014) and wordnet to insert and substitute similar word. `Word2vecAug` and `GloVeAug` use word embeddings to find most similar group of words to replace original word. On the other hand, wordnet use statistics way to find the similar group of words.

## Insert word randomly by word2vec similarity

In [9]:
aug = naw.Word2vecAug(
    model_path=os.environ.get("MODEL_DIR")+'GoogleNews-vectors-negative300.bin',
    action=Action.INSERT)

print('{} --> {}'.format(tokens, aug.augment(tokens)))

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] --> ['The', 'Ralphs_Safeway', 'quick', 'brown', 'fox', 'jumps', 'Brunswig', 'over', 'the', 'lazy', 'dog']


## Substitue word by word2vec similarity

In [10]:
aug = naw.Word2vecAug(
    model_path=os.environ.get("MODEL_DIR")+'GoogleNews-vectors-negative300.bin',
    action=Action.SUBSTITUTE)

print('{} --> {}'.format(tokens, aug.augment(tokens)))

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] --> ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'in', 'lazy', 'pit_bull']


## Insert word randomly by GloVe similarity

In [11]:
aug = naw.GloVeAug(
    model_path=os.environ.get("MODEL_DIR")+'glove.6B.50d.txt',
    action=Action.INSERT)

print('{} --> {}'.format(tokens, aug.augment(tokens)))

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] --> ['The', 'quick', 'brown', 'fox', 'jumps', 'ista', 'over', 'the', 'lazy', 'hirschson', 'dog']


## Substitue word by GloVe similarity

In [12]:
aug = naw.GloVeAug(
    model_path=os.environ.get("MODEL_DIR")+'glove.6B.50d.txt',
    action=Action.SUBSTITUTE)

print('{} --> {}'.format(tokens, aug.augment(tokens)))

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] --> ['The', 'easy', 'gray', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


## Substitue word by synonym

In [13]:
aug = naw.SynonymAug()

print('{} --> {}'.format(tokens, aug.augment(tokens)))

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] --> ['The', 'quick', 'Brown_University', 'fox', 'jumps', 'concluded', 'the', 'lazy', 'dog']


## Delete word randomly

In [14]:
aug = naw.RandomWordAug()

print('{} --> {}'.format(tokens, aug.augment(tokens)))

tokens: 9 ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
results: 7 ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the']
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] --> ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the']


# Flow Augmentation

To make use of multiple augmentation, `sequential` and `sometimes` pipelines are introduced to connect augmenters.

## Apply different augmenters sequentially

In [15]:
aug = naf.Sequential([
    nac.RandomCharAug(action=Action.INSERT),
    naw.RandomWordAug()
])

print('{} --> {}'.format(tokens, aug.augment(tokens)))

before:  ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
after:  ['lThe', 'uquick', 'browDn', 'fAox', 'ljumps', 'ojver', 'th$e', 'laTzy', 'Zdog']
tokens: 9 ['lThe', 'uquick', 'browDn', 'fAox', 'ljumps', 'ojver', 'th$e', 'laTzy', 'Zdog']
results: 7 ['lThe', 'uquick', 'browDn', 'fAox', 'ojver', 'th$e', 'laTzy']
after:  ['lThe', 'uquick', 'browDn', 'fAox', 'ojver', 'th$e', 'laTzy']
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] --> ['lThe', 'uquick', 'browDn', 'fAox', 'ojver', 'th$e', 'laTzy']


## Apply some augmenters randomly

In [16]:
tokens

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [17]:
aug = naf.Sequential([
    nac.RandomCharAug(action=Action.INSERT),
    naw.RandomWordAug()
])

print('{} --> {}'.format(tokens, aug.augment(tokens)))

before:  ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
after:  ['Thve', 'qtuick', 'ibrown', 'fwox', '$jumps', 'oKver', 'tGhe', 'l2azy', 'Ddog']
tokens: 9 ['Thve', 'qtuick', 'ibrown', 'fwox', '$jumps', 'oKver', 'tGhe', 'l2azy', 'Ddog']
results: 7 ['qtuick', 'ibrown', 'fwox', '$jumps', 'oKver', 'tGhe', 'l2azy']
after:  ['Qtuick', 'ibrown', 'fwox', '$jumps', 'oKver', 'tGhe', 'l2azy']
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] --> ['Qtuick', 'ibrown', 'fwox', '$jumps', 'oKver', 'tGhe', 'l2azy']
