## Data augmentation techniques in natural language models

Resumen adaptado a partir de https://github.com/makcedward/nlpaug

Germán Cheuque C.

In [None]:
pip install nlpaug

## 1. Character Augmenter:

OCRAug: Simulate OCR engine error

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

In [None]:
text = 'The quick brown fox jumps over the lazy dog .'

In [None]:
aug = nac.KeyboardAug()
augmented_text = aug.augment(text, n=2)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

## 2. KeyboardAug: Simulate keyboard distance error

In [None]:
aug = nac.KeyboardAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

## 3. Random Augmenter

### 3.1. Insert Character Randomly

In [None]:
aug = nac.RandomCharAug(action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

### 3.2. Substitute character randomly

In [None]:
aug = nac.RandomCharAug(action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

### 3.3 Swap character randomly

In [None]:
aug = nac.RandomCharAug(action="swap")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

### 3.4 Delete character randomly

In [None]:
aug = nac.RandomCharAug(action="delete")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

## Word Augmenter

### 4.1 Substitute word by spelling mistake words dictionary

In [None]:
aug = naw.SpellingAug()
augmented_texts = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

### 4.2 Word Embeddings Augmenter

#### Insert word randomly by word embeddings similarity

In [None]:
aug = naw.WordEmbsAug(
    model_type='word2vec', model_path=model_dir+'GoogleNews-vectors-negative300.bin',
    action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

#### Substitute word by word2vec similarity

In [None]:
# model_type: word2vec, glove or fasttext
aug = naw.WordEmbsAug(
    model_type='word2vec', model_path=model_dir+'GoogleNews-vectors-negative300.bin',
    action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

### 4.3. Contextual Word Embeddings Augmenter

#### Insert word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)

In [None]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

#### Insert word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)

In [None]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

### 4.4. Synonym Augmenter

#### Substitute word by WordNet's synonym

In [None]:
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

#### Substitute word by PPDB's synonym

In [None]:
aug = naw.SynonymAug(aug_src='ppdb', model_path=os.environ.get("MODEL_DIR") + 'ppdb-2.0-s-all')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

## 5. Sentence Augmentation 

### 5.1 Contextual Word Embeddings for Sentence Augmenter

In [None]:
# model_path: xlnet-base-cased or gpt2
aug = nas.ContextualWordEmbsForSentenceAug(model_path='xlnet-base-cased')
augmented_texts = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)