# Config

In [2]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.flow as naf

from nlpaug.util import Action



In [3]:
text = 'The quick brown fox jumps over the lazy dog'
tokens = text.split(' ')
print('Token:{}'.format(tokens))

Token:['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


# Character Augmenter

In [4]:
from nlpaug.augmenter.char import CharAugmenter

In [5]:
class CustomCharAug(CharAugmenter):
    def __init__(self, name='CustChar_Aug', aug_min=1, aug_p=0.3, tokenizer=None):
        super(CustomCharAug, self).__init__(
            action=Action.SUBSTITUTE, name=name, aug_p=aug_p, aug_min=aug_min, tokenizer=tokenizer)

        self.model = self.get_model()

    def substitute(self, tokens):
        results = []
        for token in tokens:
            result = ''
            chars = self.tokenizer(token)
            aug_cnt = self.generate_aug_cnt(len(chars))
            char_idxes = [i for i, char in enumerate(chars)]
            aug_idexes = self.sample(char_idxes, aug_cnt)

            for i, char in enumerate(chars):
                # Skip if no augment for char
                if i not in aug_idexes:
                    result += char
                    continue

                # Skip if no mapping
                if char not in self.model or len(self.model[char]) < 1:
                    result += char
                    continue

                result += self.sample(self.model[char], 1)[0]

            results.append(result)

        return results

    def get_model(self):
        result = {
            'T': 't',
            'h': 'H',
            'e': 'E'
        }
        return result

aug = CustomCharAug()

for token in tokens:
    print('{} --> {}'.format(token, aug.augment([token])[0]))
    break

The --> THe


# Word Augmenter

In [6]:
from nlpaug.augmenter.word import WordAugmenter

In [7]:
class CustomWordAug(WordAugmenter):
    def __init__(self, name='CustomWord_Aug', aug_min=1, aug_p=0.3, tokenizer=None):
        super(CustomWordAug, self).__init__(
            action=Action.INSERT, name=name, aug_p=aug_p, aug_min=aug_min, tokenizer=tokenizer)
        
        self.model = self.get_model()

    def insert(self, tokens):
        """
        :param tokens: list of token
        :return: list of token
        """
        results = tokens.copy()

        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = [i for i, t in enumerate(tokens)]
        aug_idexes = self.sample(word_idxes, aug_cnt)
        aug_idexes.sort(reverse=True)

        for aug_idx in aug_idexes:
            new_word = self.sample(self.model, 1)[0]
            results.insert(aug_idx, new_word)

        return results
    
    def get_model(self):
        return ['Custom1', 'Custom2']
        
        

aug = CustomWordAug()

for token in tokens:
    print('{} --> {}'.format(token, aug.augment([token])[0]))

The --> Custom1
quick --> Custom1
brown --> Custom1
fox --> Custom2
jumps --> Custom1
over --> Custom2
the --> Custom2
lazy --> Custom1
dog --> Custom1
