Merge pull request #125 from makcedward/dev

Dev
makcedward · Apr 25, 2020 · d977348 · d977348
2 parents 092adb7 + 597733b
commit d977348
Show file tree

Hide file tree

Showing 8 changed files with 52 additions and 14 deletions.
diff --git a/CHANGE.md b/CHANGE.md
@@ -1,6 +1,13 @@
 NLPAUG Change Log
 ================
 
+**0.0.14 Apr 24, 2020
+*   Remove QWERTAug example (Replaced by KeyboardAug) [#110] (https://github.com/makcedward/nlpaug/issues/110)
+*   Fix [#117] (https://github.com/makcedward/nlpaug/issues/117), [#114] (https://github.com/makcedward/nlpaug/issues/114), [#111] (https://github.com/makcedward/nlpaug/issues/111),  [#105](https://github.com/makcedward/nlpaug/issues/105)
+*   Support Change Log [#116] (https://github.com/makcedward/nlpaug/issues/117)
+*   Fix typo [#123] (https://github.com/makcedward/nlpaug/issues/123)
+*   Support accepting candidates in RandomCharAug [#125] (https://github.com/makcedward/nlpaug/issues/125)
+
 **0.0.13 Feb 25, 2020
 *   Fix spectrogram tutorial notebook [#98] (https://github.com/makcedward/nlpaug/issues/98)
 *   Fix RandomWordAug missed aug_max parameter [#100] (https://github.com/makcedward/nlpaug/issues/100)

diff --git a/README.md b/README.md
@@ -43,6 +43,7 @@ This python library helps you with augmenting nlp for your machine learning proj
 *   [Example of Augmentation for Spectrogram Inputs](https://github.com/makcedward/nlpaug/blob/master/example/spectrogram_augmenter.ipynb)
 *   [Example of Augmentation for Audio Inputs](https://github.com/makcedward/nlpaug/blob/master/example/audio_augmenter.ipynb)
 *   [Example of Orchestra Multiple Augmenters](https://github.com/makcedward/nlpaug/blob/master/example/flow.ipynb)
+*   [Example of Showing Augmentation History](https://github.com/makcedward/nlpaug/blob/master/example/change_log.ipynb)
 *   How to train [TF-IDF model](https://github.com/makcedward/nlpaug/blob/master/example/tfidf-train_model.ipynb)
 *   How to create [custom augmentation](https://github.com/makcedward/nlpaug/blob/master/example/custom_augmenter.ipynb)
 *   [API Documentation](https://nlpaug.readthedocs.io/en/latest/)
@@ -121,9 +122,12 @@ pip install librosa>=0.7.1
 
 ## Recent Changes
 
-**0.0.14dev Mar, 2020
+**0.0.14 Apr 24, 2020
 *   Remove QWERTAug example (Replaced by KeyboardAug) [#110] (https://github.com/makcedward/nlpaug/issues/110)
-*   Fix [#117] (https://github.com/makcedward/nlpaug/issues/117)), [#114] (https://github.com/makcedward/nlpaug/issues/114)), [#111] (https://github.com/makcedward/nlpaug/issues/111)),  [#105](https://github.com/makcedward/nlpaug/issues/105))
+*   Fix [#117] (https://github.com/makcedward/nlpaug/issues/117), [#114] (https://github.com/makcedward/nlpaug/issues/114), [#111] (https://github.com/makcedward/nlpaug/issues/111),  [#105](https://github.com/makcedward/nlpaug/issues/105)
+*   Support Change Log [#116] (https://github.com/makcedward/nlpaug/issues/117)
+*   Fix typo [#123] (https://github.com/makcedward/nlpaug/issues/123)
+*   Support accepting candidates in RandomCharAug [#125] (https://github.com/makcedward/nlpaug/issues/125)
 
 **0.0.13 Feb 25, 2020
 *   Fix spectrogram tutorial notebook [#98] (https://github.com/makcedward/nlpaug/issues/98)

diff --git a/docs/conf.py b/docs/conf.py
@@ -74,9 +74,9 @@ def __getattr__(cls, name):
 # built documents.
 #
 # The short X.Y version.
-version = '0.0.13'
+version = '0.0.14'
 # The full version, including alpha/beta/rc tags.
-release = '0.0.13'
+release = '0.0.14'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/nlpaug/__init__.py b/nlpaug/__init__.py
@@ -3,4 +3,4 @@
 
 __all__ = ['base_augmenter']
 
-__version__ = '0.0.14dev'
+__version__ = '0.0.14'
diff --git a/nlpaug/augmenter/char/random.py b/nlpaug/augmenter/char/random.py
@@ -29,19 +29,25 @@ class RandomCharAug(CharAugmenter):
     :param int aug_word_max: Maximum number of word will be augmented. If None is passed, number of augmentation is
         calculated via aup_word_p. If calculated result from aug_p is smaller than aug_max, will use calculated result
         from aug_word_p. Otherwise, using aug_max.
-    :param bool include_upper_case: If True, upper case character may be included in augmented data.
-    :param bool include_lower_case: If True, lower case character may be included in augmented data.
-    :param bool include_numeric: If True, numeric character may be included in augmented data.
+    :param bool include_upper_case: If True, upper case character may be included in augmented data. If `candidiates'
+        value is provided, this param will be ignored.
+    :param bool include_lower_case: If True, lower case character may be included in augmented data. If `candidiates'
+        value is provided, this param will be ignored.
+    :param bool include_numeric: If True, numeric character may be included in augmented data. If `candidiates'
+        value is provided, this param will be ignored.
     :param int min_char: If word less than this value, do not draw word for augmentation
     :param swap_mode: When action is 'swap', you may pass 'adjacent', 'middle' or 'random'. 'adjacent' means swap action
         only consider adjacent character (within same word). 'middle' means swap action consider adjacent character but
         not the first and last character of word. 'random' means swap action will be executed without constraint.
-    :param str spec_char: Special character may be included in augmented data.
+    :param str spec_char: Special character may be included in augmented data. If `candidiates'
+        value is provided, this param will be ignored.
     :param list stopwords: List of words which will be skipped from augment operation.
     :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
     :param func tokenizer: Customize tokenization process
     :param func reverse_tokenizer: Customize reverse of tokenization process
     :param bool include_detail: Change detail will be returned if it is True.
+    :param List candidiates: List of string for augmentation. E.g. ['AAA', '11', '===']. If values is provided,
+        `include_upper_case`, `include_lower_case`, `include_numeric` and `spec_char` will be ignored.
     :param str name: Name of this augmenter.
 
     >>> import nlpaug.augmenter.char as nac
@@ -51,7 +57,8 @@ class RandomCharAug(CharAugmenter):
     def __init__(self, action=Action.SUBSTITUTE, name='RandomChar_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
                  aug_word_p=0.3, aug_word_min=1, aug_word_max=10, include_upper_case=True, include_lower_case=True,
                  include_numeric=True, min_char=4, swap_mode='adjacent', spec_char='!@#$%^&*()_+', stopwords=None,
-                 tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None, include_detail=False):
+                 tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None, include_detail=False,
+                 candidiates=None):
         super().__init__(
             action=action, name=name, min_char=min_char, aug_char_min=aug_char_min, aug_char_max=aug_char_max,
             aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p,
@@ -63,6 +70,7 @@ def __init__(self, action=Action.SUBSTITUTE, name='RandomChar_Aug', aug_char_min
         self.include_numeric = include_numeric
         self.swap_mode = swap_mode
         self.spec_char = spec_char
+        self.candidiates = candidiates
 
         self.model = self.get_model()
 
@@ -94,8 +102,8 @@ def insert(self, data):
 
             new_token = ''.join(chars)
             change_seq += 1
-            doc.add_token(token_i, token=new_token, action=Action.INSERT,
-                          change_seq=self.parent_change_seq + change_seq)
+            doc.add_change_log(token_i, new_token=new_token, action=Action.INSERT,
+                                  change_seq=self.parent_change_seq + change_seq)
 
         if self.include_detail:
             return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs()
@@ -230,14 +238,18 @@ def delete(self, data):
             return self.reverse_tokenizer(doc.get_augmented_tokens())
 
     def get_model(self):
+        if self.candidiates:
+            return self.candidiates
+
         candidates = []
         if self.include_upper_case:
             candidates += string.ascii_uppercase
         if self.include_lower_case:
             candidates += string.ascii_lowercase
         if self.include_numeric:
             candidates += string.digits
-        candidates += self.spec_char
+        if self.spec_char:
+            candidates += self.spec_char
 
         return candidates
 

diff --git a/res/textual_example.png b/res/textual_example.png
diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name="nlpaug",
-    version="0.0.14dev",
+    version="0.0.14",
     author="Edward Ma",
     author_email="makcedward@gmail.com",
     url="https://github.com/makcedward/nlpaug",

diff --git a/test/augmenter/char/test_random_char.py b/test/augmenter/char/test_random_char.py
@@ -134,3 +134,18 @@ def test_swap_random(self):
         augmented_text = aug.augment(text)
         self.assertNotEqual(text, augmented_text)
         self.assertEqual(len(augmented_text), len(text))
+
+    def test_candidiates(self):
+        candidiates = ['AAA', '11', '===', '中文']
+        text = 'quick brown jumps over lazy'
+        aug = RandomCharAug(min_char=4, candidiates=candidiates)
+        augmented_text = aug.augment(text)
+        self.assertNotEqual(text, augmented_text)
+
+        match = False
+        for c in candidiates:
+            if c in augmented_text:
+                match = True
+                break
+
+        self.assertTrue(match)