Merge pull request #97 from makcedward/dev

Dev
makcedward · Feb 6, 2020 · 9d4fb11 · 9d4fb11
2 parents 214decf + 24eb79d
commit 9d4fb11
Show file tree

Hide file tree

Showing 15 changed files with 80 additions and 35 deletions.
diff --git a/CHANGE.md b/CHANGE.md
@@ -1,6 +1,16 @@
 NLPAUG Change Log
 ================
 
+**0.0.12 Feb 5, 2020
+*   ContextualWordEmbsAug supports bert-base-multilingual-uncased (for non English inputs)
+*   Fix missing library dependency [#74](https://github.com/makcedward/nlpaug/issues/74)
+*   Fix single token error when using RandomWordAug [#76](https://github.com/makcedward/nlpaug/issues/76)
+*   Fix replacing character in RandomCharAug error [#77](https://github.com/makcedward/nlpaug/issues/77)
+*   Enhance word's augmenter to support regular expression stopwords [#81](https://github.com/makcedward/nlpaug/issues/81)
+*   Enhance char's augmenter to support regular expression stopwords [#86](https://github.com/makcedward/nlpaug/issues/86)
+*   KeyboardAug supports Thai language [#92](https://github.com/makcedward/nlpaug/pull/92)
+*   Fix word casing issue [#82](https://github.com/makcedward/nlpaug/issues/82)
+
 **0.0.11 Dec 6, 2019
 *   Support color noise (pink, blue, red and violet noise) in audio's NoiseAug
 *   Support given background noise in audio's NoiseAug

diff --git a/README.md b/README.md
@@ -116,7 +116,7 @@ pip install librosa>=0.7.1
 
 ## Recent Changes
 
-**0.0.12dev Feb, 2020
+**0.0.12 Feb 5, 2020
 *   ContextualWordEmbsAug supports bert-base-multilingual-uncased (for non English inputs)
 *   Fix missing library dependency [#74](https://github.com/makcedward/nlpaug/issues/74)
 *   Fix single token error when using RandomWordAug [#76](https://github.com/makcedward/nlpaug/issues/76)

diff --git a/docs/conf.py b/docs/conf.py
@@ -74,9 +74,9 @@ def __getattr__(cls, name):
 # built documents.
 #
 # The short X.Y version.
-version = '0.0.12dev'
+version = '0.0.12'
 # The full version, including alpha/beta/rc tags.
-release = '0.0.12dev'
+release = '0.0.12'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/nlpaug/__init__.py b/nlpaug/__init__.py
@@ -3,4 +3,4 @@
 
 __all__ = ['base_augmenter']
 
-__version__ = '0.0.12dev'
+__version__ = '0.0.12'
diff --git a/nlpaug/augmenter/char/keyboard.py b/nlpaug/augmenter/char/keyboard.py
@@ -90,6 +90,8 @@ def substitute(self, data):
 
                 result += self.sample(self.model.predict(chars[char_i]), 1)[0]
 
+            # No capitalization alignment as this augmenter try to simulate typo
+
             results.append(result)
 
         return self.reverse_tokenizer(results)

diff --git a/nlpaug/augmenter/char/ocr.py b/nlpaug/augmenter/char/ocr.py
@@ -78,6 +78,8 @@ def substitute(self, data):
 
                 result += self.sample(self.model.predict(chars[char_i]), 1)[0]
 
+            # No capitalization alignment as this augmenter try to OCR engine error
+
             results.append(result)
 
         return self.reverse_tokenizer(results)

diff --git a/nlpaug/augmenter/char/random.py b/nlpaug/augmenter/char/random.py
@@ -88,6 +88,8 @@ def insert(self, data):
             for char_i in aug_char_idxes:
                 chars.insert(char_i, self.sample(self.model, 1)[0])
 
+            # No capitalization alignment as this augmenter try to simulate random error
+
             result = ''.join(chars)
             results.append(result)
 
@@ -120,6 +122,8 @@ def substitute(self, data):
 
                 result += self.sample(self.model, 1)[0]
 
+            # No capitalization alignment as this augmenter try to simulate random error
+
             results.append(result)
 
         return self.reverse_tokenizer(results)
@@ -163,6 +167,8 @@ def swap(self, data):
 
                 result += self.sample(self.model, 1)[0]
 
+            # No capitalization alignment as this augmenter try to simulate random error
+
             result = ''.join(chars)
             results.append(result)
 
@@ -191,6 +197,8 @@ def delete(self, data):
             for i in aug_char_idxes:
                 del chars[i]
 
+            # No capitalization alignment as this augmenter try to simulate random error
+
             result = ''.join(chars)
             results.append(result)
 

diff --git a/nlpaug/augmenter/word/antonym.py b/nlpaug/augmenter/word/antonym.py
@@ -73,10 +73,10 @@ def substitute(self, data):
         if aug_idxes is None:
             return data
 
-        for i, token in enumerate(tokens):
+        for i, original_token in enumerate(tokens):
             # Skip if no augment for word
             if i not in aug_idxes:
-                results.append(token)
+                results.append(original_token)
                 continue
 
             word_poses = PartOfSpeech.constituent2pos(pos[i][1])
@@ -88,14 +88,17 @@ def substitute(self, data):
                 for word_pos in word_poses:
                     candidates.extend(self.model.predict(pos[i][0], pos=word_pos))
 
-            candidates = [c for c in candidates if c.lower() != token.lower()]
+            candidates = [c for c in candidates if c.lower() != original_token.lower()]
 
             if len(candidates) == 0:
-                results.append(token)
+                results.append(original_token)
             else:
                 candidate = self.sample(candidates, 1)[0]
                 candidate = candidate.replace("_", " ").replace("-", " ").lower()
-                results.append(self.align_capitalization(token, candidate))
+                results.append(self.align_capitalization(original_token, candidate))
+
+            if i == 0:
+                results[0] = self.align_capitalization(original_token, results[0])
 
         return self.reverse_tokenizer(results)
 

diff --git a/nlpaug/augmenter/word/spelling.py b/nlpaug/augmenter/word/spelling.py
@@ -73,18 +73,21 @@ def substitute(self, data):
         if aug_idexes is None:
             return data
 
-        for i, token in enumerate(tokens):
+        for i, original_token in enumerate(tokens):
             # Skip if no augment for word
             if i not in aug_idexes:
-                results.append(token)
+                results.append(original_token)
                 continue
 
-            candidate_words = self.model.predict(token)
+            candidate_words = self.model.predict(original_token)
             if candidate_words:
                 results.append(self.sample(candidate_words, 1)[0])
             else:
                 # Unexpected scenario. Adding original token
-                results.append(token)
+                results.append(original_token)
+
+            if i == 0:
+                results[0] = self.align_capitalization(original_token, results[0])
 
         return self.reverse_tokenizer(results)
 

diff --git a/nlpaug/augmenter/word/synonym.py b/nlpaug/augmenter/word/synonym.py
@@ -97,10 +97,10 @@ def substitute(self, data):
         if aug_idxes is None:
             return data
 
-        for i, token in enumerate(tokens):
+        for i, original_token in enumerate(tokens):
             # Skip if no augment for word
             if i not in aug_idxes:
-                results.append(token)
+                results.append(original_token)
                 continue
 
             word_poses = PartOfSpeech.constituent2pos(pos[i][1])
@@ -112,14 +112,17 @@ def substitute(self, data):
                 for word_pos in word_poses:
                     candidates.extend(self.model.predict(pos[i][0], pos=word_pos))
 
-            candidates = [c for c in candidates if c.lower() != token.lower()]
+            candidates = [c for c in candidates if c.lower() != original_token.lower()]
 
             if len(candidates) == 0:
-                results.append(token)
+                results.append(original_token)
             else:
                 candidate = self.sample(candidates, 1)[0]
                 candidate = candidate.replace("_", " ").replace("-", " ").lower()
-                results.append(self.align_capitalization(token, candidate))
+                results.append(self.align_capitalization(original_token, candidate))
+
+            if i == 0:
+                results[0] = self.align_capitalization(original_token, results[0])
 
         return self.reverse_tokenizer(results)
 

diff --git a/nlpaug/augmenter/word/tfidf.py b/nlpaug/augmenter/word/tfidf.py
@@ -120,6 +120,11 @@ def insert(self, data):
             new_word = self.sample(candidate_words, 1)[0]
             results.insert(aug_idx, new_word)
 
+            if aug_idx == 0:
+                results[0] = results[0].capitalize()
+                if self.get_word_case(results[1]) == 'capitalize':
+                    results[1] = results[1].lower()
+
         return self.reverse_tokenizer(results)
 
     def substitute(self, data):
@@ -131,12 +136,15 @@ def substitute(self, data):
             return data
 
         for aug_idx in aug_idxes:
-            original_word = results[aug_idx]
-            candidate_words = self.model.predict(original_word, top_k=self.top_k)
+            original_token = results[aug_idx]
+            candidate_words = self.model.predict(original_token, top_k=self.top_k)
             substitute_word = self.sample(candidate_words, 1)[0]
 
             results[aug_idx] = substitute_word
 
+            if aug_idx == 0:
+                results[0] = self.align_capitalization(original_token, results[0])
+
         return self.reverse_tokenizer(results)
 
     def get_model(self, force_reload=False):

diff --git a/nlpaug/augmenter/word/word_augmenter.py b/nlpaug/augmenter/word/word_augmenter.py
@@ -76,17 +76,6 @@ def align_capitalization(self, src_token, dest_token):
             return dest_token.capitalize()
         return dest_token
 
-    # @classmethod
-    # def align_capitalization(cls, src_token, dest_token):
-    #     # For whole word is upper case
-    #     if src_token.isupper():
-    #         return dest_token.upper()
-    #     # For capitalize word
-    #     elif src_token and src_token[0].isupper():
-    #         return dest_token.capitalize()
-    #     else:
-    #         return dest_token
-
     def _get_aug_idxes(self, tokens):
         aug_cnt = self.generate_aug_cnt(len(tokens))
         word_idxes = self.pre_skip_aug(tokens)

diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name="nlpaug",
-    version="0.0.11",
+    version="0.0.12",
     author="Edward Ma",
     author_email="makcedward@gmail.com",
     url="https://github.com/makcedward/nlpaug",

diff --git a/test/augmenter/word/test_random_word.py b/test/augmenter/word/test_random_word.py
@@ -11,7 +11,7 @@ def test_swap(self):
         aug = naw.RandomWordAug(action="swap")
 
         for text in texts:
-            tokens = text.split(' ')
+            tokens = text.lower().split(' ')
             orig_token_freq = {}
             for w in tokens:
                 orig_token_freq[w] = tokens.count(w)
@@ -22,7 +22,7 @@ def test_swap(self):
             for i in range(10):
                 augmented_text = aug.augment(augmented_text)
 
-            aug_tokens = augmented_text.split(' ')
+            aug_tokens = augmented_text.lower().split(' ')
             aug_token_freq = {}
             for w in tokens:
                 aug_token_freq[w] = aug_tokens.count(w)

diff --git a/test/augmenter/word/test_word.py b/test/augmenter/word/test_word.py
@@ -166,7 +166,7 @@ def test_stopwords_regex(self):
 
     # https://github.com/makcedward/nlpaug/issues/82
     def test_case(self):
-        # Swap case
+        # Swap
         aug = naw.RandomWordAug(action='swap')
         self.assertEqual('bB aA', aug.augment('aA bB'))
         self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 1, 0))
@@ -175,6 +175,16 @@ def test_case(self):
         self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 0, 1))
         self.assertEqual(['He', 'McDonalds', 'loves'], aug.change_case('He loves McDonalds'.split(' '), 2, 1))
 
+        # Insert
+        aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action='insert')
+        expected = False
+        for i in range(10):
+            augmented_text = aug.augment('Good')
+            if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize':
+                expected = True
+                break
+        self.assertTrue(expected)
+
         # Substitute
         aug = naw.RandomWordAug(action='substitute', target_words=['abc'])
         expected = False
@@ -185,6 +195,13 @@ def test_case(self):
                 break
         self.assertTrue(expected)
 
+        aug = naw.AntonymAug()
+        self.assertEqual('Unhappy', aug.augment('Happy'))
+
+        # Do not change if target word is non-lower
+        aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt')
+        self.assertEqual('RE', aug.augment('Re'))
+
         # Delete case
         aug = naw.RandomWordAug(action='delete')
         expected = False