From 8ee28d0247cb1485a39b0d64bb36955c87ac2a07 Mon Sep 17 00:00:00 2001
From: Edward Ma <makcedward@gmail.com>
Date: Wed, 5 Feb 2020 20:08:49 -0800
Subject: [PATCH 1/2] Align capitalization

---
 nlpaug/augmenter/char/keyboard.py       |  2 ++
 nlpaug/augmenter/char/ocr.py            |  2 ++
 nlpaug/augmenter/char/random.py         |  8 ++++++++
 nlpaug/augmenter/word/antonym.py        | 13 ++++++++-----
 nlpaug/augmenter/word/spelling.py       | 11 +++++++----
 nlpaug/augmenter/word/synonym.py        | 13 ++++++++-----
 nlpaug/augmenter/word/tfidf.py          | 12 ++++++++++--
 nlpaug/augmenter/word/word_augmenter.py | 11 -----------
 test/augmenter/word/test_random_word.py |  4 ++--
 test/augmenter/word/test_word.py        | 19 ++++++++++++++++++-
 10 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/nlpaug/augmenter/char/keyboard.py b/nlpaug/augmenter/char/keyboard.py
index 707e2ea..751180d 100755
--- a/nlpaug/augmenter/char/keyboard.py
+++ b/nlpaug/augmenter/char/keyboard.py
@@ -90,6 +90,8 @@ def substitute(self, data):
 
                 result += self.sample(self.model.predict(chars[char_i]), 1)[0]
 
+            # No capitalization alignment as this augmenter try to simulate typo
+
             results.append(result)
 
         return self.reverse_tokenizer(results)
diff --git a/nlpaug/augmenter/char/ocr.py b/nlpaug/augmenter/char/ocr.py
index dc23d33..7d4942b 100755
--- a/nlpaug/augmenter/char/ocr.py
+++ b/nlpaug/augmenter/char/ocr.py
@@ -78,6 +78,8 @@ def substitute(self, data):
 
                 result += self.sample(self.model.predict(chars[char_i]), 1)[0]
 
+            # No capitalization alignment as this augmenter try to OCR engine error
+
             results.append(result)
 
         return self.reverse_tokenizer(results)
diff --git a/nlpaug/augmenter/char/random.py b/nlpaug/augmenter/char/random.py
index 7455474..9137ed3 100755
--- a/nlpaug/augmenter/char/random.py
+++ b/nlpaug/augmenter/char/random.py
@@ -88,6 +88,8 @@ def insert(self, data):
             for char_i in aug_char_idxes:
                 chars.insert(char_i, self.sample(self.model, 1)[0])
 
+            # No capitalization alignment as this augmenter try to simulate random error
+
             result = ''.join(chars)
             results.append(result)
 
@@ -120,6 +122,8 @@ def substitute(self, data):
 
                 result += self.sample(self.model, 1)[0]
 
+            # No capitalization alignment as this augmenter try to simulate random error
+
             results.append(result)
 
         return self.reverse_tokenizer(results)
@@ -163,6 +167,8 @@ def swap(self, data):
 
                 result += self.sample(self.model, 1)[0]
 
+            # No capitalization alignment as this augmenter try to simulate random error
+
             result = ''.join(chars)
             results.append(result)
 
@@ -191,6 +197,8 @@ def delete(self, data):
             for i in aug_char_idxes:
                 del chars[i]
 
+            # No capitalization alignment as this augmenter try to simulate random error
+
             result = ''.join(chars)
             results.append(result)
 
diff --git a/nlpaug/augmenter/word/antonym.py b/nlpaug/augmenter/word/antonym.py
index 57f47a9..75f263f 100755
--- a/nlpaug/augmenter/word/antonym.py
+++ b/nlpaug/augmenter/word/antonym.py
@@ -73,10 +73,10 @@ def substitute(self, data):
         if aug_idxes is None:
             return data
 
-        for i, token in enumerate(tokens):
+        for i, original_token in enumerate(tokens):
             # Skip if no augment for word
             if i not in aug_idxes:
-                results.append(token)
+                results.append(original_token)
                 continue
 
             word_poses = PartOfSpeech.constituent2pos(pos[i][1])
@@ -88,14 +88,17 @@ def substitute(self, data):
                 for word_pos in word_poses:
                     candidates.extend(self.model.predict(pos[i][0], pos=word_pos))
 
-            candidates = [c for c in candidates if c.lower() != token.lower()]
+            candidates = [c for c in candidates if c.lower() != original_token.lower()]
 
             if len(candidates) == 0:
-                results.append(token)
+                results.append(original_token)
             else:
                 candidate = self.sample(candidates, 1)[0]
                 candidate = candidate.replace("_", " ").replace("-", " ").lower()
-                results.append(self.align_capitalization(token, candidate))
+                results.append(self.align_capitalization(original_token, candidate))
+
+            if i == 0:
+                results[0] = self.align_capitalization(original_token, results[0])
 
         return self.reverse_tokenizer(results)
 
diff --git a/nlpaug/augmenter/word/spelling.py b/nlpaug/augmenter/word/spelling.py
index cdfc264..3b3588a 100755
--- a/nlpaug/augmenter/word/spelling.py
+++ b/nlpaug/augmenter/word/spelling.py
@@ -73,18 +73,21 @@ def substitute(self, data):
         if aug_idexes is None:
             return data
 
-        for i, token in enumerate(tokens):
+        for i, original_token in enumerate(tokens):
             # Skip if no augment for word
             if i not in aug_idexes:
-                results.append(token)
+                results.append(original_token)
                 continue
 
-            candidate_words = self.model.predict(token)
+            candidate_words = self.model.predict(original_token)
             if candidate_words:
                 results.append(self.sample(candidate_words, 1)[0])
             else:
                 # Unexpected scenario. Adding original token
-                results.append(token)
+                results.append(original_token)
+
+            if i == 0:
+                results[0] = self.align_capitalization(original_token, results[0])
 
         return self.reverse_tokenizer(results)
 
diff --git a/nlpaug/augmenter/word/synonym.py b/nlpaug/augmenter/word/synonym.py
index 0a80352..a8bc150 100755
--- a/nlpaug/augmenter/word/synonym.py
+++ b/nlpaug/augmenter/word/synonym.py
@@ -97,10 +97,10 @@ def substitute(self, data):
         if aug_idxes is None:
             return data
 
-        for i, token in enumerate(tokens):
+        for i, original_token in enumerate(tokens):
             # Skip if no augment for word
             if i not in aug_idxes:
-                results.append(token)
+                results.append(original_token)
                 continue
 
             word_poses = PartOfSpeech.constituent2pos(pos[i][1])
@@ -112,14 +112,17 @@ def substitute(self, data):
                 for word_pos in word_poses:
                     candidates.extend(self.model.predict(pos[i][0], pos=word_pos))
 
-            candidates = [c for c in candidates if c.lower() != token.lower()]
+            candidates = [c for c in candidates if c.lower() != original_token.lower()]
 
             if len(candidates) == 0:
-                results.append(token)
+                results.append(original_token)
             else:
                 candidate = self.sample(candidates, 1)[0]
                 candidate = candidate.replace("_", " ").replace("-", " ").lower()
-                results.append(self.align_capitalization(token, candidate))
+                results.append(self.align_capitalization(original_token, candidate))
+
+            if i == 0:
+                results[0] = self.align_capitalization(original_token, results[0])
 
         return self.reverse_tokenizer(results)
 
diff --git a/nlpaug/augmenter/word/tfidf.py b/nlpaug/augmenter/word/tfidf.py
index 667dc84..8cf20ee 100755
--- a/nlpaug/augmenter/word/tfidf.py
+++ b/nlpaug/augmenter/word/tfidf.py
@@ -120,6 +120,11 @@ def insert(self, data):
             new_word = self.sample(candidate_words, 1)[0]
             results.insert(aug_idx, new_word)
 
+            if aug_idx == 0:
+                results[0] = results[0].capitalize()
+                if self.get_word_case(results[1]) == 'capitalize':
+                    results[1] = results[1].lower()
+
         return self.reverse_tokenizer(results)
 
     def substitute(self, data):
@@ -131,12 +136,15 @@ def substitute(self, data):
             return data
 
         for aug_idx in aug_idxes:
-            original_word = results[aug_idx]
-            candidate_words = self.model.predict(original_word, top_k=self.top_k)
+            original_token = results[aug_idx]
+            candidate_words = self.model.predict(original_token, top_k=self.top_k)
             substitute_word = self.sample(candidate_words, 1)[0]
 
             results[aug_idx] = substitute_word
 
+            if aug_idx == 0:
+                results[0] = self.align_capitalization(original_token, results[0])
+
         return self.reverse_tokenizer(results)
 
     def get_model(self, force_reload=False):
diff --git a/nlpaug/augmenter/word/word_augmenter.py b/nlpaug/augmenter/word/word_augmenter.py
index 5ef3f32..453d439 100755
--- a/nlpaug/augmenter/word/word_augmenter.py
+++ b/nlpaug/augmenter/word/word_augmenter.py
@@ -76,17 +76,6 @@ def align_capitalization(self, src_token, dest_token):
             return dest_token.capitalize()
         return dest_token
 
-    # @classmethod
-    # def align_capitalization(cls, src_token, dest_token):
-    #     # For whole word is upper case
-    #     if src_token.isupper():
-    #         return dest_token.upper()
-    #     # For capitalize word
-    #     elif src_token and src_token[0].isupper():
-    #         return dest_token.capitalize()
-    #     else:
-    #         return dest_token
-
     def _get_aug_idxes(self, tokens):
         aug_cnt = self.generate_aug_cnt(len(tokens))
         word_idxes = self.pre_skip_aug(tokens)
diff --git a/test/augmenter/word/test_random_word.py b/test/augmenter/word/test_random_word.py
index 9d4f23b..c1775d5 100755
--- a/test/augmenter/word/test_random_word.py
+++ b/test/augmenter/word/test_random_word.py
@@ -11,7 +11,7 @@ def test_swap(self):
         aug = naw.RandomWordAug(action="swap")
 
         for text in texts:
-            tokens = text.split(' ')
+            tokens = text.lower().split(' ')
             orig_token_freq = {}
             for w in tokens:
                 orig_token_freq[w] = tokens.count(w)
@@ -22,7 +22,7 @@ def test_swap(self):
             for i in range(10):
                 augmented_text = aug.augment(augmented_text)
 
-            aug_tokens = augmented_text.split(' ')
+            aug_tokens = augmented_text.lower().split(' ')
             aug_token_freq = {}
             for w in tokens:
                 aug_token_freq[w] = aug_tokens.count(w)
diff --git a/test/augmenter/word/test_word.py b/test/augmenter/word/test_word.py
index 7cd5ee8..a24d919 100755
--- a/test/augmenter/word/test_word.py
+++ b/test/augmenter/word/test_word.py
@@ -166,7 +166,7 @@ def test_stopwords_regex(self):
 
     # https://github.com/makcedward/nlpaug/issues/82
     def test_case(self):
-        # Swap case
+        # Swap
         aug = naw.RandomWordAug(action='swap')
         self.assertEqual('bB aA', aug.augment('aA bB'))
         self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 1, 0))
@@ -175,6 +175,16 @@ def test_case(self):
         self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 0, 1))
         self.assertEqual(['He', 'McDonalds', 'loves'], aug.change_case('He loves McDonalds'.split(' '), 2, 1))
 
+        # Insert
+        aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action='insert')
+        expected = False
+        for i in range(10):
+            augmented_text = aug.augment('Good')
+            if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize':
+                expected = True
+                break
+        self.assertTrue(expected)
+
         # Substitute
         aug = naw.RandomWordAug(action='substitute', target_words=['abc'])
         expected = False
@@ -185,6 +195,13 @@ def test_case(self):
                 break
         self.assertTrue(expected)
 
+        aug = naw.AntonymAug()
+        self.assertEqual('Unhappy', aug.augment('Happy'))
+
+        # Do not change if target word is non-lower
+        aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt')
+        self.assertEqual('RE', aug.augment('Re'))
+
         # Delete case
         aug = naw.RandomWordAug(action='delete')
         expected = False

From 24eb79dbce7846c973e9f8381419f1124c36425a Mon Sep 17 00:00:00 2001
From: Edward Ma <makcedward@gmail.com>
Date: Wed, 5 Feb 2020 20:09:05 -0800
Subject: [PATCH 2/2] Release 0.0.12

---
 CHANGE.md          | 10 ++++++++++
 README.md          |  2 +-
 docs/conf.py       |  4 ++--
 nlpaug/__init__.py |  2 +-
 setup.py           |  2 +-
 5 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/CHANGE.md b/CHANGE.md
index f48834b..673526c 100755
--- a/CHANGE.md
+++ b/CHANGE.md
@@ -1,6 +1,16 @@
 NLPAUG Change Log
 ================
 
+**0.0.12 Feb 5, 2020
+*   ContextualWordEmbsAug supports bert-base-multilingual-uncased (for non English inputs)
+*   Fix missing library dependency [#74](https://github.com/makcedward/nlpaug/issues/74)
+*   Fix single token error when using RandomWordAug [#76](https://github.com/makcedward/nlpaug/issues/76)
+*   Fix replacing character in RandomCharAug error [#77](https://github.com/makcedward/nlpaug/issues/77)
+*   Enhance word's augmenter to support regular expression stopwords [#81](https://github.com/makcedward/nlpaug/issues/81)
+*   Enhance char's augmenter to support regular expression stopwords [#86](https://github.com/makcedward/nlpaug/issues/86)
+*   KeyboardAug supports Thai language [#92](https://github.com/makcedward/nlpaug/pull/92)
+*   Fix word casing issue [#82](https://github.com/makcedward/nlpaug/issues/82)
+
 **0.0.11 Dec 6, 2019
 *   Support color noise (pink, blue, red and violet noise) in audio's NoiseAug
 *   Support given background noise in audio's NoiseAug
diff --git a/README.md b/README.md
index ed812a3..ee7a7d4 100755
--- a/README.md
+++ b/README.md
@@ -116,7 +116,7 @@ pip install librosa>=0.7.1
 
 ## Recent Changes
 
-**0.0.12dev Feb, 2020
+**0.0.12 Feb 5, 2020
 *   ContextualWordEmbsAug supports bert-base-multilingual-uncased (for non English inputs)
 *   Fix missing library dependency [#74](https://github.com/makcedward/nlpaug/issues/74)
 *   Fix single token error when using RandomWordAug [#76](https://github.com/makcedward/nlpaug/issues/76)
diff --git a/docs/conf.py b/docs/conf.py
index 861e37c..0fcc745 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -74,9 +74,9 @@ def __getattr__(cls, name):
 # built documents.
 #
 # The short X.Y version.
-version = '0.0.12dev'
+version = '0.0.12'
 # The full version, including alpha/beta/rc tags.
-release = '0.0.12dev'
+release = '0.0.12'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/nlpaug/__init__.py b/nlpaug/__init__.py
index 2231f48..5b405c2 100755
--- a/nlpaug/__init__.py
+++ b/nlpaug/__init__.py
@@ -3,4 +3,4 @@
 
 __all__ = ['base_augmenter']
 
-__version__ = '0.0.12dev'
+__version__ = '0.0.12'
diff --git a/setup.py b/setup.py
index e87a913..5a205a4 100755
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name="nlpaug",
-    version="0.0.11",
+    version="0.0.12",
     author="Edward Ma",
     author_email="makcedward@gmail.com",
     url="https://github.com/makcedward/nlpaug",