From 8ee28d0247cb1485a39b0d64bb36955c87ac2a07 Mon Sep 17 00:00:00 2001 From: Edward Ma Date: Wed, 5 Feb 2020 20:08:49 -0800 Subject: [PATCH 1/2] Align capitalization --- nlpaug/augmenter/char/keyboard.py | 2 ++ nlpaug/augmenter/char/ocr.py | 2 ++ nlpaug/augmenter/char/random.py | 8 ++++++++ nlpaug/augmenter/word/antonym.py | 13 ++++++++----- nlpaug/augmenter/word/spelling.py | 11 +++++++---- nlpaug/augmenter/word/synonym.py | 13 ++++++++----- nlpaug/augmenter/word/tfidf.py | 12 ++++++++++-- nlpaug/augmenter/word/word_augmenter.py | 11 ----------- test/augmenter/word/test_random_word.py | 4 ++-- test/augmenter/word/test_word.py | 19 ++++++++++++++++++- 10 files changed, 65 insertions(+), 30 deletions(-) diff --git a/nlpaug/augmenter/char/keyboard.py b/nlpaug/augmenter/char/keyboard.py index 707e2ea..751180d 100755 --- a/nlpaug/augmenter/char/keyboard.py +++ b/nlpaug/augmenter/char/keyboard.py @@ -90,6 +90,8 @@ def substitute(self, data): result += self.sample(self.model.predict(chars[char_i]), 1)[0] + # No capitalization alignment as this augmenter try to simulate typo + results.append(result) return self.reverse_tokenizer(results) diff --git a/nlpaug/augmenter/char/ocr.py b/nlpaug/augmenter/char/ocr.py index dc23d33..7d4942b 100755 --- a/nlpaug/augmenter/char/ocr.py +++ b/nlpaug/augmenter/char/ocr.py @@ -78,6 +78,8 @@ def substitute(self, data): result += self.sample(self.model.predict(chars[char_i]), 1)[0] + # No capitalization alignment as this augmenter try to OCR engine error + results.append(result) return self.reverse_tokenizer(results) diff --git a/nlpaug/augmenter/char/random.py b/nlpaug/augmenter/char/random.py index 7455474..9137ed3 100755 --- a/nlpaug/augmenter/char/random.py +++ b/nlpaug/augmenter/char/random.py @@ -88,6 +88,8 @@ def insert(self, data): for char_i in aug_char_idxes: chars.insert(char_i, self.sample(self.model, 1)[0]) + # No capitalization alignment as this augmenter try to simulate random error + result = ''.join(chars) results.append(result) @@ -120,6 +122,8 @@ def substitute(self, data): result += self.sample(self.model, 1)[0] + # No capitalization alignment as this augmenter try to simulate random error + results.append(result) return self.reverse_tokenizer(results) @@ -163,6 +167,8 @@ def swap(self, data): result += self.sample(self.model, 1)[0] + # No capitalization alignment as this augmenter try to simulate random error + result = ''.join(chars) results.append(result) @@ -191,6 +197,8 @@ def delete(self, data): for i in aug_char_idxes: del chars[i] + # No capitalization alignment as this augmenter try to simulate random error + result = ''.join(chars) results.append(result) diff --git a/nlpaug/augmenter/word/antonym.py b/nlpaug/augmenter/word/antonym.py index 57f47a9..75f263f 100755 --- a/nlpaug/augmenter/word/antonym.py +++ b/nlpaug/augmenter/word/antonym.py @@ -73,10 +73,10 @@ def substitute(self, data): if aug_idxes is None: return data - for i, token in enumerate(tokens): + for i, original_token in enumerate(tokens): # Skip if no augment for word if i not in aug_idxes: - results.append(token) + results.append(original_token) continue word_poses = PartOfSpeech.constituent2pos(pos[i][1]) @@ -88,14 +88,17 @@ def substitute(self, data): for word_pos in word_poses: candidates.extend(self.model.predict(pos[i][0], pos=word_pos)) - candidates = [c for c in candidates if c.lower() != token.lower()] + candidates = [c for c in candidates if c.lower() != original_token.lower()] if len(candidates) == 0: - results.append(token) + results.append(original_token) else: candidate = self.sample(candidates, 1)[0] candidate = candidate.replace("_", " ").replace("-", " ").lower() - results.append(self.align_capitalization(token, candidate)) + results.append(self.align_capitalization(original_token, candidate)) + + if i == 0: + results[0] = self.align_capitalization(original_token, results[0]) return self.reverse_tokenizer(results) diff --git a/nlpaug/augmenter/word/spelling.py b/nlpaug/augmenter/word/spelling.py index cdfc264..3b3588a 100755 --- a/nlpaug/augmenter/word/spelling.py +++ b/nlpaug/augmenter/word/spelling.py @@ -73,18 +73,21 @@ def substitute(self, data): if aug_idexes is None: return data - for i, token in enumerate(tokens): + for i, original_token in enumerate(tokens): # Skip if no augment for word if i not in aug_idexes: - results.append(token) + results.append(original_token) continue - candidate_words = self.model.predict(token) + candidate_words = self.model.predict(original_token) if candidate_words: results.append(self.sample(candidate_words, 1)[0]) else: # Unexpected scenario. Adding original token - results.append(token) + results.append(original_token) + + if i == 0: + results[0] = self.align_capitalization(original_token, results[0]) return self.reverse_tokenizer(results) diff --git a/nlpaug/augmenter/word/synonym.py b/nlpaug/augmenter/word/synonym.py index 0a80352..a8bc150 100755 --- a/nlpaug/augmenter/word/synonym.py +++ b/nlpaug/augmenter/word/synonym.py @@ -97,10 +97,10 @@ def substitute(self, data): if aug_idxes is None: return data - for i, token in enumerate(tokens): + for i, original_token in enumerate(tokens): # Skip if no augment for word if i not in aug_idxes: - results.append(token) + results.append(original_token) continue word_poses = PartOfSpeech.constituent2pos(pos[i][1]) @@ -112,14 +112,17 @@ def substitute(self, data): for word_pos in word_poses: candidates.extend(self.model.predict(pos[i][0], pos=word_pos)) - candidates = [c for c in candidates if c.lower() != token.lower()] + candidates = [c for c in candidates if c.lower() != original_token.lower()] if len(candidates) == 0: - results.append(token) + results.append(original_token) else: candidate = self.sample(candidates, 1)[0] candidate = candidate.replace("_", " ").replace("-", " ").lower() - results.append(self.align_capitalization(token, candidate)) + results.append(self.align_capitalization(original_token, candidate)) + + if i == 0: + results[0] = self.align_capitalization(original_token, results[0]) return self.reverse_tokenizer(results) diff --git a/nlpaug/augmenter/word/tfidf.py b/nlpaug/augmenter/word/tfidf.py index 667dc84..8cf20ee 100755 --- a/nlpaug/augmenter/word/tfidf.py +++ b/nlpaug/augmenter/word/tfidf.py @@ -120,6 +120,11 @@ def insert(self, data): new_word = self.sample(candidate_words, 1)[0] results.insert(aug_idx, new_word) + if aug_idx == 0: + results[0] = results[0].capitalize() + if self.get_word_case(results[1]) == 'capitalize': + results[1] = results[1].lower() + return self.reverse_tokenizer(results) def substitute(self, data): @@ -131,12 +136,15 @@ def substitute(self, data): return data for aug_idx in aug_idxes: - original_word = results[aug_idx] - candidate_words = self.model.predict(original_word, top_k=self.top_k) + original_token = results[aug_idx] + candidate_words = self.model.predict(original_token, top_k=self.top_k) substitute_word = self.sample(candidate_words, 1)[0] results[aug_idx] = substitute_word + if aug_idx == 0: + results[0] = self.align_capitalization(original_token, results[0]) + return self.reverse_tokenizer(results) def get_model(self, force_reload=False): diff --git a/nlpaug/augmenter/word/word_augmenter.py b/nlpaug/augmenter/word/word_augmenter.py index 5ef3f32..453d439 100755 --- a/nlpaug/augmenter/word/word_augmenter.py +++ b/nlpaug/augmenter/word/word_augmenter.py @@ -76,17 +76,6 @@ def align_capitalization(self, src_token, dest_token): return dest_token.capitalize() return dest_token - # @classmethod - # def align_capitalization(cls, src_token, dest_token): - # # For whole word is upper case - # if src_token.isupper(): - # return dest_token.upper() - # # For capitalize word - # elif src_token and src_token[0].isupper(): - # return dest_token.capitalize() - # else: - # return dest_token - def _get_aug_idxes(self, tokens): aug_cnt = self.generate_aug_cnt(len(tokens)) word_idxes = self.pre_skip_aug(tokens) diff --git a/test/augmenter/word/test_random_word.py b/test/augmenter/word/test_random_word.py index 9d4f23b..c1775d5 100755 --- a/test/augmenter/word/test_random_word.py +++ b/test/augmenter/word/test_random_word.py @@ -11,7 +11,7 @@ def test_swap(self): aug = naw.RandomWordAug(action="swap") for text in texts: - tokens = text.split(' ') + tokens = text.lower().split(' ') orig_token_freq = {} for w in tokens: orig_token_freq[w] = tokens.count(w) @@ -22,7 +22,7 @@ def test_swap(self): for i in range(10): augmented_text = aug.augment(augmented_text) - aug_tokens = augmented_text.split(' ') + aug_tokens = augmented_text.lower().split(' ') aug_token_freq = {} for w in tokens: aug_token_freq[w] = aug_tokens.count(w) diff --git a/test/augmenter/word/test_word.py b/test/augmenter/word/test_word.py index 7cd5ee8..a24d919 100755 --- a/test/augmenter/word/test_word.py +++ b/test/augmenter/word/test_word.py @@ -166,7 +166,7 @@ def test_stopwords_regex(self): # https://github.com/makcedward/nlpaug/issues/82 def test_case(self): - # Swap case + # Swap aug = naw.RandomWordAug(action='swap') self.assertEqual('bB aA', aug.augment('aA bB')) self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 1, 0)) @@ -175,6 +175,16 @@ def test_case(self): self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 0, 1)) self.assertEqual(['He', 'McDonalds', 'loves'], aug.change_case('He loves McDonalds'.split(' '), 2, 1)) + # Insert + aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action='insert') + expected = False + for i in range(10): + augmented_text = aug.augment('Good') + if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize': + expected = True + break + self.assertTrue(expected) + # Substitute aug = naw.RandomWordAug(action='substitute', target_words=['abc']) expected = False @@ -185,6 +195,13 @@ def test_case(self): break self.assertTrue(expected) + aug = naw.AntonymAug() + self.assertEqual('Unhappy', aug.augment('Happy')) + + # Do not change if target word is non-lower + aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt') + self.assertEqual('RE', aug.augment('Re')) + # Delete case aug = naw.RandomWordAug(action='delete') expected = False From 24eb79dbce7846c973e9f8381419f1124c36425a Mon Sep 17 00:00:00 2001 From: Edward Ma Date: Wed, 5 Feb 2020 20:09:05 -0800 Subject: [PATCH 2/2] Release 0.0.12 --- CHANGE.md | 10 ++++++++++ README.md | 2 +- docs/conf.py | 4 ++-- nlpaug/__init__.py | 2 +- setup.py | 2 +- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/CHANGE.md b/CHANGE.md index f48834b..673526c 100755 --- a/CHANGE.md +++ b/CHANGE.md @@ -1,6 +1,16 @@ NLPAUG Change Log ================ +**0.0.12 Feb 5, 2020 +* ContextualWordEmbsAug supports bert-base-multilingual-uncased (for non English inputs) +* Fix missing library dependency [#74](https://github.com/makcedward/nlpaug/issues/74) +* Fix single token error when using RandomWordAug [#76](https://github.com/makcedward/nlpaug/issues/76) +* Fix replacing character in RandomCharAug error [#77](https://github.com/makcedward/nlpaug/issues/77) +* Enhance word's augmenter to support regular expression stopwords [#81](https://github.com/makcedward/nlpaug/issues/81) +* Enhance char's augmenter to support regular expression stopwords [#86](https://github.com/makcedward/nlpaug/issues/86) +* KeyboardAug supports Thai language [#92](https://github.com/makcedward/nlpaug/pull/92) +* Fix word casing issue [#82](https://github.com/makcedward/nlpaug/issues/82) + **0.0.11 Dec 6, 2019 * Support color noise (pink, blue, red and violet noise) in audio's NoiseAug * Support given background noise in audio's NoiseAug diff --git a/README.md b/README.md index ed812a3..ee7a7d4 100755 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ pip install librosa>=0.7.1 ## Recent Changes -**0.0.12dev Feb, 2020 +**0.0.12 Feb 5, 2020 * ContextualWordEmbsAug supports bert-base-multilingual-uncased (for non English inputs) * Fix missing library dependency [#74](https://github.com/makcedward/nlpaug/issues/74) * Fix single token error when using RandomWordAug [#76](https://github.com/makcedward/nlpaug/issues/76) diff --git a/docs/conf.py b/docs/conf.py index 861e37c..0fcc745 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -74,9 +74,9 @@ def __getattr__(cls, name): # built documents. # # The short X.Y version. -version = '0.0.12dev' +version = '0.0.12' # The full version, including alpha/beta/rc tags. -release = '0.0.12dev' +release = '0.0.12' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/nlpaug/__init__.py b/nlpaug/__init__.py index 2231f48..5b405c2 100755 --- a/nlpaug/__init__.py +++ b/nlpaug/__init__.py @@ -3,4 +3,4 @@ __all__ = ['base_augmenter'] -__version__ = '0.0.12dev' +__version__ = '0.0.12' diff --git a/setup.py b/setup.py index e87a913..5a205a4 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( name="nlpaug", - version="0.0.11", + version="0.0.12", author="Edward Ma", author_email="makcedward@gmail.com", url="https://github.com/makcedward/nlpaug",