Skip to content

Commit

Permalink
Merge pull request #97 from makcedward/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
makcedward committed Feb 6, 2020
2 parents 214decf + 24eb79d commit 9d4fb11
Show file tree
Hide file tree
Showing 15 changed files with 80 additions and 35 deletions.
10 changes: 10 additions & 0 deletions CHANGE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
NLPAUG Change Log
================

**0.0.12 Feb 5, 2020
* ContextualWordEmbsAug supports bert-base-multilingual-uncased (for non English inputs)
* Fix missing library dependency [#74](https://github.com/makcedward/nlpaug/issues/74)
* Fix single token error when using RandomWordAug [#76](https://github.com/makcedward/nlpaug/issues/76)
* Fix replacing character in RandomCharAug error [#77](https://github.com/makcedward/nlpaug/issues/77)
* Enhance word's augmenter to support regular expression stopwords [#81](https://github.com/makcedward/nlpaug/issues/81)
* Enhance char's augmenter to support regular expression stopwords [#86](https://github.com/makcedward/nlpaug/issues/86)
* KeyboardAug supports Thai language [#92](https://github.com/makcedward/nlpaug/pull/92)
* Fix word casing issue [#82](https://github.com/makcedward/nlpaug/issues/82)

**0.0.11 Dec 6, 2019
* Support color noise (pink, blue, red and violet noise) in audio's NoiseAug
* Support given background noise in audio's NoiseAug
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ pip install librosa>=0.7.1

## Recent Changes

**0.0.12dev Feb, 2020
**0.0.12 Feb 5, 2020
* ContextualWordEmbsAug supports bert-base-multilingual-uncased (for non English inputs)
* Fix missing library dependency [#74](https://github.com/makcedward/nlpaug/issues/74)
* Fix single token error when using RandomWordAug [#76](https://github.com/makcedward/nlpaug/issues/76)
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def __getattr__(cls, name):
# built documents.
#
# The short X.Y version.
version = '0.0.12dev'
version = '0.0.12'
# The full version, including alpha/beta/rc tags.
release = '0.0.12dev'
release = '0.0.12'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion nlpaug/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

__all__ = ['base_augmenter']

__version__ = '0.0.12dev'
__version__ = '0.0.12'
2 changes: 2 additions & 0 deletions nlpaug/augmenter/char/keyboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ def substitute(self, data):

result += self.sample(self.model.predict(chars[char_i]), 1)[0]

# No capitalization alignment as this augmenter try to simulate typo

results.append(result)

return self.reverse_tokenizer(results)
Expand Down
2 changes: 2 additions & 0 deletions nlpaug/augmenter/char/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def substitute(self, data):

result += self.sample(self.model.predict(chars[char_i]), 1)[0]

# No capitalization alignment as this augmenter try to OCR engine error

results.append(result)

return self.reverse_tokenizer(results)
Expand Down
8 changes: 8 additions & 0 deletions nlpaug/augmenter/char/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ def insert(self, data):
for char_i in aug_char_idxes:
chars.insert(char_i, self.sample(self.model, 1)[0])

# No capitalization alignment as this augmenter try to simulate random error

result = ''.join(chars)
results.append(result)

Expand Down Expand Up @@ -120,6 +122,8 @@ def substitute(self, data):

result += self.sample(self.model, 1)[0]

# No capitalization alignment as this augmenter try to simulate random error

results.append(result)

return self.reverse_tokenizer(results)
Expand Down Expand Up @@ -163,6 +167,8 @@ def swap(self, data):

result += self.sample(self.model, 1)[0]

# No capitalization alignment as this augmenter try to simulate random error

result = ''.join(chars)
results.append(result)

Expand Down Expand Up @@ -191,6 +197,8 @@ def delete(self, data):
for i in aug_char_idxes:
del chars[i]

# No capitalization alignment as this augmenter try to simulate random error

result = ''.join(chars)
results.append(result)

Expand Down
13 changes: 8 additions & 5 deletions nlpaug/augmenter/word/antonym.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,10 @@ def substitute(self, data):
if aug_idxes is None:
return data

for i, token in enumerate(tokens):
for i, original_token in enumerate(tokens):
# Skip if no augment for word
if i not in aug_idxes:
results.append(token)
results.append(original_token)
continue

word_poses = PartOfSpeech.constituent2pos(pos[i][1])
Expand All @@ -88,14 +88,17 @@ def substitute(self, data):
for word_pos in word_poses:
candidates.extend(self.model.predict(pos[i][0], pos=word_pos))

candidates = [c for c in candidates if c.lower() != token.lower()]
candidates = [c for c in candidates if c.lower() != original_token.lower()]

if len(candidates) == 0:
results.append(token)
results.append(original_token)
else:
candidate = self.sample(candidates, 1)[0]
candidate = candidate.replace("_", " ").replace("-", " ").lower()
results.append(self.align_capitalization(token, candidate))
results.append(self.align_capitalization(original_token, candidate))

if i == 0:
results[0] = self.align_capitalization(original_token, results[0])

return self.reverse_tokenizer(results)

Expand Down
11 changes: 7 additions & 4 deletions nlpaug/augmenter/word/spelling.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,21 @@ def substitute(self, data):
if aug_idexes is None:
return data

for i, token in enumerate(tokens):
for i, original_token in enumerate(tokens):
# Skip if no augment for word
if i not in aug_idexes:
results.append(token)
results.append(original_token)
continue

candidate_words = self.model.predict(token)
candidate_words = self.model.predict(original_token)
if candidate_words:
results.append(self.sample(candidate_words, 1)[0])
else:
# Unexpected scenario. Adding original token
results.append(token)
results.append(original_token)

if i == 0:
results[0] = self.align_capitalization(original_token, results[0])

return self.reverse_tokenizer(results)

Expand Down
13 changes: 8 additions & 5 deletions nlpaug/augmenter/word/synonym.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,10 @@ def substitute(self, data):
if aug_idxes is None:
return data

for i, token in enumerate(tokens):
for i, original_token in enumerate(tokens):
# Skip if no augment for word
if i not in aug_idxes:
results.append(token)
results.append(original_token)
continue

word_poses = PartOfSpeech.constituent2pos(pos[i][1])
Expand All @@ -112,14 +112,17 @@ def substitute(self, data):
for word_pos in word_poses:
candidates.extend(self.model.predict(pos[i][0], pos=word_pos))

candidates = [c for c in candidates if c.lower() != token.lower()]
candidates = [c for c in candidates if c.lower() != original_token.lower()]

if len(candidates) == 0:
results.append(token)
results.append(original_token)
else:
candidate = self.sample(candidates, 1)[0]
candidate = candidate.replace("_", " ").replace("-", " ").lower()
results.append(self.align_capitalization(token, candidate))
results.append(self.align_capitalization(original_token, candidate))

if i == 0:
results[0] = self.align_capitalization(original_token, results[0])

return self.reverse_tokenizer(results)

Expand Down
12 changes: 10 additions & 2 deletions nlpaug/augmenter/word/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ def insert(self, data):
new_word = self.sample(candidate_words, 1)[0]
results.insert(aug_idx, new_word)

if aug_idx == 0:
results[0] = results[0].capitalize()
if self.get_word_case(results[1]) == 'capitalize':
results[1] = results[1].lower()

return self.reverse_tokenizer(results)

def substitute(self, data):
Expand All @@ -131,12 +136,15 @@ def substitute(self, data):
return data

for aug_idx in aug_idxes:
original_word = results[aug_idx]
candidate_words = self.model.predict(original_word, top_k=self.top_k)
original_token = results[aug_idx]
candidate_words = self.model.predict(original_token, top_k=self.top_k)
substitute_word = self.sample(candidate_words, 1)[0]

results[aug_idx] = substitute_word

if aug_idx == 0:
results[0] = self.align_capitalization(original_token, results[0])

return self.reverse_tokenizer(results)

def get_model(self, force_reload=False):
Expand Down
11 changes: 0 additions & 11 deletions nlpaug/augmenter/word/word_augmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,17 +76,6 @@ def align_capitalization(self, src_token, dest_token):
return dest_token.capitalize()
return dest_token

# @classmethod
# def align_capitalization(cls, src_token, dest_token):
# # For whole word is upper case
# if src_token.isupper():
# return dest_token.upper()
# # For capitalize word
# elif src_token and src_token[0].isupper():
# return dest_token.capitalize()
# else:
# return dest_token

def _get_aug_idxes(self, tokens):
aug_cnt = self.generate_aug_cnt(len(tokens))
word_idxes = self.pre_skip_aug(tokens)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

setup(
name="nlpaug",
version="0.0.11",
version="0.0.12",
author="Edward Ma",
author_email="makcedward@gmail.com",
url="https://github.com/makcedward/nlpaug",
Expand Down
4 changes: 2 additions & 2 deletions test/augmenter/word/test_random_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_swap(self):
aug = naw.RandomWordAug(action="swap")

for text in texts:
tokens = text.split(' ')
tokens = text.lower().split(' ')
orig_token_freq = {}
for w in tokens:
orig_token_freq[w] = tokens.count(w)
Expand All @@ -22,7 +22,7 @@ def test_swap(self):
for i in range(10):
augmented_text = aug.augment(augmented_text)

aug_tokens = augmented_text.split(' ')
aug_tokens = augmented_text.lower().split(' ')
aug_token_freq = {}
for w in tokens:
aug_token_freq[w] = aug_tokens.count(w)
Expand Down
19 changes: 18 additions & 1 deletion test/augmenter/word/test_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def test_stopwords_regex(self):

# https://github.com/makcedward/nlpaug/issues/82
def test_case(self):
# Swap case
# Swap
aug = naw.RandomWordAug(action='swap')
self.assertEqual('bB aA', aug.augment('aA bB'))
self.assertEqual(['Love', 'I', 'McDonalds'], aug.change_case('I love McDonalds'.split(' '), 1, 0))
Expand All @@ -175,6 +175,16 @@ def test_case(self):
self.assertEqual(['Loves', 'he', 'McDonalds'], aug.change_case('He loves McDonalds'.split(' '), 0, 1))
self.assertEqual(['He', 'McDonalds', 'loves'], aug.change_case('He loves McDonalds'.split(' '), 2, 1))

# Insert
aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action='insert')
expected = False
for i in range(10):
augmented_text = aug.augment('Good')
if 'good' in augmented_text and aug.get_word_case(augmented_text.split(' ')[0]) == 'capitalize':
expected = True
break
self.assertTrue(expected)

# Substitute
aug = naw.RandomWordAug(action='substitute', target_words=['abc'])
expected = False
Expand All @@ -185,6 +195,13 @@ def test_case(self):
break
self.assertTrue(expected)

aug = naw.AntonymAug()
self.assertEqual('Unhappy', aug.augment('Happy'))

# Do not change if target word is non-lower
aug = naw.SpellingAug(dict_path=os.environ.get("MODEL_DIR") + 'spelling_en.txt')
self.assertEqual('RE', aug.augment('Re'))

# Delete case
aug = naw.RandomWordAug(action='delete')
expected = False
Expand Down

0 comments on commit 9d4fb11

Please sign in to comment.