Skip to content

Commit

Permalink
Enhancement: Add ZWJ sequences Emoji and Skin Tone Modifier Emoji sup…
Browse files Browse the repository at this point in the history
…port to TweetTokenizer  (#2843)

* add tokenizer support for ZWJ sequences Emoji and skin tone modifer Emoji

* update AUTHORS.md

* fix corner case in TweetTokenizer with Emojis

* Replace the post-process function in TweetTokenizer with RegEx Matching patterns, both approaches are able to support ZWJ sequences Emoji and Skin Tone Modifier while the latter makes the code easier to organize.
  • Loading branch information
Saibo-creator committed Oct 10, 2021
1 parent 836b98e commit 2538164
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 0 deletions.
1 change: 1 addition & 0 deletions AUTHORS.md
Expand Up @@ -280,6 +280,7 @@
- Hiroki Teranishi <https://github.com/chantera>
- Ruben Cartuyvels <https://github.com/rubencart>
- Dalton Pearson <https://github.com/daltonpearson>
- Saibo Geng <https://github.com/Saibo-creator>

## Others whose work we've taken and included in NLTK, but who didn't directly contribute it:

Expand Down
48 changes: 48 additions & 0 deletions nltk/test/unit/test_tokenize.py
Expand Up @@ -334,6 +334,54 @@ def test_phone_tokenizer(self):
result = tokenizer.tokenize(test2)
assert result == expected

def test_emoji_tokenizer(self):
"""
Test a string that contains Emoji ZWJ Sequences and skin tone modifier
"""
tokenizer = TweetTokenizer()

# A Emoji ZWJ Sequences, they together build as a single emoji, should not be split.
test1 = "👨‍👩‍👧‍👧"
expected = ["👨‍👩‍👧‍👧"]
result = tokenizer.tokenize(test1)
assert result == expected

# A Emoji with skin tone modifier, the two characters build a single emoji, should not be split.
test2 = "👨🏿"
expected = ["👨🏿"]
result = tokenizer.tokenize(test2)
assert result == expected

# A string containing both skin tone modifier and ZWJ Sequences
test3 = "🤔 🙈 me así, se😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽"
expected = [
"🤔",
"🙈",
"me",
"así",
",",
"se",
"😌",
"ds",
"💕",
"👭",
"👙",
"hello",
"👩🏾\u200d🎓",
"emoji",
"hello",
"👨\u200d👩\u200d👦\u200d👦",
"how",
"are",
"😊",
"you",
"today",
"🙅🏽",
"🙅🏽",
]
result = tokenizer.tokenize(test3)
assert result == expected

def test_pad_asterisk(self):
"""
Test padding of asterisk for word tokenization.
Expand Down
7 changes: 7 additions & 0 deletions nltk/tokenize/casual.py
Expand Up @@ -157,6 +157,12 @@
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
# email addresses
r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
# Zero-Width-Joiner and Skin tone modifier emojis
""".(?:
[\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+
|
[\U0001F3FB-\U0001F3FF]
)""",
# Remaining word types:
r"""
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
Expand Down Expand Up @@ -194,6 +200,7 @@
r"(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))"
)


######################################################################
# Functions for converting html entities
######################################################################
Expand Down

0 comments on commit 2538164

Please sign in to comment.