Enhancement: Add ZWJ sequences Emoji and Skin Tone Modifier Emoji sup…

…port to TweetTokenizer (#2843) * add tokenizer support for ZWJ sequences Emoji and skin tone modifer Emoji * update AUTHORS.md * fix corner case in TweetTokenizer with Emojis * Replace the post-process function in TweetTokenizer with RegEx Matching patterns, both approaches are able to support ZWJ sequences Emoji and Skin Tone Modifier while the latter makes the code easier to organize.
nltk · Oct 10, 2021 · 2538164 · 2538164
1 parent 836b98e
commit 2538164
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 0 deletions.
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -280,6 +280,7 @@
 - Hiroki Teranishi <https://github.com/chantera>
 - Ruben Cartuyvels <https://github.com/rubencart>
 - Dalton Pearson <https://github.com/daltonpearson>
+- Saibo Geng <https://github.com/Saibo-creator>
 
 ## Others whose work we've taken and included in NLTK, but who didn't directly contribute it:
 

diff --git a/nltk/test/unit/test_tokenize.py b/nltk/test/unit/test_tokenize.py
@@ -334,6 +334,54 @@ def test_phone_tokenizer(self):
         result = tokenizer.tokenize(test2)
         assert result == expected
 
+    def test_emoji_tokenizer(self):
+        """
+        Test a string that contains Emoji ZWJ Sequences and skin tone modifier
+        """
+        tokenizer = TweetTokenizer()
+
+        # A Emoji ZWJ Sequences, they together build as a single emoji, should not be split.
+        test1 = "👨‍👩‍👧‍👧"
+        expected = ["👨‍👩‍👧‍👧"]
+        result = tokenizer.tokenize(test1)
+        assert result == expected
+
+        # A Emoji with skin tone modifier, the two characters build a single emoji, should not be split.
+        test2 = "👨🏿"
+        expected = ["👨🏿"]
+        result = tokenizer.tokenize(test2)
+        assert result == expected
+
+        # A string containing both skin tone modifier and ZWJ Sequences
+        test3 = "🤔 🙈 me así, se😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽"
+        expected = [
+            "🤔",
+            "🙈",
+            "me",
+            "así",
+            ",",
+            "se",
+            "😌",
+            "ds",
+            "💕",
+            "👭",
+            "👙",
+            "hello",
+            "👩🏾\u200d🎓",
+            "emoji",
+            "hello",
+            "👨\u200d👩\u200d👦\u200d👦",
+            "how",
+            "are",
+            "😊",
+            "you",
+            "today",
+            "🙅🏽",
+            "🙅🏽",
+        ]
+        result = tokenizer.tokenize(test3)
+        assert result == expected
+
     def test_pad_asterisk(self):
         """
         Test padding of asterisk for word tokenization.

diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py
@@ -157,6 +157,12 @@
     r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
     # email addresses
     r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
+    # Zero-Width-Joiner and Skin tone modifier emojis
+    """.(?:
+        [\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+
+        |
+        [\U0001F3FB-\U0001F3FF]
+    )""",
     # Remaining word types:
     r"""
     (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
@@ -194,6 +200,7 @@
     r"(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))"
 )
 
+
 ######################################################################
 # Functions for converting html entities
 ######################################################################