You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import nltk
nltk.download('punkt')
from nltk import tokenize as nltktokenize
test_sentence = ".@JordanClarksons is ready for an 'amazing' stint in representing the country at the #AsianGames! pic.twitter.com/AQK35h8tcU"
nltktokenize.sent_tokenize(test_sentence )
would produce an error:
IndexError: list index out of range
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
R:\Temp/ipykernel_4488/1710349248.py in <module>
----> 1 nltktokenize.sent_tokenize(".@JordanClarksons is ready for an 'amazing' stint in representing the country at the #AsianGames! pic.twitter.com/AQK35h8tcU")
D:\ProgramData\Miniconda3\envs\nlp\lib\site-packages\nltk\tokenize\__init__.py in sent_tokenize(text, language)
105 """
106 tokenizer = load(f"tokenizers/punkt/{language}.pickle")
--> 107 return tokenizer.tokenize(text)
108
109
D:\ProgramData\Miniconda3\envs\nlp\lib\site-packages\nltk\tokenize\punkt.py in tokenize(self, text, realign_boundaries)
1274 Given a text, returns a list of the sentences in that text.
1275 """
-> 1276 return list(self.sentences_from_text(text, realign_boundaries))
1277
1278 def debug_decisions(self, text):
D:\ProgramData\Miniconda3\envs\nlp\lib\site-packages\nltk\tokenize\punkt.py in sentences_from_text(self, text, realign_boundaries)
1330 follows the period.
1331 """
-> 1332 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1333
1334 def _match_potential_end_contexts(self, text):
D:\ProgramData\Miniconda3\envs\nlp\lib\site-packages\nltk\tokenize\punkt.py in <listcomp>(.0)
1330 follows the period.
1331 """
-> 1332 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1333
1334 def _match_potential_end_contexts(self, text):
D:\ProgramData\Miniconda3\envs\nlp\lib\site-packages\nltk\tokenize\punkt.py in span_tokenize(self, text, realign_boundaries)
1320 if realign_boundaries:
1321 slices = self._realign_boundaries(text, slices)
-> 1322 for sentence in slices:
1323 yield (sentence.start, sentence.stop)
1324
D:\ProgramData\Miniconda3\envs\nlp\lib\site-packages\nltk\tokenize\punkt.py in _realign_boundaries(self, text, slices)
1419 """
1420 realign = 0
-> 1421 for sentence1, sentence2 in _pair_iter(slices):
1422 sentence1 = slice(sentence1.start + realign, sentence1.stop)
1423 if not sentence2:
D:\ProgramData\Miniconda3\envs\nlp\lib\site-packages\nltk\tokenize\punkt.py in _pair_iter(iterator)
316 iterator = iter(iterator)
317 try:
--> 318 prev = next(iterator)
319 except StopIteration:
320 return
D:\ProgramData\Miniconda3\envs\nlp\lib\site-packages\nltk\tokenize\punkt.py in _slices_from_text(self, text)
1393 def _slices_from_text(self, text):
1394 last_break = 0
-> 1395 for match, context in self._match_potential_end_contexts(text):
1396 if self.text_contains_sentbreak(context):
1397 yield slice(last_break, match.end())
D:\ProgramData\Miniconda3\envs\nlp\lib\site-packages\nltk\tokenize\punkt.py in _match_potential_end_contexts(self, text)
1380 split = text[: match.start()].rsplit(maxsplit=1)
1381 before_start = len(split[0]) if len(split) == 2 else 0
-> 1382 before_words[match] = split[-1]
1383 matches.append(match)
1384
IndexError: list index out of range
The text was updated successfully, but these errors were encountered:
@tingjhenjiang Hello! This was an issue from NLTK 3.6.6 (See #2925, #2921), and has been patched in NLTK 3.6.7.
This can easily be resolved by updating to version NLTK 3.6.7 (The current version).
Also, this is besides the point, but for Tweets and other "casual" texts, I would personally recommend the TweetTokenizer. It allows for filtering away tags, hashtags, URLs, etc.
I'll close this as this should be resolved in NLTK 3.6.7.
following codes:
would produce an error:
The text was updated successfully, but these errors were encountered: