diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark_sbd_tools.py similarity index 96% rename from benchmarks/benchmark.py rename to benchmarks/benchmark_sbd_tools.py index f0eae20..691f925 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark_sbd_tools.py @@ -4,7 +4,6 @@ import spacy import stanza -import syntok from syntok.tokenizer import Tokenizer import syntok.segmenter as syntok_segmenter @@ -27,7 +26,8 @@ def nltk_tokenize(text): return nltk.sent_tokenize(text) def pysbd_tokenize(text): - return pysbd_segmenter.segment(text) + segments = pysbd_segmenter.segment(text) + return [s.strip() for s in segments] def spacy_tokenize(text): return [sent.text for sent in nlp(text).sents] diff --git a/benchmarks/bigtext_speed_benchmark.py b/benchmarks/bigtext_speed_benchmark.py new file mode 100644 index 0000000..33b76c0 --- /dev/null +++ b/benchmarks/bigtext_speed_benchmark.py @@ -0,0 +1,75 @@ +import blingfire +import nltk +import pysbd +import spacy +import stanza + +from syntok.tokenizer import Tokenizer +import syntok.segmenter as syntok_segmenter + +pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False) + +nlp = spacy.blank('en') +nlp.add_pipe(nlp.create_pipe("sentencizer")) +nlp_dep = spacy.load('en_core_web_sm', disable=["ner"]) +#stanza.download('en') +stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize') + +syntok_tokenizer = Tokenizer() + +def blingfire_tokenize(text): + return blingfire.text_to_sentences(text).split('\n') + +def nltk_tokenize(text): + return nltk.sent_tokenize(text) + +def pysbd_tokenize(text): + segments = pysbd_segmenter.segment(text) + segments = [s.strip() for s in segments] + return segments + +def spacy_tokenize(text): + return [sent.text.strip("\n") for sent in nlp(text).sents] + +def spacy_dep_tokenize(text): + return [sent.text.strip("\n") for sent in nlp_dep(text).sents] + +def stanza_tokenize(text): + return [e.text for e in stanza_nlp(text).sentences] + +def make_sentences(segmented_tokens): + for sentence in segmented_tokens: + yield "".join(str(token) for token in sentence).strip() + +def syntok_tokenize(text): + tokens = syntok_tokenizer.split(text) + result = syntok_segmenter.split(iter(tokens)) + segments = [sent for sent in make_sentences(result)] + return segments + +def speed_benchmark(big_text, tokenize_func): + segments = tokenize_func(big_text) + return segments + +if __name__ == "__main__": + import time + libraries = ( + blingfire_tokenize, + nltk_tokenize, + pysbd_tokenize, + spacy_tokenize, + spacy_dep_tokenize, + stanza_tokenize, + syntok_tokenize) + + for tokenize_func in libraries: + t = time.time() + # wget http://www.gutenberg.org/files/1661/1661-0.txt -P benchmarks/ + with open('benchmarks/1661-0.txt') as bigfile: + big_text = bigfile.read() + sentences = speed_benchmark(big_text, tokenize_func) + + time_taken = time.time() - t + print() + print(tokenize_func.__name__) + print('Speed : {:>20.2f} ms'.format(time_taken * 1000)) diff --git a/benchmarks/genia_benchmark.py b/benchmarks/genia_benchmark.py index 8a835bf..82a7540 100644 --- a/benchmarks/genia_benchmark.py +++ b/benchmarks/genia_benchmark.py @@ -4,7 +4,6 @@ import spacy import stanza -import syntok from syntok.tokenizer import Tokenizer import syntok.segmenter as syntok_segmenter @@ -27,7 +26,8 @@ def nltk_tokenize(text): return nltk.sent_tokenize(text) def pysbd_tokenize(text): - return pysbd_segmenter.segment(text) + segments = pysbd_segmenter.segment(text) + return [s.strip() for s in segments] def spacy_tokenize(text): return [sent.text.strip("\n") for sent in nlp(text).sents] diff --git a/pysbd/abbreviation_replacer.py b/pysbd/abbreviation_replacer.py index 7e80f10..3cf3932 100644 --- a/pysbd/abbreviation_replacer.py +++ b/pysbd/abbreviation_replacer.py @@ -32,7 +32,10 @@ def replace(self): self.lang.KommanditgesellschaftRule, *self.lang.SingleLetterAbbreviationRules.All ) - self.text = self.search_for_abbreviations_in_string() + abbr_handled_text = "" + for line in self.text.splitlines(True): + abbr_handled_text += self.search_for_abbreviations_in_string(line) + self.text = abbr_handled_text self.replace_multi_period_abbreviations() self.text = Text(self.text).apply(*self.lang.AmPmRules.All) self.text = self.replace_abbreviation_as_sentence_boundary() @@ -72,25 +75,24 @@ def replace_period_of_abbr(self, txt, abbr): return txt - def search_for_abbreviations_in_string(self): - original = self.text - lowered = original.lower() + def search_for_abbreviations_in_string(self, text): + lowered = text.lower() for abbr in self.lang.Abbreviation.ABBREVIATIONS: stripped = abbr.strip() if stripped not in lowered: continue abbrev_match = re.findall( - r"(?:^|\s|\r|\n){}".format(stripped), original, flags=re.IGNORECASE + r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE ) if not abbrev_match: continue next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}" - char_array = re.findall(next_word_start, self.text) + char_array = re.findall(next_word_start, text) for ind, match in enumerate(abbrev_match): - self.text = self.scan_for_replacements( - self.text, match, ind, char_array + text = self.scan_for_replacements( + text, match, ind, char_array ) - return self.text + return text def scan_for_replacements(self, txt, am, ind, char_array): try: diff --git a/pysbd/about.py b/pysbd/about.py index e73ef72..4de8a4f 100644 --- a/pysbd/about.py +++ b/pysbd/about.py @@ -2,7 +2,7 @@ # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ __title__ = "pysbd" -__version__ = "0.3.0rc" +__version__ = "0.3.0" __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages." __uri__ = "http://nipunsadvilkar.github.io/" __author__ = "Nipun Sadvilkar" diff --git a/pysbd/lang/common/common.py b/pysbd/lang/common/common.py index dc833fb..7ddef28 100644 --- a/pysbd/lang/common/common.py +++ b/pysbd/lang/common/common.py @@ -5,7 +5,6 @@ class Common(object): # added special case: r"[。..!!?].*" to handle intermittent dots, exclamation, etc. - # TODO: above special cases group can be updated as per developer needs SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]" # # Rubular: http://rubular.com/r/NqCqv372Ix diff --git a/pysbd/lang/deutsch.py b/pysbd/lang/deutsch.py index 26298a5..def6be6 100644 --- a/pysbd/lang/deutsch.py +++ b/pysbd/lang/deutsch.py @@ -63,7 +63,7 @@ def replace(self): SingleLowerCaseLetterRule, SingleLowerCaseLetterAtStartOfLineRule) - self.text = self.search_for_abbreviations_in_string() + self.text = self.search_for_abbreviations_in_string(self.text) self.replace_multi_period_abbreviations() self.text = Text(self.text).apply(*self.lang.AmPmRules.All) self.text = self.replace_abbreviation_as_sentence_boundary() diff --git a/pysbd/processor.py b/pysbd/processor.py index 5e4bf1b..f140b8f 100644 --- a/pysbd/processor.py +++ b/pysbd/processor.py @@ -77,17 +77,18 @@ def split_into_segments(self): sents = [self.check_for_punctuation(s) for s in sents] # flatten list of list of sentences sents = self.rm_none_flatten(sents) - new_sents = [] + postprocessed_sents = [] for sent in sents: sent = Text(sent).apply(*self.lang.SubSymbolsRules.All) post_process_sent = self.post_process_segments(sent) if post_process_sent and isinstance(post_process_sent, str): - new_sents.append(post_process_sent) + postprocessed_sents.append(post_process_sent) elif isinstance(post_process_sent, list): for pps in post_process_sent: - new_sents.append(pps) - new_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule) for ns in new_sents] - return new_sents + postprocessed_sents.append(pps) + postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule) + for ns in postprocessed_sents] + return postprocessed_sents def post_process_segments(self, txt): if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt): diff --git a/pysbd/segmenter.py b/pysbd/segmenter.py index d286fa6..98dfc77 100644 --- a/pysbd/segmenter.py +++ b/pysbd/segmenter.py @@ -14,7 +14,7 @@ def __init__(self, language="en", clean=False, doc_type=None, char_span=False): Parameters ---------- - language : str, optional + language : str, required specify a language use its two character ISO 639-1 code, by default "en" clean : bool, optional @@ -49,11 +49,23 @@ def processor(self, text): def sentences_with_char_spans(self, sentences): # since SENTENCE_BOUNDARY_REGEX doesnt account - # for trailing whitespaces \s* is used as suffix + # for trailing whitespaces \s* & is used as suffix # to keep non-destructive text after segments joins - return [TextSpan(m.group(), m.start(), m.end()) for sent in sentences - for m in re.finditer('{0}\s*'.format(re.escape(sent)), - self.original_text)] + sent_spans = [] + prior_start_char_idx = 0 + for sent in sentences: + for match in re.finditer(r'{0}\s*'.format(re.escape(sent)), self.original_text): + match_str = match.group() + match_start_idx, match_end_idx = match.span() + if match_start_idx >= prior_start_char_idx: + # making sure if curren sentence and its span + # is either first sentence along with its char spans + # or current sent spans adjacent to prior sentence spans + sent_spans.append( + TextSpan(match_str, match_start_idx, match_end_idx)) + prior_start_char_idx = match_start_idx + break + return sent_spans def segment(self, text): self.original_text = text @@ -66,11 +78,11 @@ def segment(self, text): text = self.cleaner(text).clean() postprocessed_sents = self.processor(text).process() sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents) - if self.clean: + if self.char_span: + return sentence_w_char_spans + elif self.clean: # clean and destructed sentences return postprocessed_sents - elif self.char_span: - return sentence_w_char_spans else: # nondestructive with whitespaces return [textspan.sent for textspan in sentence_w_char_spans] diff --git a/pysbd/utils.py b/pysbd/utils.py index 7ba98ae..41e6716 100644 --- a/pysbd/utils.py +++ b/pysbd/utils.py @@ -56,13 +56,12 @@ def __init__(self, sent, start, end): self.end = end def __repr__(self): # pragma: no cover - return "{0}(sent='{1}', start={2}, end={3})".format( - self.__class__.__name__, self.sent, self.start, self.end) + return "{0}(sent={1}, start={2}, end={3})".format( + self.__class__.__name__, repr(self.sent), self.start, self.end) def __eq__(self, other): if isinstance(self, other.__class__): - return self.sent == other.sent and self.start == other.start and self.end == self.end - return False + return self.sent == other.sent and self.start == other.start and self.end == other.end class PySBDFactory(object): diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 61e9f80..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -spacy>=2.2.4 diff --git a/setup.py b/setup.py index 8095b82..16b192d 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRES_PYTHON = ">=3" # What packages are required for this module to be executed? -REQUIRED = ["spacy"] +REQUIRED = [] with io.open(os.path.join(root, "pysbd", "about.py"), encoding="utf8") as f: about = {} diff --git a/tests/regression/test_issues.py b/tests/regression/test_issues.py index 36852d9..d9ac4f5 100644 --- a/tests/regression/test_issues.py +++ b/tests/regression/test_issues.py @@ -33,31 +33,31 @@ TEST_ISSUE_DATA_CHAR_SPANS = [ ('#49', "1) The first item. 2) The second item.", - [('1) The first item. ', 0, 18), ('2) The second item.', 19, 38)] + [('1) The first item. ', 0, 19), ('2) The second item.', 19, 38)] ), ('#49', "a. The first item. b. The second item. c. The third list item", [ - ('a. The first item. ', 0, 18), ('b. The second item. ', 19, 38), + ('a. The first item. ', 0, 19), ('b. The second item. ', 19, 39), ('c. The third list item', 39, 61)] ), ('#53', "Trust in journalism is not associated with frequency of media use (except in the case of television as mentioned above), indicating that trust is not an important predictor of media use, though it might have an important impact on information processing. This counterintuitive fi nding can be explained by taking into account the fact that audiences do not watch informative content merely to inform themselves; they have other motivations that might override credibility concerns. For example, they might follow media primarily for entertainment purposes and consequently put less emphasis on the quality of the received information.As <|CITE|> have claimed, audiences tend to approach and process information differently depending on the channel; they approach television primarily for entertainment and newspapers primarily for information. This has implications for trust as well since audiences in an entertainment processing mode will be less attentive to credibility cues, such as news errors, than those in an information processing mode (Ibid.). <|CITE|> research confi rms this claim -he found that audiences tend to approach newspaper reading more actively than television viewing and that credibility assessments differ regarding whether audience members approach news actively or passively. These fi ndings can help explain why we found a weak positive correlation between television news exposure and trust in journalism. It could be that audiences turn to television not because they expect the best quality information but rather the opposite -namely, that they approach television news less critically, focus less attention on credibility concerns and, therefore, develop a higher degree of trust in journalism. The fact that those respondents who follow the commercial television channel POP TV and the tabloid Slovenske Novice exhibit a higher trust in journalistic objectivity compared to those respondents who do not follow these media is also in line with this interpretation. The topic of Janez Janša and exposure to media that are favourable to him and his SDS party is negatively connected to trust in journalism. This phenomenon can be partly explained by the elaboration likelihood model <|CITE|> , according to which highly involved individuals tend to process new information in a way that maintains and confi rms their original opinion by 1) taking information consistent with their views (information that falls within a narrow range of acceptance) as simply veridical and embracing it, and 2) judging counter-attitudinal information to be the product of biased, misguided or ill-informed sources and rejecting it <|CITE|> <|CITE|> . Highly partisan audiences will, therefore, tend to react to dissonant information by lowering the trustworthiness assessment of the source of such information.", - [('Trust in journalism is not associated with frequency of media use (except in the case of television as mentioned above), indicating that trust is not an important predictor of media use, though it might have an important impact on information processing. ', 0, 254), - ('This counterintuitive fi nding can be explained by taking into account the fact that audiences do not watch informative content merely to inform themselves; they have other motivations that might override credibility concerns. ', 255, 481), - ('For example, they might follow media primarily for entertainment purposes and consequently put less emphasis on the quality of the received information.As <|CITE|> have claimed, audiences tend to approach and process information differently depending on the channel; they approach television primarily for entertainment and newspapers primarily for information. ', 482, 843), - ('This has implications for trust as well since audiences in an entertainment processing mode will be less attentive to credibility cues, such as news errors, than those in an information processing mode (Ibid.). ', 844, 1054), - ('<|CITE|> research confi rms this claim -he found that audiences tend to approach newspaper reading more actively than television viewing and that credibility assessments differ regarding whether audience members approach news actively or passively. ', 1055, 1303), - ('These fi ndings can help explain why we found a weak positive correlation between television news exposure and trust in journalism. ', 1304, 1435), - ('It could be that audiences turn to television not because they expect the best quality information but rather the opposite -namely, that they approach television news less critically, focus less attention on credibility concerns and, therefore, develop a higher degree of trust in journalism. ', 1436, 1728), - ('The fact that those respondents who follow the commercial television channel POP TV and the tabloid Slovenske Novice exhibit a higher trust in journalistic objectivity compared to those respondents who do not follow these media is also in line with this interpretation. ', 1729, 1998), - ('The topic of Janez Janša and exposure to media that are favourable to him and his SDS party is negatively connected to trust in journalism. ', 1999, 2138), - ('This phenomenon can be partly explained by the elaboration likelihood model <|CITE|> , according to which highly involved individuals tend to process new information in a way that maintains and confi rms their original opinion by ', 2139, 2368), - ('1) taking information consistent with their views (information that falls within a narrow range of acceptance) as simply veridical and embracing it, and ', 2369, 2521), - ('2) judging counter-attitudinal information to be the product of biased, misguided or ill-informed sources and rejecting it <|CITE|> <|CITE|> . ', 2522, 2664), + [('Trust in journalism is not associated with frequency of media use (except in the case of television as mentioned above), indicating that trust is not an important predictor of media use, though it might have an important impact on information processing. ', 0, 255), + ('This counterintuitive fi nding can be explained by taking into account the fact that audiences do not watch informative content merely to inform themselves; they have other motivations that might override credibility concerns. ', 255, 482), + ('For example, they might follow media primarily for entertainment purposes and consequently put less emphasis on the quality of the received information.As <|CITE|> have claimed, audiences tend to approach and process information differently depending on the channel; they approach television primarily for entertainment and newspapers primarily for information. ', 482, 844), + ('This has implications for trust as well since audiences in an entertainment processing mode will be less attentive to credibility cues, such as news errors, than those in an information processing mode (Ibid.). ', 844, 1055), + ('<|CITE|> research confi rms this claim -he found that audiences tend to approach newspaper reading more actively than television viewing and that credibility assessments differ regarding whether audience members approach news actively or passively. ', 1055, 1304), + ('These fi ndings can help explain why we found a weak positive correlation between television news exposure and trust in journalism. ', 1304, 1436), + ('It could be that audiences turn to television not because they expect the best quality information but rather the opposite -namely, that they approach television news less critically, focus less attention on credibility concerns and, therefore, develop a higher degree of trust in journalism. ', 1436, 1729), + ('The fact that those respondents who follow the commercial television channel POP TV and the tabloid Slovenske Novice exhibit a higher trust in journalistic objectivity compared to those respondents who do not follow these media is also in line with this interpretation. ', 1729, 1999), + ('The topic of Janez Janša and exposure to media that are favourable to him and his SDS party is negatively connected to trust in journalism. ', 1999, 2139), + ('This phenomenon can be partly explained by the elaboration likelihood model <|CITE|> , according to which highly involved individuals tend to process new information in a way that maintains and confi rms their original opinion by ', 2139, 2369), + ('1) taking information consistent with their views (information that falls within a narrow range of acceptance) as simply veridical and embracing it, and ', 2369, 2522), + ('2) judging counter-attitudinal information to be the product of biased, misguided or ill-informed sources and rejecting it <|CITE|> <|CITE|> . ', 2522, 2665), ('Highly partisan audiences will, therefore, tend to react to dissonant information by lowering the trustworthiness assessment of the source of such information.', 2665, 2824)] ), ('#55', "She turned to him, \"This is great.\" She held the book out to show him.", [ - ('She turned to him, "This is great." ', 0, 35), ('She held the book out to show him.', 36, 70) + ('She turned to him, "This is great." ', 0, 36), ('She held the book out to show him.', 36, 70) ]) ] diff --git a/tests/test_segmenter.py b/tests/test_segmenter.py index 89d0c4b..6ec64ae 100644 --- a/tests/test_segmenter.py +++ b/tests/test_segmenter.py @@ -24,7 +24,7 @@ def test_segmenter_doesnt_mutate_input(pysbd_default_en_no_clean_no_span_fixture @pytest.mark.parametrize('text,expected', [('My name is Jonas E. Smith. Please turn to p. 55.', [ - ('My name is Jonas E. Smith. ', 0, 26), + ('My name is Jonas E. Smith. ', 0, 27), ('Please turn to p. 55.', 27, 48), ]) ]) @@ -37,6 +37,32 @@ def test_sbd_char_span(en_no_clean_with_span_fixture, text, expected): # clubbing sentences and matching with original text assert text == "".join([seg.sent for seg in segments]) +def test_same_sentence_different_char_span(en_no_clean_with_span_fixture): + """Test same sentences with different char offsets & check for non-destruction""" + text = """From the AP comes this story : +President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area. +*** +After you are elected in 2004, what will your memoirs say about you, what will the title be, and what will the main theme say? +*** +"THE PRESIDENT: I appreciate that. +(Laughter.) +My life is too complicated right now trying to do my job. +(Laughter.)""" + expected_text_spans = [TextSpan(sent='From the AP comes this story :\n', start=0, end=31), + TextSpan(sent='President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area.\n', start=31, end=153), + TextSpan(sent='***\n', start=153, end=157), + TextSpan(sent='After you are elected in 2004, what will your memoirs say about you, what will the title be, and what will the main theme say?\n', start=157, end=284), + TextSpan(sent='***\n', start=284, end=288), + TextSpan(sent='"THE PRESIDENT: I appreciate that.\n', start=288, end=323), + TextSpan(sent='(Laughter.)\n', start=323, end=335), + TextSpan(sent='My life is too complicated right now trying to do my job.\n', start=335, end=393), + TextSpan(sent='(Laughter.)', start=393, end=404)] + segments_w_spans = en_no_clean_with_span_fixture.segment(text) + assert segments_w_spans == expected_text_spans + # check for non-destruction + # clubbing sentences and matching with original text + assert text == "".join([seg.sent for seg in segments_w_spans]) + def test_exception_with_both_clean_and_span_true(): """Test to not allow clean=True and char_span=True """