Merge pull request #71 from nipunsadvilkar/npn-abbr-refactor

nipunsadvilkar · Aug 4, 2020 · 92362f7 · 92362f7
2 parents e6c596f + 6b84eaa
commit 92362f7
Show file tree

Hide file tree

Showing 14 changed files with 164 additions and 51 deletions.
diff --git a/benchmarks/benchmark.py → benchmarks/benchmark_sbd_tools.py b/benchmarks/benchmark.py → benchmarks/benchmark_sbd_tools.py
@@ -4,7 +4,6 @@
 import spacy
 import stanza
 
-import syntok
 from syntok.tokenizer import Tokenizer
 import syntok.segmenter as syntok_segmenter
 
@@ -27,7 +26,8 @@ def nltk_tokenize(text):
     return nltk.sent_tokenize(text)
 
 def pysbd_tokenize(text):
-    return pysbd_segmenter.segment(text)
+    segments = pysbd_segmenter.segment(text)
+    return [s.strip() for s in segments]
 
 def spacy_tokenize(text):
     return [sent.text for sent in nlp(text).sents]

diff --git a/benchmarks/bigtext_speed_benchmark.py b/benchmarks/bigtext_speed_benchmark.py
@@ -0,0 +1,75 @@
+import blingfire
+import nltk
+import pysbd
+import spacy
+import stanza
+
+from syntok.tokenizer import Tokenizer
+import syntok.segmenter as syntok_segmenter
+
+pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
+
+nlp = spacy.blank('en')
+nlp.add_pipe(nlp.create_pipe("sentencizer"))
+nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
+#stanza.download('en')
+stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
+
+syntok_tokenizer = Tokenizer()
+
+def blingfire_tokenize(text):
+    return blingfire.text_to_sentences(text).split('\n')
+
+def nltk_tokenize(text):
+    return nltk.sent_tokenize(text)
+
+def pysbd_tokenize(text):
+    segments = pysbd_segmenter.segment(text)
+    segments = [s.strip() for s in segments]
+    return segments
+
+def spacy_tokenize(text):
+    return [sent.text.strip("\n") for sent in nlp(text).sents]
+
+def spacy_dep_tokenize(text):
+    return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
+
+def stanza_tokenize(text):
+    return [e.text for e in stanza_nlp(text).sentences]
+
+def make_sentences(segmented_tokens):
+    for sentence in segmented_tokens:
+        yield "".join(str(token) for token in sentence).strip()
+
+def syntok_tokenize(text):
+    tokens = syntok_tokenizer.split(text)
+    result = syntok_segmenter.split(iter(tokens))
+    segments = [sent for sent in make_sentences(result)]
+    return segments
+
+def speed_benchmark(big_text, tokenize_func):
+    segments = tokenize_func(big_text)
+    return segments
+
+if __name__ == "__main__":
+    import time
+    libraries = (
+        blingfire_tokenize,
+        nltk_tokenize,
+        pysbd_tokenize,
+        spacy_tokenize,
+        spacy_dep_tokenize,
+        stanza_tokenize,
+        syntok_tokenize)
+
+    for tokenize_func in libraries:
+        t = time.time()
+        # wget http://www.gutenberg.org/files/1661/1661-0.txt -P benchmarks/
+        with open('benchmarks/1661-0.txt') as bigfile:
+            big_text = bigfile.read()
+        sentences = speed_benchmark(big_text, tokenize_func)
+
+        time_taken = time.time() - t
+        print()
+        print(tokenize_func.__name__)
+        print('Speed : {:>20.2f} ms'.format(time_taken * 1000))
diff --git a/benchmarks/genia_benchmark.py b/benchmarks/genia_benchmark.py
@@ -4,7 +4,6 @@
 import spacy
 import stanza
 
-import syntok
 from syntok.tokenizer import Tokenizer
 import syntok.segmenter as syntok_segmenter
 
@@ -27,7 +26,8 @@ def nltk_tokenize(text):
     return nltk.sent_tokenize(text)
 
 def pysbd_tokenize(text):
-    return pysbd_segmenter.segment(text)
+    segments = pysbd_segmenter.segment(text)
+    return [s.strip() for s in segments]
 
 def spacy_tokenize(text):
     return [sent.text.strip("\n") for sent in nlp(text).sents]

diff --git a/pysbd/abbreviation_replacer.py b/pysbd/abbreviation_replacer.py
@@ -32,7 +32,10 @@ def replace(self):
             self.lang.KommanditgesellschaftRule,
             *self.lang.SingleLetterAbbreviationRules.All
         )
-        self.text = self.search_for_abbreviations_in_string()
+        abbr_handled_text = ""
+        for line in self.text.splitlines(True):
+            abbr_handled_text += self.search_for_abbreviations_in_string(line)
+        self.text = abbr_handled_text
         self.replace_multi_period_abbreviations()
         self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
         self.text = self.replace_abbreviation_as_sentence_boundary()
@@ -72,25 +75,24 @@ def replace_period_of_abbr(self, txt, abbr):
         return txt
 
 
-    def search_for_abbreviations_in_string(self):
-        original = self.text
-        lowered = original.lower()
+    def search_for_abbreviations_in_string(self, text):
+        lowered = text.lower()
         for abbr in self.lang.Abbreviation.ABBREVIATIONS:
             stripped = abbr.strip()
             if stripped not in lowered:
                 continue
             abbrev_match = re.findall(
-                r"(?:^|\s|\r|\n){}".format(stripped), original, flags=re.IGNORECASE
+                r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
             )
             if not abbrev_match:
                 continue
             next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
-            char_array = re.findall(next_word_start, self.text)
+            char_array = re.findall(next_word_start, text)
             for ind, match in enumerate(abbrev_match):
-                self.text = self.scan_for_replacements(
-                    self.text, match, ind, char_array
+                text = self.scan_for_replacements(
+                    text, match, ind, char_array
                 )
-        return self.text
+        return text
 
     def scan_for_replacements(self, txt, am, ind, char_array):
         try:

diff --git a/pysbd/about.py b/pysbd/about.py
@@ -2,7 +2,7 @@
 # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
 
 __title__ = "pysbd"
-__version__ = "0.3.0rc"
+__version__ = "0.3.0"
 __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
 __uri__ = "http://nipunsadvilkar.github.io/"
 __author__ = "Nipun Sadvilkar"

diff --git a/pysbd/lang/common/common.py b/pysbd/lang/common/common.py
@@ -5,7 +5,6 @@
 class Common(object):
 
     # added special case: r"[。．.！!?].*" to handle intermittent dots, exclamation, etc.
-    # TODO: above special cases group can be updated as per developer needs
     SENTENCE_BOUNDARY_REGEX = r"（(?:[^）])*）(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。．.！!?？].*|\S.*?[。．.！!?？ȸȹ☉☈☇☄]"
 
     # # Rubular: http://rubular.com/r/NqCqv372Ix

diff --git a/pysbd/lang/deutsch.py b/pysbd/lang/deutsch.py
@@ -63,7 +63,7 @@ def replace(self):
                     SingleLowerCaseLetterRule,
                     SingleLowerCaseLetterAtStartOfLineRule)
 
-            self.text = self.search_for_abbreviations_in_string()
+            self.text = self.search_for_abbreviations_in_string(self.text)
             self.replace_multi_period_abbreviations()
             self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
             self.text = self.replace_abbreviation_as_sentence_boundary()

diff --git a/pysbd/processor.py b/pysbd/processor.py
@@ -77,17 +77,18 @@ def split_into_segments(self):
         sents = [self.check_for_punctuation(s) for s in sents]
         # flatten list of list of sentences
         sents = self.rm_none_flatten(sents)
-        new_sents = []
+        postprocessed_sents = []
         for sent in sents:
             sent = Text(sent).apply(*self.lang.SubSymbolsRules.All)
             post_process_sent = self.post_process_segments(sent)
             if post_process_sent and isinstance(post_process_sent, str):
-                new_sents.append(post_process_sent)
+                postprocessed_sents.append(post_process_sent)
             elif isinstance(post_process_sent, list):
                 for pps in post_process_sent:
-                    new_sents.append(pps)
-        new_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule) for ns in new_sents]
-        return new_sents
+                    postprocessed_sents.append(pps)
+        postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule)
+                               for ns in postprocessed_sents]
+        return postprocessed_sents
 
     def post_process_segments(self, txt):
         if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):

diff --git a/pysbd/segmenter.py b/pysbd/segmenter.py
@@ -14,7 +14,7 @@ def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
 
         Parameters
         ----------
-        language : str, optional
+        language : str, required
             specify a language use its two character ISO 639-1 code,
             by default "en"
         clean : bool, optional
@@ -49,11 +49,23 @@ def processor(self, text):
 
     def sentences_with_char_spans(self, sentences):
         # since SENTENCE_BOUNDARY_REGEX doesnt account
-        # for trailing whitespaces \s* is used as suffix
+        # for trailing whitespaces \s* & is used as suffix
         # to keep non-destructive text after segments joins
-        return [TextSpan(m.group(), m.start(), m.end()) for sent in sentences
-                for m in re.finditer('{0}\s*'.format(re.escape(sent)),
-                self.original_text)]
+        sent_spans = []
+        prior_start_char_idx = 0
+        for sent in sentences:
+            for match in re.finditer(r'{0}\s*'.format(re.escape(sent)), self.original_text):
+                match_str = match.group()
+                match_start_idx, match_end_idx = match.span()
+                if match_start_idx >= prior_start_char_idx:
+                    # making sure if curren sentence and its span
+                    # is either first sentence along with its char spans
+                    # or current sent spans adjacent to prior sentence spans
+                    sent_spans.append(
+                        TextSpan(match_str, match_start_idx, match_end_idx))
+                    prior_start_char_idx = match_start_idx
+                    break
+        return sent_spans
 
     def segment(self, text):
         self.original_text = text
@@ -66,11 +78,11 @@ def segment(self, text):
             text = self.cleaner(text).clean()
         postprocessed_sents = self.processor(text).process()
         sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
-        if self.clean:
+        if self.char_span:
+            return sentence_w_char_spans
+        elif self.clean:
             # clean and destructed sentences
             return postprocessed_sents
-        elif self.char_span:
-            return sentence_w_char_spans
         else:
             # nondestructive with whitespaces
             return [textspan.sent for textspan in sentence_w_char_spans]
diff --git a/pysbd/utils.py b/pysbd/utils.py
@@ -56,13 +56,12 @@ def __init__(self, sent, start, end):
         self.end = end
 
     def __repr__(self):  # pragma: no cover
-        return "{0}(sent='{1}', start={2}, end={3})".format(
-            self.__class__.__name__, self.sent, self.start, self.end)
+        return "{0}(sent={1}, start={2}, end={3})".format(
+            self.__class__.__name__, repr(self.sent), self.start, self.end)
 
     def __eq__(self, other):
         if isinstance(self, other.__class__):
-            return self.sent == other.sent and self.start == other.start and self.end == self.end
-        return False
+            return self.sent == other.sent and self.start == other.start and self.end == other.end
 
 
 class PySBDFactory(object):

diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 
 REQUIRES_PYTHON = ">=3"
 # What packages are required for this module to be executed?
-REQUIRED = ["spacy"]
+REQUIRED = []
 
 with io.open(os.path.join(root, "pysbd", "about.py"), encoding="utf8") as f:
     about = {}