Skip to content

Commit

Permalink
Merge pull request #71 from nipunsadvilkar/npn-abbr-refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
nipunsadvilkar committed Aug 4, 2020
2 parents e6c596f + 6b84eaa commit 92362f7
Show file tree
Hide file tree
Showing 14 changed files with 164 additions and 51 deletions.
4 changes: 2 additions & 2 deletions benchmarks/benchmark.py → benchmarks/benchmark_sbd_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import spacy
import stanza

import syntok
from syntok.tokenizer import Tokenizer
import syntok.segmenter as syntok_segmenter

Expand All @@ -27,7 +26,8 @@ def nltk_tokenize(text):
return nltk.sent_tokenize(text)

def pysbd_tokenize(text):
return pysbd_segmenter.segment(text)
segments = pysbd_segmenter.segment(text)
return [s.strip() for s in segments]

def spacy_tokenize(text):
return [sent.text for sent in nlp(text).sents]
Expand Down
75 changes: 75 additions & 0 deletions benchmarks/bigtext_speed_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import blingfire
import nltk
import pysbd
import spacy
import stanza

from syntok.tokenizer import Tokenizer
import syntok.segmenter as syntok_segmenter

pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)

nlp = spacy.blank('en')
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
#stanza.download('en')
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')

syntok_tokenizer = Tokenizer()

def blingfire_tokenize(text):
return blingfire.text_to_sentences(text).split('\n')

def nltk_tokenize(text):
return nltk.sent_tokenize(text)

def pysbd_tokenize(text):
segments = pysbd_segmenter.segment(text)
segments = [s.strip() for s in segments]
return segments

def spacy_tokenize(text):
return [sent.text.strip("\n") for sent in nlp(text).sents]

def spacy_dep_tokenize(text):
return [sent.text.strip("\n") for sent in nlp_dep(text).sents]

def stanza_tokenize(text):
return [e.text for e in stanza_nlp(text).sentences]

def make_sentences(segmented_tokens):
for sentence in segmented_tokens:
yield "".join(str(token) for token in sentence).strip()

def syntok_tokenize(text):
tokens = syntok_tokenizer.split(text)
result = syntok_segmenter.split(iter(tokens))
segments = [sent for sent in make_sentences(result)]
return segments

def speed_benchmark(big_text, tokenize_func):
segments = tokenize_func(big_text)
return segments

if __name__ == "__main__":
import time
libraries = (
blingfire_tokenize,
nltk_tokenize,
pysbd_tokenize,
spacy_tokenize,
spacy_dep_tokenize,
stanza_tokenize,
syntok_tokenize)

for tokenize_func in libraries:
t = time.time()
# wget http://www.gutenberg.org/files/1661/1661-0.txt -P benchmarks/
with open('benchmarks/1661-0.txt') as bigfile:
big_text = bigfile.read()
sentences = speed_benchmark(big_text, tokenize_func)

time_taken = time.time() - t
print()
print(tokenize_func.__name__)
print('Speed : {:>20.2f} ms'.format(time_taken * 1000))
4 changes: 2 additions & 2 deletions benchmarks/genia_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import spacy
import stanza

import syntok
from syntok.tokenizer import Tokenizer
import syntok.segmenter as syntok_segmenter

Expand All @@ -27,7 +26,8 @@ def nltk_tokenize(text):
return nltk.sent_tokenize(text)

def pysbd_tokenize(text):
return pysbd_segmenter.segment(text)
segments = pysbd_segmenter.segment(text)
return [s.strip() for s in segments]

def spacy_tokenize(text):
return [sent.text.strip("\n") for sent in nlp(text).sents]
Expand Down
20 changes: 11 additions & 9 deletions pysbd/abbreviation_replacer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ def replace(self):
self.lang.KommanditgesellschaftRule,
*self.lang.SingleLetterAbbreviationRules.All
)
self.text = self.search_for_abbreviations_in_string()
abbr_handled_text = ""
for line in self.text.splitlines(True):
abbr_handled_text += self.search_for_abbreviations_in_string(line)
self.text = abbr_handled_text
self.replace_multi_period_abbreviations()
self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
self.text = self.replace_abbreviation_as_sentence_boundary()
Expand Down Expand Up @@ -72,25 +75,24 @@ def replace_period_of_abbr(self, txt, abbr):
return txt


def search_for_abbreviations_in_string(self):
original = self.text
lowered = original.lower()
def search_for_abbreviations_in_string(self, text):
lowered = text.lower()
for abbr in self.lang.Abbreviation.ABBREVIATIONS:
stripped = abbr.strip()
if stripped not in lowered:
continue
abbrev_match = re.findall(
r"(?:^|\s|\r|\n){}".format(stripped), original, flags=re.IGNORECASE
r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
)
if not abbrev_match:
continue
next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
char_array = re.findall(next_word_start, self.text)
char_array = re.findall(next_word_start, text)
for ind, match in enumerate(abbrev_match):
self.text = self.scan_for_replacements(
self.text, match, ind, char_array
text = self.scan_for_replacements(
text, match, ind, char_array
)
return self.text
return text

def scan_for_replacements(self, txt, am, ind, char_array):
try:
Expand Down
2 changes: 1 addition & 1 deletion pysbd/about.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/

__title__ = "pysbd"
__version__ = "0.3.0rc"
__version__ = "0.3.0"
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
__uri__ = "http://nipunsadvilkar.github.io/"
__author__ = "Nipun Sadvilkar"
Expand Down
1 change: 0 additions & 1 deletion pysbd/lang/common/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
class Common(object):

# added special case: r"[。..!!?].*" to handle intermittent dots, exclamation, etc.
# TODO: above special cases group can be updated as per developer needs
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]"

# # Rubular: http://rubular.com/r/NqCqv372Ix
Expand Down
2 changes: 1 addition & 1 deletion pysbd/lang/deutsch.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def replace(self):
SingleLowerCaseLetterRule,
SingleLowerCaseLetterAtStartOfLineRule)

self.text = self.search_for_abbreviations_in_string()
self.text = self.search_for_abbreviations_in_string(self.text)
self.replace_multi_period_abbreviations()
self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
self.text = self.replace_abbreviation_as_sentence_boundary()
Expand Down
11 changes: 6 additions & 5 deletions pysbd/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,18 @@ def split_into_segments(self):
sents = [self.check_for_punctuation(s) for s in sents]
# flatten list of list of sentences
sents = self.rm_none_flatten(sents)
new_sents = []
postprocessed_sents = []
for sent in sents:
sent = Text(sent).apply(*self.lang.SubSymbolsRules.All)
post_process_sent = self.post_process_segments(sent)
if post_process_sent and isinstance(post_process_sent, str):
new_sents.append(post_process_sent)
postprocessed_sents.append(post_process_sent)
elif isinstance(post_process_sent, list):
for pps in post_process_sent:
new_sents.append(pps)
new_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule) for ns in new_sents]
return new_sents
postprocessed_sents.append(pps)
postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule)
for ns in postprocessed_sents]
return postprocessed_sents

def post_process_segments(self, txt):
if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):
Expand Down
28 changes: 20 additions & 8 deletions pysbd/segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
Parameters
----------
language : str, optional
language : str, required
specify a language use its two character ISO 639-1 code,
by default "en"
clean : bool, optional
Expand Down Expand Up @@ -49,11 +49,23 @@ def processor(self, text):

def sentences_with_char_spans(self, sentences):
# since SENTENCE_BOUNDARY_REGEX doesnt account
# for trailing whitespaces \s* is used as suffix
# for trailing whitespaces \s* & is used as suffix
# to keep non-destructive text after segments joins
return [TextSpan(m.group(), m.start(), m.end()) for sent in sentences
for m in re.finditer('{0}\s*'.format(re.escape(sent)),
self.original_text)]
sent_spans = []
prior_start_char_idx = 0
for sent in sentences:
for match in re.finditer(r'{0}\s*'.format(re.escape(sent)), self.original_text):
match_str = match.group()
match_start_idx, match_end_idx = match.span()
if match_start_idx >= prior_start_char_idx:
# making sure if curren sentence and its span
# is either first sentence along with its char spans
# or current sent spans adjacent to prior sentence spans
sent_spans.append(
TextSpan(match_str, match_start_idx, match_end_idx))
prior_start_char_idx = match_start_idx
break
return sent_spans

def segment(self, text):
self.original_text = text
Expand All @@ -66,11 +78,11 @@ def segment(self, text):
text = self.cleaner(text).clean()
postprocessed_sents = self.processor(text).process()
sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
if self.clean:
if self.char_span:
return sentence_w_char_spans
elif self.clean:
# clean and destructed sentences
return postprocessed_sents
elif self.char_span:
return sentence_w_char_spans
else:
# nondestructive with whitespaces
return [textspan.sent for textspan in sentence_w_char_spans]
7 changes: 3 additions & 4 deletions pysbd/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,12 @@ def __init__(self, sent, start, end):
self.end = end

def __repr__(self): # pragma: no cover
return "{0}(sent='{1}', start={2}, end={3})".format(
self.__class__.__name__, self.sent, self.start, self.end)
return "{0}(sent={1}, start={2}, end={3})".format(
self.__class__.__name__, repr(self.sent), self.start, self.end)

def __eq__(self, other):
if isinstance(self, other.__class__):
return self.sent == other.sent and self.start == other.start and self.end == self.end
return False
return self.sent == other.sent and self.start == other.start and self.end == other.end


class PySBDFactory(object):
Expand Down
1 change: 0 additions & 1 deletion requirements.txt

This file was deleted.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

REQUIRES_PYTHON = ">=3"
# What packages are required for this module to be executed?
REQUIRED = ["spacy"]
REQUIRED = []

with io.open(os.path.join(root, "pysbd", "about.py"), encoding="utf8") as f:
about = {}
Expand Down
Loading

0 comments on commit 92362f7

Please sign in to comment.