diff --git a/nltk/stem/porter.py b/nltk/stem/porter.py index 56894e15d2..5e74b54b96 100644 --- a/nltk/stem/porter.py +++ b/nltk/stem/porter.py @@ -117,20 +117,11 @@ class PorterStemmer(StemmerI): """ # The main part of the stemming algorithm starts here. - # b is a buffer holding a word to be stemmed. The letters are in b[k0], - # b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is - # readjusted downwards as the stemming progresses. Zero termination is - # not in fact used in the algorithm. # Note that only lower case sequences are stemmed. Forcing to lower case # should be done before stem(...) is called. def __init__(self): - self.b = "" # buffer for word to be stemmed - self.k = 0 - self.k0 = 0 - self.j = 0 # j is a general offset into the string - ## --NEW-- ## This is a table of irregular forms. It is quite short, but still ## reflects the errors actually drawn to Martin Porter's attention over @@ -170,19 +161,21 @@ def __init__(self): for key in irregular_forms: for val in irregular_forms[key]: self.pool[val] = key + + self.vowels = frozenset(['a', 'e', 'i', 'o', 'u']) - def cons(self, i): + def _cons(self, word, i): """cons(i) is TRUE <=> b[i] is a consonant.""" - if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': - return 0 - if self.b[i] == 'y': - if i == self.k0: - return 1 + if word[i] in self.vowels: + return False + if word[i] == 'y': + if i == 0: + return True else: - return (not self.cons(i - 1)) + return (not self._cons(word, i - 1)) return 1 - def m(self): + def _m(self, word, j): """m() measures the number of consonant sequences between k0 and j. if c is a consonant sequence and v a vowel sequence, and <..> indicates arbitrary presence, @@ -194,52 +187,52 @@ def m(self): .... """ n = 0 - i = self.k0 + i = 0 while 1: - if i > self.j: + if i > j: return n - if not self.cons(i): + if not self._cons(word, i): break i = i + 1 i = i + 1 while 1: while 1: - if i > self.j: + if i > j: return n - if self.cons(i): + if self._cons(word, i): break i = i + 1 i = i + 1 n = n + 1 while 1: - if i > self.j: + if i > j: return n - if not self.cons(i): + if not self._cons(word, i): break i = i + 1 i = i + 1 - def vowelinstem(self): - """vowelinstem() is TRUE <=> k0,...j contains a vowel""" - for i in range(self.k0, self.j + 1): - if not self.cons(i): - return 1 - return 0 - - def doublec(self, j): - """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" - if j < (self.k0 + 1): - return 0 - if (self.b[j] != self.b[j-1]): - return 0 - return self.cons(j) - - def cvc(self, i): + def _vowelinstem(self, stem): + """vowelinstem(stem) is TRUE <=> stem contains a vowel""" + for i in range(len(stem)): + if not self._cons(stem, i): + return True + return False + + def _doublec(self, word): + """doublec(word) is TRUE <=> word ends with a double consonant""" + if len(word) < 2: + return False + if (word[-1] != word[-2]): + return False + return self._cons(word, len(word)-1) + + def _cvc(self, word, i): """cvc(i) is TRUE <=> - a) ( --NEW--) i == 1, and p[0] p[1] is vowel consonant, or + a) ( --NEW--) i == 1, and word[0] word[1] is vowel consonant, or - b) p[i - 2], p[i - 1], p[i] has the form consonant - + b) word[i - 2], word[i - 1], word[i] has the form consonant - vowel - consonant and also if the second c is not w, x or y. this is used when trying to restore an e at the end of a short word. e.g. @@ -248,39 +241,16 @@ def cvc(self, i): snow, box, tray. """ if i == 0: return 0 # i == 0 never happens perhaps - if i == 1: return (not self.cons(0) and self.cons(1)) - if not self.cons(i) or self.cons(i-1) or not self.cons(i-2): return 0 + if i == 1: return (not self._cons(word, 0) and self._cons(word, 1)) + if not self._cons(word, i) or self._cons(word, i-1) or not self._cons(word, i-2): return 0 - ch = self.b[i] + ch = word[i] if ch == 'w' or ch == 'x' or ch == 'y': return 0 return 1 - def ends(self, s): - """ends(s) is TRUE <=> k0,...k ends with the string s.""" - length = len(s) - if s[length - 1] != self.b[self.k]: # tiny speed-up - return 0 - if length > (self.k - self.k0 + 1): - return 0 - if self.b[self.k-length+1:self.k+1] != s: - return 0 - self.j = self.k - length - return 1 - - def setto(self, s): - """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" - length = len(s) - self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] - self.k = self.j + length - - def r(self, s): - """r(s) is used further down.""" - if self.m() > 0: - self.setto(s) - - def step1ab(self): + def _step1ab(self, word): """step1ab() gets rid of plurals and -ed or -ing. e.g. caresses -> caress @@ -302,44 +272,52 @@ def step1ab(self): meetings -> meet """ - if self.b[self.k] == 's': - if self.ends("sses"): - self.k = self.k - 2 - elif self.ends("ies"): - if self.j == 0: - self.k = self.k - 1 + if word[-1] == 's': + if word.endswith("sses"): + word = word[:-2] + elif word.endswith("ies"): + if len(word) == 4: + word = word[:-1] # this line extends the original algorithm, so that # 'flies'->'fli' but 'dies'->'die' etc else: - self.k = self.k - 2 - elif self.b[self.k - 1] != 's': - self.k = self.k - 1 - - if self.ends("ied"): - if self.j == 0: - self.k = self.k - 1 + word = word[:-2] + elif word[-2] != 's': + word = word[:-1] + + ed_or_ing_trimmed = False + if word.endswith("ied"): + if len(word) == 4: + word = word[:-1] else: - self.k = self.k - 2 + word = word[:-2] # this line extends the original algorithm, so that # 'spied'->'spi' but 'died'->'die' etc - elif self.ends("eed"): - if self.m() > 0: - self.k = self.k - 1 - elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): - self.k = self.j - if self.ends("at"): self.setto("ate") - elif self.ends("bl"): self.setto("ble") - elif self.ends("iz"): self.setto("ize") - elif self.doublec(self.k): - self.k = self.k - 1 - ch = self.b[self.k] - if ch == 'l' or ch == 's' or ch == 'z': - self.k = self.k + 1 - elif (self.m() == 1 and self.cvc(self.k)): - self.setto("e") - - def step1c(self): + elif word.endswith("eed"): + if self._m(word, len(word)-4) > 0: + word = word[:-1] + + + elif word.endswith("ed") and self._vowelinstem(word[:-2]): + word = word[:-2] + ed_or_ing_trimmed = True + elif word.endswith("ing") and self._vowelinstem(word[:-3]): + word = word[:-3] + ed_or_ing_trimmed = True + + if ed_or_ing_trimmed: + if word.endswith("at") or word.endswith("bl") or word.endswith("iz"): + word += 'e' + elif self._doublec(word): + if word[-1] not in {'l', 's', 'z'}: + word = word[:-1] + elif (self._m(word, len(word)-1) == 1 and self._cvc(word, len(word)-1)): + word += 'e' + + return word + + def _step1c(self, word): """step1c() turns terminal y to i when there is another vowel in the stem. --NEW--: This has been modified from the original Porter algorithm so that y->i is only done when y is preceded by a consonant, but not if the stem @@ -358,170 +336,261 @@ def step1c(self): 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried', 'flies' ... """ - if self.ends("y") and self.j > 0 and self.cons(self.k - 1): - self.b = self.b[:self.k] + 'i' + self.b[self.k+1:] + if word[-1] == 'y' and len(word) > 2 and self._cons(word, len(word) - 2): + return word[:-1] + 'i' + else: + return word - def step2(self): + def _step2(self, word): """step2() maps double suffices to single ones. so -ization ( = -ize plus -ation) maps to -ize etc. note that the string before the suffix must give m() > 0. """ - if self.b[self.k - 1] == 'a': - if self.ends("ational"): self.r("ate") - elif self.ends("tional"): self.r("tion") - elif self.b[self.k - 1] == 'c': - if self.ends("enci"): self.r("ence") - elif self.ends("anci"): self.r("ance") - elif self.b[self.k - 1] == 'e': - if self.ends("izer"): self.r("ize") - elif self.b[self.k - 1] == 'l': - if self.ends("bli"): self.r("ble") # --DEPARTURE-- - # To match the published algorithm, replace this phrase with - # if self.ends("abli"): self.r("able") - elif self.ends("alli"): - if self.m() > 0: # --NEW-- - self.setto("al") - self.step2() - elif self.ends("fulli"): self.r("ful") # --NEW-- - elif self.ends("entli"): self.r("ent") - elif self.ends("eli"): self.r("e") - elif self.ends("ousli"): self.r("ous") - elif self.b[self.k - 1] == 'o': - if self.ends("ization"): self.r("ize") - elif self.ends("ation"): self.r("ate") - elif self.ends("ator"): self.r("ate") - elif self.b[self.k - 1] == 's': - if self.ends("alism"): self.r("al") - elif self.ends("iveness"): self.r("ive") - elif self.ends("fulness"): self.r("ful") - elif self.ends("ousness"): self.r("ous") - elif self.b[self.k - 1] == 't': - if self.ends("aliti"): self.r("al") - elif self.ends("iviti"): self.r("ive") - elif self.ends("biliti"): self.r("ble") - elif self.b[self.k - 1] == 'g': # --DEPARTURE-- - if self.ends("logi"): - self.j = self.j + 1 # --NEW-- (Barry Wilkins) - self.r("og") - # To match the published algorithm, delete this phrase - - def step3(self): - """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" - if self.b[self.k] == 'e': - if self.ends("icate"): self.r("ic") - elif self.ends("ative"): self.r("") - elif self.ends("alize"): self.r("al") - elif self.b[self.k] == 'i': - if self.ends("iciti"): self.r("ic") - elif self.b[self.k] == 'l': - if self.ends("ical"): self.r("ic") - elif self.ends("ful"): self.r("") - elif self.b[self.k] == 's': - if self.ends("ness"): self.r("") - - def step4(self): + ch = word[-2] + + if ch == 'a': + if word.endswith("ational"): + return word[:-7] + "ate" if self._m(word, len(word)-8) > 0 else word + elif word.endswith("tional"): + return word[:-6] + "ate" if self._m(word, len(word)-7) > 0 else word + else: + return word + elif ch == 'c': + if word.endswith("enci"): + return word[:-4] + "ence" if self._m(word, len(word)-5) > 0 else word + elif word.endswith("anci"): + return word[:-4] + "ance" if self._m(word, len(word)-5) > 0 else word + else: + return word + elif ch == 'e': + if word.endswith("izer"): + return word[:-1] if self._m(word, len(word)-5) > 0 else word + else: + return word + elif ch == 'l': + if word.endswith("bli"): + return word[:-3] + "ble" if self._m(word, len(word)-4) > 0 else word # --DEPARTURE-- + # To match the published algorithm, replace "bli" with "abli" and "ble" with "able" + elif word.endswith("alli"): + # --NEW-- + if self._m(word, len(word)-5) > 0: + word = word[:-2] + return self._step2(word) + else: + return word + elif word.endswith("fulli"): + return word[:-2] if self._m(word, len(word)-6) else word # --NEW-- + elif word.endswith("entli"): + return word[:-2] if self._m(word, len(word)-6) else word + elif word.endswith("eli"): + return word[:-2] if self._m(word, len(word)-4) else word + elif word.endswith("ousli"): + return word[:-2] if self._m(word, len(word)-6) else word + else: + return word + elif ch == 'o': + if word.endswith("ization"): + return word[:-7] + "ize" if self._m(word, len(word)-8) else word + elif word.endswith("ation"): + return word[:-5] + "ate" if self._m(word, len(word)-6) else word + elif word.endswith("ator"): + return word[:-4] + "ate" if self._m(word, len(word)-5) else word + else: + return word + elif ch == 's': + if word.endswith("alism"): + return word[:-3] if self._m(word, len(word)-6) else word + elif word.endswith("ness"): + if word.endswith("iveness"): + return word[:-4] if self._m(word, len(word)-8) else word + elif word.endswith("fulness"): + return word[:-4] if self._m(word, len(word)-8) else word + elif word.endswith("ousness"): + return word[:-4] if self._m(word, len(word)-8) else word + else: + return word + else: + return word + elif ch == 't': + if word.endswith("aliti"): + return word[:-3] if self._m(word, len(word)-6) else word + elif word.endswith("iviti"): + return word[:-5] + "ive" if self._m(word, len(word)-6) else word + elif word.endswith("biliti"): + return word[:-6] + "ble" if self._m(word, len(word)-7) else word + else: + return word + elif ch == 'g': # --DEPARTURE-- + if word.endswith("logi"): + return word[:-1] if self._m(word, len(word) - 4) else word # --NEW-- (Barry Wilkins) + # To match the published algorithm, pass len(word)-5 to _m instead of len(word)-4 + else: + return word + + else: + return word + + def _step3(self, word): + """step3() deals with -ic-, -full, -ness etc. similar strategy to step2.""" + + ch = word[-1] + + if ch == 'e': + if word.endswith("icate"): + return word[:-3] if self._m(word, len(word)-6) else word + elif word.endswith("ative"): + return word[:-5] if self._m(word, len(word)-6) else word + elif word.endswith("alize"): + return word[:-3] if self._m(word, len(word)-6) else word + else: + return word + elif ch == 'i': + if word.endswith("iciti"): + return word[:-3] if self._m(word, len(word)-6) else word + else: + return word + elif ch == 'l': + if word.endswith("ical"): + return word[:-2] if self._m(word, len(word)-5) else word + elif word.endswith("ful"): + return word[:-3] if self._m(word, len(word)-4) else word + else: + return word + elif ch == 's': + if word.endswith("ness"): + return word[:-4] if self._m(word, len(word)-5) else word + else: + return word + + else: + return word + + def _step4(self, word): """step4() takes off -ant, -ence etc., in context vcvc.""" - if self.b[self.k - 1] == 'a': - if self.ends("al"): pass - else: return - elif self.b[self.k - 1] == 'c': - if self.ends("ance"): pass - elif self.ends("ence"): pass - else: return - elif self.b[self.k - 1] == 'e': - if self.ends("er"): pass - else: return - elif self.b[self.k - 1] == 'i': - if self.ends("ic"): pass - else: return - elif self.b[self.k - 1] == 'l': - if self.ends("able"): pass - elif self.ends("ible"): pass - else: return - elif self.b[self.k - 1] == 'n': - if self.ends("ant"): pass - elif self.ends("ement"): pass - elif self.ends("ment"): pass - elif self.ends("ent"): pass - else: return - elif self.b[self.k - 1] == 'o': - if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass - elif self.ends("ou"): pass - # takes care of -ous - else: return - elif self.b[self.k - 1] == 's': - if self.ends("ism"): pass - else: return - elif self.b[self.k - 1] == 't': - if self.ends("ate"): pass - elif self.ends("iti"): pass - else: return - elif self.b[self.k - 1] == 'u': - if self.ends("ous"): pass - else: return - elif self.b[self.k - 1] == 'v': - if self.ends("ive"): pass - else: return - elif self.b[self.k - 1] == 'z': - if self.ends("ize"): pass - else: return + + ch = word[-2] + + if ch == 'a': + if word.endswith("al"): + return word[:-2] if self._m(word, len(word)-3) > 1 else word + else: + return word + elif ch == 'c': + if word.endswith("ance"): + return word[:-4] if self._m(word, len(word)-5) > 1 else word + elif word.endswith("ence"): + return word[:-4] if self._m(word, len(word)-5) > 1 else word + else: + return word + elif ch == 'e': + if word.endswith("er"): + return word[:-2] if self._m(word, len(word)-3) > 1 else word + else: + return word + elif ch == 'i': + if word.endswith("ic"): + return word[:-2] if self._m(word, len(word)-3) > 1 else word + else: + return word + elif ch == 'l': + if word.endswith("able"): + return word[:-4] if self._m(word, len(word)-5) > 1 else word + elif word.endswith("ible"): + return word[:-4] if self._m(word, len(word)-5) > 1 else word + else: + return word + elif ch == 'n': + if word.endswith("ant"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + elif word.endswith("ement"): + return word[:-5] if self._m(word, len(word)-6) > 1 else word + elif word.endswith("ment"): + return word[:-4] if self._m(word, len(word)-5) > 1 else word + elif word.endswith("ent"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + elif ch == 'o': + if word.endswith("sion") or word.endswith("tion"): # slightly different logic to all the other cases + return word[:-3] if self._m(word, len(word)-4) > 1 else word + elif word.endswith("ou"): + return word[:-2] if self._m(word, len(word)-3) > 1 else word + else: + return word + elif ch == 's': + if word.endswith("ism"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + elif ch == 't': + if word.endswith("ate"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + elif word.endswith("iti"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + elif ch == 'u': + if word.endswith("ous"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + elif ch == 'v': + if word.endswith("ive"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + elif ch == 'z': + if word.endswith("ize"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word else: - return - if self.m() > 1: - self.k = self.j + return word - def step5(self): + def _step5(self, word): """step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1. """ - self.j = self.k - if self.b[self.k] == 'e': - a = self.m() - if a > 1 or (a == 1 and not self.cvc(self.k-1)): - self.k = self.k - 1 - if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: - self.k = self.k -1 + if word[-1] == 'e': + a = self._m(word, len(word)-1) + if a > 1 or (a == 1 and not self._cvc(word, len(word)-2)): + word = word[:-1] + if word.endswith('ll') and self._m(word, len(word)-1) > 1: + word = word[:-1] + + return word def stem_word(self, p, i=0, j=None): - """In stem(p,i,j), p is a char pointer, and the string to be stemmed - is from p[i] to p[j] inclusive. Typically i is zero and j is the - offset to the last character of a string, (p[j+1] == '\0'). The - stemmer adjusts the characters p[i] ... p[j] and returns the new - end-point of the string, k. Stemming never increases word length, so - i <= k <= j. To turn the stemmer into a module, declare 'stem' as - extern, and delete the remainder of this file. + """ + Returns the stem of p, or, if i and j are given, the stem of p[i:j+1]. """ ## --NLTK-- - ## Don't print results as we go (commented out the next line) - #print p[i:j+1] - if j is None: - j = len(p) - 1 - - # copy the parameters into statics - self.b = p - self.k = j - self.k0 = i - - if self.b[self.k0:self.k+1] in self.pool: - return self.pool[self.b[self.k0:self.k+1]] + if j is None and i == 0: + word = p + else: + if j is None: + j = len(p) - 1 + word = p[i:j+1] - if self.k <= self.k0 + 1: - return self.b # --DEPARTURE-- + if word in self.pool: + return self.pool[word] + if len(word) <= 2: + return word # --DEPARTURE-- # With this line, strings of length 1 or 2 don't go through the # stemming process, although no mention is made of this in the # published algorithm. Remove the line to match the published # algorithm. - - self.step1ab() - self.step1c() - self.step2() - self.step3() - self.step4() - self.step5() - return self.b[self.k0:self.k+1] - - def adjust_case(self, word, stem): + + word = self._step1ab(word) + word = self._step1c(word) + word = self._step2(word) + word = self._step3(word) + word = self._step4(word) + word = self._step5(word) + return word + + def _adjust_case(self, word, stem): lower = word.lower() ret = "" @@ -556,7 +625,7 @@ def adjust_case(self, word, stem): ## Define a stem() method that implements the StemmerI interface. def stem(self, word): stem = self.stem_word(word.lower(), 0, len(word) - 1) - return self.adjust_case(word, stem) + return self._adjust_case(word, stem) ## --NLTK-- ## Add a string representation function