-
Notifications
You must be signed in to change notification settings - Fork 2.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix issue ArabicStemmer AttributeError #1852 #1856
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -294,7 +294,7 @@ def _rv_standard(self, word, vowels): | |
|
||
return rv | ||
|
||
class ArabicStemmer(_LanguageSpecificStemmer): | ||
class ArabicStemmer(_StandardStemmer): | ||
""" | ||
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm) | ||
The Snowball Arabic light Stemmer | ||
|
@@ -516,7 +516,7 @@ def __Suffix_Verb_Step1(self, token): | |
|
||
def __Suffix_Verb_Step2a(self, token): | ||
for suffix in self.__suffix_verb_step2a: | ||
if token.endswith(suffix): | ||
if token.endswith(suffix) and len(token) > 3: | ||
if suffix == '\u062a' and len(token) >= 4: | ||
token = token[:-1] | ||
self.suffix_verb_step2a_success = True | ||
|
@@ -750,14 +750,19 @@ def stem(self, word): | |
self.__checks_1(modified_word) | ||
# checks2 | ||
self.__checks_2(modified_word) | ||
# Pre_Normalization | ||
modified_word = self.__normalize_pre(modified_word) | ||
# Avoid stopwords | ||
if modified_word in self.stopwords or len(modified_word) <= 2: | ||
return modified_word | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
# Start stemming | ||
if self.is_verb: | ||
modified_word = self.__Suffix_Verb_Step1(modified_word) | ||
if self.suffixes_verb_step1_success: | ||
modified_word = self.__Suffix_Verb_Step2a(modified_word) | ||
if not self.suffix_verb_step2a_success : | ||
modified_word = self.__Suffix_Verb_Step2c(modified_word) | ||
#or next | ||
#or next TODO: How to deal with or next instruction | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In which cases would there be more steps that needs to be applied here? Perhaps, it'll be good to list these cases down. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're working on it and other todos when we solve them we'll send PR for updates There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add a link to the assem-ch/arabicstemmer#1 in the github comment too? That'll be helpful for us to track later. Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've created an issue in nltk to track the changes later and added a comment in assem-ch/arabicstemmer#1 , I hope this is helpful, sorry for the late replay. |
||
else: | ||
modified_word = self.__Suffix_Verb_Step2b(modified_word) | ||
if not self.suffix_verb_step2b_success: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,14 +15,16 @@ def test_arabic(self): | |
this unit testing for test the snowball arabic light stemmer | ||
this stemmer deals with prefixes and suffixes | ||
""" | ||
ar_stemmer = SnowballStemmer("arabic") | ||
ar_stemmer = SnowballStemmer("arabic", True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add another test where the |
||
assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب" | ||
assert ar_stemmer.stem("العربية") == "عرب" | ||
assert ar_stemmer.stem("فقالوا") == "قال" | ||
assert ar_stemmer.stem("الطالبات") == "طالب" | ||
assert ar_stemmer.stem("فالطالبات") == "طالب" | ||
assert ar_stemmer.stem("والطالبات") == "طالب" | ||
assert ar_stemmer.stem("الطالبون") == "طالب" | ||
assert ar_stemmer.stem("اللذان") == "اللذان" | ||
assert ar_stemmer.stem("من") == "من" | ||
|
||
def test_russian(self): | ||
# Russian words both consisting of Cyrillic | ||
|
@@ -54,11 +56,11 @@ def test_short_strings_bug(self): | |
assert stemmer.stem("y's") == 'y' | ||
|
||
class PorterTest(unittest.TestCase): | ||
|
||
def _vocabulary(self): | ||
with closing(data.find('stemmers/porter_test/porter_vocabulary.txt').open(encoding='utf-8')) as fp: | ||
return fp.read().splitlines() | ||
|
||
def _test_against_expected_output(self, stemmer_mode, expected_stems): | ||
stemmer = PorterStemmer(mode=stemmer_mode) | ||
for word, true_stem in zip(self._vocabulary(), expected_stems): | ||
|
@@ -68,10 +70,10 @@ def _test_against_expected_output(self, stemmer_mode, expected_stems): | |
word, true_stem, stemmer_mode, our_stem | ||
) | ||
) | ||
|
||
def test_vocabulary_martin_mode(self): | ||
"""Tests all words from the test vocabulary provided by M Porter | ||
|
||
The sample vocabulary and output were sourced from: | ||
http://tartarus.org/martin/PorterStemmer/voc.txt | ||
http://tartarus.org/martin/PorterStemmer/output.txt | ||
|
@@ -84,14 +86,14 @@ def test_vocabulary_martin_mode(self): | |
PorterStemmer.MARTIN_EXTENSIONS, | ||
fp.read().splitlines() | ||
) | ||
|
||
def test_vocabulary_nltk_mode(self): | ||
with closing(data.find('stemmers/porter_test/porter_nltk_output.txt').open(encoding='utf-8')) as fp: | ||
self._test_against_expected_output( | ||
PorterStemmer.NLTK_EXTENSIONS, | ||
fp.read().splitlines() | ||
) | ||
|
||
def test_vocabulary_original_mode(self): | ||
# The list of stems for this test was generated by taking the | ||
# Martin-blessed stemmer from | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just out of curiosity, is there a linguistic reason to avoid words with 2 characters?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We didn't study the case of words that have 2 length yet, We're mentioned it in the list of our todos too.