|
| 1 | +# -*- coding: utf-8 -*- |
1 | 2 | import json |
2 | 3 | import unittest |
3 | 4 | from datetime import datetime, timedelta |
|
24 | 25 | from kitsune.sumo.tests import LocalizingClient |
25 | 26 | from kitsune.sumo.urlresolvers import reverse |
26 | 27 | from kitsune.users.tests import group, user |
| 28 | +from kitsune.wiki.models import DocumentMappingType |
27 | 29 | from kitsune.wiki.tests import document, revision, helpful_vote |
28 | 30 |
|
29 | 31 |
|
@@ -1100,3 +1102,132 @@ def test_mappings(self): |
1100 | 1102 | merged_mapping[key][1].append(cls_name) |
1101 | 1103 |
|
1102 | 1104 | # If we get here, then we're fine. |
| 1105 | + |
| 1106 | + |
| 1107 | +class TestAnalyzers(ElasticTestCase): |
| 1108 | + |
| 1109 | + def setUp(self): |
| 1110 | + super(TestAnalyzers, self).setUp() |
| 1111 | + |
| 1112 | + self.locale_data = { |
| 1113 | + 'en-US': { |
| 1114 | + 'analyzer': 'snowball-english', |
| 1115 | + 'content': 'I have a cat.', |
| 1116 | + }, |
| 1117 | + 'es': { |
| 1118 | + 'analyzer': 'snowball-spanish', |
| 1119 | + 'content': 'Tieno un gato.', |
| 1120 | + }, |
| 1121 | + 'ar': { |
| 1122 | + 'analyzer': 'arabic', |
| 1123 | + 'content': u'لدي اثنين من القطط', |
| 1124 | + }, |
| 1125 | + 'my': { |
| 1126 | + 'analyzer': 'custom-burmese', |
| 1127 | + 'content': u'အနုပညာ', |
| 1128 | + }, |
| 1129 | + 'he': { |
| 1130 | + 'analyzer': 'standard', |
| 1131 | + 'content': u'גאולוגיה היא אחד', |
| 1132 | + } |
| 1133 | + } |
| 1134 | + |
| 1135 | + self.docs = {} |
| 1136 | + for locale, data in self.locale_data.items(): |
| 1137 | + d = document(locale=locale, save=True) |
| 1138 | + revision(document=d, content=data['content'], is_approved=True, save=True) |
| 1139 | + self.locale_data[locale]['doc'] = d |
| 1140 | + |
| 1141 | + self.refresh() |
| 1142 | + |
| 1143 | + def test_analyzer_choices(self): |
| 1144 | + """Check that the indexer picked the right analyzer.""" |
| 1145 | + |
| 1146 | + ids = [d.id for d in self.docs.values()] |
| 1147 | + docs = es_utils.get_documents(DocumentMappingType, ids) |
| 1148 | + for doc in docs: |
| 1149 | + locale = doc['locale'] |
| 1150 | + eq_(doc['_analyzer'], self.locale_data[locale]['analyzer']) |
| 1151 | + |
| 1152 | + def _check_locale_tokenization(self, locale, expected_tokens, p_tag=True): |
| 1153 | + """ |
| 1154 | + Check that a given locale's document was tokenized correctly. |
| 1155 | +
|
| 1156 | + * `locale` - The locale to check. |
| 1157 | + * `expected_tokens` - An iterable of the tokens that should be |
| 1158 | + found. If any tokens from this list are missing, or if any |
| 1159 | + tokens not in this list are found, the check will fail. |
| 1160 | + * `p_tag` - Default True. If True, an extra token will be added |
| 1161 | + to `expected_tokens`: "p". |
| 1162 | +
|
| 1163 | + This is because our wiki parser wraps it's content in <p> |
| 1164 | + tags and many analyzers will tokenize a string like |
| 1165 | + '<p>Foo</p>' as ['p', 'foo'] (the HTML tag is included in |
| 1166 | + the tokenization). So this will show up in the tokenization |
| 1167 | + during this test. Not all the analyzers do this, which is |
| 1168 | + why it can be turned off. |
| 1169 | +
|
| 1170 | + Why can't we fix the analyzers to strip out that HTML, and not |
| 1171 | + generate spurious tokens? That could probably be done, but it |
| 1172 | + probably isn't worth while because: |
| 1173 | +
|
| 1174 | + * ES will weight common words lower, thanks to it's TF-IDF |
| 1175 | + algorithms, which judges words based on how often they |
| 1176 | + appear in the entire corpus and in the document, so the p |
| 1177 | + tokens will be largely ignored. |
| 1178 | + * The pre-l10n search code did it this way, so it doesn't |
| 1179 | + break search. |
| 1180 | + * When implementing l10n search, I wanted to minimize the |
| 1181 | + number of changes needed, and this seemed like an unneeded |
| 1182 | + change. |
| 1183 | + """ |
| 1184 | + |
| 1185 | + search = es_utils.Sphilastic(DocumentMappingType) |
| 1186 | + search = search.filter(document_locale=locale) |
| 1187 | + facet_filter = search._process_filters([('document_locale', locale)]) |
| 1188 | + search = search.facet_raw(tokens={ |
| 1189 | + 'terms': {'field': 'document_content'}, |
| 1190 | + 'facet_filter': facet_filter, |
| 1191 | + }) |
| 1192 | + facets = search.facet_counts() |
| 1193 | + |
| 1194 | + expected = set(expected_tokens) |
| 1195 | + if p_tag: |
| 1196 | + # Since `expected` is a set, there is no problem adding this |
| 1197 | + # twice, since duplicates will be ignored. |
| 1198 | + expected.add(u'p') |
| 1199 | + actual = set(t['term'] for t in facets['tokens']) |
| 1200 | + eq_(actual, expected) |
| 1201 | + |
| 1202 | + # These 5 languages were chosen for tokenization testing because |
| 1203 | + # they represent the 5 kinds of languages we have: English, Snowball |
| 1204 | + # supported languages, ES supported languages, Languages with custom |
| 1205 | + # analyzers, and languages with no analyzer, which use the standard |
| 1206 | + # analyzer. |
| 1207 | + |
| 1208 | + def test_english_tokenization(self): |
| 1209 | + """Test that English stemming and stop words work.""" |
| 1210 | + self._check_locale_tokenization('en-US', ['i', 'have', 'cat']) |
| 1211 | + |
| 1212 | + def test_spanish_tokenization(self): |
| 1213 | + """Test that Spanish stemming and stop words work.""" |
| 1214 | + self._check_locale_tokenization('es', ['tien', 'un', 'gat']) |
| 1215 | + |
| 1216 | + def test_arabic_tokenization(self): |
| 1217 | + """Test that Arabic stemming works. |
| 1218 | +
|
| 1219 | + I don't read Arabic, this is just what ES gave me when I asked |
| 1220 | + it to analyze an Arabic text as Arabic. If someone who reads |
| 1221 | + Arabic can improve this test, go for it! |
| 1222 | + """ |
| 1223 | + self._check_locale_tokenization('ar', [u'لد', u'اثن', u'قطط']) |
| 1224 | + |
| 1225 | + def test_burmese_tokenization(self): |
| 1226 | + """Test that the shingle analyzer is active for Burmese.""" |
| 1227 | + tokens = [u'အန', u'နု', u'ုပ', u'ပည', u'ညာ'] |
| 1228 | + self._check_locale_tokenization('my', tokens, False) |
| 1229 | + |
| 1230 | + def test_herbrew_tokenization(self): |
| 1231 | + """Test that Hebrew uses the standard analyzer.""" |
| 1232 | + tokens = [u'גאולוגיה', u'היא', u'אחד'] |
| 1233 | + self._check_locale_tokenization('he', tokens) |
0 commit comments