Fix unicode tokenizer (#447)

neulab · Jun 29, 2018 · 6274a12 · 6274a12
1 parent 316c72c
commit 6274a12
Showing 1 changed file with 2 additions and 4 deletions.
diff --git a/xnmt/preproc.py b/xnmt/preproc.py
@@ -177,9 +177,7 @@ def tokenize(self, sent: str) -> str:
 
   @staticmethod
   def _is_weird(c):
-    return not (unicodedata.category(c)[0] == 'L'
-                or unicodedata.category(c)[0] == 'N'
-                or c.isspace())
+    return not (unicodedata.category(c)[0] in 'LMN' or c.isspace())
 
 class ExternalTokenizer(Tokenizer, Serializable):
   """
@@ -529,4 +527,4 @@ def extract_to(self, in_file, out_file):
         for features, db_item in zip(data, db_by_speaker[speaker_id]):
           features = normalize(features, mean, std)
           hf.create_dataset(str(db_item["index"]), data=features)
-    logger.debug(f"feature extraction took {time.time()-start_time:.3f} seconds")
+    logger.debug(f"feature extraction took {time.time()-start_time:.3f} seconds")