Skip to content

Commit

Permalink
Fix unicode tokenizer (#447)
Browse files Browse the repository at this point in the history
  • Loading branch information
neubig authored and msperber committed Jun 29, 2018
1 parent 316c72c commit 6274a12
Showing 1 changed file with 2 additions and 4 deletions.
6 changes: 2 additions & 4 deletions xnmt/preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,7 @@ def tokenize(self, sent: str) -> str:

@staticmethod
def _is_weird(c):
return not (unicodedata.category(c)[0] == 'L'
or unicodedata.category(c)[0] == 'N'
or c.isspace())
return not (unicodedata.category(c)[0] in 'LMN' or c.isspace())

class ExternalTokenizer(Tokenizer, Serializable):
"""
Expand Down Expand Up @@ -529,4 +527,4 @@ def extract_to(self, in_file, out_file):
for features, db_item in zip(data, db_by_speaker[speaker_id]):
features = normalize(features, mean, std)
hf.create_dataset(str(db_item["index"]), data=features)
logger.debug(f"feature extraction took {time.time()-start_time:.3f} seconds")
logger.debug(f"feature extraction took {time.time()-start_time:.3f} seconds")

0 comments on commit 6274a12

Please sign in to comment.