Skip to content

Commit

Permalink
Further speedup by not using re
Browse files Browse the repository at this point in the history
  • Loading branch information
m3at authored and rwightman committed May 9, 2024
1 parent 4b7ac9c commit 2e8de83
Showing 1 changed file with 3 additions and 4 deletions.
7 changes: 3 additions & 4 deletions src/open_clip/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
_nltk_init = False

DEFAULT_CONTEXT_LENGTH = 77 # default context length for OpenAI CLIP
RE_DEDUPLICATE_WHITESPACE = re.compile(r"\s+")


@lru_cache()
Expand Down Expand Up @@ -71,7 +70,7 @@ def basic_clean(text):


def whitespace_clean(text):
text = RE_DEDUPLICATE_WHITESPACE.sub(" ", text)
text = " ".join(text.split())
text = text.strip()
return text

Expand Down Expand Up @@ -127,7 +126,7 @@ def canonicalize_text(
else:
text = text.translate(trans_punctuation)
text = text.lower()
text = RE_DEDUPLICATE_WHITESPACE.sub(" ", text)
text = " ".join(text.split())
return text.strip()


Expand Down Expand Up @@ -191,7 +190,7 @@ def bpe(self, token):
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
except Exception:
new_word.extend(word[i:])
break

Expand Down

0 comments on commit 2e8de83

Please sign in to comment.