Further speedup by not using re

mlfoundations · May 9, 2024 · 2e8de83 · 2e8de83
1 parent 4b7ac9c
commit 2e8de83
Showing 1 changed file with 3 additions and 4 deletions.
diff --git a/src/open_clip/tokenizer.py b/src/open_clip/tokenizer.py
@@ -21,7 +21,6 @@
 _nltk_init = False
 
 DEFAULT_CONTEXT_LENGTH = 77  # default context length for OpenAI CLIP
-RE_DEDUPLICATE_WHITESPACE = re.compile(r"\s+")
 
 
 @lru_cache()
@@ -71,7 +70,7 @@ def basic_clean(text):
 
 
 def whitespace_clean(text):
-    text = RE_DEDUPLICATE_WHITESPACE.sub(" ", text)
+    text = " ".join(text.split())
     text = text.strip()
     return text
 
@@ -127,7 +126,7 @@ def canonicalize_text(
     else:
         text = text.translate(trans_punctuation)
     text = text.lower()
-    text = RE_DEDUPLICATE_WHITESPACE.sub(" ", text)
+    text = " ".join(text.split())
     return text.strip()
 
 
@@ -191,7 +190,7 @@ def bpe(self, token):
                     j = word.index(first, i)
                     new_word.extend(word[i:j])
                     i = j
-                except:
+                except Exception:
                     new_word.extend(word[i:])
                     break