mlfoundations · rwightman · Dec 9, 2023 · Dec 7, 2023 · Dec 7, 2023 · Dec 7, 2023
diff --git a/src/open_clip/factory.py b/src/open_clip/factory.py
@@ -18,8 +18,7 @@
 from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained,\
     list_pretrained_tags_by_model, download_pretrained_from_hf
 from .transform import image_transform_v2, AugmentationCfg, PreprocessCfg, merge_preprocess_dict, merge_preprocess_kwargs
-from .tokenizer import HFTokenizer, SimpleTokenizer, DEFAULT_CONTEXT_LENGTH
-
+from .tokenizer import HFTokenizer, NLLBTokenizer, SimpleTokenizer, DEFAULT_CONTEXT_LENGTH
 
 HF_HUB_PREFIX = 'hf-hub:'
 _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
@@ -111,11 +110,18 @@ def get_tokenizer(
         context_length = text_config.get('context_length', DEFAULT_CONTEXT_LENGTH)
 
     if 'hf_tokenizer_name' in text_config:
-        tokenizer = HFTokenizer(
+        if model_name.startswith("nllb"):
+            tokenizer = NLLBTokenizer(
             text_config['hf_tokenizer_name'],
             context_length=context_length,
             **tokenizer_kwargs,
         )
+        else:
+            tokenizer = HFTokenizer(
+                text_config['hf_tokenizer_name'],
+                context_length=context_length,
+                **tokenizer_kwargs,
+            )
     else:
         tokenizer = SimpleTokenizer(
             context_length=context_length,

diff --git a/src/open_clip/model.py b/src/open_clip/model.py
@@ -285,6 +285,15 @@ def encode_text(self, text, normalize: bool = False):
 
         return F.normalize(x, dim=-1) if normalize else x
 
+    def get_logits(self, image, text):
+        image_features = self.encode_image(image, normalize=False)
+        text_features = self.encode_text(text, normalize=False)
+        image_logits = self.logit_scale * image_features @ text_features.T
+        if self.logit_bias is not None:
+            image_logits += self.logit_bias
+        text_logits = image_logits.T
+        return image_logits, text_logits
+
     def forward(
             self,
             image: Optional[torch.Tensor] = None,
@@ -354,6 +363,15 @@ def encode_text(self, text, normalize: bool = False):
         features = self.text(text)
         return F.normalize(features, dim=-1) if normalize else features
 
+    def get_logits(self, image, text):
+        image_features = self.encode_image(image, normalize=False)
+        text_features = self.encode_text(text, normalize=False)
+        image_logits = self.logit_scale * image_features @ text_features.T
+        if self.logit_bias is not None:
+            image_logits += self.logit_bias
+        text_logits = image_logits.T
+        return image_logits, text_logits
+
     def forward(
             self,
             image: Optional[torch.Tensor] = None,
@@ -603,4 +621,4 @@ def get_model_tokenize_cfg(model):
     vocab_size = getattr(module, 'vocab_size', None)
     if vocab_size is not None:
         cfg['vocab_size'] = vocab_size
-    return cfg
+    return cfg
diff --git a/src/open_clip/tokenizer.py b/src/open_clip/tokenizer.py
@@ -495,3 +495,67 @@ def __call__(self, texts: Union[str, List[str]], context_length: Optional[int] =
             truncation=True,
         )
         return output.input_ids
+
+
+class NLLBTokenizer:
+    """HuggingFace tokenizer wrapper for NLLB models"""
+
+    def __init__(
+        self,
+        tokenizer_name: str,
+        context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH,
+        clean: str = "whitespace",
+    ):
+        from transformers import AutoTokenizer
+
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        self.context_length = context_length
+        self.clean_fn = get_clean_fn(clean)
+
+    def save_pretrained(self, dest):
+        self.tokenizer.save_pretrained(dest)
+
+    def __call__(
+        self,
+        texts: Union[str, List[str]],
+        langs: Union[str, List[str], None],
+        context_length: Optional[int] = None,
+    ) -> torch.Tensor:
+        import warnings
+
+        if isinstance(texts, str):
+            texts = [texts]
+
+        context_length = context_length or self.context_length
+        assert (
+            context_length
+        ), "Please set a valid context length in class init or call."
+
+        # same cleaning as for default tokenizer, except lowercasing
+        # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance
+        texts = [self.clean_fn(text) for text in texts]
+        if langs is None:
+            warnings.warn("No languages provided, assuming all texts are in English.")
+            input_ids = self.tokenizer.batch_encode_plus(
+                texts,
+                return_tensors="pt",
+                max_length=context_length,
+                padding="max_length",
+                truncation=True,
+            ).input_ids
+        else:
+            assert len(texts) == len(langs), "Please provide a language for each text."
+            text_input_ids = []
+            for i, text in enumerate(texts):
+                self.tokenizer.set_src_lang_special_tokens(langs[i])
+                text_input_ids.append(
+                    self.tokenizer.batch_encode_plus(
+                        [text],
+                        return_tensors="pt",
+                        max_length=context_length,
+                        padding="max_length",
+                        truncation=True,
+                    ).input_ids
+                )
+            input_ids = torch.stack(text_input_ids).squeeze()
+        return input_ids