Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add get_logits method and NLLB tokenizer #756

Merged
merged 7 commits into from
Dec 9, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions src/open_clip/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained,\
list_pretrained_tags_by_model, download_pretrained_from_hf
from .transform import image_transform_v2, AugmentationCfg, PreprocessCfg, merge_preprocess_dict, merge_preprocess_kwargs
from .tokenizer import HFTokenizer, SimpleTokenizer, DEFAULT_CONTEXT_LENGTH

from .tokenizer import HFTokenizer, NLLBTokenizer, SimpleTokenizer, DEFAULT_CONTEXT_LENGTH

HF_HUB_PREFIX = 'hf-hub:'
_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
Expand Down Expand Up @@ -111,11 +110,18 @@ def get_tokenizer(
context_length = text_config.get('context_length', DEFAULT_CONTEXT_LENGTH)

if 'hf_tokenizer_name' in text_config:
tokenizer = HFTokenizer(
if model_name.startswith("nllb"):
tokenizer = NLLBTokenizer(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

really not a fan of having a model name based hack

text_config['hf_tokenizer_name'],
context_length=context_length,
**tokenizer_kwargs,
)
else:
tokenizer = HFTokenizer(
text_config['hf_tokenizer_name'],
context_length=context_length,
**tokenizer_kwargs,
)
else:
tokenizer = SimpleTokenizer(
context_length=context_length,
Expand Down
20 changes: 19 additions & 1 deletion src/open_clip/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,15 @@ def encode_text(self, text, normalize: bool = False):

return F.normalize(x, dim=-1) if normalize else x

def get_logits(self, image, text):
image_features = self.encode_image(image, normalize=False)
text_features = self.encode_text(text, normalize=False)
image_logits = self.logit_scale * image_features @ text_features.T
if self.logit_bias is not None:
image_logits += self.logit_bias
text_logits = image_logits.T
return image_logits, text_logits

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be

    def get_logits(self, image, text):
        image_features = self.encode_image(image, normalize=True)
        text_features = self.encode_text(text, normalize=True)
        image_logits = self.logit_scale.exp() * image_features @ text_features.T
        if self.logit_bias is not None:
            image_logits += self.logit_bias
        text_logits = image_logits.T
        return image_logits, text_logits

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By bad. Fixed.

def forward(
self,
image: Optional[torch.Tensor] = None,
Expand Down Expand Up @@ -354,6 +363,15 @@ def encode_text(self, text, normalize: bool = False):
features = self.text(text)
return F.normalize(features, dim=-1) if normalize else features

def get_logits(self, image, text):
image_features = self.encode_image(image, normalize=False)
text_features = self.encode_text(text, normalize=False)
image_logits = self.logit_scale * image_features @ text_features.T
if self.logit_bias is not None:
image_logits += self.logit_bias
text_logits = image_logits.T
return image_logits, text_logits

def forward(
self,
image: Optional[torch.Tensor] = None,
Expand Down Expand Up @@ -603,4 +621,4 @@ def get_model_tokenize_cfg(model):
vocab_size = getattr(module, 'vocab_size', None)
if vocab_size is not None:
cfg['vocab_size'] = vocab_size
return cfg
return cfg
64 changes: 64 additions & 0 deletions src/open_clip/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,3 +495,67 @@ def __call__(self, texts: Union[str, List[str]], context_length: Optional[int] =
truncation=True,
)
return output.input_ids


class NLLBTokenizer:
"""HuggingFace tokenizer wrapper for NLLB models"""

def __init__(
self,
tokenizer_name: str,
context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH,
clean: str = "whitespace",
):
from transformers import AutoTokenizer

self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
self.context_length = context_length
self.clean_fn = get_clean_fn(clean)

def save_pretrained(self, dest):
self.tokenizer.save_pretrained(dest)

def __call__(
self,
texts: Union[str, List[str]],
langs: Union[str, List[str], None],
context_length: Optional[int] = None,
) -> torch.Tensor:
import warnings

if isinstance(texts, str):
texts = [texts]

context_length = context_length or self.context_length
assert (
context_length
), "Please set a valid context length in class init or call."

# same cleaning as for default tokenizer, except lowercasing
# adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance
texts = [self.clean_fn(text) for text in texts]
if langs is None:
warnings.warn("No languages provided, assuming all texts are in English.")
input_ids = self.tokenizer.batch_encode_plus(
texts,
return_tensors="pt",
max_length=context_length,
padding="max_length",
truncation=True,
).input_ids
else:
assert len(texts) == len(langs), "Please provide a language for each text."
text_input_ids = []
for i, text in enumerate(texts):
self.tokenizer.set_src_lang_special_tokens(langs[i])
text_input_ids.append(
self.tokenizer.batch_encode_plus(
[text],
return_tensors="pt",
max_length=context_length,
padding="max_length",
truncation=True,
).input_ids
)
input_ids = torch.stack(text_input_ids).squeeze()
return input_ids