## Setting up the environment

In [45]:
import json
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForMaskedLM

In [75]:
pip list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Package                           Version
--------------------------------- --------------
aiofiles                          24.1.0
aiohappyeyeballs                  2.4.8
aiohttp                           3.11.13
aiosignal                         1.3.2
airportsdata                      20250224
annotated-types                   0.7.0
anyio                             4.8.0
appnope                           0.1.4
argon2-cffi                       23.1.0
argon2-cffi-bindings              21.2.0
arrow                             1.3.0
astor                             0.8.1
asttokens                         3.0.0
async-lru                         2.0.4
attrs                             25.1.0
babel                             2.17.0
beautifulsoup4                    4.13.3
blake3                            1.0.4
bleach                            6.2.0
certifi                           2024.8.30
cffi                              1.17.1
charset-normalizer                3.4.0
click        

In [54]:
def dump_object(obj):
    for attribute in dir(obj):
        if not attribute.startswith("__"): # Avoid special methods
            try:
                value = getattr(obj, attribute)
                print(f"{attribute}: {value}")
            except AttributeError:
                print(f"{attribute}: <not accessible>") # Handle potential errors

In [62]:
embeddings_model_path = "ibm-granite/granite-embedding-30m-english"
sparse_embeddings_model_path = "ibm-granite/granite-embedding-30m-sparse"

In [63]:
embeddings_tokenizer = AutoTokenizer.from_pretrained(embeddings_model_path)
sparse_embeddings_tokenizer = AutoTokenizer.from_pretrained(sparse_embeddings_model_path)

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [64]:
config = config = AutoConfig.from_pretrained(embeddings_model_path)
sparse_config = AutoConfig.from_pretrained(sparse_embeddings_model_path)

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

In [60]:
dump_obj(config)

return_dict: True
output_hidden_states: False
output_attentions: False
torchscript: False
torch_dtype: torch.bfloat16
use_bfloat16: False
tf_legacy_loss: False
pruned_heads: {}
tie_word_embeddings: True
chunk_size_feed_forward: 0
is_encoder_decoder: False
is_decoder: False
cross_attention_hidden_size: None
add_cross_attention: False
tie_encoder_decoder: False
max_length: 20
min_length: 0
do_sample: False
early_stopping: False
num_beams: 1
num_beam_groups: 1
diversity_penalty: 0.0
temperature: 1.0
top_k: 50
top_p: 1.0
typical_p: 1.0
repetition_penalty: 1.0
length_penalty: 1.0
no_repeat_ngram_size: 0
encoder_no_repeat_ngram_size: 0
bad_words_ids: None
num_return_sequences: 1
output_scores: False
return_dict_in_generate: False
forced_bos_token_id: None
forced_eos_token_id: None
remove_invalid_values: False
exponential_decay_length_penalty: None
suppress_tokens: None
begin_suppress_tokens: None
architectures: ['RobertaModel']
finetuning_task: None
id2label: {0: 'LABEL_0', 1: 'LABEL_1'}
lab

In [65]:
dump_obj(sparse_config)

return_dict: True
output_hidden_states: False
output_attentions: False
torchscript: False
torch_dtype: torch.bfloat16
use_bfloat16: False
tf_legacy_loss: False
pruned_heads: {}
tie_word_embeddings: True
chunk_size_feed_forward: 0
is_encoder_decoder: False
is_decoder: False
cross_attention_hidden_size: None
add_cross_attention: False
tie_encoder_decoder: False
max_length: 20
min_length: 0
do_sample: False
early_stopping: False
num_beams: 1
num_beam_groups: 1
diversity_penalty: 0.0
temperature: 1.0
top_k: 50
top_p: 1.0
typical_p: 1.0
repetition_penalty: 1.0
length_penalty: 1.0
no_repeat_ngram_size: 0
encoder_no_repeat_ngram_size: 0
bad_words_ids: None
num_return_sequences: 1
output_scores: False
return_dict_in_generate: False
forced_bos_token_id: None
forced_eos_token_id: None
remove_invalid_values: False
exponential_decay_length_penalty: None
suppress_tokens: None
begin_suppress_tokens: None
architectures: ['RobertaForMaskedLM']
finetuning_task: None
id2label: {0: 'LABEL_0', 1: 'LABEL_1

In [66]:
model = AutoModel.from_pretrained(embeddings_model_path, config=config)

In [67]:
sparse_model = AutoModelForMaskedLM.from_pretrained(sparse_embeddings_model_path, config=config)

model.safetensors:   0%|          | 0.00/60.7M [00:00<?, ?B/s]

In [68]:
text = "This is a sample sentence."
tokenized_input = sparse_embeddings_tokenizer(text, return_tensors="pt") #  or tokenizer(text)
print(tokenized_input)

{'input_ids': tensor([[   0,  713,   16,   10, 7728, 3645,    4,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [69]:
output = model(**tokenized_input)
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0917,  0.6270,  0.2908,  ...,  0.8090, -7.7908,  0.4029],
         [-0.9681,  0.4096,  1.1218,  ...,  1.9385, -8.3621, -1.2195],
         [-0.6250,  0.4209,  1.2987,  ...,  2.0155, -8.8782, -1.3207],
         ...,
         [-0.7753,  0.2901,  1.3131,  ...,  1.8260, -7.9409, -1.2732],
         [-0.7346,  0.1128,  0.9503,  ...,  1.8745, -8.2791, -1.2013],
         [-1.1832,  0.1630,  0.9972,  ...,  1.6418, -8.7131, -1.3595]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.0740, -0.3853, -0.3833,  0.8431,  0.8284, -0.5442,  0.0567, -0.7814,
         -0.0306, -0.7020, -0.5249,  0.8708,  0.2952,  0.2342, -0.1667, -0.3459,
          0.9935,  0.2066,  0.4474, -0.2957,  0.6336, -0.9835, -0.0151,  0.6406,
          0.3330, -0.7784,  0.4974, -0.1368,  0.8830,  0.9333, -0.8930, -0.5642,
         -0.9814, -0.4812, -0.2500,  0.5129,  0.2521,  0.2317,  0.2613, -0.5998,
         -0.8011,  0.3829,  0.39

In [76]:
type(output)

transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions

In [72]:
dump_obj(output)

last_hidden_state: tensor([[[ 0.0917,  0.6270,  0.2908,  ...,  0.8090, -7.7908,  0.4029],
         [-0.9681,  0.4096,  1.1218,  ...,  1.9385, -8.3621, -1.2195],
         [-0.6250,  0.4209,  1.2987,  ...,  2.0155, -8.8782, -1.3207],
         ...,
         [-0.7753,  0.2901,  1.3131,  ...,  1.8260, -7.9409, -1.2732],
         [-0.7346,  0.1128,  0.9503,  ...,  1.8745, -8.2791, -1.2013],
         [-1.1832,  0.1630,  0.9972,  ...,  1.6418, -8.7131, -1.3595]]],
       grad_fn=<NativeLayerNormBackward0>)
pooler_output: tensor([[ 0.0740, -0.3853, -0.3833,  0.8431,  0.8284, -0.5442,  0.0567, -0.7814,
         -0.0306, -0.7020, -0.5249,  0.8708,  0.2952,  0.2342, -0.1667, -0.3459,
          0.9935,  0.2066,  0.4474, -0.2957,  0.6336, -0.9835, -0.0151,  0.6406,
          0.3330, -0.7784,  0.4974, -0.1368,  0.8830,  0.9333, -0.8930, -0.5642,
         -0.9814, -0.4812, -0.2500,  0.5129,  0.2521,  0.2317,  0.2613, -0.5998,
         -0.8011,  0.3829,  0.3943,  0.6421, -0.0899,  0.7475, -0.1785, -0.0

from HF Transformers: [transformers/src/transformers modeling_outputs.py](https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_outputs.py#L70)
```
class BaseModelOutputWithPooling(ModelOutput):
...
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Last layer hidden-state of the first token of the sequence (classification token) after further processing
    through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
    the classification token after processing through a linear layer and a tanh activation function (The tanh
    function outputs values in the range of -1 to +1). 
    The linear layer weights are trained from the next sentence prediction (classification) objective during
    pretraining.
```

Understanding shapes from PyTorch: 
- [Tensor Shapes](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html#tensor-shapes)
- [torch.Tensor.shape](https://pytorch.org/docs/stable/generated/torch.Tensor.shape.html)

In [91]:
# ([rows, num_rows (matches input tokens), columns (sequence_length)])
# contains: logits (un-normalized values)
output.last_hidden_state.shape

torch.Size([1, 8, 384])

In [92]:
# ([rows, sequence_length])
# containst: embeddings (normlized, using tanh fx.)
output.pooler_output.shape

torch.Size([1, 384])