## Setting up the environment

In [1]:
import json
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForMaskedLM

In [8]:
def dump_object(obj):
    for attribute in dir(obj):
        if not attribute.startswith("__") and not attribute.startswith("_"): # Avoid special methods
            try:
                value = getattr(obj, attribute)
                if not callable(value):                
                    print(f"{attribute}: {value}")
            except AttributeError:
                print(f"{attribute}: <not accessible>") # Handle potential errors

In [9]:
embeddings_model_path = "ibm-granite/granite-embedding-30m-english"
sparse_embeddings_model_path = "ibm-granite/granite-embedding-30m-sparse"

In [10]:
embeddings_tokenizer = AutoTokenizer.from_pretrained(embeddings_model_path)
sparse_embeddings_tokenizer = AutoTokenizer.from_pretrained(sparse_embeddings_model_path)

In [11]:
config = config = AutoConfig.from_pretrained(embeddings_model_path)
sparse_config = AutoConfig.from_pretrained(sparse_embeddings_model_path)

In [12]:
dump_object(config)

add_cross_attention: False
architectures: ['RobertaModel']
attention_probs_dropout_prob: 0.1
attribute_map: {}
bad_words_ids: None
base_config_key: 
base_model_pp_plan: None
base_model_tp_plan: None
begin_suppress_tokens: None
bos_token_id: 0
chunk_size_feed_forward: 0
classifier_dropout: None
cross_attention_hidden_size: None
decoder_start_token_id: None
diversity_penalty: 0.0
do_sample: False
early_stopping: False
encoder_no_repeat_ngram_size: 0
eos_token_id: 2
exponential_decay_length_penalty: None
finetuning_task: None
forced_bos_token_id: None
forced_eos_token_id: None
hidden_act: gelu
hidden_dropout_prob: 0.1
hidden_size: 384
id2label: {0: 'LABEL_0', 1: 'LABEL_1'}
initializer_range: 0.02
intermediate_size: 1536
is_composition: False
is_decoder: False
is_encoder_decoder: False
label2id: {'LABEL_0': 0, 'LABEL_1': 1}
layer_norm_eps: 1e-12
length_penalty: 1.0
max_length: 20
max_position_embeddings: 514
min_length: 0
model_type: roberta
name_or_path: ibm-granite/granite-embedding-30m-

In [13]:
dump_object(sparse_config)

add_cross_attention: False
architectures: ['RobertaForMaskedLM']
attention_probs_dropout_prob: 0.1
attribute_map: {}
bad_words_ids: None
base_config_key: 
base_model_pp_plan: None
base_model_tp_plan: None
begin_suppress_tokens: None
bos_token_id: 0
chunk_size_feed_forward: 0
classifier_dropout: None
cross_attention_hidden_size: None
decoder_start_token_id: None
diversity_penalty: 0.0
do_sample: False
early_stopping: False
encoder_no_repeat_ngram_size: 0
eos_token_id: 2
exponential_decay_length_penalty: None
finetuning_task: None
forced_bos_token_id: None
forced_eos_token_id: None
hidden_act: gelu
hidden_dropout_prob: 0.1
hidden_size: 384
id2label: {0: 'LABEL_0', 1: 'LABEL_1'}
initializer_range: 0.02
intermediate_size: 1536
is_composition: False
is_decoder: False
is_encoder_decoder: False
label2id: {'LABEL_0': 0, 'LABEL_1': 1}
layer_norm_eps: 1e-12
length_penalty: 1.0
max_length: 20
max_position_embeddings: 514
min_length: 0
model_type: roberta
name_or_path: ibm-granite/granite-embeddin

In [14]:
model = AutoModel.from_pretrained(embeddings_model_path, config=config)

In [15]:
sparse_model = AutoModelForMaskedLM.from_pretrained(sparse_embeddings_model_path, config=config)

In [16]:
text = "This is a sample sentence."
tokenized_input = sparse_embeddings_tokenizer(text, return_tensors="pt") #  or tokenizer(text)
print(tokenized_input)

{'input_ids': tensor([[   0,  713,   16,   10, 7728, 3645,    4,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [17]:
output = model(**tokenized_input)
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0917,  0.6270,  0.2908,  ...,  0.8090, -7.7908,  0.4029],
         [-0.9681,  0.4096,  1.1218,  ...,  1.9385, -8.3621, -1.2195],
         [-0.6250,  0.4209,  1.2987,  ...,  2.0155, -8.8782, -1.3207],
         ...,
         [-0.7753,  0.2901,  1.3131,  ...,  1.8260, -7.9409, -1.2732],
         [-0.7346,  0.1128,  0.9503,  ...,  1.8745, -8.2791, -1.2013],
         [-1.1832,  0.1630,  0.9972,  ...,  1.6418, -8.7131, -1.3595]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.0740, -0.3853, -0.3833,  0.8431,  0.8284, -0.5442,  0.0567, -0.7814,
         -0.0306, -0.7020, -0.5249,  0.8708,  0.2952,  0.2342, -0.1667, -0.3459,
          0.9935,  0.2066,  0.4474, -0.2957,  0.6336, -0.9835, -0.0151,  0.6406,
          0.3330, -0.7784,  0.4974, -0.1368,  0.8830,  0.9333, -0.8930, -0.5642,
         -0.9814, -0.4812, -0.2500,  0.5129,  0.2521,  0.2317,  0.2613, -0.5998,
         -0.8011,  0.3829,  0.39

In [18]:
type(output)

transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions

In [19]:
dump_object(output)

attentions: None
cross_attentions: None
hidden_states: None
last_hidden_state: tensor([[[ 0.0917,  0.6270,  0.2908,  ...,  0.8090, -7.7908,  0.4029],
         [-0.9681,  0.4096,  1.1218,  ...,  1.9385, -8.3621, -1.2195],
         [-0.6250,  0.4209,  1.2987,  ...,  2.0155, -8.8782, -1.3207],
         ...,
         [-0.7753,  0.2901,  1.3131,  ...,  1.8260, -7.9409, -1.2732],
         [-0.7346,  0.1128,  0.9503,  ...,  1.8745, -8.2791, -1.2013],
         [-1.1832,  0.1630,  0.9972,  ...,  1.6418, -8.7131, -1.3595]]],
       grad_fn=<NativeLayerNormBackward0>)
past_key_values: None
pooler_output: tensor([[ 0.0740, -0.3853, -0.3833,  0.8431,  0.8284, -0.5442,  0.0567, -0.7814,
         -0.0306, -0.7020, -0.5249,  0.8708,  0.2952,  0.2342, -0.1667, -0.3459,
          0.9935,  0.2066,  0.4474, -0.2957,  0.6336, -0.9835, -0.0151,  0.6406,
          0.3330, -0.7784,  0.4974, -0.1368,  0.8830,  0.9333, -0.8930, -0.5642,
         -0.9814, -0.4812, -0.2500,  0.5129,  0.2521,  0.2317,  0.2613, -0.

from HF Transformers: [transformers/src/transformers modeling_outputs.py](https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_outputs.py#L70)
```
class BaseModelOutputWithPooling(ModelOutput):
...
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Last layer hidden-state of the first token of the sequence (classification token) after further processing
    through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
    the classification token after processing through a linear layer and a tanh activation function (The tanh
    function outputs values in the range of -1 to +1). 
    The linear layer weights are trained from the next sentence prediction (classification) objective during
    pretraining.
```

Understanding shapes from PyTorch: 
- [Tensor Shapes](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html#tensor-shapes)
- [torch.Tensor.shape](https://pytorch.org/docs/stable/generated/torch.Tensor.shape.html)

In [20]:
# ([rows, num_rows (matches input tokens), columns (sequence_length)])
# contains: logits (un-normalized values)
output.last_hidden_state.shape

torch.Size([1, 8, 384])

In [21]:
# ([rows, sequence_length])
# containst: embeddings (normlized, using tanh fx.)
output.pooler_output.shape

torch.Size([1, 384])