## Setting up the environment

In [1]:
import json
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForMaskedLM

In [2]:
pip list

Package                           Version
--------------------------------- --------------
aiofiles                          24.1.0
aiohappyeyeballs                  2.4.8
aiohttp                           3.11.13
aiosignal                         1.3.2
airportsdata                      20250224
annotated-types                   0.7.0
anyio                             4.8.0
appnope                           0.1.4
argon2-cffi                       23.1.0
argon2-cffi-bindings              21.2.0
arrow                             1.3.0
astor                             0.8.1
asttokens                         3.0.0
async-lru                         2.0.4
attrs                             25.1.0
babel                             2.17.0
beautifulsoup4                    4.13.3
blake3                            1.0.4
bleach                            6.2.0
certifi                           2024.8.30
cffi                              1.17.1
charset-normalizer                3.4.0
click        

In [18]:
def dump_object(obj):
    for attribute in dir(obj):
        if not attribute.startswith("__") and not attribute.startswith("_"): # Avoid special methods
            try:
                value = getattr(obj, attribute)
                if not callable(value):                
                    print(f"{attribute}: {value}")
            except AttributeError:
                print(f"{attribute}: <not accessible>") # Handle potential errors

In [10]:
embeddings_model_path = "ibm-granite/granite-embedding-30m-english"
sparse_embeddings_model_path = "ibm-granite/granite-embedding-30m-sparse"

In [11]:
embeddings_tokenizer = AutoTokenizer.from_pretrained(embeddings_model_path)
sparse_embeddings_tokenizer = AutoTokenizer.from_pretrained(sparse_embeddings_model_path)

In [12]:
config = config = AutoConfig.from_pretrained(embeddings_model_path)
sparse_config = AutoConfig.from_pretrained(sparse_embeddings_model_path)

In [19]:
dump_object(config)

add_cross_attention: False
architectures: ['RobertaModel']
attention_probs_dropout_prob: 0.1
attribute_map: {}
bad_words_ids: None
base_config_key: 
base_model_pp_plan: None
base_model_tp_plan: None
begin_suppress_tokens: None
bos_token_id: 0
chunk_size_feed_forward: 0
classifier_dropout: None
cross_attention_hidden_size: None
decoder_start_token_id: None
diversity_penalty: 0.0
do_sample: False
early_stopping: False
encoder_no_repeat_ngram_size: 0
eos_token_id: 2
exponential_decay_length_penalty: None
finetuning_task: None
forced_bos_token_id: None
forced_eos_token_id: None
hidden_act: gelu
hidden_dropout_prob: 0.1
hidden_size: 384
id2label: {0: 'LABEL_0', 1: 'LABEL_1'}
initializer_range: 0.02
intermediate_size: 1536
is_composition: False
is_decoder: False
is_encoder_decoder: False
label2id: {'LABEL_0': 0, 'LABEL_1': 1}
layer_norm_eps: 1e-12
length_penalty: 1.0
max_length: 20
max_position_embeddings: 514
min_length: 0
model_type: roberta
name_or_path: ibm-granite/granite-embedding-30m-

In [14]:
dump_object(sparse_config)

NameError: name 'dump_obj' is not defined

In [None]:
model = AutoModel.from_pretrained(embeddings_model_path, config=config)

In [None]:
sparse_model = AutoModelForMaskedLM.from_pretrained(sparse_embeddings_model_path, config=config)

In [None]:
text = "This is a sample sentence."
tokenized_input = sparse_embeddings_tokenizer(text, return_tensors="pt") #  or tokenizer(text)
print(tokenized_input)

In [None]:
output = model(**tokenized_input)
output

In [None]:
type(output)

In [None]:
dump_obj(output)

from HF Transformers: [transformers/src/transformers modeling_outputs.py](https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_outputs.py#L70)
```
class BaseModelOutputWithPooling(ModelOutput):
...
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Last layer hidden-state of the first token of the sequence (classification token) after further processing
    through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
    the classification token after processing through a linear layer and a tanh activation function (The tanh
    function outputs values in the range of -1 to +1). 
    The linear layer weights are trained from the next sentence prediction (classification) objective during
    pretraining.
```

Understanding shapes from PyTorch: 
- [Tensor Shapes](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html#tensor-shapes)
- [torch.Tensor.shape](https://pytorch.org/docs/stable/generated/torch.Tensor.shape.html)

In [None]:
# ([rows, num_rows (matches input tokens), columns (sequence_length)])
# contains: logits (un-normalized values)
output.last_hidden_state.shape

In [None]:
# ([rows, sequence_length])
# containst: embeddings (normlized, using tanh fx.)
output.pooler_output.shape