# Granite Sparse Demo
- See: https://github.com/primeqa/docuverse/blob/v0.0.12/notebooks/GraniteSparseTest.ipynb

In [None]:
pip list

### Ingest and Retriece using PyMilvus

`pip install pymilvus[model]`

In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [101]:
# Let's see a bit more of the tensors...
torch.set_printoptions(threshold=10_000, linewidth=200, edgeitems=4) # edgeitems=3, precision=4, sci_mode=False

In [2]:
def dump_object(obj):
    for attribute in dir(obj):
        if not attribute.startswith("__") and not attribute.startswith("_"): # Avoid special methods
            try:
                value = getattr(obj, attribute)
                if not callable(value):                
                    print(f"{attribute}: {value}")
            except AttributeError:
                print(f"{attribute}: <not accessible>") # Handle potential errors

In [84]:
def saveNumpy3D(numpy_array, file_path):
    with open(file_path, 'w') as file:
       for i in range(numpy_array.shape[0]):
           np.savetxt(file, numpy_array[i, :, :], fmt='%f')
           file.write("\n")  # Add a newline between slices

In [11]:
from pymilvus import model
from pymilvus import MilvusClient, DataType

client = MilvusClient("./milvus_demo.db")
client.drop_collection(collection_name="my_sparse_collection")

In [12]:
schema = client.create_schema(
    auto_id=True,
    enable_dynamic_fields=True,
)
schema.add_field(field_name="pk", datatype=DataType.VARCHAR, is_primary=True, max_length=100)

{'auto_id': True, 'description': '', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}, 'is_primary': True, 'auto_id': False}], 'enable_dynamic_field': False}

In [13]:
schema.add_field(field_name="id", datatype=DataType.VARCHAR, is_primary=False, max_length=100)

{'auto_id': True, 'description': '', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}, 'is_primary': True, 'auto_id': False}, {'name': 'id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}}], 'enable_dynamic_field': False}

In [14]:
schema.add_field(field_name="embeddings", datatype=DataType.SPARSE_FLOAT_VECTOR)

{'auto_id': True, 'description': '', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}, 'is_primary': True, 'auto_id': False}, {'name': 'id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}}, {'name': 'embeddings', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}], 'enable_dynamic_field': False}

In [25]:
index_params = client.prepare_index_params()
index_params.add_index(field_name="embeddings",
                               index_name="sparse_inverted_index",
                               index_type="SPARSE_INVERTED_INDEX",
                               metric_type="IP",
                               params={"drop_ratio_build": 0.2})
client.create_collection(
    collection_name="my_sparse_collection",
    schema=schema,
    index_params=index_params
)

In [26]:
embeddings_model = model.sparse.SpladeEmbeddingFunction(
    model_name="ibm-granite/granite-embedding-30m-sparse", 
    device="cpu",
    batch_size=2,
    k_tokens_query=50,
    k_tokens_document=192
)

In [27]:
# Prepare documents to be ingested
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

# SpladeEmbeddingFunction.encode_documents returns sparse matrix or sparse array depending
# on the milvus-model version. reshape(1,-1) ensures the format is correct for ingestion.
doc_vector = [{"embeddings": doc_emb.reshape(1,-1), "id": f"item_{i}"} for i, doc_emb in enumerate(embeddings_model.encode_documents(docs))]


In [28]:
print(f"type: {type(doc_vector)}\n{doc_vector}")
dump_object(doc_vector)

type: <class 'list'>
[{'embeddings': <COOrdinate sparse array of dtype 'float32'
	with 192 stored elements and shape (1, 50265)>, 'id': 'item_0'}, {'embeddings': <COOrdinate sparse array of dtype 'float32'
	with 192 stored elements and shape (1, 50265)>, 'id': 'item_1'}, {'embeddings': <COOrdinate sparse array of dtype 'float32'
	with 192 stored elements and shape (1, 50265)>, 'id': 'item_2'}]


In [29]:
print(f"{doc_vector[0]}")

{'embeddings': <COOrdinate sparse array of dtype 'float32'
	with 192 stored elements and shape (1, 50265)>, 'id': 'item_0'}


In [30]:

client.insert(
    collection_name="my_sparse_collection",
    data=doc_vector
)

# Prepare search parameters
search_params = {
    "params": {"drop_ratio_search": 0.2},  # Additional optional search parameters
}

# Prepare the query vector

queries = [
      "When was artificial intelligence founded", 
      "Where was Turing born?"
]
query_vector = embeddings_model.encode_documents(queries)

res = client.search(
    collection_name="my_sparse_collection",
    data=query_vector,
    limit=1, #top k documents to return
    output_fields=["id"],
    search_params=search_params,
)

for r in res:
    print(r)


[{'id': '456922559366037504', 'distance': 12.364128112792969, 'entity': {'id': 'item_0'}}]
[{'id': '456922559366037506', 'distance': 17.135875701904297, 'entity': {'id': 'item_2'}}]


### Get Embeddings using HF Transformers

In [62]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import numpy as np

> torch.log() function calculates the natural logarithm (base e) of each element in the input tensor, returning a new tensor with the log values

```python
import torch
a = torch.rand(5) * 5
print(a)
# Output: tensor([4.7767, 4.3234, 1.2156, 0.2411, 4.5739])
b = torch.log(a)
print(b)
# Output: tensor([ 1.5637, 1.4640, 0.1952, -1.4226, 1.5204])
```

In [102]:
class SparseSentenceTransformer:
    def __init__(self, model_name_or_path, device:str= 'cpu'):
        self.model = AutoModelForMaskedLM.from_pretrained(model_name_or_path)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.device = device
        self.model.to(device)
        if device == "cuda":
            self.model = self.model.cuda()
            self.model = self.model.bfloat16()

    @torch.no_grad()
    def encode(self, sentences, max_tokens=20):        
        if type(sentences) == str:
            sentences = [sentences]
        
        input_dict = self.tokenizer(sentences, max_length=512, padding=True, return_tensors='pt', truncation=True)
        attention_mask = input_dict['attention_mask']  # (bs,seqlen)
        print(f"---\nattention_mask: shape: {attention_mask.shape}:\n{attention_mask}")

        if self.device == "cuda":
            input_dict['input_ids'] = input_dict['input_ids'].cuda()
            input_dict['attention_mask'] = input_dict['attention_mask'].cuda()
            if 'token_type_ids' in input_dict:
                input_dict['token_type_ids'] = input_dict['token_type_ids'].cuda()

        hidden_state = self.model(**input_dict)[0]
        print(f"---\nhidden_state: shape: {hidden_state.shape}:\n{hidden_state}")
        
        # Note we add 1.0 to avoid (default) of (-)Infty being set as values
        maxarg = torch.log(1.0 + torch.relu(hidden_state))
        print(f"---\nmaxarg: shape: {maxarg.shape}:\n{maxarg}")
        saveNumpy3D(maxarg, "maxarg.txt")
        
        input_mask_expanded = attention_mask.unsqueeze(-1).to(maxarg.device) # bs * seqlen * voc
        print(f"---\ninput_mask_expanded: shape: {input_mask_expanded.shape}:\n{input_mask_expanded}")   
        saveNumpy3D(input_mask_expanded, "input_mask_expanded.txt")
        
        maxdim1 = torch.max(maxarg * input_mask_expanded, dim=1).values  # bs * voc
        print(f"---\nmaxdim1: shape: {maxdim1.shape}:\n{maxdim1}")
        # saveNumpy3D(maxdim1, "maxdim1.txt")
        np.savetxt("maxdim1.txt", maxdim1, fmt='%f')        
        
        # get topk high weights
        topk, indices = torch.topk(maxdim1, k=max_tokens) # (weight - (bs * max_terms), index - (bs * max_terms))
        print (f"---\ntopk.shape: {topk.shape}:\n{topk}")

        expansions = [[(self.tokenizer.decode(int(indices[sidx][tidx])), float(topk[sidx][tidx])) for tidx in range(topk.shape[1])] for sidx in range(topk.shape[0]) ]  

        return expansions

In [103]:
sparse_model = SparseSentenceTransformer("ibm-granite/granite-embedding-30m-sparse")

In [109]:
#change max_tokens to produce more or less expansions for the sentences
sparse_model.encode(["Artificial intelligence was founded as an academic discipline in 1956."], max_tokens=40)

---
attention_mask: shape: torch.Size([1, 14]):
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
---
hidden_state: shape: torch.Size([1, 14, 50265]):
tensor([[[ -1.7427,  -7.6422,  -1.9149,  -8.9616,  ..., -10.7275, -11.0216, -11.6262, -10.0269],
         [ -2.0706,  -7.3661,  -1.9145,  -7.8359,  ...,  -9.7414,  -9.9994, -10.6886,  -8.8855],
         [ -2.7542,  -8.0620,  -2.3382, -10.5351,  ..., -12.5394, -13.0191, -13.6108, -11.1574],
         [ -2.4148,  -7.8324,  -1.5616, -10.6918,  ..., -12.1566, -12.6419, -13.1174, -11.0873],
         ...,
         [ -1.5354,  -7.3953,  -2.3199,  -8.7108,  ..., -10.0220, -10.3128, -11.0252,  -9.5884],
         [ -3.2807,  -7.0404,  -3.2199,  -8.2233,  ...,  -9.7368, -10.0086, -10.7926, -10.0631],
         [ -1.4550,  -7.6613,  -2.2228,  -8.3820,  ...,  -9.8845, -10.1485, -10.7153,  -8.8985],
         [ -1.5066,  -7.5331,  -2.0919,  -7.9797,  ...,  -9.3315,  -9.5452, -10.1722,  -8.3641]]])
---
maxarg: shape: torch.Size([1, 14, 50265]):
tensor(

[[(' AI', 1.6671510934829712),
  (' intelligence', 1.4905357360839844),
  (' artificial', 1.2501304149627686),
  (' discipline', 1.2192901372909546),
  (' founded', 1.060374140739441),
  (' 1956', 1.0351001024246216),
  (' invention', 0.9785776138305664),
  ('56', 0.7224239706993103),
  (' learning', 0.6999132037162781),
  (' scientific', 0.6892709732055664),
  (' computer', 0.6566586494445801),
  (' academic', 0.62173992395401),
  (' university', 0.5886272192001343),
  (' robot', 0.5613628625869751),
  (' establishment', 0.5508407950401306),
  (' philosophy', 0.5431841611862183),
  ('A', 0.5025954842567444),
  (' brain', 0.47637760639190674),
  (' machine', 0.44881123304367065),
  ('1960', 0.446492463350296),
  ('1950', 0.4327620565891266),
  (' algorithm', 0.4083283841609955),
  (' science', 0.37494218349456787),
  (' regression', 0.3722963333129883),
  (' comput', 0.33030468225479126),
  (' Discipline', 0.32480597496032715),
  (' institute', 0.3156873881816864),
  (' automatic', 0.2