<a href="https://www.kaggle.com/code/lukalafaye/stella-en-embedding-model?scriptVersionId=201553731" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# Run with gpu p100

!pip install sentence-transformers flash_attn

In [None]:
!pip install xformers

In [None]:
from sentence_transformers import SentenceTransformer

# ！The default dimension is 1024, if you need other dimensions, please clone the model and modify `modules.json` to replace `2_Dense_1024` with another dimension, e.g. `2_Dense_256` or `2_Dense_8192` !
model = SentenceTransformer("infgrad/stella_en_400M_v5", trust_remote_code=True).cuda()

In [None]:
# This model supports two prompts: "s2p_query" and "s2s_query" for sentence-to-passage and sentence-to-sentence tasks, respectively.
# They are defined in `config_sentence_transformers.json`
prompt_name = "s2p_query"
queries = [
    "What are some ways to reduce stress?",
    "What are the benefits of drinking green tea?",
]
# docs do not need any prompts
docs = [
    "There are many effective ways to reduce stress. Some common techniques include deep breathing, meditation, and physical activity. Engaging in hobbies, spending time in nature, and connecting with loved ones can also help alleviate stress. Additionally, setting boundaries, practicing self-care, and learning to say no can prevent stress from building up.",
    "Green tea has been consumed for centuries and is known for its potential health benefits. It contains antioxidants that may help protect the body against damage caused by free radicals. Regular consumption of green tea has been associated with improved heart health, enhanced cognitive function, and a reduced risk of certain types of cancer. The polyphenols in green tea may also have anti-inflammatory and weight loss properties.",
]

query_embeddings = model.encode(queries, prompt_name=prompt_name)
doc_embeddings = model.encode(docs)
print(query_embeddings.shape, doc_embeddings.shape)
# (2, 1024) (2, 1024)

similarities = model.similarity(query_embeddings, doc_embeddings)
print(similarities)
# tensor([[0.8398, 0.2990],
#         [0.3282, 0.8095]])

In [None]:
model.encode(["aaa", "bbb"])

In [None]:
!pip install chromadb

In [None]:
!pip install chromadb pysqlite3-binary
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [None]:
import torch

In [None]:
import chromadb
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

In [None]:
text_chunks = [("text", "0"), ("text1", "1")]

In [None]:
class StellaEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        self.stella = model

    def __call__(self, texts: Documents) -> Embeddings:
        embeddings = []
        for text in texts:
            embedding_array = self.stella.encode((str(text)))
            embedding = embedding_array.tolist()
            print(type(embedding))
            embeddings.append(embedding)
        return embeddings

In [None]:
def create_chromadb(text_chunks):
    """
    embedding_type can be biobert / biomistral / qwen7b
    text_chunks = [("document", "id"), ...]
    """

    client = chromadb.PersistentClient(path=".")
    print("Client heartbeat: ", client.heartbeat())

    collection = client.get_or_create_collection(name="collection", embedding_function=StellaEmbeddingFunction())
    
    ids = [text_chunk[1] for text_chunk in text_chunks]
    docs = [text_chunk[0] for text_chunk in text_chunks]

    assert len(ids) == len(docs)

    collection.add(documents=docs, ids=ids)
    return collection

In [None]:
collection = create_chromadb(text_chunks)

In [None]:
collection.peek()