# Create your Custom Embedder using FastEmbed

This notebook shows how to use an embedder that is not listed in FastEmbed but can be used in FastEmbed via Huggingface.

In this case, I want to use an embedder for Italian without using a multilingual one.

In [None]:
import asyncio
from typing import List
from medha.interfaces import BaseEmbedder
from fastembed import TextEmbedding
from fastembed.common.model_description import PoolingType, ModelSource

## Create your custom embedder

In [None]:
class FastEmbedCustomEmbedder(BaseEmbedder):
    """
    Custom wrapper for FastEmbed that registers a specific model
    at initialization and implements the Medha interface.
    """

    def __init__(self, model_name: str = "nickprock/sentence-bert-base-italian-xxl-uncased", model_file: str = "onnx/model_qint8_avx512_vnni.onnx"):
        self._model_name = model_name
        self._dim = 768  # Specific size for nickprock/sentence-bert-base-italian-xxl-uncased
        self._model_file = model_file # Point to the onnx subfolder

        # 1. Registering the custom template in FastEmbed
        try:
            TextEmbedding.add_custom_model(
                model=self._model_name,
                pooling=PoolingType.MEAN,
                normalization=True,
                sources=ModelSource(hf=self._model_name),
                dim=self._dim,
                model_file= self._model_file 
            )
        except ValueError:
            # Handles the case where the model has already been registered
            pass

        # 2. Initialization of the FastEmbed model
        self._model = TextEmbedding(model_name=self._model_name, threads=None)

    @property
    def dimension(self) -> int:
        return self._dim

    @property
    def model_name(self) -> str:
        return self._model_name

    async def aembed(self, text: str) -> List[float]:
        """
        Generate the embedding for a single string.
        """
        embedding_generator = self._model.embed([text])
        return list(next(embedding_generator))

    async def aembed_batch(self, texts: List[str]) -> List[List[float]]:
        """
        Generate the embedding for a list of strings (batch).
        """
        return list(self._model.embed(texts))

# TEST IT!

## Initialize

In [None]:
print("Initializing Custom Embedder...")
embedder = FastEmbedCustomEmbedder()

## Test one sentence

In [None]:
text = "Medha permette di creare pipeline RAG flessibili."
vector = await embedder.aembed(text)
print(f"\nSingle embedding completed.")
print(f"Dimension: {len(vector)} (Expected: 768)")
print(f"First 5 values: {vector[:5]}")

## Test batch

In [None]:
texts = [
        "L'integrazione di modelli custom è semplice.",
        "FastEmbed usa ONNX runtime per la velocità."
    ]
batch_vectors = await embedder.aembed_batch(texts)
print(f"\nBatch embedding completed for {len(batch_vectors)} sentences.")
print(f"Dimensions of the first vector: {len(batch_vectors[0])}")