In [1]:
!pip install onnxruntime-gpu -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ -qq
!pip install fastembed-gpu huggingface_hub -qqq

In [2]:
from fastembed import TextEmbedding, SparseTextEmbedding,SparseEmbedding
from kaggle_secrets import UserSecretsClient
from datasets import Dataset
from typing import List
import huggingface_hub
import polars as pl
import numpy as np
import ast
import os

In [3]:
user_secrets = UserSecretsClient()
hf_toke_write = user_secrets.get_secret("hf_toke_write")
os.environ["HF_TOKEN"] = hf_toke_write

In [None]:
huggingface_hub.login(hf_toke_write)

In [5]:
def process_data_structure(row):
    doc_template = """
Title: {title}
Ingredients:\n{ingredients}
Directions:\n{directions}
"""
    new_row = list()

    for value in row:
        if isinstance(value, str):
            if value.startswith('[') and value.endswith(']'):
                string_list_value = ast.literal_eval(value)
                list_value= [item.strip() for item in string_list_value]
                new_row.append(list_value)
            else:
                new_row.append(value.strip())
        else:
            new_row.append(value)

    new_row[2] = "\n".join(new_row[2])
    new_row[3] = "\n".join(new_row[3])

    new_row.append({"title": new_row[1],"NER": new_row[6]})
    new_row.append(doc_template.format(title=new_row[1],ingredients=new_row[2],directions=new_row[3]).strip())

    return tuple(new_row)

In [None]:
model = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2", providers=["CUDAExecutionProvider"])

In [7]:
df = (
    pl.read_parquet('hf://datasets/rk404/recipe_short/final_recipes.parquet')
    .map_rows(process_data_structure)
    .rename({
        "column_0": "id",
        "column_1": "title",
        "column_2": "ingredients",
        "column_3": "directions",
        "column_4": "link",
        "column_5": "source",
        "column_6": "NER",
        "column_7": "metadata",
        "column_8": "document"
    })
)

data = df.to_dict(as_series=False)

In [9]:
embeddings_list: List[np.array] = list(
    model.embed(data['document'], batch_size=100)
)

In [11]:
data['all-MiniLM-L6-v2'] = [i for i in embeddings_list]

In [13]:
ds = Dataset.from_dict(data)

In [None]:
ds.push_to_hub("otacilio-psf/recipe_short_embeddings")