In [1]:
import openai
import os
import pandas as pd
import polars as pl
import tiktoken

In [2]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [3]:
client = openai.OpenAI(
    api_key=OPENAI_API_KEY
)

In [4]:
df_food = pl.read_csv("../data/generic-food.csv")
df_food.head()

FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP
str,str,str,str
"""Angelica""","""Angelica keisk…","""Herbs and Spic…","""Herbs"""
"""Savoy cabbage""","""Brassica olera…","""Vegetables""","""Cabbages"""
"""Silver linden""","""Tilia argentea…","""Herbs and Spic…","""Herbs"""
"""Kiwi""","""Actinidia chin…","""Fruits""","""Tropical fruit…"
"""Allium (Onion)…","""Allium""","""Vegetables""","""Onion-family v…"


In [5]:
def num_tokens_from_string(text, encoding_name):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(text))
    return num_tokens

In [6]:
df_food = df_food.with_columns(
    pl.col("FOOD NAME").apply(lambda x: num_tokens_from_string(x, "cl100k_base")).alias("total_tokens")
)

In [7]:
df_food.head()

FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,total_tokens
str,str,str,str,i64
"""Angelica""","""Angelica keisk…","""Herbs and Spic…","""Herbs""",2
"""Savoy cabbage""","""Brassica olera…","""Vegetables""","""Cabbages""",4
"""Silver linden""","""Tilia argentea…","""Herbs and Spic…","""Herbs""",3
"""Kiwi""","""Actinidia chin…","""Fruits""","""Tropical fruit…",2
"""Allium (Onion)…","""Allium""","""Vegetables""","""Onion-family v…",6


In [8]:
df_food["total_tokens"].sum()

2947

In [9]:
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", "")
    return (
        client.embeddings.create(input=text, model="text-embedding-ada-002")
        .data[0]
        .embedding
    )

In [10]:
df_food = df_food.with_columns(
    pl.col("FOOD NAME")
    .apply(lambda x: get_embedding(x))
    .alias("embeddings")
)

In [11]:
df_food.head(10)

FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,total_tokens,embeddings
str,str,str,str,i64,list[f64]
"""Angelica""","""Angelica keisk…","""Herbs and Spic…","""Herbs""",2,"[0.006203, -0.0101, … -0.015106]"
"""Savoy cabbage""","""Brassica olera…","""Vegetables""","""Cabbages""",4,"[0.005499, -0.00497, … 0.000699]"
"""Silver linden""","""Tilia argentea…","""Herbs and Spic…","""Herbs""",3,"[-0.004519, 0.019994, … -0.033409]"
"""Kiwi""","""Actinidia chin…","""Fruits""","""Tropical fruit…",2,"[-0.004589, -0.010032, … -0.020672]"
"""Allium (Onion)…","""Allium""","""Vegetables""","""Onion-family v…",6,"[0.013211, -0.01995, … 0.012451]"
"""Garden onion""","""Allium cepa""","""Vegetables""","""Onion-family v…",3,"[0.009202, -0.021001, … 0.005702]"
"""Leek""","""Allium porrum""","""Vegetables""","""Onion-family v…",2,"[0.005257, -0.004284, … -0.002118]"
"""Garlic""","""Allium sativum…","""Herbs and Spic…","""Herbs""",2,"[0.025597, -0.013733, … 0.004028]"
"""Chives""","""Allium schoeno…","""Herbs and Spic…","""Herbs""",2,"[-0.004742, -0.014766, … 0.015614]"
"""Lemon verbena""","""Aloysia triphy…","""Herbs and Spic…","""Herbs""",4,"[0.016134, 0.001248, … -0.035395]"


In [35]:
df_embeddings = (
    df_food
    .select("embeddings")
    .with_columns(pl.col("embeddings").list.to_struct())
    .unnest("embeddings")
)

In [19]:
df_embeddings.write_csv("../data/embeddings.tsv", separator="\t", has_header=False)

In [20]:
df_food.select(["FOOD NAME", "GROUP", "SUB GROUP"]).write_csv(
    "../data/labels_food.tsv", separator="\t"
)