In [1]:
import datasets
from sentence_transformers import SentenceTransformer
from random import seed, sample
from tqdm import tqdm
import faiss
import json

2023-07-12 17:16:00.236043: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
file_path = "/Volumes/Extreme SSD/data/recipe_nlg/full_dataset.csv"
dataset = datasets.DatasetDict.from_csv(file_path)
dataset[0]

Found cached dataset csv (/Users/maxwoolf/.cache/huggingface/datasets/csv/default-52b24f0143b2cc1d/0.0.0)


{'Unnamed: 0': 0,
 'title': 'No-Bake Nut Cookies',
 'ingredients': '["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]',
 'directions': '["In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.", "Stir over medium heat until mixture bubbles all over top.", "Boil and stir 5 minutes more. Take off heat.", "Stir in vanilla and cereal; mix well.", "Using 2 teaspoons, drop and shape into 30 clusters on wax paper.", "Let stand until firm, about 30 minutes."]',
 'link': 'www.cookbooks.com/Recipe-Details.aspx?id=44874',
 'source': 'Gathered',
 'NER': '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'}

In [4]:
def format_recipe(row):
    return f"Name: {row['title']}\nKeywords: {', '.join(json.loads(row['NER']))}"

print(format_recipe(dataset[0]))

Name: No-Bake Nut Cookies
Keywords: brown sugar, milk, vanilla, nuts, butter, bite size shredded rice biscuits


In [5]:
features = datasets.Features(
    {
        "id": datasets.Value(dtype="int32"),
        "name": datasets.Value(dtype="string"),
        "embeddings": datasets.Sequence(
            feature=datasets.Value(dtype="float32"), length=384
        ),
    }
)

features

{'id': Value(dtype='int32', id=None),
 'name': Value(dtype='string', id=None),
 'embeddings': Sequence(feature=Value(dtype='float32', id=None), length=384, id=None)}

In [6]:
num_samples = 1000

# select the same random recipes, given the same sample size
seed(42)
rand_idx = sample(range(0, dataset.num_rows), num_samples)

processed_samples = []
for idx in tqdm(rand_idx):
    row = dataset[idx]
    recipe_formatted = format_recipe(row)
    embedding = model.encode(recipe_formatted)  # numpy array
    processed_samples.append(
        {"id": row["Unnamed: 0"], "name": row["title"], "embeddings": embedding}
    )

100%|██████████| 1000/1000 [00:08<00:00, 120.79it/s]


In [7]:
recipe_dataset = datasets.Dataset.from_list(processed_samples, features=features)
recipe_dataset

Dataset({
    features: ['id', 'name', 'embeddings'],
    num_rows: 1000
})

In [8]:
recipe_dataset.to_parquet("recipe_embeddings.parquet", compression="gzip")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 11.20ba/s]


1568027

## Test Out The Vector Similarity

In [9]:
recipe_dataset.add_faiss_index(column='embeddings')

100%|██████████| 1/1 [00:00<00:00, 347.24it/s]


Dataset({
    features: ['id', 'name', 'embeddings'],
    num_rows: 1000
})

In [10]:
def get_similar_recipes(query, k=3):
    query_embedding = model.encode(query)
    scores, recipes = recipe_dataset.get_nearest_examples('embeddings', query_embedding, k=k)
    recipes.pop("embeddings")
    return recipes

In [11]:
get_similar_recipes("What's an easy-to-make dish?")

{'id': [1980633, 1950301, 836179],
 'name': ['Easy in the Microwave Curry Doria',
  'Easy Corn Casserole',
  'Easy  Chicken Casserole']}

In [12]:
get_similar_recipes("What can I make with chicken and carrots?")

{'id': [99255, 502840, 469207],
 'name': ["Grandma'S Chicken Soup",
  'Chicken Breast Dressing',
  'Sunshine Carrots']}

In [13]:
get_similar_recipes("yummy dessert")

{'id': [167188, 1488243, 299514],
 'name': ['Creamy Strawberry Pie',
  'Summer Strawberry Pie Recipe',
  'Pudding Cake']}