In [31]:
!pip install pandas
!pip install scikit-learn joblib
!pip install sentence-transformers faiss-cpu
!pip install openai



In [32]:
# import required dependencies
import pandas as pd
import numpy as np
import uuid
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import faiss, time
from openai import OpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import joblib

In [33]:
# load dataset
dataset = "files/inputs/3A2M_Test.csv"
data = pd.read_csv(dataset)

In [34]:
print(len(data))

30000


In [35]:
# rename default column names
data = data.rename(columns={
    "Unnamed: 0": "row",
    "title": "title",
    "directions": "directions",
    "NER": "ingredients",
    "genre": "genre",
    "label": "label"
})

In [36]:
# helper functions
def generate_id():
    return str(uuid.uuid4())

def create_summary(recipe):
    summary = f"Recipe Name: {recipe["title"]}\nIngredients: {', '.join(recipe["ingredients"])}\nDirections: {', '.join(recipe["directions"])}\nGenre: {recipe["genre"]}"
    return summary

In [37]:
# prepare json data from dataset
recipes = []
for _, r in data.iterrows():
    recipes.append({
        "id": generate_id(),
        "row": r["row"],
        "title": r["title"],
        "ingredients": json.loads(r["ingredients"]),
        "directions": json.loads(r["directions"]),
        "genre": r["genre"],
        "label": r["label"]
    })

for r in recipes:
    r["summary"] = create_summary(r)

In [38]:
print(recipes[0])

{'id': '495e6287-dfc5-437e-95cc-de237626b301', 'row': 0, 'title': 'Reeses Cups(Candy)  ', 'ingredients': ['peanut butter', 'graham cracker crumbs', 'butter', 'powdered sugar', 'chocolate chips'], 'directions': ['Combine first four ingredients and press in 13 x 9-inch ungreased pan.', 'Melt chocolate chips and spread over mixture. Refrigerate for about 20 minutes and cut into pieces before chocolate gets hard.', 'Keep in refrigerator.'], 'genre': 'drinks', 'label': 2, 'summary': 'Recipe Name: Reeses Cups(Candy)  \nIngredients: peanut butter, graham cracker crumbs, butter, powdered sugar, chocolate chips\nDirections: Combine first four ingredients and press in 13 x 9-inch ungreased pan., Melt chocolate chips and spread over mixture. Refrigerate for about 20 minutes and cut into pieces before chocolate gets hard., Keep in refrigerator.\nGenre: drinks'}


In [9]:
# save recipes to file
with open("files/outputs/recipes.jsonl", "w", encoding="utf-8") as f:
    for r in recipes:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

In [10]:
# prepare summeries for embedding
summaries = []

for r in recipes:
    summaries.append(r["summary"])

### All MiniLM L6 V2

In [None]:
# prepare embedding model
embedding_model = "all-MiniLM-L6-v2"
model = SentenceTransformer(embedding_model, device="cpu")

In [24]:
# prepare embedding function
def embed_summaries(summaries, batch_size=256):
    vectors = []

    for i in tqdm(range(0, len(summaries), batch_size), desc="Embedding"):
        batch = summaries[i:i+batch_size]
        # L2-normalized -> cosine via dot
        embed = model.encode(batch, convert_to_numpy=True, normalize_embeddings=True)
        vectors.append(embed.astype(np.float32))

    return np.vstack(vectors)

In [None]:
# embed summaries
start_time = time.time()
embeded_summaries = embed_summaries(summaries)
end_time = time.time()
time_taken = end_time - start_time
print(time_taken)

# 1425.6717722415924

Embedding:   0%|          | 0/118 [00:00<?, ?it/s]

Embedding: 100%|██████████| 118/118 [23:45<00:00, 12.08s/it]


1425.6717722415924


In [29]:
# create FAISS index file
d = embeded_summaries.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embeded_summaries)

In [None]:
# save index file
index_path = "files/outputs/recipes-index-all-MiniLM-L6-v2.bin"

faiss.write_index(index, index_path)

### Text Embedding 3 Small

In [None]:
embedding_model = "text-embedding-3-small"
openai_api_key = "add-your-open-AI-key"

client = OpenAI(api_key=openai_api_key)

In [17]:
# prepare embedding function
def embed_summaries(summaries, batch_size=256):
    vectors = []

    for i in tqdm(range(0, len(summaries), batch_size), desc="Embedding"):
        batch = summaries[i:i+batch_size]

        embed = client.embeddings.create(input=batch, model=embedding_model)

        for e in embed.data:
            vectors.append(e.embedding)

    return np.array(vectors, dtype=np.float32)

In [None]:
start_time = time.time()
embedded_summaries = embed_summaries(summaries)
end_time = time.time()
time_taken = end_time - start_time
print(time_taken)

# 306.5218036174774


Embedding:   0%|          | 0/118 [00:00<?, ?it/s]

Embedding: 100%|██████████| 118/118 [04:33<00:00,  2.32s/it]


306.5218036174774


In [23]:
# Normalize vectors for cosine similarity
norms = np.linalg.norm(embedded_summaries, axis=1, keepdims=True)
embedded_summaries = embedded_summaries / norms

# create FAISS index file
d = embedded_summaries.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embeded_summaries)

In [24]:
# save index file
index_path = "files/outputs/recipes-index-text-embedding-3-small.bin"

faiss.write_index(index, index_path)

In [28]:
vector_path = "files/outputs/recipes-tfidf.joblib"

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.9)
pipeline = make_pipeline(vectorizer, Normalizer(copy=False))

start_time = time.time()
vectors = pipeline.fit_transform(summaries)
end_time = time.time()

time_taken = end_time - start_time
print(time_taken)

# 10.024590730667114

10.024590730667114


In [30]:
joblib.dump({"pipeline": pipeline, "vectors": vectors}, vector_path)

['files/outputs/recipes-tfidf.joblib']