In [None]:
%pip install pandas
%pip install inflect
%pip install scikit-learn
%pip install chromadb

In [None]:
DATA_DIRECTORY = './data'
DB_DIRECTORY = './db'


# Vectorization

In [None]:
import pandas as pd
import json
import pprint
import inflect
import pprint as pprint
from sklearn.feature_extraction import DictVectorizer

df=None
for result_set in range(0, 26):
    file_path =f'{DATA_DIRECTORY}/cocktails_{result_set}.json' 
    with open(file_path, 'r') as f:
        data = json.load(f)
        if(data['drinks'] is None or len(data['drinks']) == 0):
            continue
        df1 = pd.json_normalize(data['drinks'])
        if df is None:
            df = df1
        else:
            df = pd.concat([df, df1])

print("df.size=",df.size)

p = inflect.engine()

def build_ingredients(row):
    ingredients = {}
    for i in range(1, 16):
        ingredient = row[f'strIngredient{i}']
        if ingredient is None or ingredient == '':
            break
        ingredient = p.singular_noun(ingredient) or ingredient
        name = ingredient.strip().title()
        measure = row[f'strMeasure{i}']
        ingredients[name] = measure
    return ingredients

def build_features(row):
    ignored_ingredients = ["water", "ice"]
    ignored_ingredients_containing = ["sugar", "coloring"]

    features = {}
    for ingredient in row["ingredients"]:
        if ingredient in ignored_ingredients: # TODO: ignore also everything containing "sugar" or "coloring", but not "carbonated water" or "ice-cream"
            continue
        ingredient = p.singular_noun(ingredient) or ingredient
        ingredient = ingredient.strip().title()
        features[ingredient] = 1 # measure is useless
    return features


df['ingredients'] = df.apply(build_ingredients, axis=1)
df['features'] = df.apply(build_features, axis=1)

recipes = []
for index, row in df.iterrows():
    recipe = {}
    recipe['id'] = row['idDrink']
    recipe['name'] = row['strDrink']
    recipe['instructions'] = row['strInstructions']
    recipe['image_thumb'] = row['strDrinkThumb']
    recipe['alcohol'] = row['strAlcoholic']
    # recipe['is_alcoolic'] = row['strDrinkThumb'] # possible values: 'Alcoholic' 'Non alcoholic' 'Optional alcohol'
    recipe['ingredients'] = row['ingredients']
    recipe['glass'] = row['strGlass']
    recipes.append(recipe)


vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(df['features'])

## get the lists, for translation purpose

In [None]:
unique_ingredients = set()
for recipe in recipes:
    unique_ingredients=unique_ingredients.union(recipe['ingredients'].keys())

print(sorted(unique_ingredients))


In [None]:
sorted([recipe['name'] for recipe in recipes])

In [None]:
sorted(set([recipe['glass'] for recipe in recipes]))

# Persist vectors

In [None]:
import chromadb
from chromadb.config import Settings
import os

# Initialize Chroma client (in-memory for quick POC)
db = chromadb.PersistentClient(
        path=DB_DIRECTORY,
        settings=Settings(allow_reset=True)
    )

# Create a collection for cocktails
if("cocktails" in [collection.name for collection in db.list_collections()]):
    print("Deleting existing collection")
    db.delete_collection("cocktails")
collection = db.create_collection("cocktails")

for i, recipe in enumerate(recipes):
    collection.add(
        embeddings=X[i],
        documents=[json.dumps(list(features[i].keys()))],
        metadatas={
            "name": recipe["name"], 
            "id": recipe["id"], 
            "ingredients": json.dumps(recipe["ingredients"]),
            "image_thumb": recipe["image_thumb"],
            "alcohol": recipe["alcohol"],
            "glass": recipe["glass"],
            "instructions": recipe["instructions"],
            },
        ids=[recipe["id"]]
    )

print("Collection created")

# Test

In [None]:
liked_recipes_names = ["Pina Colada", "Margarita"]
liked_recipes = [recipe for recipe in recipes if recipe["name"] in liked_recipes_names]

# get the vectors for the liked recipes

db_collection = db.get_collection("cocktails")
print(f"Querying {db_collection.count()} cocktails...")

liked_recipes_vectors = db_collection.get(ids=[recipe["id"] for recipe in liked_recipes], include=["metadatas", "embeddings"])

where_clause = {"id": {"$nin": [recipe["id"] for recipe in liked_recipes]}}

results = db_collection.query(liked_recipes_vectors["embeddings"], where=where_clause, n_results=5, include=["metadatas", "distances","documents"])

results_structured = []
for result_set, metadatas in enumerate(results["metadatas"]):
    for index_in_result_set, metadatas_for_result_set in enumerate(metadatas): 
        results_structured.append(
            {
                'id': metadatas_for_result_set["id"],
                'name': metadatas_for_result_set["name"],
                'distance': results["distances"][result_set][index_in_result_set],
                'ingredients': json.loads(results["documents"][result_set][index_in_result_set]),
                'result_set_index': result_set
            })

# for result in results["metadatas"]:
#     print('---')
#     for cocktail in result:
#         print(cocktail['name'])

results_structured = sorted(results_structured, key=lambda x: x['distance'])
pprint.pp(results_structured)
