### load env variables

In [6]:
# !pip install ipynb # used to import another ipynb
# !pip install python-dotenv # used to load .env file

# Load env should run it very first
from dotenv import load_dotenv
load_dotenv()


True

### Import

In [7]:
import json
# from preprocess.readcsv import read
# from preprocess import esClient
import helper.eshelper as eshelper
import helper.csvhelper as csvhelper

import importlib
importlib.reload(eshelper)
importlib.reload(csvhelper)



<module 'helper.csvhelper' from '/Users/roychiu/Desktop/ISS Master/GroupProject1/preprocess/helper/csvhelper.py'>

### Read recipe data

In [8]:
header = [
    "query",
    "recipe_id",
    "publisher",
    'source_url', 
    'image_url', 
    'social_rank', 
    'publisher_url', 
    'title', 
    'sum_cal', 
    'sum_fat', 
    'sum_carb', 
    'sum_protein'
]
recipeData = csvhelper.read("../data/recipe.csv", header)


### push recipe data to es

In [9]:
#push recipe to elasticsearch
err = eshelper.bulkIndex(recipeData,"recipe_raw","{recipe_id}")
if err != None:
    print(err)
else:
    print("pushed to elasticsearch")

pushing 3166 documents
(3166, [])
pushed to elasticsearch


### read ingredient data

In [10]:
# read ingredient data
header = [
    "query",
    "recipe_id",
    "quantity",
    "unit",
    "weight_g",
    "raw_ingredient",
    "ingredient",
    "nutrition_value"
    "cal",
    'fat', 
    'cah', 
    'protein', 
    'original_ingredient'
]
ingredientData = csvhelper.read("../data/ingredient.csv", header)

### push ingredient data to es

In [11]:
#push recipe to elasticsearch
err = eshelper.bulkIndex(ingredientData,"recipe_ingredient_raw","{recipe_id}-{ingredient}")
if err != None:
    print(err)
else:
    print("pushed to elasticsearch")

pushing 35827 documents
181 document(s) failed to index.


### get similarity result

In [12]:
### result is in id -> [recipes] format
with open('../data/similarity_result.json', 'r') as f:
    SIMILAR_RECIPES = json.load(f)


### preprocess data

### 

In [15]:
# create dictionary by recipe id
recipeDict = {}
for recipe in recipeData:
    recipe_id = recipe["recipe_id"]
    recipeDict[recipe_id] = recipe.copy()
    recipeDict[recipe_id]["ingredients"] = []
    recipeDict[recipe_id]["ingredients_g"] = {}
    recipeDict[recipe_id]["ingredients_weight_g"] = []
    recipeDict[recipe_id]["ingredients_g_normalised"] = []
    recipeDict[recipe_id]["ingredients_popularity"] = {}
    recipeDict[recipe_id]["ingredients_name"] = ""
    recipeDict[recipe_id]["total_ingredient_weight_g"] = 0
    recipeDict[recipe_id]["min_ingredient_weight_g"] = float("inf")
    # similar recipes
    if recipe_id in SIMILAR_RECIPES:
        recipeDict[recipe_id]["similar_recipes"] = [ {"recipe_id": v['recipe_id'],"score": v['score'] } for v in SIMILAR_RECIPES[recipe_id]]

# push ingredient to recipe
for ingredient in ingredientData:
    recipe_id = ingredient["recipe_id"]
    recipeDict[recipe_id]["ingredients"].append(ingredient)
    recipeDict[recipe_id]["ingredients_name"] += " {0}".format(ingredient['ingredient'])
    recipeDict[recipe_id]["total_ingredient_weight_g"] += ingredient["weight_g"]
    recipeDict[recipe_id]["ingredients_g"][ingredient["ingredient"].strip()] = int(ingredient["weight_g"])
    if ingredient["weight_g"] < recipeDict[recipe_id]["min_ingredient_weight_g"]:
        # get the min weight and normalise it later
        recipeDict[recipe_id]["min_ingredient_weight_g"] = ingredient["weight_g"]

for recipe_id in recipeDict:
    for ingredient in recipeDict[recipe_id]["ingredients"]:
        times = int(ingredient['weight_g']/recipeDict[recipe_id]["min_ingredient_weight_g"])
        recipeDict[recipe_id]["ingredients_g_normalised"].append({
            "ingredient" : ingredient["ingredient"],
            "weight": recipeDict[recipe_id]["ingredients_g"][ingredient["ingredient"].strip()] / recipeDict[recipe_id]["total_ingredient_weight_g"] * 10
        })
        # this is used for popularity search
        recipeDict[recipe_id]["ingredients_popularity"][ingredient['ingredient'].replace(' ','_')] = times
        if len(ingredient['ingredient'].split()) > 0:
            words = ingredient['ingredient'].split()
            for i in words:
                if i not in recipeDict[recipe_id]["ingredients_popularity"]:
                    recipeDict[recipe_id]["ingredients_popularity"][i] = 0
                recipeDict[recipe_id]["ingredients_popularity"][i] += times
        
        # this is used for text search
        for i in range(times):
            recipeDict[recipe_id]["ingredients_weight_g"].append(ingredient["ingredient"])

    # normalize popularity
    mKey = max(recipeDict[recipe_id]["ingredients_popularity"],key=recipeDict[recipe_id]["ingredients_popularity"].get)
    m = recipeDict[recipe_id]["ingredients_popularity"][mKey]
    for k in recipeDict[recipe_id]["ingredients_popularity"]:
        recipeDict[recipe_id]["ingredients_popularity"][k] = recipeDict[recipe_id]["ingredients_popularity"][k]/m


### push preprocessed data to es

In [19]:
#push recipe detail to elasticsearch
recipes = []
for key in recipeDict:
    r = recipeDict[key].copy()
    del r["ingredients"]
    recipes.append(r)

err = eshelper.bulkIndex(recipes,"recipe_detail","{recipe_id}")
if err != None:
    print(err.errors)
else:
    print("pushed to elasticsearch")

pushing 2256 documents
(2256, [])
pushed to elasticsearch


### test search recipe detail

In [12]:
#search documents
eshelper.searchByIngredient("recipe_detail*", "cheese", 10)

Got 10 Hits:


### read and push nutritions and unit data to es

In [9]:
# process ingredients and units to es
with open('../data/unit_to_gram_convertion.json', 'r') as f:
    UNITS_DICT = json.load(f)
with open('../data/nutritions_dict.json', 'r') as f:
    NUTRITIONS_DICT = json.load(f)


NUTRITIONS = []
UNITS = []

for key in NUTRITIONS_DICT:
    d = NUTRITIONS_DICT[key]
    d["ingredient"] = key
    NUTRITIONS.append(d)

for key in UNITS_DICT:
    v = UNITS_DICT[key]
    UNITS.append({
        "unit": key,
        "value": v
    })


err = eshelper.bulkIndex(NUTRITIONS,"ingredients","{ingredient}-{id}")
print(err)

err = eshelper.bulkIndex(UNITS,"units","{unit}")
print(err)

pushing 553 documents
None
pushing 64 documents
None
