In [3]:
import os
import pathlib

os.chdir(path="../")
print(f"Current directory: {os.getcwd()}")

path_dataset = pathlib.Path("dataset/")

Current directory: /Users/longyyu/Documents/research


In [4]:
def download_resource(resource_url, local_path):
    import requests
    if not os.path.exists(local_path):
        r = requests.get(resource_url)
        print(f"Content type: {r.headers.get('content-type')}")
        print(f"Content disposition: {r.headers.get('content-disposition')}")
        print(f"Content length: {r.headers.get('content-length')}")
        try:
            with open(local_path, 'wb') as file_disk:
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:
                        file_disk.write(chunk)
        except Exception as e:
            print(f"Error: {str(e)}")
        if os.path.isfile(local_path):
            print("Downloaded successfully!")
    else:
        print(f"Resource {local_path} already exists.")

## Annotated recipes from ["RecipeScape"](https://recipescape.kixlab.org/)

Annotated recipes in JSON format for [chocolate chip cookies](https://recipescape.kixlab.org/ccc_trees.json) and [tomato pasta](https://recipescape.kixlab.org/tomatopasta_trees.json).

In [13]:
# download dataset from url if not already exists
filename_dataset = "recipescape-choco-chip-cookies.json"
download_resource(
    resource_url="https://recipescape.kixlab.org/ccc_trees.json",
    local_path=str(path_dataset/filename_dataset)
)
# read in the json file
import json
with open(path_dataset/filename_dataset, 'r') as f:
    data = json.load(f)
# data is a list of dict objects, each dict has keys 'id' and 'tree'

Resource dataset/recipescape-choco-chip-cookies.json already exists.


This dataset is a list of dict objects, each dict has keys 'id' and 'tree'.

In [42]:
print(f"Num of elements: {len(data)}")
print("Keys of an element: ", data[0].keys())
# print(json.dumps(data, indent=4))

Num of elements: 490
Keys of an element:  dict_keys(['id', 'tree'])


A 'tree' is also a list of dict objects, each with keys 'word' and 'ingredient'. 

In [45]:
tree = data[0]["tree"]
print(f"Num of elements: {len(tree)}")
print("Keys of an element: ", tree[0].keys())
# print(json.dumps(tree, indent=4))

Num of elements: 7
Keys of an element:  dict_keys(['word', 'ingredient'])


In [75]:
# pretty print the data as table (first 6 rows)
def print_row(row):
    print(f"{row[0]:<24}\t{row[1]:<12}\t{row[2]}")

print_row(["recipe_id", "word", "ingredient"])
for recipe in data[:6]:
    for idx, record in enumerate(recipe["tree"]):
        print_row(row=[' ' if idx else recipe['id'], record['word'], ','.join(record['ingredient'])])

recipe_id               	word        	ingredient
54a47bb66529d92b2c02c10e	arrange     	bread slices,work surface,bread slice,cheese,tablespoons caponata,half,tomato
                        	remaining   	cheese,bread,brush,bread slices,oil,heat,skillet,medium heat
                        	is          	cheese
54a42ff76529d92b2c01385a	heat        	cooking spray
                        	add         	tomato sauce,cranberry sauce,horseradish,dry mustard,vinegar,stock
                        	lower       	
                        	stirring    	
                        	lower       	
                        	serve       	
                        	reheat      	
54a4154d19925f464b375841	heat        	oil
                        	sauté       	shallots
                        	add         	chicken
                        	turn        	chicken
                        	broth       	
                        	cover       	
                        	simmer      	chicken
                        	boil     

### Tomato pasta

In [72]:
# download dataset from url if not already exists
filename_dataset = "recipescape-tomato-pasta.json"
download_resource(
    resource_url="https://recipescape.kixlab.org/tomatopasta_trees.json",
    local_path=str(path_dataset/filename_dataset)
)

# read in the json file
import json
with open(path_dataset/filename_dataset, 'r') as f:
    data = json.load(f)
# data is a list of dict objects, each dict has keys 'id' and 'tree'

print(f"Num of elements: {len(data)}")
print("Keys of an element: ", data[0].keys())
# print(json.dumps(data, indent=4))

tree = data[0]["tree"]
print(f"\nNum of elements: {len(tree)}")
print("Keys of an element: ", tree[0].keys())
# print(json.dumps(tree, indent=4))

Resource dataset/recipescape-tomato-pasta.json already exists.
Num of elements: 553
Keys of an element:  dict_keys(['id', 'tree'])

Num of elements: 3
Keys of an element:  dict_keys(['word', 'ingredient'])


In [74]:
# pretty print the data as table (first 6 rows)
def print_row(row):
    print(f"{row[0]:<24}\t{row[1]:<12}\t{row[2]}")

print_row(["recipe_id", "word", "ingredient"])
for recipe in data[:6]:
    for idx, record in enumerate(recipe["tree"]):
        print_row(row=[' ' if idx else recipe['id'], record['word'], ','.join(record['ingredient'])])

recipe_id               	word        	ingredient
54a47bb66529d92b2c02c10e	arrange     	bread slices,work surface,bread slice,cheese,tablespoons caponata,half,tomato
                        	remaining   	cheese,bread,brush,bread slices,oil,heat,skillet,medium heat
                        	is          	cheese
54a42ff76529d92b2c01385a	heat        	cooking spray
                        	add         	tomato sauce,cranberry sauce,horseradish,dry mustard,vinegar,stock
                        	lower       	
                        	stirring    	
                        	lower       	
                        	serve       	
                        	reheat      	
54a4154d19925f464b375841	heat        	oil
                        	sauté       	shallots
                        	add         	chicken
                        	turn        	chicken
                        	broth       	
                        	cover       	
                        	simmer      	chicken
                        	boil     

## Simplified Recipes1M

> Source: https://dominikschmidt.xyz/simplified-recipes-1M/

In [10]:
# download dataset from url if not already exists
filename_dataset = "simplified-recipes-1M.npz"
download_resource(
    resource_url="https://github.com/schmidtdominik/RecipeNet/raw/master/simplified-recipes-1M.npz",
    local_path=str(path_dataset/filename_dataset)
)

Resource dataset/simplified-recipes-1M.npz already exists.


In [None]:
import numpy as np

data = np.load(path_dataset/filename_dataset)
# with np.load(path_dataset/filename_dataset) as data:
#     recipes = data['recipes']
#     ingredients = data['ingredients']