In [1]:
import requests
import json
import pandas as pd
import numpy as np
from datetime import datetime
import os
import glob
import time

In [112]:
with open("apikey.json") as f:
    content = f.read()
    API_KEY = json.loads(content)['protonmail']

## Get recipe data from Spoonacular API

In [3]:
# Helper function
# export as csv
def export_df_as_csv(data):
    now = datetime.now()
    timestamp=f"{now.date()}-{now.hour}:{now.minute}:{now.second}"
    data.to_csv(f"data/recipedata-{timestamp}.csv")
    
def get_recipes(n, request_fn):
    '''
    This function calls request_fn n times to get n*100 recipes in total
    and create a DataFrame with the data
    
    return: pd.Dataframe
    '''
    recipes = []
    for _ in range(n):
        recipes_100, is_success = request_fn()
        if not is_success: 
            break
        recipes+=recipes_100
    df = pd.DataFrame(recipes)
    return df

### Random recipes

In [62]:
# number of recipes

n_hundred = 10 # 10 hundred = 1000 random recipes

In [63]:
def get_100_random_recipes():
    '''
    This function requests 100 random recipes from Spoonacular 
    100 is the highest number of recipes one could get from a single request
    
    return: 
        - List( Dict() ) - a list of recipes (full information)
        - True if status code is 200, False otherwise
    '''
    res = requests.get(f"https://api.spoonacular.com/recipes/random?apiKey={API_KEY}&number=100")
    
    if res.status_code == 200:
        recipes = json.loads(res.content)["recipes"]
        return recipes, True
    else:
        print(res.status_code)
        print(res.content)
        return [], False

In [64]:
# get data from spoonacular
recipe_data = get_recipes(n_hundred, get_100_random_recipes)

In [65]:
recipe_data.shape

(1000, 38)

In [66]:
export_df_as_csv(recipe_data)

### Get randome recipes based on complex search

One problem with the above approach is that it returns a very similar set of 1000 recipes for request. Therefore, less than 1000 recipes are collected even after multiple requests for 1000 recipes. 

To overcome this, we are going to use the complex search method, with the `sort` option set to `random`. 

In [71]:
def random_complex_search_100():
    '''
    This function gets 100 random recipes from Spoonacular with the complex search method
    
    returns: List( Dict() ) - a list of recipes (limited information)
    '''
    res = requests.get(f"https://api.spoonacular.com/recipes/complexSearch?apiKey={API_KEY}&number=100&sort=random")
    
    time.sleep(1)
    
    if res.status_code == 200:
        recipes = json.loads(res.content)["results"]
        return recipes, True
    else:
        print(res.status_code)
        print(res.content)
        return [], False

In [113]:
recipe_data1 = get_recipes(1, random_complex_search_100)

402
b'{"status":"failure", "code":402,"message":"Your daily points limit of 150 has been reached. Please upgrade your plan to continue using the API."}'


However, instead of getting all data related to the recipes, we only get `id`, `title`, `image` and `imageType`. To overcome this, we will get the information for each recipe by sending another set of requests. 

In [103]:
def get_recipe_info(recipe_ids):
    '''
    params:
        - recipes_ids: List(int) - list of recipe ids
    returns: 
        - pd.DataFrame
    '''
    recipes = []
    for recipe_id in recipe_ids:
        res = requests.get(f"https://api.spoonacular.com/recipes/{recipe_id}/information?apiKey={API_KEY}&includeNutrition=true")
        time.sleep(1)
        if res.status_code == 200:
            recipe = json.loads(res.content)
            recipes.append(recipe)
        else:
            print(res.status_code)
            print(res.content)
            break
    return pd.DataFrame(recipes)

In [104]:
recipe_ids = recipe_data1["id"].to_list()
recipe_info = get_recipe_info(recipe_ids)

402
b'{"status":"failure", "code":402,"message":"Your daily points limit of 150 has been reached. Please upgrade your plan to continue using the API."}'


In [105]:
recipe_info.head()

Unnamed: 0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,weightWatcherSmartPoints,gaps,...,cuisines,dishTypes,diets,occasions,winePairing,instructions,analyzedInstructions,originalId,license,spoonacularSourceUrl
0,False,False,False,False,False,False,False,False,16,no,...,[],[side dish],[],[],{},,[],,,
1,False,False,False,False,False,False,True,False,11,no,...,"[Mediterranean, Italian, European]","[morning meal, brunch, breakfast]",[],[],{},,[],,,
2,True,False,False,False,False,False,False,False,8,no,...,[],[],[lacto ovo vegetarian],[father's day],"{'pairedWines': [], 'pairingText': '', 'produc...",Set oven to 350ºF and grease a loaf pan.Combin...,"[{'name': '', 'steps': [{'number': 1, 'step': ...",,,
3,False,False,False,True,False,False,False,False,6,no,...,[],"[lunch, main course, main dish, dinner]","[dairy free, pescatarian]",[],"{'pairedWines': ['pinot grigio', 'riesling', '...","1.In wok or large skillet, heat oil over mediu...","[{'name': '', 'steps': [{'number': 1, 'step': ...",,,
4,False,False,True,True,False,False,False,False,24,no,...,[],"[lunch, main course, main dish, dinner]","[gluten free, dairy free]",[],"{'pairedWines': [], 'pairingText': '', 'produc...",Cook onions in bacon drippings until tender. R...,"[{'name': '', 'steps': [{'number': 1, 'step': ...",,CC BY 3.0,https://spoonacular.com/choucroute-garni-57810


In [106]:
print(f"{recipe_info.shape[0]} recipes downloaded")

33 recipes downloaded


In [107]:
export_df_as_csv(recipe_info)

## Grouping all csv files together

In [108]:
# This is just for us to know many recipes we originaly have 
grouped = pd.read_csv("./data/recipedata-grouped.csv")

In [109]:
# Group all csv files into one DataFrame

recipe_csv_files = glob.glob('./data/recipedata*.csv')
df = pd.concat(map(lambda path: pd.read_csv(path, index_col=0), recipe_csv_files))
df.head()

Unnamed: 0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,weightWatcherSmartPoints,gaps,...,dishTypes,diets,occasions,winePairing,instructions,analyzedInstructions,originalId,license,spoonacularSourceUrl,author
0,True,True,False,True,True,False,True,False,11,no,...,['side dish'],"['dairy free', 'lacto ovo vegetarian', 'vegan']",[],"{'pairedWines': [], 'pairingText': '', 'produc...",Directions: ...,"[{'name': '', 'steps': [{'number': 1, 'step': ...",,,,
1,False,False,False,False,False,False,False,False,23,no,...,['dessert'],[],[],"{'pairedWines': ['cream sherry', 'moscato dast...","To make the cake, soak the apricots in the bra...","[{'name': '', 'steps': [{'number': 1, 'step': ...",,,,
2,True,False,True,True,False,False,True,False,4,no,...,['side dish'],"['gluten free', 'dairy free', 'lacto ovo veget...","[""father's day"", '4th of july', 'summer']",{},"Add the mayonnaise, sugar, lemon juice, white ...","[{'name': '', 'steps': [{'number': 1, 'step': ...",,,,
3,True,False,False,False,False,False,False,False,6,no,...,[],['lacto ovo vegetarian'],[],"{'pairedWines': [], 'pairingText': '', 'produc...",<p>Place all ingredients into pan of bread mac...,"[{'name': '', 'steps': [{'number': 1, 'step': ...",,,,
4,False,False,False,False,False,False,False,False,14,no,...,"['lunch', 'main course', 'main dish', 'dinner']",[],[],"{'pairedWines': [], 'pairingText': 'No one win...",To make the spicy marinara sauce: Combine the ...,"[{'name': 'To make the spicy marinara sauce', ...",,,,


In [110]:
print(f"Before dropping duplicates: {df.shape}")
df = df.drop_duplicates()
print(f"After dropping duplicates: {df.shape}")
print(f"{df.shape[0] - grouped.shape[0]} new recipes added.")

Before dropping duplicates: (2981, 40)
After dropping duplicates: (2881, 40)
33 new recipes added.


In [111]:
df.to_csv(f"data/recipedata-grouped.csv")