# Recipe1M parser

In [1]:
!pip install quantulum3
!pip install stemming

import pandas as pd
import re
from recipe import Recipe




In [2]:
# Add GDrive
from google.colab import drive
import sys
drive.mount('/content/drive/')
sys.path.append('/content/drive/My Drive/Datasets/Recipe1M/') ## Place correct Link HERE !!! ##

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Recipe1M data
Recipe1M comes with various json files containing crawled recipes from the web. For our project, two of them are interesting:
* layer1.json: Contains all recipes to their full extend
  
  ![layer1](https://github.com/mscholl96/mad-recime/blob/recipe1M-parser/data/recipe1M/dataset-analysis/layer1_puml.png?raw=1)

* det_ingrs.json: Only contains recipe ID, parsed ingredients and validity flag for parsing 
  
  ![det_ingrs](https://github.com/mscholl96/mad-recime/blob/recipe1M-parser/data/recipe1M/dataset-analysis/det_ingrs_puml.png?raw=1)

In our first attempt we want to make use of the parsed ingredient list and only consider recipes, where all ingredients are marked valid. The parsed ingredients don't contain amounts, so our parser has to kind of merge content of both files. Extracting ingredients from one and their amount and unit from the other file.

## Preprocessing
Removal of all invalid sets from ingredient and full data json to reduce memory. Use pickle instead of json.

In [3]:
# Removal of all elements in ingredient json which contain invalid entries according to the data set
ingredient_data = pd.read_json('/content/drive/My Drive/Datasets/Recipe1M/det_ingrs.json')
recipe_raw_data = pd.read_json('/content/drive/My Drive/Datasets/Recipe1M/layer1.json')

indices = []
i = 0
for row in ingredient_data.valid:
    if any(x == False for x in row):
        indices.append(i)
    i += 1

# Frame of ids that have to be dropped from raw data
drop_ids = pd.DataFrame(ingredient_data.iloc[indices]['id'])

# Drop indices from ingredient data
ingredient_data.drop(indices, inplace=True)
ingredient_data.info()

# Remove data from raw recipes where id matches
recipe_mod = recipe_raw_data[~recipe_raw_data.id.isin(drop_ids.id)]
recipe_mod.info()

# Remove fractions from raw ingredients
fractionRegex = re.compile("[0-9]+/[0-9]+")
for _, recipe in recipe_mod.iterrows():
    ingredients_mod = []
    for ingredient in recipe['ingredients']:
        ingredient_mod = ""
        for word in ingredient['text'].split(' '):
            match = re.match(fractionRegex, word)
            if match:
                numbers = match.group(0).split('/')

                float_representation = int(numbers[0])/int(numbers[1])
                ingredient_mod += f'{float_representation} '
            else:
                ingredient_mod += f'{word} '
        ingredients_mod.append({'text': ingredient_mod})
    recipe['ingredients'] = ingredients_mod

recipe_mod.info()

# TODO: Replace unparseable stuff like "c." --> cup 

# Save data to pickle (it's faster)
ingredient_data.to_pickle('/content/drive/My Drive/Datasets/Recipe1M/det_ingrs_valid.pkl')
recipe_mod.to_pickle('/content/drive/My Drive/Datasets/Recipe1M/layer1_valid.pkl')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 869656 entries, 1 to 1029719
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   valid        869656 non-null  object
 1   id           869656 non-null  object
 2   ingredients  869656 non-null  object
dtypes: object(3)
memory usage: 26.5+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 869656 entries, 1 to 1029719
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   ingredients   869656 non-null  object
 1   url           869656 non-null  object
 2   partition     869656 non-null  object
 3   title         869656 non-null  object
 4   id            869656 non-null  object
 5   instructions  869656 non-null  object
dtypes: object(6)
memory usage: 46.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 869656 entries, 1 to 1029719
Data columns (total 6 columns):
 #   Column        Non-Null Cou

## Actual parsing

In [2]:
recipes = []
# Use id as index for easy access
data = pd.read_pickle('/content/drive/My Drive/Datasets/Recipe1M/layer1_valid.pkl').sort_values('id')
print(f'Total number of recipes: {len(data)}')
data.head(5)


Total number of recipes: 869656


Unnamed: 0,ingredients,url,partition,title,id,instructions
1,"[{'text': '1 c. elbow macaroni '}, {'text': '1...",http://cookeatshare.com/recipes/dilly-macaroni...,train,Dilly Macaroni Salad Recipe,000033e39b,[{'text': 'Cook macaroni according to package ...
2,"[{'text': '8 tomatoes, quartered '}, {'text': ...",http://www.foodnetwork.com/recipes/gazpacho1.html,train,Gazpacho,000035f7ed,[{'text': 'Add the tomatoes to a food processo...
3,"[{'text': '2 12 cups milk '}, {'text': '1 12 c...",http://www.food.com/recipe/crunchy-onion-potat...,test,Crunchy Onion Potato Bake,00003a70b1,[{'text': 'Preheat oven to 350 degrees Fahrenh...
4,[{'text': '1 (3 ounce) package watermelon gela...,http://www.food.com/recipe/cool-n-easy-creamy-...,train,Cool 'n Easy Creamy Watermelon Pie,00004320bb,"[{'text': 'Dissolve Jello in boiling water.'},..."
5,"[{'text': '12 cup shredded coconut '}, {'text'...",http://www.food.com/recipe/easy-tropical-beef-...,train,Easy Tropical Beef Skillet,0000631d90,"[{'text': 'In a large skillet, toast the cocon..."


In [3]:
ingredient_data = pd.read_pickle('/content/drive/My Drive/Datasets/Recipe1M/det_ingrs_valid.pkl').sort_values('id')
print(f'Total number of ingredients: {len(ingredient_data)}')
ingredient_data.head(5)

Total number of ingredients: 869656


Unnamed: 0,valid,id,ingredients
1,"[True, True, True, True, True, True, True, Tru...",000033e39b,"[{'text': 'elbow macaroni'}, {'text': 'America..."
2,"[True, True, True, True, True, True, True, Tru...",000035f7ed,"[{'text': 'tomatoes'}, {'text': 'kosher salt'}..."
3,"[True, True, True, True, True, True, True]",00003a70b1,"[{'text': 'milk'}, {'text': 'water'}, {'text':..."
4,"[True, True, True, True, True]",00004320bb,"[{'text': 'watermelon gelatin'}, {'text': 'boi..."
5,"[True, True, True, True, True, True, True, Tru...",0000631d90,"[{'text': 'shredded coconut'}, {'text': 'lean ..."


In [4]:
for i in range(0, len(ingredient_data)):

    ingredients = ingredient_data.iloc[i]
    raw_recipe = data.iloc[i]
    #id = ingredients['id']
 #   if id != raw_recipe['id']:
  #      print("Error: ID mismatch")
   #     continue
    recipe = Recipe(ingredients['id'])
    
    # Continue if parser didn't parse
    if False == recipe.parse_ingredients(ingredients['ingredients']):
        continue

    # Continue if parser didn't parse
    if False == recipe.parse_instructions(raw_recipe['instructions']):
        continue
      
    recipe.get_ingredient_amounts(raw_recipe['ingredients'])

    recipe.title = raw_recipe['title']
    recipes.append(recipe)
    if i % 10000 == 0:
      print(f'Progress: {i}')
    i+=1

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
2022-01-17 16:55:37,071 --- The classifier was built using a different scikit-learn version (=0.24.2, !=1.0.2). The disambiguation tool could behave unexpectedly. Consider running classifier.train_classfier()


Progress: 0
Progress: 1000
Progress: 2000
Progress: 3000
Progress: 4000
Progress: 5000
Progress: 6000
Progress: 7000
Progress: 8000
Progress: 9000
Progress: 10000
Progress: 11000
Progress: 12000
Progress: 13000
Progress: 14000
Progress: 15000
Progress: 16000
Progress: 17000
Progress: 18000
Progress: 19000
Progress: 20000
Progress: 21000
Progress: 22000
Progress: 23000
Progress: 24000
Progress: 25000
Progress: 26000
Progress: 27000
Progress: 28000
Progress: 29000
Progress: 30000
Progress: 31000
Progress: 32000
Progress: 33000
Progress: 34000
Progress: 35000
Progress: 36000
Progress: 37000
Progress: 38000
Progress: 39000
Progress: 40000
Progress: 41000
Progress: 42000
Progress: 43000
Progress: 44000
Progress: 45000
Progress: 46000
Progress: 47000
Progress: 48000
Progress: 49000
Progress: 50000
Progress: 51000
Progress: 52000
Progress: 53000
Progress: 54000
Progress: 55000
Progress: 56000
Progress: 58000
Progress: 59000
Progress: 60000
Progress: 61000
Progress: 62000
Progress: 63000
Progr

In [5]:
# Create data frame in the end (according to Stackoverflow this is faster)                
df = pd.DataFrame([vars(r) for r in recipes])
df = df.set_index('id')
df.to_pickle('/content/drive/My Drive/Datasets/Recipe1M/recipes_valid.pkl')
df.to_json('/content/drive/My Drive/Datasets/Recipe1M/recipes_valid.json')
df.head(10)

Unnamed: 0_level_0,title,ingredients,instructions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000033e39b,Dilly Macaroni Salad Recipe,amount unit ingredient 0 1...,0 Cook macaroni according to package direct...
000035f7ed,Gazpacho,amount unit ingredient 0 8 ...,0 Add the tomatoes to a food processor with...
00003a70b1,Crunchy Onion Potato Bake,amount unit ingredient 0 1 ...,0 Preheat oven to 350 degrees Fah...
00004320bb,Cool 'n Easy Creamy Watermelon Pie,amount unit ingredient 0 1 ...,0 Dissolve Jello in boiling water. 1 ...
0000631d90,Easy Tropical Beef Skillet,amount unit ingredient 0 ...,"0 In a large skillet, toast the coconut ove..."
000075604a,Kombu Tea Grilled Chicken Thigh,amount unit ingredient 0 2 ...,0 Pierce the skin of the chicken with a for...
00007bfd16,Strawberry Rhubarb Dump Cake,amount unit ingred...,0 Put ingredients in a buttered 9 x 12 x 2-...
000095fc1d,Yogurt Parfaits,amount unit ingredient 0 ...,0 Layer all ingredients in a serving dish. ...
0000973574,Zucchini Nut Bread,amount unit ingredient 0 2...,0 Sift dry ingr...
0000b1e2b5,Fennel-Rubbed Pork Tenderloin with Roasted Fen...,amount unit ingr...,0 Preheat oven to 350F with rack i...
