# Recipe1M parser

In [None]:
!pip install quantulum3
!pip install stemming

In [None]:
# Add GDrive
from google.colab import drive
import sys
drive.mount('/content/drive/')
sys.path.append('/content/drive/My Drive/Datasets/Recipe1M/')

In [1]:
import pandas as pd
import re
from recipe import Recipe

FILE_DIR = '../'
#FILE_DIR = '/content/drive/My Drive/Datasets/Recipe1M/'

## Recipe1M data
Recipe1M comes with various json files containing crawled recipes from the web. For our project, two of them are interesting:
* layer1.json: Contains all recipes to their full extend
  
  ![layer1](https://github.com/mscholl96/mad-recime/blob/recipe1M-parser/data/recipe1M/dataset-analysis/layer1_puml.png?raw=1)

* det_ingrs.json: Only contains recipe ID, parsed ingredients and validity flag for parsing 
  
  ![det_ingrs](https://github.com/mscholl96/mad-recime/blob/recipe1M-parser/data/recipe1M/dataset-analysis/det_ingrs_puml.png?raw=1)

In our first attempt we want to make use of the parsed ingredient list and only consider recipes, where all ingredients are marked valid. The parsed ingredients don't contain amounts, so our parser has to kind of merge content of both files. Extracting ingredients from one and their amount and unit from the other file.

## Preprocessing
Removal of all invalid sets from ingredient and full data json to reduce memory. Use pickle instead of json.

In [3]:
ingredient_file = FILE_DIR + 'ingSub.json'
layer1_file = FILE_DIR + 'layer1_stripped.json'

ingredient_out = FILE_DIR + 'det_ingrs_valid.pkl'
layer1_out = FILE_DIR + 'layer1_valid.pkl'

# Removal of all elements in ingredient json which contain invalid entries according to the data set
ingredient_data = pd.read_json(ingredient_file).set_index('id')
recipe_raw_data = pd.read_json(layer1_file).set_index('id')

# Get indices of ingredients which contain false valid flags 
indices = ingredient_data[[True if any(x == False for x in row) else False for row in ingredient_data['valid']]].index

# Drop indices from ingredient data
ingredient_data = ingredient_data.drop(indices).drop(columns=['valid'])

# Remove data from raw recipes where id matches
recipe_mod = recipe_raw_data.drop(indices).drop(columns=['url', 'partition'])

# Remove fractions from raw ingredients
fractionRegex = re.compile("[0-9]+/[0-9]+")
for _, recipe in recipe_mod.iterrows():
    ingredients_mod = []
    for ingredient in recipe['ingredients']:
        ingredient_mod = ""
        for word in ingredient['text'].split(' '):
            match = re.match(fractionRegex, word)
            if match:
                numbers = match.group(0).split('/')

                float_representation = int(numbers[0])/int(numbers[1])
                ingredient_mod += f'{float_representation} '
            else:
                ingredient_mod += f'{word} '
        ingredients_mod.append({'text': ingredient_mod})
    recipe['ingredients'] = ingredients_mod

# Save data to pickle (it's faster)
ingredient_data.to_pickle(ingredient_out)
recipe_mod.to_pickle(layer1_out)

recipe_mod.head(5)


Unnamed: 0_level_0,ingredients,title,instructions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000033e39b,"[{'text': '1 c. elbow macaroni '}, {'text': '1...",Dilly Macaroni Salad Recipe,[{'text': 'Cook macaroni according to package ...
000035f7ed,"[{'text': '8 tomatoes, quartered '}, {'text': ...",Gazpacho,[{'text': 'Add the tomatoes to a food processo...
00003a70b1,"[{'text': '2 12 cups milk '}, {'text': '1 12 c...",Crunchy Onion Potato Bake,[{'text': 'Preheat oven to 350 degrees Fahrenh...
00004320bb,[{'text': '1 (3 ounce) package watermelon gela...,Cool 'n Easy Creamy Watermelon Pie,"[{'text': 'Dissolve Jello in boiling water.'},..."
0000631d90,"[{'text': '12 cup shredded coconut '}, {'text'...",Easy Tropical Beef Skillet,"[{'text': 'In a large skillet, toast the cocon..."


## Actual parsing

In [4]:
recipes = []
# Sort by ID, 
data = pd.read_pickle(FILE_DIR + 'layer1_valid.pkl')
print(f'Total number of recipes: {len(data)}')
data.head(5)


Total number of recipes: 22


Unnamed: 0_level_0,ingredients,title,instructions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000033e39b,"[{'text': '1 c. elbow macaroni '}, {'text': '1...",Dilly Macaroni Salad Recipe,[{'text': 'Cook macaroni according to package ...
000035f7ed,"[{'text': '8 tomatoes, quartered '}, {'text': ...",Gazpacho,[{'text': 'Add the tomatoes to a food processo...
00003a70b1,"[{'text': '2 12 cups milk '}, {'text': '1 12 c...",Crunchy Onion Potato Bake,[{'text': 'Preheat oven to 350 degrees Fahrenh...
00004320bb,[{'text': '1 (3 ounce) package watermelon gela...,Cool 'n Easy Creamy Watermelon Pie,"[{'text': 'Dissolve Jello in boiling water.'},..."
0000631d90,"[{'text': '12 cup shredded coconut '}, {'text'...",Easy Tropical Beef Skillet,"[{'text': 'In a large skillet, toast the cocon..."


In [5]:
ingredient_data = pd.read_pickle(FILE_DIR + 'det_ingrs_valid.pkl')
print(f'Total number of ingredients: {len(ingredient_data)}')
ingredient_data.head(5)

Total number of ingredients: 21


Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
000033e39b,"[{'text': 'elbow macaroni'}, {'text': 'America..."
000035f7ed,"[{'text': 'tomatoes'}, {'text': 'kosher salt'}..."
00003a70b1,"[{'text': 'milk'}, {'text': 'water'}, {'text':..."
00004320bb,"[{'text': 'watermelon gelatin'}, {'text': 'boi..."
0000631d90,"[{'text': 'shredded coconut'}, {'text': 'lean ..."


In [6]:
for idx, ingredients in ingredient_data.iterrows():
    
    raw_recipe = data.loc[idx]
    recipe = Recipe(idx)
    
    # Continue if parser didn't parse
    if False == recipe.parse_ingredients(ingredients['ingredients']):
        continue

    # Continue if parser didn't parse
    if False == recipe.parse_instructions(raw_recipe['instructions']):
        continue
      
    recipe.get_ingredient_amounts(raw_recipe['ingredients'])

    recipe.title = raw_recipe['title']
    recipes.append(recipe)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
2022-01-19 23:04:39,841 --- The classifier was built using a different scikit-learn version (=0.24.2, !=1.0.1). The disambiguation tool could behave unexpectedly. Consider running classifier.train_classfier()


[ID: 000033e39b 
Title: Dilly Macaroni Salad Recipe 
Ingredients:   amount        unit       ingredient
0    1.0         cup   elbow macaroni
1      1              American cheese
2    0.5         cup           celery
3      1                green peppers
4      1                     pimentos
5    0.5         cup       mayonnaise
6    1.0  tablespoon          vinegar
7   0.75    teaspoon             salt
8    0.5    teaspoon    dry dill weed 
Instructions 0    Cook macaroni according to package directions;...
1                                                Cold.
2    Combine macaroni, cheese cubes, celery, green ...
3    Blend together mayonnaise or possibly salad dr...
4                                        Toss lightly.
5                           Cover and refrigeratewell.
6       Serve salad in lettuce lined bowl if you like.
7                                    Makes 6 servings.
dtype: object]
[ID: 000033e39b 
Title: Dilly Macaroni Salad Recipe 
Ingredients:   amount        uni

In [5]:
# Create data frame in the end (according to Stackoverflow this is faster)                
df = pd.DataFrame([vars(r) for r in recipes])
df = df.set_index('id')
df.to_pickle(FILE_DIR + 'recipes_valid.pkl')
df.to_json(FILE_DIR + 'recipes_valid.json', indent=2, orient='records')
df.head(10)

Unnamed: 0_level_0,title,ingredients,instructions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000033e39b,Dilly Macaroni Salad Recipe,amount unit ingredient 0 1...,0 Cook macaroni according to package direct...
000035f7ed,Gazpacho,amount unit ingredient 0 8 ...,0 Add the tomatoes to a food processor with...
00003a70b1,Crunchy Onion Potato Bake,amount unit ingredient 0 1 ...,0 Preheat oven to 350 degrees Fah...
00004320bb,Cool 'n Easy Creamy Watermelon Pie,amount unit ingredient 0 1 ...,0 Dissolve Jello in boiling water. 1 ...
0000631d90,Easy Tropical Beef Skillet,amount unit ingredient 0 ...,"0 In a large skillet, toast the coconut ove..."
000075604a,Kombu Tea Grilled Chicken Thigh,amount unit ingredient 0 2 ...,0 Pierce the skin of the chicken with a for...
00007bfd16,Strawberry Rhubarb Dump Cake,amount unit ingred...,0 Put ingredients in a buttered 9 x 12 x 2-...
000095fc1d,Yogurt Parfaits,amount unit ingredient 0 ...,0 Layer all ingredients in a serving dish. ...
0000973574,Zucchini Nut Bread,amount unit ingredient 0 2...,0 Sift dry ingr...
0000b1e2b5,Fennel-Rubbed Pork Tenderloin with Roasted Fen...,amount unit ingr...,0 Preheat oven to 350F with rack i...
