# Recipe1M parser

In [1]:
!pip install quantulum3
!pip install stemming

import pandas as pd
import re
from recipe import Recipe




In [2]:
# Add GDrive
from google.colab import drive
import sys
drive.mount('/content/drive/')
sys.path.append('/content/drive/My Drive/Datasets/Recipe1M/') ## Place correct Link HERE !!! ##

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Recipe1M data
Recipe1M comes with various json files containing crawled recipes from the web. For our project, two of them are interesting:
* layer1.json: Contains all recipes to their full extend
  
  ![layer1](https://github.com/mscholl96/mad-recime/blob/recipe1M-parser/data/recipe1M/dataset-analysis/layer1_puml.png?raw=1)

* det_ingrs.json: Only contains recipe ID, parsed ingredients and validity flag for parsing 
  
  ![det_ingrs](https://github.com/mscholl96/mad-recime/blob/recipe1M-parser/data/recipe1M/dataset-analysis/det_ingrs_puml.png?raw=1)

In our first attempt we want to make use of the parsed ingredient list and only consider recipes, where all ingredients are marked valid. The parsed ingredients don't contain amounts, so our parser has to kind of merge content of both files. Extracting ingredients from one and their amount and unit from the other file.

## Preprocessing
Removal of all invalid sets from ingredient and full data json to reduce memory. Use pickle instead of json.

In [3]:
# Removal of all elements in ingredient json which contain invalid entries according to the data set
ingredient_data = pd.read_json('/content/drive/My Drive/Datasets/Recipe1M/det_ingrs.json')
recipe_raw_data = pd.read_json('/content/drive/My Drive/Datasets/Recipe1M/layer1.json')

indices = []
i = 0
for row in ingredient_data.valid:
    if any(x == False for x in row):
        indices.append(i)
    i += 1

# Frame of ids that have to be dropped from raw data
drop_ids = pd.DataFrame(ingredient_data.iloc[indices]['id'])

# Drop indices from ingredient data
ingredient_data.drop(indices, inplace=True)
ingredient_data.info()

# Remove data from raw recipes where id matches
recipe_mod = recipe_raw_data[~recipe_raw_data.id.isin(drop_ids.id)]
recipe_mod.info()

# Remove fractions from raw ingredients
fractionRegex = re.compile("[0-9]+/[0-9]+")
for _, recipe in recipe_mod.iterrows():
    ingredients_mod = []
    for ingredient in recipe['ingredients']:
        ingredient_mod = ""
        for word in ingredient['text'].split(' '):
            match = re.match(fractionRegex, word)
            if match:
                numbers = match.group(0).split('/')

                float_representation = int(numbers[0])/int(numbers[1])
                ingredient_mod += f'{float_representation} '
            else:
                ingredient_mod += f'{word} '
        ingredients_mod.append({'text': ingredient_mod})
    recipe['ingredients'] = ingredients_mod

recipe_mod.info()

# TODO: Replace unparseable stuff like "c." --> cup 

# Save data to pickle (it's faster)
ingredient_data.to_pickle('/content/drive/My Drive/Datasets/Recipe1M/det_ingrs_valid.pkl')
recipe_mod.to_pickle('/content/drive/My Drive/Datasets/Recipe1M/layer1_valid.pkl')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 869656 entries, 1 to 1029719
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   valid        869656 non-null  object
 1   id           869656 non-null  object
 2   ingredients  869656 non-null  object
dtypes: object(3)
memory usage: 26.5+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 869656 entries, 1 to 1029719
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   ingredients   869656 non-null  object
 1   url           869656 non-null  object
 2   partition     869656 non-null  object
 3   title         869656 non-null  object
 4   id            869656 non-null  object
 5   instructions  869656 non-null  object
dtypes: object(6)
memory usage: 46.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 869656 entries, 1 to 1029719
Data columns (total 6 columns):
 #   Column        Non-Null Cou

## Actual parsing

In [None]:
recipes = []
data = pd.read_pickle('/content/drive/My Drive/Datasets/Recipe1M/layer1_valid.pkl')
num_recipes = len(data)
print(f'Total number of recipes: {num_recipes}')
# Use id as index for easy access
data = data.set_index('id')

ingredient_data = pd.read_pickle('/content/drive/My Drive/Datasets/Recipe1M/det_ingrs_valid.pkl')
for _, row in ingredient_data.iterrows():
    recipe = Recipe(row['id'])
    
    # Continue if parser didn't parse
    if False == recipe.parse_ingredients(row['ingredients']):
        continue
    
    # Find raw recipe by id
    raw_recipe = data.loc[recipe.id]
    recipe.get_ingredient_amounts(raw_recipe['ingredients'])
    
    # Continue if parser didn't parse
    if False == recipe.parse_instructions(raw_recipe['instructions']):
        continue

    recipe.title = raw_recipe['title']
    recipes.append(recipe)

# Create data frame in the end (according to Stackoverflow this is faster)                
df = pd.DataFrame([vars(r) for r in recipes])
df = df.set_index('id')
df.to_pickle('/content/drive/My Drive/Datasets/Recipe1M/recipes_valid.pkl')
df.to_json('/content/drive/My Drive/Datasets/Recipe1M/recipes_valid.json')
df.head(10)

Total number of recipes: 869656


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
2022-01-16 20:12:48,379 --- The classifier was built using a different scikit-learn version (=0.24.2, !=1.0.2). The disambiguation tool could behave unexpectedly. Consider running classifier.train_classfier()
