# Recipe1M parser

In [None]:
!pip install quantulum3
!pip install stemming

In [None]:
# Add GDrive
from google.colab import drive
import sys
drive.mount('/content/drive/')
sys.path.append('/content/drive/My Drive/Datasets/Recipe1M/')

In [None]:
import pandas as pd
import re
from recipe import Recipe

FILE_DIR = '../'
#FILE_DIR = '/content/drive/My Drive/Datasets/Recipe1M/'

## Recipe1M data
Recipe1M comes with various json files containing crawled recipes from the web. For our project, two of them are interesting:
* layer1.json: Contains all recipes to their full extend
  
  ![layer1](https://github.com/mscholl96/mad-recime/blob/recipe1M-parser/data/recipe1M/dataset-analysis/layer1_puml.png?raw=1)

* det_ingrs.json: Only contains recipe ID, parsed ingredients and validity flag for parsing 
  
  ![det_ingrs](https://github.com/mscholl96/mad-recime/blob/recipe1M-parser/data/recipe1M/dataset-analysis/det_ingrs_puml.png?raw=1)

In our first attempt we want to make use of the parsed ingredient list and only consider recipes, where all ingredients are marked valid. The parsed ingredients don't contain amounts, so our parser has to kind of merge content of both files. Extracting ingredients from one and their amount and unit from the other file.

## Preprocessing
Removal of all invalid sets from ingredient and full data json to reduce memory. Use pickle instead of json.

In [None]:
ingredient_file = FILE_DIR + 'ingSub.json'
layer1_file = FILE_DIR + 'layer1_stripped.json'

ingredient_out = FILE_DIR + 'det_ingrs_valid.pkl'
layer1_out = FILE_DIR + 'layer1_valid.pkl'

# Removal of all elements in ingredient json which contain invalid entries according to the data set
ingredient_data = pd.read_json(ingredient_file).set_index('id')
recipe_raw_data = pd.read_json(layer1_file).set_index('id')

# Get indices of ingredients which contain false valid flags 
indices = ingredient_data[[True if any(x == False for x in row) else False for row in ingredient_data['valid']]].index

# Drop indices from ingredient data
ingredient_data = ingredient_data.drop(indices).drop(columns=['valid'])

# Remove data from raw recipes where id matches
recipe_mod = recipe_raw_data.drop(indices).drop(columns=['url', 'partition'])

# Remove fractions from raw ingredients
fractionRegex = re.compile("[0-9]+/[0-9]+")
for _, recipe in recipe_mod.iterrows():
    ingredients_mod = []
    for ingredient in recipe['ingredients']:
        ingredient_mod = ""
        for word in ingredient['text'].split(' '):
            match = re.match(fractionRegex, word)
            if match:
                numbers = match.group(0).split('/')

                float_representation = int(numbers[0])/int(numbers[1])
                ingredient_mod += f'{float_representation} '
            else:
                ingredient_mod += f'{word} '
        ingredients_mod.append({'text': ingredient_mod})
    recipe['ingredients'] = ingredients_mod

# Save data to pickle (it's faster)
ingredient_data.to_pickle(ingredient_out)
recipe_mod.to_pickle(layer1_out)

recipe_mod.head(5)


## Actual parsing

In [None]:
recipes = []
# Sort by ID, 
data = pd.read_pickle(FILE_DIR + 'layer1_valid.pkl')
print(f'Total number of recipes: {len(data)}')
data.head(5)


In [None]:
ingredient_data = pd.read_pickle(FILE_DIR + 'det_ingrs_valid.pkl')
print(f'Total number of ingredients: {len(ingredient_data)}')
ingredient_data.head(5)

In [None]:
for idx, ingredients in ingredient_data.iterrows():
    
    raw_recipe = data.loc[idx]
    recipe = Recipe(idx)
    
    # Continue if parser didn't parse
    if False == recipe.parse_ingredients(ingredients['ingredients']):
        continue

    # Continue if parser didn't parse
    if False == recipe.parse_instructions(raw_recipe['instructions']):
        continue
      
    recipe.get_ingredient_amounts(raw_recipe['ingredients'])

    recipe.title = raw_recipe['title']
    recipes.append(recipe)

In [None]:
# Try to clean up
del ingredient_data, data

# Create data frame in the end (according to Stackoverflow this is faster)                
df = pd.DataFrame([vars(r) for r in recipes]).set_index('id')
df.to_pickle(FILE_DIR + 'recipes_valid.pkl')
df.to_json(FILE_DIR + 'recipes_valid.json', indent=2, orient='records')
df.head(10)