Libraries

In [1]:
import json, os, pandas as pd, re, copy, csv
from collections import defaultdict
cwd = os.getcwd()

Get the list of words that needs to be deleted before processing

In [2]:
remove_ingredient_before_fn = cwd + '/preprocess-now_youre_cooking_data/ingredient-lists/remove-ingredient-before.csv'
remove_ingredient_before_list = list(pd.read_csv(remove_ingredient_before_fn, header=None)[0])

Cleanup the recipes

In [3]:
recipe_json_fn = cwd + '/data/preprocessed-recipes/recipes.json'
with open(recipe_json_fn, 'r') as fp:
    recipe_dict = json.load(fp)
recipe_dict = {int(key): recipe_dict[key] for key in recipe_dict}

In [4]:
# clean_ingredients
def clean_ingredients(ingredient_items_str):
	# Remove the word 'Ingredients: ', any unicode, and strip newlines and spaces from both ends
	items = ingredient_items_str.strip()
	# items = ingredient_items_str.decode('unicode_escape').encode('ascii', 'ignore').strip()
	# Remove words listed in the remove-ingredient-before.csv using boundary search
	for each_ingredient in remove_ingredient_before_list:
		items = re.sub(r"\b" + re.escape(each_ingredient) + r"\b", '', items)
	# Replace the -LRB-, -RRB-, -LSB- and -RSB- with paranthases
	items = items.replace('-LRB-', '(').replace('-RRB-', ')')
	items = items.replace('-LSB-', '(').replace('-RSB-', ')')
	# Remove empty parenthases, and keep it as a bi-gram and replace the underscore with space to save them as uni-grams
	items = items.replace('(  )', '').replace('_', ' ')
	# Split the recipe into individual items (for both uni and bi-grams)
	item_arr = items.split('	')
	# Create an array that will hold the clean ingredients
	clean_ingredient_items_arr = []
	# Iterate over the items of the recipe
	for each_item in item_arr:
		# If the item ends with semicolumn then exclude it, and no need to keep track of its index
		if each_item.endswith(':'):
			continue
		# If the item has the following conditions (completion of prev item), then exclude it, and no need to keep track of its index
		if each_item.startswith('(') or each_item.startswith('-') or (each_item.endswith(')') and not('(' in each_item)):
			continue
		# Preserve the item before the next cleaning part, so we can save it if it's a strange item
		each_item_b4_strip = each_item
		# Strip spaces and commas from both ends, and extra spaces in the middle
		each_item = each_item.strip(' ').strip(',').strip(' ').strip(',')
		each_item = re.sub(r' +', ' ', each_item)
		if not re.search('[a-zA-Z]', each_item):
			strange_items_list.append(each_item_b4_strip)
			continue
		# Append the items to the clean_ingredient_items_arr
		clean_ingredient_items_arr.append(each_item)
	return clean_ingredient_items_arr
# end clean_ingredients


Cleanup the ingredients

In [5]:
# Set the list of strange items
strange_items_list = []
# Copy to a new dict which would store the cleaned ingredients
clean_recipe_dict = copy.deepcopy(recipe_dict)
# Recipe index list that don't have ingredients
recipe_no_ing_list = []
# Iterate over the recipe dictionary
for each_recipe in clean_recipe_dict.keys():
	if 'ingredients' in clean_recipe_dict[each_recipe]:
		clean_recipe_dict[each_recipe]['ingredients'] = clean_ingredients(clean_recipe_dict[each_recipe]['ingredients'])
	else:
		clean_recipe_dict.pop(each_recipe, None)
print('Clean recipes dict:', len(clean_recipe_dict))
print('Number of strange items:', len(strange_items_list))

('Clean recipes dict:', 145907)
('Number of strange items:', 6078)


Indexing for ingredient-phrase-tagger tool

In [6]:
# Set least number of ingredients that should be in a recipe
least_num_items = 3
# Set item index and the item map
item_idx = 0
recipe_item_map = defaultdict(list)
# Ingredients list
ingredients_list = []
# Sort keys of the clean_recipe_dict
sorted_clean_recipe_keys = sorted(clean_recipe_dict.keys())
# Iterate over the recipe dictionary
for recipe_idx in sorted_clean_recipe_keys:
	if len(clean_recipe_dict[recipe_idx]['ingredients']) < least_num_items:
		clean_recipe_dict.pop(recipe_idx, None)
		continue
	# Iterate over the ingredients of the recipe
	for each_ingredient in clean_recipe_dict[recipe_idx]['ingredients']:
		# Append to the list of all ingredients (in order)
		ingredients_list.append(each_ingredient)
		# Append the item index to the recipe index list
		recipe_item_map[recipe_idx].append(item_idx)
		# Increment the ingredient index
		item_idx += 1
print('Check if both dicts recipe_item_map and clean_recipe_dict are equal in length')
print('Number of items in recipe_item_map:', len(recipe_item_map))
print('Number of recipes in clean_recipe_dict:', len(clean_recipe_dict))

Check if both dicts recipe_item_map and clean_recipe_dict are equal in length
('Number of items in recipe_item_map:', 141528)
('Number of recipes in clean_recipe_dict:', 141528)


In [7]:
# Check for unmatching indexes for the recipe_item_map and clean_recipe_dict
unmatching_indexes = []
for each_item, each_recipe in zip(recipe_item_map, clean_recipe_dict):
	if each_item != each_recipe:
		unmatching_indexes.append((each_item, each_recipe))
print('Number of unmatching_indexes:', len(unmatching_indexes))
print('unmatching_indexes:', unmatching_indexes)

('Number of unmatching_indexes:', 0)
('unmatching_indexes:', [])


Inspect if there are any ingredients in the ingredients_list not in order with respect to clean_recipe_dict

In [8]:
# Iterate over the clean_recipe_dict
for inspect_idx in clean_recipe_dict:
	# Get the min and max indexes of the recipe indexed by inspect_idx in recipe_item_map
	min_ing_idx, max_ing_idx = min(recipe_item_map[inspect_idx]), max(recipe_item_map[inspect_idx])
	# Check if there are any mismatches in the two lists 
	# clean_recipe_dict[inspect_idx]['ingredients'] and ingredients_list[min_ing_idx:max_ing_idx]
	for each_clean_ing, each_ing_list in zip(clean_recipe_dict[inspect_idx]['ingredients'], ingredients_list[min_ing_idx:max_ing_idx]):
		if each_clean_ing != each_ing_list:
			print('The mismatch was found in', inspect_idx)

Store the clean_recipes, recipe_item_map and ingredients_list

In [9]:
# Store the clean_recipe_dict
clean_recipe_fn = cwd + '/data/preprocessed-recipes/clean_recipes.json'
with open(clean_recipe_fn, 'w') as fp:
    json.dump(clean_recipe_dict, fp)
# Store the recipe_item_map
recipe_item_map_fn = cwd + '/data/preprocessed-recipes/recipe_item_map.json'
with open(recipe_item_map_fn, 'w') as fp:
    json.dump(recipe_item_map, fp)
# Store the ingredients_list in a CSV file, which will be processed by the ingredient phrase tagger
ingredients_list_fn = cwd + '/data/preprocessed-recipes/ingredients_list.csv'
# Make up an impossible separator, since we don't want double quotations when there are commas or any preassumed types of separators
pd.DataFrame(ingredients_list).to_csv(ingredients_list_fn, header=False, index=False, sep='~')