# Steamboat Squad

Import and load data

In [1036]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1037]:
import json

with open("recipes_ingredients.json", "r") as json_file:
    recipes = json.load(json_file)
    
len(recipes)

4702

Overview of data structure. This is a list of dictionary, where each dictionary is a recipe with its name, ingredients and url

In [1038]:
recipes[0]

{'url': 'https://www.allrecipes.com/recipe/18318/pan-fried-asparagus/',
 'name': 'Pan-Fried Asparagus',
 'ingredients': ['¼ cup butter ',
  '2 tablespoons olive oil ',
  '1 teaspoon coarse salt ',
  '¼ teaspoon ground black pepper ',
  '3 cloves garlic, minced ',
  '1 pound fresh asparagus spears, trimmed ']}

Deleting url key

In [1039]:
for recipe in recipes:
    del recipe['url']
recipes[0]

{'name': 'Pan-Fried Asparagus',
 'ingredients': ['¼ cup butter ',
  '2 tablespoons olive oil ',
  '1 teaspoon coarse salt ',
  '¼ teaspoon ground black pepper ',
  '3 cloves garlic, minced ',
  '1 pound fresh asparagus spears, trimmed ']}

# Preprocessing Recipe Names
- Lower-casing (normalise words by using POS tagging)
- Change numbers to fix number (place holder)

NLTK has a help function that explains its POS tags.

In [1040]:
import nltk
from nltk import pos_tag, word_tokenize, RegexpParser, Tree
from nltk.tokenize import PunktSentenceTokenizer

nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [1041]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

Using %%capture, save the NLTK help text as a string

In [1042]:
%%capture cap --no-stderr

nltk.help.upenn_tagset()

In [1043]:
cap.stdout

'$: dollar\n    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$\n\'\': closing quotation mark\n    \' \'\'\n(: opening parenthesis\n    ( [ {\n): closing parenthesis\n    ) ] }\n,: comma\n    ,\n--: dash\n    --\n.: sentence terminator\n    . ! ?\n:: colon or ellipsis\n    : ; ...\nCC: conjunction, coordinating\n    & \'n and both but either et for less minus neither nor or plus so\n    therefore times v. versus vs. whether yet\nCD: numeral, cardinal\n    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-\n    seven 1987 twenty \'79 zero two 78-degrees eighty-four IX \'60s .025\n    fifteen 271,124 dozen quintillion DM2,000 ...\nDT: determiner\n    all an another any both del each either every half la many much nary\n    neither no some such that the them these this those\nEX: existential there\n    there\nFW: foreign word\n    gemeinschaft hund ich jeux habeas Haementeria Herr K\'ang-si vous\n    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte\n    terram 

Using RE, get all the tag names

In [1044]:
import re

ALL_POS = re.findall(".*: +", cap.stdout)

for i, pos in enumerate(ALL_POS):
  ALL_POS[i] = pos.replace(': ', '')


ALL_POS

['$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 '    ',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``']

In [1045]:
ALL_POS.remove('    ')
ALL_POS

['$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``']

Create a function to pos tag a text

In [1046]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def tag_pos(corpus):
    text=word_tokenize(corpus)
    return nltk.pos_tag(text)

tag_pos("This is a test sentence.")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('test', 'NN'),
 ('sentence', 'NN'),
 ('.', '.')]

Create a function that POS tag and returns words with specific POS

In [1047]:
def get_words_with_pos(text, pos):
  tagged = tag_pos(text)
  return [t for t in tagged if t[1].startswith(pos)]

get_words_with_pos("This is a test sentence.", "NN")

[('test', 'NN'), ('sentence', 'NN')]

POS tag all recipe names

In [1048]:
tagged_recipe_names = []

for i, recipe in enumerate(recipes):
  try:
    tagged_recipe_names.append(tag_pos(recipes[i]['name']))
  except Exception as e:
    pass

len(tagged_recipe_names)

4701

## Data cleaning for names based on POS tagging

Looking at the first 10 tagged recipe names, there is a need for pre-processing, as NLTK's tagging is confused by the letter casing.

In [1049]:
tagged_recipe_names[:10]

[[('Pan-Fried', 'JJ'), ('Asparagus', 'NNP')],
 [('Pan', 'NNP'),
  ('de', 'FW'),
  ('Muertos', 'NNP'),
  ('(', '('),
  ('Mexican', 'NNP'),
  ('Bread', 'NNP'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('Dead', 'NNP'),
  (')', ')')],
 [('Creamy', 'NNP'), ('Au', 'NNP'), ('Gratin', 'NNP'), ('Potatoes', 'NNP')],
 [('Super-Delicious', 'JJ'), ('Zuppa', 'NNP'), ('Toscana', 'NNP')],
 [('Simple', 'JJ'), ('Teriyaki', 'NNP'), ('Sauce', 'NNP')],
 [('Spicy', 'JJ'),
  ('Korean', 'NNP'),
  ('Fried', 'NNP'),
  ('Chicken', 'NNP'),
  ('with', 'IN'),
  ('Gochujang', 'NNP'),
  ('Sauce', 'NNP')],
 [('Spaghetti', 'NNP'), ('Aglio', 'NNP'), ('e', 'NN'), ('Olio', 'NNP')],
 [('Easy', 'JJ'), ('Garam', 'NNP'), ('Masala', 'NNP')],
 [('Easy', 'NNP'), ('Chorizo', 'NNP'), ('Street', 'NNP'), ('Tacos', 'NNP')],
 [('Tres', 'NNS'),
  ('Leches', 'NNP'),
  ('(', '('),
  ('Milk', 'NNP'),
  ('Cake', 'NNP'),
  (')', ')')]]

Create a function that returns all tagged words with the same tag. NLTK's POS tagging assumes that capitalized noun means proper noun (name).

In [1050]:
def list_words_with_tag(tuple_list, pos):
  results = []
  for name in tuple_list:
    for tag in name:
      if tag[1] == pos:
        results.append(tag[0])
  return results

list_words_with_tag(tagged_recipe_names, "NNP")

['Asparagus',
 'Pan',
 'Muertos',
 'Mexican',
 'Bread',
 'Dead',
 'Creamy',
 'Au',
 'Gratin',
 'Potatoes',
 'Zuppa',
 'Toscana',
 'Teriyaki',
 'Sauce',
 'Korean',
 'Fried',
 'Chicken',
 'Gochujang',
 'Sauce',
 'Spaghetti',
 'Aglio',
 'Olio',
 'Garam',
 'Masala',
 'Easy',
 'Chorizo',
 'Street',
 'Tacos',
 'Leches',
 'Milk',
 'Cake',
 'Cabbage',
 'Rolls',
 'Gravy',
 'Shrimp',
 'Scampi',
 'Pasta',
 'Lemon',
 'Chicken',
 'Potato',
 'Bake',
 'Mexican',
 'Casserole',
 'Caldo',
 'Res',
 'Mexican',
 'Beef',
 'Soup',
 'Nogada',
 'Mexican',
 'Stuffed',
 'Poblano',
 'Peppers',
 'Walnut',
 'Sauce',
 'Apple',
 'Cake',
 'Flan',
 'Pork',
 'Chops',
 'Sauerkraut',
 'Spicy',
 'Thai',
 'Basil',
 'Chicken',
 'Pad',
 'Krapow',
 'Gai',
 'Spaghetti',
 'Cacio',
 'Pepe',
 'Chef',
 'John',
 'Chicken',
 'Kiev',
 'Chicken',
 'Onions',
 'Fajita',
 'Perfect',
 'Sushi',
 'Rice',
 'Baked',
 'Chicken',
 'German',
 'Potato',
 'Salad',
 'Miso',
 'Soup',
 'Mexican',
 'Rice',
 'II',
 'Haluski',
 'Labneh',
 'Lebanese',
 'Y

In [1051]:
all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  all_name_tags.append(new_dic)

In [1052]:
def get_tag_number(tag_list):
  tag_numbers = []
  for tag in tag_list:
    for key, value in tag.items(): 
      new_dict = {key: len(value)}
    tag_numbers.append(new_dict)
  return tag_numbers

get_tag_number(all_name_tags)

[{'$': 1},
 {"''": 7},
 {'(': 529},
 {')': 529},
 {',': 63},
 {'--': 0},
 {'.': 10},
 {':': 98},
 {'CC': 555},
 {'CD': 74},
 {'DT': 104},
 {'EX': 0},
 {'FW': 47},
 {'IN': 482},
 {'JJ': 1822},
 {'JJR': 4},
 {'JJS': 27},
 {'LS': 0},
 {'MD': 2},
 {'NN': 571},
 {'NNP': 13139},
 {'NNPS': 46},
 {'NNS': 307},
 {'PDT': 0},
 {'POS': 348},
 {'PRP': 72},
 {'PRP$': 20},
 {'RB': 33},
 {'RBR': 0},
 {'RBS': 1},
 {'RP': 2},
 {'SYM': 0},
 {'TO': 20},
 {'UH': 0},
 {'VB': 24},
 {'VBD': 39},
 {'VBG': 50},
 {'VBN': 133},
 {'VBP': 10},
 {'VBZ': 22},
 {'WDT': 4},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 7},
 {'``': 6}]

Some names have numbers (CD). Some are obviously not numbers, like 'Figgy'

In [1053]:
def get_values_from_dict_list(dict_list, key):
  values = []
  for d in dict_list:
    if key in d:
      values.append(d[key])
  return values

cd_tokens = get_values_from_dict_list(all_name_tags, 'CD')[0]
cd_tokens

['5',
 '16',
 '2',
 '13',
 '300',
 'Figgy',
 '3',
 '9',
 'Two',
 '9',
 '22',
 '10',
 '15',
 'One',
 '18',
 'Ten',
 'Flounder',
 'Three',
 'Ziti',
 'One',
 '21',
 'Four',
 '9',
 '65',
 '17',
 '14',
 '10',
 "'n",
 '15',
 '8',
 'Minestrone',
 'Four',
 '35',
 'Fly',
 '15',
 '23',
 '8',
 '15',
 '21',
 "That's-a",
 'Tex-Mex',
 '14',
 '17',
 'Five',
 '10',
 '18',
 '5',
 "'Otai",
 '17',
 '3',
 '17',
 '75',
 '17',
 '20',
 'Take-Out',
 '16',
 '12',
 'Three',
 "'Three",
 '15',
 '20',
 '16',
 '12',
 '15',
 '22',
 '12',
 'Three',
 '21',
 '21',
 '25',
 '7',
 '10',
 '19',
 '20']

Create a function that searches for recipe name with specific string

In [1054]:
def find_value_with_char(dic_list, key, char):
  matches = []
  for recipe in dic_list:
    try:
      if char in recipe[key]:
        matches.append(recipe[key])
    except Exception as e:
      pass
  return matches

find_value_with_char(recipes, 'name', 'Figgy')

['Figgy Pudding']

'Three cup chicken' is indeed a name. On the other hand, numerics, such as 9 and 13 are not part of the actual names of dishes. So, numerics, instead of NLTK's CD, should be treated. This treatment should be done using regex.

In [1055]:
for cd in cd_tokens:
  print(find_value_with_char(recipes, 'name', cd))

['Our 5 Best Avgolemono Soup Recipes', '5-Ingredient Mexican Casserole', '15 Mexican-Inspired Ground Beef Casseroles That Deliver Big Flavor With Every Satisfying Bite', 'Chicken 65', 'Pan-Roasted 5-Spice Pork Loin', 'The 15 Most Iconic French Desserts', '35 Quick and Easy Chinese Dinners You Can Make at Home', '15 Essential North Indian Recipes', '15 Essential North Indian Recipes', '18 Easy Mexican Dishes With 5 Ingredients or Less', 'French 75 Cocktail', '15 Top-Rated Traditional German Christmas Cookies', '15 Traditional Italian Christmas Dinner Recipes', "25 Italian Cookies You'll Love"]
['16 German Recipes That Are Comfort Food Favorites', '16 Mexican-Inspired Casseroles for Family-Pleasing Dinners', '16 Essential Puerto Rican Recipes']
['2 Minute Cheese Quesadillas', "22 Recipes Using a Whole Baguette (That Aren't Sandwiches)", 'Our 21 Best Authentic Mexican Recipes', '23 Delicious Ways the World Cooks Pork Shoulder', '21 Easy Dinners That Start with Packaged Gnocchi', 'Our 20 B

Create a function that searches a regex pattern from a text

In [1056]:
def searchWordsPatt(text, patt):
    array = re.findall(patt, text)
    return array

NUMPATTERN = r'[0-9]+'
searchWordsPatt("I want 1 cup of tea", NUMPATTERN)

['1']

Create a function that substitutes regex patterns with a given value

In [1057]:
def searchReplacePatt(text, patt, new_val):
  return re.sub(patt, new_val, text)

NUMSPACEPATTERN = r'(\d+\s)'
searchReplacePatt("I want 1 cup of tea", NUMSPACEPATTERN, "")

'I want cup of tea'

searchReplacePatt, except it iterates recipe list

In [1058]:
def searchReplacePattList(dict_list, patt, new_val, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            dict_list[i]["name"] = searchReplacePatt(dict_list[i][key], patt, new_val)
        except Exception as e:
            pass

Remove numerics from name

In [1059]:
import re

p_recipes = recipes

searchReplacePattList(p_recipes, NUMSPACEPATTERN, "")

def retag(text_list, key):
  new_list = []
  for i, recipe in enumerate(text_list):
    try:
      new_list.append(tag_pos(recipes[i][key]))
    except Exception as e:
      pass
  return new_list

tagged_recipe_names = retag(p_recipes, "name")

Get the new remaining CD

In [1060]:
new_cd_tokens = list_words_with_tag(tagged_recipe_names, "CD")
new_cd_tokens

['Figgy',
 'Two',
 'One',
 'Ten',
 'Flounder',
 'Three',
 'Ziti',
 'One',
 'Four',
 '65',
 "'n",
 'Minestrone',
 'Four',
 'Fly',
 "That's-a",
 'Tex-Mex',
 'Five',
 "'Otai",
 'Take-Out',
 'Three',
 "'Three",
 'Three']

The remaining numbers (CD) are part of actual recipe names

In [1061]:
for cd in new_cd_tokens:
  print(find_value_with_char(p_recipes, 'name', cd))

['Figgy Pudding']
['Two-Ingredient Naan', 'Pollo alla Birra for Two']
['A Number One Egg Bread', 'One-Egg Egg Drop Soup', 'One Pot Thai-Style Rice Noodles', 'One-Pot Vegan Potato-Lentil Curry', 'One-Bite Thai "Flavor Bomb" Salad Wraps (Miang Kham)', 'Easy One-Skillet Ground Beef Burrito', 'One-Pot Greek Lemon Chicken and Rice']
['Tender Italian Baked Chicken', 'Tuscan Pork Tenderloin', 'Asian Pork Tenderloin', 'Italian Pork Tenderloin', 'Sweet and Sour Pork Tenderloin', 'Chipotle Crusted Pork Tenderloin', 'Ten Minute Szechuan Chicken', 'Thai Quivering Tenderloins', 'Spicy Pork Tenderloin', 'Chinese Pork Tenderloin', 'Grecian Pork Tenderloin', 'Havana Slow Cooker Pork Tenderloin', 'Curry Pork Tenderloin', 'Tender Juicy Skirt Steak  (Churrasco)', 'Spicy and Tender Corned Beef', 'Pan Roasted Pork Tenderloin with a Blue Cheese and Olive Stuffing']
['Flounder Mediterranean']
['Pastel de Tres Leches (Three Milk Cake)', 'Three-Meat Italian Meatballs', 'Three Cheese Manicotti II', 'Taiwanese-S

In [1062]:
new_all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  new_all_name_tags.append(new_dic)

Can and 'll are the modal verbs found

In [1063]:
md_tokens = list_words_with_tag(tagged_recipe_names, "MD")
md_tokens

['Can', "'ll"]

'can' is caused by words such as Canadian, which is processed in next section. But, 'you'll love' is not part of recipe name and more of an expression

In [1064]:
for md in md_tokens:
  print(find_value_with_char(p_recipes, 'name', md))

['Canadian Yellow Split Pea Soup with Ham', 'French Canadian Tourtiere', 'Pure Maple Candy', 'Cannoli', 'The Original Donair From the East Coast of Canada', 'Sauerkraut for Canning', 'Tourtiere (French Canadian Meat Pie)', 'Pumpkin Cannoli', 'Puerto Rican Canned Corned Beef Stew', 'Canadian Pork Loin Chops', 'Caneles de Bordeaux', 'Canadian Walleye (Pickerel)', "Thera's Canadian Fried Dough", 'Italian Baked Cannelloni', 'Canary Island Red Mojo Sauce', 'Mexican Tamarind Candy', 'Cantonese Chicken Chow Mein', 'Roti Canai/Paratha (Indian Pancake)', 'Polvorones de Canele (Cinnamon Cookies)', 'Miraculous Canadian Sugar Pie', 'Canadian Tea Biscuits', 'Peanut Butter Potato Candy', 'Irish Potato Candy', 'Filipino Pancit Bihon with Canton', 'Gorton (French-Canadian Pork Spread)', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Chocolate Cantucci', 'Cantonese Style Lobster', 'Real Canadian Poutine', 'French Canadian Meatball Stew', 'Canadian Butter Tarts', 'Canadian Apple Pie', 'Cantones

Removing "You'll" and retagging new list

In [1065]:
searchReplacePattList(p_recipes, r"(You'll Love)", "")
tagged_recipe_names = retag(p_recipes, "name")

'll' removed

In [1066]:
new_md_tokens = list_words_with_tag(tagged_recipe_names, "MD")
new_md_tokens

['Can']

In [1067]:
for md in new_md_tokens:
  print(find_value_with_char(p_recipes, 'name', md))

['Canadian Yellow Split Pea Soup with Ham', 'French Canadian Tourtiere', 'Pure Maple Candy', 'Cannoli', 'The Original Donair From the East Coast of Canada', 'Sauerkraut for Canning', 'Tourtiere (French Canadian Meat Pie)', 'Pumpkin Cannoli', 'Puerto Rican Canned Corned Beef Stew', 'Canadian Pork Loin Chops', 'Caneles de Bordeaux', 'Canadian Walleye (Pickerel)', "Thera's Canadian Fried Dough", 'Italian Baked Cannelloni', 'Canary Island Red Mojo Sauce', 'Mexican Tamarind Candy', 'Cantonese Chicken Chow Mein', 'Roti Canai/Paratha (Indian Pancake)', 'Polvorones de Canele (Cinnamon Cookies)', 'Miraculous Canadian Sugar Pie', 'Canadian Tea Biscuits', 'Peanut Butter Potato Candy', 'Irish Potato Candy', 'Filipino Pancit Bihon with Canton', 'Gorton (French-Canadian Pork Spread)', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Chocolate Cantucci', 'Cantonese Style Lobster', 'Real Canadian Poutine', 'French Canadian Meatball Stew', 'Canadian Butter Tarts', 'Canadian Apple Pie', 'Cantones

In [1068]:
bracket_tokens = list(set(list_words_with_tag(tagged_recipe_names, "(")))
bracket_tokens

['(']

Examining brackers in names. Most of the words in brackets are translations

In [1069]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Tres Leches (Milk Cake)', 'Caldo de Res (Mexican Beef Soup)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', '

"(no red sauce here...golden)" needs to be removed

In [1070]:
searchReplacePattList(p_recipes,  r"(no red sauce here...golden)", "")
searchReplacePattList(p_recipes, r"(From a Swede!)", "")
searchReplacePattList(p_recipes, r"(from a Chinese person)", "")
searchReplacePattList(p_recipes, r"&reg;", "")
tagged_recipe_names = retag(p_recipes, "name")

In [1071]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Tres Leches (Milk Cake)', 'Caldo de Res (Mexican Beef Soup)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', '

In [1072]:
fw_tokens = list(set(list_words_with_tag(tagged_recipe_names, "FW")))
fw_tokens

['Rassolnik', 'et', 'de']

In [1073]:
fw_names = []
for fw in fw_tokens:
    names = find_value_with_char(p_recipes, 'name', fw)
    print(names)
    fw_names = fw_names + names
fw_names = list(set(fw_names))

['Rassolnik with Rice (Russian Pickle Soup)']
['Spaghetti Aglio e Olio', 'Easy Chorizo Street Tacos', 'Spaghetti Cacio e Pepe', 'Make-Ahead Vegetarian Moroccan Stew', "'Chinese Buffet' Green Beans", 'Sweet and Sour Chicken I', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Braised Corned Beef Brisket', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Fabulous Wet Burritos', 'Sauteed Sweet Plantains (Tajaditas Dulces de Platano)', 'Vegetarian Mexican Inspired Stuffed Peppers', 'Sheet Pan Chicken Fajitas', 'Sheet Pan Fried Rice', 'Vegetarian Chinese Fried Noodles', "Papa Drexler's Bavarian Pretzels", 'Quick Bruschetta Chicken Bake', 'Authentic Vietnamese Spring Rolls (Nem Ran Hay Cha Gio)', 'Kotlet Schabowy (Polish Breaded Pork Chop)', 'Spaghetti alla Carbonara: the Traditional Italian Recipe', 'Stir-Fry Chicken and Vegetables', 'Vegetarian Moussaka', 'French Baguettes', 'Shrimp Fettuccine Alfredo', 'Conchas (Mexican Sweet Bread)', 'Skillet Chicken Picante', 'Spaghett

In [1074]:
fw_names

['Dutch Croquetten',
 'Mango Pudding (Flan de Mango)',
 'Peruvian Chicken Soup (Aguadito de Pollo)',
 'Schupfnudeln (German Fried Potato Dumplings)',
 'Vietnamese Salad Rolls',
 'Vegetarian Korma',
 'African Sweet Potato and Peanut Soup',
 'Sweet and Sour Chicken I',
 'Homemade Pork Fried Rice',
 'Sweet Sausage Marsala',
 'Caldo de Res (Mexican Beef Soup)',
 'Bean Tamales (Tameles de Frijoles)',
 'Tembleque de Coco - Coconut Tembleque',
 'Lamb Feta Peppers',
 'Homemade Hoisin Sauce',
 'Homemade Taco Seasoning',
 'Homemade Scotch Eggs',
 'German Apple Sheet Cake',
 'Poulet a la Moutarde (Chicken in Dijon Mustard Sauce)',
 'Homemade Tom Yum Soup',
 'Vietnamese Beef Pho',
 'Roasted Pork Banh Mi (Vietnamese Sandwich)',
 'Spiced Sweet Roasted Red Pepper Hummus',
 'Pain de Campagne - Country French Bread',
 "Mom's Spaghetti Bolognese",
 'Mexican Chicken Meatball Soup (Sopa de Albondigas de Pollo)',
 'Kaes-Spaetzle',
 'Garlic and Herb Marinade',
 "Chef John's French Omelette",
 'Feta Cheese T

Names that both have foreign words and bracket

In [1075]:
bracket_and_fw = [name for name in bracketed_names if name in fw_names]
bracket_and_fw

['Mango Pudding (Flan de Mango)',
 'Peruvian Chicken Soup (Aguadito de Pollo)',
 'Schupfnudeln (German Fried Potato Dumplings)',
 'Caldo de Res (Mexican Beef Soup)',
 'Bean Tamales (Tameles de Frijoles)',
 'Poulet a la Moutarde (Chicken in Dijon Mustard Sauce)',
 'Roasted Pork Banh Mi (Vietnamese Sandwich)',
 'Mexican Chicken Meatball Soup (Sopa de Albondigas de Pollo)',
 'Limber de Coco (Coconut Ice)',
 'Hearty Caldo de Res (Mexican Beef Soup)',
 "Chef John's Brazilian Cheese Bread (Pao de Queijo)",
 "World's Best (Now Vegetarian!) Lasagna",
 'Pastel de Elote (Mexican Corn Cake)',
 'Spaghetti alla Nerano (Spaghetti with Fried Zucchini)',
 'Authentic Chicken Empanadas (Empanadas de Pollo)',
 'Homemade Manti (Traditional Turkish Dumplings)',
 'French Butter Cakes (Madeleines)',
 'Pastel de Tres Leches (Three Milk Cake)',
 'Enchiladas Verdes (Green Enchiladas)',
 'Frikadeller (Danish Meatballs)',
 'Rassolnik with Rice (Russian Pickle Soup)',
 'Gourmet Pastelillos (Meat Pies)',
 'Bibimbap

Split the names into two names, one outside and one inside

In [1076]:
BRACKET_REGEX = " \(.*\)"
def break_fw_bracket(name):
    name1 = re.findall(BRACKET_REGEX, name)[0]
    name1 = name1[name1.find("(")+1:name1.find(")")]
    name2 = re.sub(BRACKET_REGEX, "", name)
    return name1, name2

print(break_fw_bracket("Hearty Caldo de Res (Mexican Beef Soup)"))
print(break_fw_bracket("Ukha (Russian Fish Soup)"))

('Mexican Beef Soup', 'Hearty Caldo de Res')
('Russian Fish Soup', 'Ukha')


Apply the split function. Delete old recipe with bracket and foreign words. In both of the new recipes, duplicate old ingredients.

In [1077]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in bracket_and_fw:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

There are still remaining names with bracket, mostly due to the foreign words not being recognized.

In [1078]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Tres Leches (Milk Cake)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', 'Kalbi (Korean BBQ Short Ribs)', 'Macaron (French Macaroon)', 'Atsara (Papaya Relish)', 'Authentic Chinese Egg Rolls ()', 'Greek Le

In [1079]:
bracketed_names

['Haydari (Turkish Yogurt Dip)',
 'Polish Noodles (Cottage Cheese and Noodles)',
 'Saboob (Egyptian Flatbread)',
 'Kartoffelsuppe nach Bayrischer Art (Bavarian Potato Soup)',
 'Makhani Chicken (Indian Butter Chicken)',
 'Ricotta Pie (Old Italian Recipe)',
 'Atsara (Papaya Relish)',
 'Half-hour Pudding Cake (Montreal Pudding)',
 'Portokalopita (Greek Orange Phyllo Cake)',
 'Pisang Goreng (Indonesian Banana Fritters)',
 'Moroccan Harira (Bean Soup)',
 'Rahmschnitzel (German Schnitzel in Creamy Mushroom Sauce)',
 'Calimocho (Kalimotxo)',
 'Japanese-Style Pickled Cucumber (Sunomono)',
 'Gigantes (Greek Lima Beans)',
 'South Asian-Style Ground Beef (Keema)',
 'Spinach and Tomato Dal (Indian Lentil Soup)',
 'Spinach and Rice (Spanakorizo)',
 'Tsao Mi Fun (Taiwanese Fried Rice Noodles)',
 'Algerian Kefta (Meatballs)',
 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 'Calabaza Con Pollo (Calabaza Squash and Chicken)',
 'Versunkener Apfelkuchen (German Sunken Apple Cake)'

Most of the brackets are at the end of each name. For those that are in the middle, they are translations of one of the words in the name.

In [1080]:
b_name_end = []
b_name_mid = []
for b_name in bracketed_names:
    if b_name.endswith(')'):
        b_name_end.append(b_name)
    else:
        b_name_mid.append(b_name)
        
b_name_end

['Haydari (Turkish Yogurt Dip)',
 'Polish Noodles (Cottage Cheese and Noodles)',
 'Saboob (Egyptian Flatbread)',
 'Kartoffelsuppe nach Bayrischer Art (Bavarian Potato Soup)',
 'Makhani Chicken (Indian Butter Chicken)',
 'Ricotta Pie (Old Italian Recipe)',
 'Atsara (Papaya Relish)',
 'Half-hour Pudding Cake (Montreal Pudding)',
 'Portokalopita (Greek Orange Phyllo Cake)',
 'Pisang Goreng (Indonesian Banana Fritters)',
 'Moroccan Harira (Bean Soup)',
 'Rahmschnitzel (German Schnitzel in Creamy Mushroom Sauce)',
 'Calimocho (Kalimotxo)',
 'Japanese-Style Pickled Cucumber (Sunomono)',
 'Gigantes (Greek Lima Beans)',
 'South Asian-Style Ground Beef (Keema)',
 'Spinach and Tomato Dal (Indian Lentil Soup)',
 'Spinach and Rice (Spanakorizo)',
 'Tsao Mi Fun (Taiwanese Fried Rice Noodles)',
 'Algerian Kefta (Meatballs)',
 'Calabaza Con Pollo (Calabaza Squash and Chicken)',
 'Versunkener Apfelkuchen (German Sunken Apple Cake)',
 'Kalbi (Korean BBQ Short Ribs)',
 'Tostones (Fried Plantains)',
 'Bi

In [1081]:
b_name_mid

['Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 'Coconut (Haupia) and Chocolate Pie',
 'Spicy Indian (Gujarati) Green Beans',
 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms',
 'Hawaiian Bruddah Potato Mac (Macaroni) Salad',
 'Lamb (Gosht) Biryani',
 'Ulu (Breadfruit) Pancakes',
 'Lengua (Beef Tongue) Stew',
 'Seaweed (Nori) Soup',
 'Jeera (Cumin) Rice',
 'Zito (Zhito/Koljivo) - Serbian Wheat Pudding',
 'Classic Cuban Midnight (Medianoche) Sandwich',
 'Korean Bean Curd (Miso) Soup',
 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish',
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Besan (Gram Flour) Halwa',
 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce',
 'Lazy Golumpki (Stuffed Cabbage) Soup',
 'Bee Sting Cake (Bienenstich) II',
 'Albondigas (Meatballs) en Chipotle',
 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce']

On the other hand, without parenthesis anymore, names with foregin words tagged are now clean

In [1082]:
fw_names = []
for fw in fw_tokens:
    names = find_value_with_char(p_recipes, 'name', fw)
    print(names)
    fw_names = fw_names + names
fw_names = list(set(fw_names))

['Rassolnik with Rice']
['Spaghetti Aglio e Olio', 'Easy Chorizo Street Tacos', 'Spaghetti Cacio e Pepe', 'Make-Ahead Vegetarian Moroccan Stew', "'Chinese Buffet' Green Beans", 'Sweet and Sour Chicken I', 'Braised Corned Beef Brisket', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Fabulous Wet Burritos', 'Vegetarian Mexican Inspired Stuffed Peppers', 'Sheet Pan Chicken Fajitas', 'Sheet Pan Fried Rice', 'Vegetarian Chinese Fried Noodles', "Papa Drexler's Bavarian Pretzels", 'Quick Bruschetta Chicken Bake', 'Spaghetti alla Carbonara: the Traditional Italian Recipe', 'Stir-Fry Chicken and Vegetables', 'Vegetarian Moussaka', 'French Baguettes', 'Shrimp Fettuccine Alfredo', 'Skillet Chicken Picante', 'Spaghetti Sauce', 'Vegetarian Korma', 'Fettuccini Carbonara', 'Kaese Spaetzle', 'Beef and Beet Borscht', 'Addictive Sweet Potato Burritos', "Chef John's French Omelette", 'Sweet and Spicy Stir Fry with Chicken and Broccoli', 'Simple Sweet and Spicy Chicken Wraps', 'Johnny 

In [1083]:
fw_names

['Dutch Croquetten',
 'Mexican Bruschetta with Beans',
 'Vietnamese Salad Rolls',
 'Tajaditas Dulces de Platano',
 'Vegetarian Korma',
 'Philippine Longanisa de Eugenio',
 'African Sweet Potato and Peanut Soup',
 'Sweet and Sour Chicken I',
 'Homemade Pork Fried Rice',
 'Sweet Sausage Marsala',
 'Tembleque de Coco - Coconut Tembleque',
 'Belgi Galettes',
 'Lamb Feta Peppers',
 'Homemade Hoisin Sauce',
 'Homemade Taco Seasoning',
 'Korean Rice With Mixed Vegetables',
 'Homemade Scotch Eggs',
 'German Apple Sheet Cake',
 'Homemade Tom Yum Soup',
 'Vietnamese Beef Pho',
 'Spiced Sweet Roasted Red Pepper Hummus',
 'Pain de Campagne - Country French Bread',
 "Mom's Spaghetti Bolognese",
 'Polvorones de Canele',
 'Kaes-Spaetzle',
 'Garlic and Herb Marinade',
 "Chef John's French Omelette",
 'Feta Cheese Turkey Burgers',
 'Vegetarian Four Cheese Lasagna',
 'Ground Beef with Homemade Taco Seasoning Mix',
 'Vegetable Tom Yum Soup',
 'Lemon Madeleines',
 "Chef John's Pasta con le Sarde",
 'Veget

For the remaining names with bracket at the end, split into two new recipe names

In [1084]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_end:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Tres Leches (Milk Cake)
Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)
Spicy Thai Basil Chicken (Pad Krapow Gai)
Labneh (Lebanese Yogurt)
Indian Chicken Curry (Murgh Kari)
Keema Aloo (Ground Beef and Potatoes)
Turkish Eggs (Cilbir)
South African Melktert (Milk Tart)
Ukrainian Apple Cake (Yabluchnyk)
Spanish Garlic Shrimp (Gambas al Ajillo)
German Potato Dumplings (Kartoffelkloesse)
Apfelkuchen (Apple Cake)
Eggplant Caponata (Sicilian Version)
Chana Masala (Savory Indian Chick Peas)
Ricotta Pie (Old Italian Recipe)
Easy Blini (Russian Pancake)
Easy Bulgogi (Korean BBQ Beef)
Carne en su Jugo (Meat in its Juices)
Ghormeh Sabzi (Persian Herb Stew)
Puerto Rican Tostones (Fried Plantains)
Kalbi (Korean BBQ Short Ribs)
Macaron (French Macaroon)
Atsara (Papaya Relish)
Authentic Chinese Egg Rolls ()
Greek Lentil Soup (Fakes)
Lumpia (Shanghai version)
Northern Ontario Partridge (Ruffed Grouse)
Vampiros Mexicanos (Mexican Vampires)
Jamaican Saltfish Fritters (Stamp and Go)
Slo

Arroz Tapado (Rice-On-Top)
Salted Egg Salad (Itlogna Maalat)
Arrachera (Skirt Steak Taco Filling)
Chipas (Argentinean Cheese Bread)


In [1085]:
# glutten free

For some reasons, need to run the cell twice

In [1086]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_end:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Polish Noodles (Cottage Cheese and Noodles)
Oyakodon (Japanese Chicken and Egg Rice Bowl)
Papas Rellenas (Fried Stuffed Potatoes)
Blaukraut (German Red Cabbage)
Irish Boiled Dinner (Corned Beef)
True Dominican Sancocho (Latin 7-Meat Stew)
Blini (Russian Pancakes)
Oeufs Cocotte (Baked Eggs)
Ropa Vieja (Cuban Beef)
Lace Cookies (Florentine Cookies)
Sinigang na Bangus (Filipino Milkfish in Tamarind Broth)
Schwabischer Kartoffelsalat (German Potato Salad - Schwabisch Style)
Roti Canai/Paratha (Indian Pancake)
Melanzana alla Parmigiana (Perfect Eggplant Parmigiana)
Pierogi (Traditional Polish Dumplings)
Nipples of Venus (Capezzoli di Venere)
Samosadilla (Samosa Quesadilla)
Bulgogi (Korean Barbecued Beef)
Sabaayad (Somali Flatbread)
Filipino Baked Milkfish (Baked Bangus)
Ash-e Reshteh (Persian Legume Soup)
Lentil and Cactus Soup (Mom's Recipe)
Ethiopian Cabbage and Potato Dish (Atkilt)
Finnish Kropser (Baked Pancakes)
Oma's Griessnockerlsuppe (Beef and Semolina Dumpling Soup)
Kewa Datshi (Bh

Only the names with bracket in the middle of their names remain

In [1087]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names= bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Classic Cuban Midnight (Medianoche) Sandwich', 'Spicy Indian (Gujarati) Green Beans', 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce', 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce', 'Bee Sting Cake (Bienenstich) II', 'Coconut (Haupia) and Chocolate Pie', 'Lamb (Gosht) Biryani', 'Jeera (Cumin) Rice', 'Pollo (Chicken) Fricassee from Puerto Rico', 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish', 'Lazy Golumpki (Stuffed Cabbage) Soup', 'Ulu (Breadfruit) Pancakes', 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican', 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce', 'Seaweed (Nori) Soup', 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms', 'Hawaiian Bruddah Potato Mac (Macaroni) Salad', 'Korean Bean Curd (Miso) Soup', 'Lengua (Beef Tongue) Stew', 'Albondigas (Meatballs) en Chipotle', 'Zito (Zhito/Koljivo) - Serbian Wheat Pudding', 'Besan (Gram Flour) Halwa']


Mac and rapini is only synonymous the the one word before them. Otherwise, the bracketed words are synonymous to all the words before them combined.

In [1088]:
bracketed_names

['Ulu (Breadfruit) Pancakes',
 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish',
 'Classic Cuban Midnight (Medianoche) Sandwich',
 'Lengua (Beef Tongue) Stew',
 'Jeera (Cumin) Rice',
 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce',
 'Korean Bean Curd (Miso) Soup',
 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms',
 'Lazy Golumpki (Stuffed Cabbage) Soup',
 'Spicy Indian (Gujarati) Green Beans',
 'Bee Sting Cake (Bienenstich) II',
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Besan (Gram Flour) Halwa',
 'Seaweed (Nori) Soup',
 'Zito (Zhito/Koljivo) - Serbian Wheat Pudding',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 'Coconut (Haupia) and Chocolate Pie',
 'Albondigas (Meatballs) en Chipotle',
 'Lamb (Gosht) Biryani',
 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce',
 'Hawaiian Bruddah Potato Mac (Macaroni) Salad']

The names can still be duplicated into 2, except that the bracketed word replaces the words before in the second new name, treating them as synonyms.

In [1089]:
def convert_bracket_synonym(name, num=0):
    name1 = re.findall(BRACKET_REGEX, name)[0]
    name1 = name1[name1.find("(")+1:name1.find(")")]
    name1_suffix = name.split(')')[1]
    if num==0:
        name1 = name1 + name1_suffix
        name2 = re.sub(BRACKET_REGEX, "", name)
    else:
        name1_prefix = name.split('(')[0]
        name1_prefix = name1_prefix[:-num]
        name1 = name1_prefix + name1 + name1_suffix
        name2 = re.sub(BRACKET_REGEX, " ", name)
    return name1, name2

print(convert_bracket_synonym("Lamb (Gosht) Biryani"))
print(convert_bracket_synonym("Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce", 1))
print(convert_bracket_synonym("Hawaiian Bruddah Potato Mac (Macaroni) Salad", 1))

('Gosht Biryani', 'Lamb Biryani')
('Fusilli with RapiniBroccoli Rabe, Garlic, and Tomato Wine Sauce', 'Fusilli with Rapini , Garlic, and Tomato Wine Sauce')
('Hawaiian Bruddah Potato MacMacaroni Salad', 'Hawaiian Bruddah Potato Mac  Salad')


In [1090]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_mid:
            newname1, newname2 = convert_bracket_synonym(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Classic Cuban Midnight (Medianoche) Sandwich
Spicy Indian (Gujarati) Green Beans
Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce
Kimchi Jun (Kimchi Pancake) and Dipping Sauce
Bee Sting Cake (Bienenstich) II
Coconut (Haupia) and Chocolate Pie
Lamb (Gosht) Biryani
Jeera (Cumin) Rice
Pollo (Chicken) Fricassee from Puerto Rico
Fish Sinigang (Tilapia) - Filipino Sour Broth Dish
Lazy Golumpki (Stuffed Cabbage) Soup
Ulu (Breadfruit) Pancakes
Fried Chicken Chunks (Chicharrones De Pollo) Dominican
Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce
Seaweed (Nori) Soup
Vareniki (Russian Pierogi) with Potatoes and Mushrooms
Hawaiian Bruddah Potato Mac (Macaroni) Salad
Korean Bean Curd (Miso) Soup
Lengua (Beef Tongue) Stew
Albondigas (Meatballs) en Chipotle
Zito (Zhito/Koljivo) - Serbian Wheat Pudding
Besan (Gram Flour) Halwa


Successfully removed all brackets from recipe names

In [1091]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names= bracketed_names + names

bracketed_names = list(set(bracketed_names))
bracketed_names

[]


[]

In [1092]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

[':', '-', ';']

Dashes are mostly adjectives, but things like semi colon need to be removed. As for colons, its mostly translation. Semicolons are caused by K&auml;, which are dishes with special characters or German words.

In [1093]:
for colon in colon_tokens:
  print(find_value_with_char(p_recipes, 'name', colon))

['Spaghetti alla Carbonara: the Traditional Italian Recipe', 'Doro Wat: Ethiopian Chicken Dish', "Grandma's Focaccia: Baraise Style"]
['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chic

In [1096]:
def remove_entry_with(dict_list, target, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            if target in dict_list[i]["name"]:
                dict_list.remove(dict_list[i])
        except Exception as e:
            pass

In [1097]:
for semicolon in ["Quorn&trade;", "Sp&auml;tzle", "Tamales Oaxaque&ntilde;os", "K&auml;sesahnetorte", "Salte&ntilde;as"]:
    remove_entry_with(p_recipes, semicolon)
tagged_recipe_names = retag(p_recipes, "name")

In [1098]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

[':', '-']

In [1099]:
for colon in colon_tokens:
  print(find_value_with_char(p_recipes, 'name', colon))

['Spaghetti alla Carbonara: the Traditional Italian Recipe', 'Doro Wat: Ethiopian Chicken Dish', "Grandma's Focaccia: Baraise Style"]
['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chic

In [1100]:
all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  all_name_tags.append(new_dic)

get_tag_number(all_name_tags)

[{'$': 1},
 {"''": 7},
 {'(': 0},
 {')': 0},
 {',': 65},
 {'--': 0},
 {'.': 9},
 {':': 42},
 {'CC': 499},
 {'CD': 26},
 {'DT': 104},
 {'EX': 0},
 {'FW': 66},
 {'IN': 476},
 {'JJ': 1906},
 {'JJR': 3},
 {'JJS': 30},
 {'LS': 0},
 {'MD': 1},
 {'NN': 678},
 {'NNP': 12791},
 {'NNPS': 36},
 {'NNS': 384},
 {'PDT': 0},
 {'POS': 348},
 {'PRP': 71},
 {'PRP$': 20},
 {'RB': 31},
 {'RBR': 0},
 {'RBS': 2},
 {'RP': 2},
 {'SYM': 0},
 {'TO': 20},
 {'UH': 0},
 {'VB': 28},
 {'VBD': 43},
 {'VBG': 58},
 {'VBN': 138},
 {'VBP': 12},
 {'VBZ': 30},
 {'WDT': 4},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 7},
 {'``': 6}]

!, ? and . are found, which are odd for recipe names

In [184]:
punc_tokens = list_words_with_tag(tagged_recipe_names, ".")
punc_tokens

['!', '!', '!', '!', '!', '.', '.', '?', '!', '!']

The punctuations are mostly slang abbreviations and exclamations

In [185]:
for punc in list(set(punc_tokens)):
  print(find_value_with_char(p_recipes, 'name', punc))

['DIY Finnish Lonkero (a.k.a. Long Drink)', "Our Top P.F. Chang's Copycat Recipes", "Perfect St. Patrick's Day Cake"]
['Real Canadian Butter Tarts, eh?']
['Sangria! Sangria!', 'Oatmeal Apple Crisp To Die For!', 'Mexican Lasagna - No Lasagna Noodles!', "Sushi House Salad Dressing, It's ORANGE!", 'Now Vegetarian!', 'From a Swede!']


## Correcting mis-tagged POS

Chicken is considered dollar?

In [148]:
dol_tokens = list_words_with_tag(tagged_recipe_names, "$")
dol_tokens

['Chicken']

It's a tagging error, so this can be ignored

In [149]:
for dol in dol_tokens:
  print(find_value_with_char(p_recipes, 'name', dol))

['Spicy Korean Fried Chicken with Gochujang Sauce', 'Greek Lemon Chicken and Potato Bake', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', "Chef John's Chicken Kiev", 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Chicken Katsu', 'Indian Chicken Curry (Murgh Kari)', 'Chicken Stir-Fry', 'Mexican-Style Chicken Taco Casserole', 'Curry Stand Chicken Tikka Masala Sauce', 'Chicken Enchiladas V', 'Jamaican Style Curry Chicken', 'Salsa Chicken', 'Grilled Asian Chicken', 'Chicken Tikka Masala', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Turkish Chicken Kebabs', 'Chicken Souvlaki with Tzatziki Sauce', 'Greek Lemon Chicken Soup', 'Chicken Cacciatore in a Slow Cooker', 'Chicken and Broccoli Stir-Fry', 'Creamy Chicken Lasagna', 'Broccoli and Chicken Stir-Fry', 'Chicken Parmigiana', 'Shoyu Chicken', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Sheet Pan Chicken Fajitas', 'White Chicken Enchilada Slow

There are some quotation marks

In [150]:
quote_tokens = list_words_with_tag(tagged_recipe_names, "''")
quote_tokens

["''", "''", "''", "'", "''", "''", "''"]

Quotation marks are caused by possessive -'s

In [151]:
for quote in quote_tokens:
  print(find_value_with_char(p_recipes, 'name', quote))

[]
[]
[]
["Chef John's Chicken Kiev", "Angela's Awesome Enchiladas", "Randy's Slow Cooker Ravioli Lasagna", "'Chinese Buffet' Green Beans", "Chef John's Beef Rouladen", "Corned Beef and Cabbage Shepherd's Pie", "Gramma's Date Squares", "Authentic Russian Salad 'Olivye'", "Chef John's Meatless Meatballs", "Chef John's Beef Goulash", "Grandma's Noodles II", "Chef John's Clotted Cream", "Newfoundland Jigg's Dinner", "Chef John's Coq Au Vin", "Chef John's Loco Moco", "Dash's Donair", "Turkey Shepherd's Pie", "Papa Drexler's Bavarian Pretzels", "Bob's Stuffed Banana Peppers", "Chef John's Swedish Meatballs", "Chef John's Best German Recipes", "Chef John's Chicken Tikka Masala", "Maria's Mexican Rice", "Mom's Buttermilk Pancakes", "Geneva's Ultimate Hungarian Mushroom Soup", "Charley's Slow Cooker Mexican Style Meat", "Ingrid's Rouladen", "Chef John's Lasagna", "Lola's Horchata", "Chef John's Italian Sausage Chili", "Kid's Favorite Pizza Casserole", "Traci's Adobo Seasoning", "Frank's Favori

Some commas were found

In [None]:
recipe_names_pos[","]

[(',', ',')]

In [None]:
for com in recipe_names_pos[","]:
  print(find_name_with_char(com[0]))

['Bow Ties with Sausage, Tomatoes and Cream', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Chicken, Spinach, and Cheese Pasta Bake', 'Super-Simple, Super-Spicy Mongolian Beef', 'Creamy Potato, Carrot, and Leek Soup', 'Beef, Mushroom and Guinness&reg; Pie', 'Easy, Chewy Flourless Peanut Butter Cookies', 'Rajas con Crema, Elote, y Queso (Creamy Poblano Peppers and Sweet Corn)', 'Filipino Steamed Rice, Cebu Style', 'Orange, Honey and Soy Chicken', 'Chicken Francese, Italian-Style', 'Duck with Honey, Soy, and Ginger', 'Grillhaxe (Grilled Eisbein, Pork Shanks)', 'Steak, Onion, and Pepper Fajitas', 'Indian Carrots, Peas and Potatoes', "Sushi House Salad Dressing, It's ORANGE!", 'Trio of Mashed Roots (Parsnip, Turnip and Carrot)', 'Simple, Baked Finnan Haddie', 'Indian-Style Rice with Cashews, Raisins and Turmeric', 'Serbian Ground Beef, Veggie, and Potato Bake', 'Fried Rice with Ginger, Hoisin, and Sesame', 'Chard Lentil Soup, Lebanese-Style'

 For now, leave the preprocessing of the recipe names first.

In [None]:
for recipe in recipes[:10]:
  print(tag_pos(recipe["name"]))

[('Pan-Fried', 'JJ'), ('Asparagus', 'NNP')]
[('Pan', 'NNP'), ('de', 'FW'), ('Muertos', 'NNP'), ('(', '('), ('Mexican', 'NNP'), ('Bread', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('Dead', 'NNP'), (')', ')')]
[('Creamy', 'NNP'), ('Au', 'NNP'), ('Gratin', 'NNP'), ('Potatoes', 'NNP')]
[('Super-Delicious', 'JJ'), ('Zuppa', 'NNP'), ('Toscana', 'NNP')]
[('Simple', 'JJ'), ('Teriyaki', 'NNP'), ('Sauce', 'NNP')]
[('Spicy', 'JJ'), ('Korean', 'NNP'), ('Fried', 'NNP'), ('Chicken', 'NNP'), ('with', 'IN'), ('Gochujang', 'NNP'), ('Sauce', 'NNP')]
[('Spaghetti', 'NNP'), ('Aglio', 'NNP'), ('e', 'NN'), ('Olio', 'NNP')]
[('Easy', 'JJ'), ('Garam', 'NNP'), ('Masala', 'NNP')]
[('Easy', 'NNP'), ('Chorizo', 'NNP'), ('Street', 'NNP'), ('Tacos', 'NNP')]
[('Tres', 'NNS'), ('Leches', 'NNP'), ('(', '('), ('Milk', 'NNP'), ('Cake', 'NNP'), (')', ')')]


NLTK assumes fractions as JJ (adjectives)

In [None]:
for ingredient in recipes[0]["ingredients"]:
  print(tag_pos(ingredient))

[('¼', 'JJ'), ('cup', 'NN'), ('butter', 'NN')]
[('2', 'CD'), ('tablespoons', 'NNS'), ('olive', 'JJ'), ('oil', 'NN')]
[('1', 'CD'), ('teaspoon', 'NN'), ('coarse', 'NN'), ('salt', 'NN')]
[('¼', 'JJ'), ('teaspoon', 'NN'), ('ground', 'NN'), ('black', 'JJ'), ('pepper', 'NN')]
[('3', 'CD'), ('cloves', 'NNS'), ('garlic', 'JJ'), (',', ','), ('minced', 'VBD')]
[('1', 'CD'), ('pound', 'NN'), ('fresh', 'JJ'), ('asparagus', 'JJ'), ('spears', 'NNS'), (',', ','), ('trimmed', 'VBD')]


Create a function that converts any fraction in a text to integer

In [None]:
for recipe in recipes[:3]:
  for ingredient in recipe["ingredients"]:
    print(ingredient)

¼ cup butter 
2 tablespoons olive oil 
1 teaspoon coarse salt 
¼ teaspoon ground black pepper 
3 cloves garlic, minced 
1 pound fresh asparagus spears, trimmed 
¼ cup margarine 
¼ cup milk 
¼ cup warm water (110 degrees F/45 degrees C) 
3 cups all-purpose flour 
1 ¼ teaspoons active dry yeast 
½ teaspoon salt 
2 teaspoons anise seed 
¼ cup white sugar 
2 eggs, beaten 
2 teaspoons orange zest 
¼ cup white sugar 
¼ cup orange juice 
1 tablespoon orange zest 
2 tablespoons white sugar 
4 russet potatoes, sliced into 1/4 inch slices 
1 onion, sliced into rings 
 salt and pepper to taste 
3 tablespoons butter 
3 tablespoons all-purpose flour 
½ teaspoon salt 
2 cups milk 
1 ½ cups shredded Cheddar cheese 


In [None]:
import unicodedata

def fraction_to_int(text):
  for i, char in enumerate(text):
    try:
      # unicode.numeric converts fractions such as ½ to decimal place, 0.25
      # int() removes decimal places, str() allows it to be joined with original text
      text = text[:i] + str(int(unicodedata.numeric(char))) + text[i + 1:]
    except Exception as e:
      pass
  # Because number + fraction, such as 1 1/4 may be converted to 1 0, so use re.sub to remove
  text = re.sub("([0-9]+ [0])+", "4", text)
  return text

for recipe in recipes[:3]:
  for ingredient in recipe["ingredients"]:
    print(fraction_to_int(ingredient))

0 cup butter 
2 tablespoons olive oil 
1 teaspoon coarse salt 
0 teaspoon ground black pepper 
3 cloves garlic, minced 
1 pound fresh asparagus spears, trimmed 
0 cup margarine 
0 cup milk 
0 cup warm water (110 degrees F/45 degrees C) 
3 cups all-purpose flour 
4 teaspoons active dry yeast 
0 teaspoon salt 
2 teaspoons anise seed 
0 cup white sugar 
2 eggs, beaten 
2 teaspoons orange zest 
0 cup white sugar 
0 cup orange juice 
1 tablespoon orange zest 
2 tablespoons white sugar 
4 russet potatoes, sliced into 1/4 inch slices 
1 onion, sliced into rings 
 salt and pepper to taste 
3 tablespoons butter 
3 tablespoons all-purpose flour 
0 teaspoon salt 
2 cups milk 
4 cups shredded Cheddar cheese 


In [None]:
for i, recipe in enumerate(recipes):
  for j, ingredient in enumerate(recipe["ingredients"]):
    recipes[i]["ingredients"][j] = fraction_to_int(recipes[i]["ingredients"][j])

recipes[:3]

[{'ingredients': ['0 cup butter ',
   '2 tablespoons olive oil ',
   '1 teaspoon coarse salt ',
   '0 teaspoon ground black pepper ',
   '3 cloves garlic, minced ',
   '1 pound fresh asparagus spears, trimmed '],
  'name': 'Pan-Fried Asparagus',
  'url': 'https://www.allrecipes.com/recipe/18318/pan-fried-asparagus/'},
 {'ingredients': ['0 cup margarine ',
   '0 cup milk ',
   '0 cup warm water (110 degrees F/45 degrees C) ',
   '3 cups all-purpose flour ',
   '4 teaspoons active dry yeast ',
   '0 teaspoon salt ',
   '2 teaspoons anise seed ',
   '0 cup white sugar ',
   '2 eggs, beaten ',
   '2 teaspoons orange zest ',
   '0 cup white sugar ',
   '0 cup orange juice ',
   '1 tablespoon orange zest ',
   '2 tablespoons white sugar '],
  'name': 'Pan de Muertos (Mexican Bread of the Dead)',
  'url': 'https://www.allrecipes.com/recipe/7224/pan-de-muertos-mexican-bread-of-the-dead/'},
 {'ingredients': ['4 russet potatoes, sliced into 1/4 inch slices ',
   '1 onion, sliced into rings ',
  

By converting fractions into integers, NLTK stops seeing them as adjectives (JJ) and instead, they are considered numbers (CD)

In [None]:
for recipe in recipes[:10]:
  for ingredient in recipe["ingredients"]:
    print(tag_pos(ingredient))

[('0', 'CD'), ('cup', 'NN'), ('butter', 'NN')]
[('2', 'CD'), ('tablespoons', 'NNS'), ('olive', 'JJ'), ('oil', 'NN')]
[('1', 'CD'), ('teaspoon', 'NN'), ('coarse', 'NN'), ('salt', 'NN')]
[('0', 'CD'), ('teaspoon', 'NN'), ('ground', 'NN'), ('black', 'JJ'), ('pepper', 'NN')]
[('3', 'CD'), ('cloves', 'NNS'), ('garlic', 'JJ'), (',', ','), ('minced', 'VBD')]
[('1', 'CD'), ('pound', 'NN'), ('fresh', 'JJ'), ('asparagus', 'JJ'), ('spears', 'NNS'), (',', ','), ('trimmed', 'VBD')]
[('0', 'CD'), ('cup', 'NN'), ('margarine', 'NN')]
[('0', 'CD'), ('cup', 'NN'), ('milk', 'NN')]
[('0', 'CD'), ('cup', 'NN'), ('warm', 'JJ'), ('water', 'NN'), ('(', '('), ('110', 'CD'), ('degrees', 'NNS'), ('F/45', 'NNP'), ('degrees', 'NNS'), ('C', 'NNP'), (')', ')')]
[('3', 'CD'), ('cups', 'NNS'), ('all-purpose', 'JJ'), ('flour', 'NN')]
[('4', 'CD'), ('teaspoons', 'NNS'), ('active', 'JJ'), ('dry', 'JJ'), ('yeast', 'NN')]
[('0', 'CD'), ('teaspoon', 'NN'), ('salt', 'NN')]
[('2', 'CD'), ('teaspoons', 'NNS'), ('anise', 'VBP')

Replace all the numbers with placeholder of 4

In [None]:
for i, recipe in enumerate(recipes):
  for j, ingredient in enumerate(recipe["ingredients"]):
    recipes[i]["ingredients"][j] = searchReplacePatt(recipes[i]["ingredients"][j], NUMPATTERN, "4")

recipes[:3]

[{'ingredients': ['4 cup butter ',
   '4 tablespoons olive oil ',
   '4 teaspoon coarse salt ',
   '4 teaspoon ground black pepper ',
   '4 cloves garlic, minced ',
   '4 pound fresh asparagus spears, trimmed '],
  'name': 'Pan-Fried Asparagus',
  'url': 'https://www.allrecipes.com/recipe/18318/pan-fried-asparagus/'},
 {'ingredients': ['4 cup margarine ',
   '4 cup milk ',
   '4 cup warm water (4 degrees F/4 degrees C) ',
   '4 cups all-purpose flour ',
   '4 teaspoons active dry yeast ',
   '4 teaspoon salt ',
   '4 teaspoons anise seed ',
   '4 cup white sugar ',
   '4 eggs, beaten ',
   '4 teaspoons orange zest ',
   '4 cup white sugar ',
   '4 cup orange juice ',
   '4 tablespoon orange zest ',
   '4 tablespoons white sugar '],
  'name': 'Pan de Muertos (Mexican Bread of the Dead)',
  'url': 'https://www.allrecipes.com/recipe/7224/pan-de-muertos-mexican-bread-of-the-dead/'},
 {'ingredients': ['4 russet potatoes, sliced into 4/4 inch slices ',
   '4 onion, sliced into rings ',
   ' 

Flatten each data entry into a string

In [None]:
corpus_list = []
for item in recipes:
    item['ingredients']=','.join(item['ingredients'])
    try:
        item['text'] = item['name'] + " " + item["ingredients"]
    except Exception as e:
        item['name'] = ""
        item['text'] = item['name'] + " " + item["ingredients"]
        print(e)
    corpus_list.append(item['text'])
    
corpus_list[:3]

'name'


['Pan-Fried Asparagus ¼ cup butter ,2 tablespoons olive oil ,1 teaspoon coarse salt ,¼ teaspoon ground black pepper ,3 cloves garlic, minced ,1 pound fresh asparagus spears, trimmed ',
 'Pan de Muertos (Mexican Bread of the Dead) ¼ cup margarine ,¼ cup milk ,¼ cup warm water (110 degrees F/45 degrees C) ,3 cups all-purpose flour ,1\u2009¼ teaspoons active dry yeast ,½ teaspoon salt ,2 teaspoons anise seed ,¼ cup white sugar ,2 eggs, beaten ,2 teaspoons orange zest ,¼ cup white sugar ,¼ cup orange juice ,1 tablespoon orange zest ,2 tablespoons white sugar ',
 'Creamy Au Gratin Potatoes 4 russet potatoes, sliced into 1/4 inch slices ,1 onion, sliced into rings , salt and pepper to taste ,3 tablespoons butter ,3 tablespoons all-purpose flour ,½ teaspoon salt ,2 cups milk ,1\u2009½ cups shredded Cheddar cheese ']

Convert entire flattened list into a string

In [None]:
corpus = ','.join(corpus_list)

Compute bigram

In [None]:
import nltk
nltk.download('punkt')

tokens = nltk.word_tokenize(corpus)
bigrams = nltk.bigrams(tokens)
frequence = nltk.FreqDist(bigrams)
for key,value in frequence.items():
    print(key,value)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('taste', ',7') 1
('and', 'Tangy') 2
('Tangy', 'Thai') 1
('French', '75') 1
('75', 'Cocktail') 1
('ounces', 'lemon') 1
('ounces', 'gin') 1
('gin', ',2') 2
('teaspoons', 'simple') 1
('chilled', 'Champagne') 1
('Champagne', ',1') 1
('or', 'orange') 2
('orange', 'slice') 1
('slice', 'for') 2
('Aunt', 'Bert') 1
('Bert', "'s") 1
("'s", 'Fruitcake') 1
('Fruitcake', 'Cookies') 1
('white', 'raisins') 1
('pound', 'dates') 1
('slices', 'candied') 1
('cinnamon', ',28') 1
(',28', 'ounces') 1
('ounces', 'pecans') 1
(',', 'Kerala') 1
('Kerala', 'Chicken') 1
('root', ',8') 1
(',', 'Adriel') 1
('Adriel', "'s") 1
('Chinese', 'Curry') 1
('Portuguese', 'Kale') 1
('white', 'pea') 1
('pea', 'beans') 1
('beef', 'soup') 1
('soup', 'bones') 1
('bunches', 'kale') 2
('quart', 'hot') 2
('water', 'or') 1
('de', 'Mariscos') 1
('Mariscos', '(') 1
('(', 'Seafood') 1
('Seafood', 'Soup') 2
('stock', '(') 2
('as', 'Swanson®') 1
('Swanson®', ')') 1
('fresh

In [None]:
len(tokens)

299068

Convert bigrams into dictionaries, with bigram as key, frequency as value

In [None]:
result = dict(sorted(frequence.items(), key=lambda item: item[0]))
result

{('!', ')'): 2,
 ('!', ','): 1,
 ('!', '1'): 1,
 ('!', '3'): 1,
 ('!', 'Sangria'): 1,
 ('!', 'cooking'): 1,
 ('!', '½'): 1,
 ('#', '1'): 1,
 ('%', ')'): 2,
 ('%', '-lean'): 1,
 ('%', 'Blue'): 1,
 ('%', 'agave'): 1,
 ('%', 'alcohol'): 1,
 ('%', 'cocao'): 1,
 ('%', 'fat'): 1,
 ('%', 'lean'): 4,
 ('%', 'milk'): 4,
 ('%', 'reduced'): 1,
 ('&', 'Beans'): 1,
 ('&', 'B®'): 1,
 ('&', 'Onion'): 1,
 ('&', 'Salad'): 1,
 ('&', 'Sour'): 1,
 ('&', 'auml'): 3,
 ('&', 'ntilde'): 2,
 ('&', 'reg'): 51,
 ('&', 'trade'): 1,
 ("'", '('): 1,
 ("'", '1'): 1,
 ("'", '5'): 1,
 ("'", '6'): 1,
 ("'", 'Amazing'): 1,
 ("'", 'Fajitas'): 1,
 ("'", 'Green'): 1,
 ("'", 'Hungarian'): 1,
 ("'", 'Jamaican'): 1,
 ("'", 'Mince'): 1,
 ("'", 'Own®'): 1,
 ("'", 'Posole'): 1,
 ("'", 'Salmon'): 1,
 ("'", 'Soup'): 1,
 ("'", 'Stormy'): 2,
 ("'", 'ammonia'): 1,
 ("'", 'coating'): 2,
 ("'", 'liver'): 1,
 ("'", 'sugar'): 153,
 ("'", 'yeast'): 2,
 ("''", '('): 1,
 ("''", ')'): 1,
 ("''", '-long'): 1,
 ("''", 'Chicken'): 2,
 ("''", 'C

Get unique tokens and sort them in an ascending order

In [None]:
unique_tokens = sorted(list(set(tokens)))
unique_tokens

['!',
 '#',
 '%',
 '&',
 "'",
 "''",
 "'Bride",
 "'Calabacitas",
 "'Chinese",
 "'Fricot",
 "'Olivye",
 "'Otai",
 "'Three",
 "'ll",
 "'n",
 "'s",
 '(',
 ')',
 '*',
 ',',
 ',1',
 ',10',
 ',11',
 ',12',
 ',13',
 ',14',
 ',15',
 ',16',
 ',17',
 ',18',
 ',19',
 ',2',
 ',20',
 ',21',
 ',22',
 ',23',
 ',24',
 ',25',
 ',26',
 ',27',
 ',28',
 ',29',
 ',3',
 ',3-Ingredient',
 ',30',
 ',32',
 ',34',
 ',35',
 ',36',
 ',38',
 ',4',
 ',40',
 ',48',
 ',5',
 ',5-Ingredient',
 ',50',
 ',6',
 ',60',
 ',7',
 ',8',
 ',80',
 ',9',
 '-',
 '--',
 '-lean',
 '-long',
 '.',
 '...',
 '.063',
 '.18',
 '.24',
 '.25',
 '.7',
 '.75',
 '/',
 '0.6',
 '00',
 '1',
 '1-1/2',
 '1-1/2-inch',
 '1-inch',
 '1-inch-thick',
 '1-pound',
 '1.063',
 '1.12',
 '1.2',
 '1.25',
 '1.27',
 '1.41',
 '1.5',
 '1.75',
 '1.9',
 '1/2',
 '1/2-',
 '1/2-inch',
 '1/2-inch-long',
 '1/2-inch-thick',
 '1/2-pound',
 '1/2x1/4',
 '1/3',
 '1/3-inch',
 '1/4',
 '1/4-inch',
 '1/4-inch-thick',
 '1/8',
 '1/8-inch',
 '1/8-inch-thick',
 '10',
 '10.25',
 '10.5'

Combine bigrams of the same first word into a dictionary

In [None]:
def find_dict_tuple_key(search):
    entry = {
        "token": search,
        "bigrams": []
    }
    bigrams = {x: result[x] for x in result.keys() if x[0] == search}
    for key, value in bigrams.items():
        newDict = {key[1]: value}
        entry["bigrams"].append(newDict)
    return entry

find_dict_tuple_key('Garlic')

{'token': 'Garlic',
 'bigrams': [{',': 2},
  {',1': 4},
  {',3': 1},
  {'1': 1},
  {'10': 1},
  {'2': 2},
  {'Alfredo': 1},
  {'Beef': 1},
  {'Butter': 1},
  {'Cheddar': 1},
  {'Chicken': 4},
  {'Dill': 1},
  {'Fried': 2},
  {'Mashed': 1},
  {'Parmesan': 1},
  {'Paste': 1},
  {'Pizza': 1},
  {'Pork': 1},
  {'Potato': 1},
  {'Potatoes': 1},
  {'Prawns': 2},
  {'Rice': 1},
  {'Salsa': 1},
  {'Sauce': 5},
  {'Scalloped': 2},
  {'Shrimp': 2},
  {'Soup': 2},
  {'Spinach': 1},
  {'Teriyaki': 1},
  {'Tzatziki': 1},
  {'Wine': 1},
  {'Wings': 1},
  {'and': 3},
  {'without': 1},
  {'¼': 1}]}

Do the same to all the tokens to create a list of dictionaries

In [None]:
bigram_list = []
for value in unique_tokens:
    bigram_list.append(find_dict_tuple_key(value))
    
bigram_list

[{'token': '!',
  'bigrams': [{')': 2},
   {',': 1},
   {'1': 1},
   {'3': 1},
   {'Sangria': 1},
   {'cooking': 1},
   {'½': 1}]},
 {'token': '#', 'bigrams': [{'1': 1}]},
 {'token': '%',
  'bigrams': [{')': 2},
   {'-lean': 1},
   {'Blue': 1},
   {'agave': 1},
   {'alcohol': 1},
   {'cocao': 1},
   {'fat': 1},
   {'lean': 4},
   {'milk': 4},
   {'reduced': 1}]},
 {'token': '&',
  'bigrams': [{'Beans': 1},
   {'B®': 1},
   {'Onion': 1},
   {'Salad': 1},
   {'Sour': 1},
   {'auml': 3},
   {'ntilde': 2},
   {'reg': 51},
   {'trade': 1}]},
 {'token': "'",
  'bigrams': [{'(': 1},
   {'1': 1},
   {'5': 1},
   {'6': 1},
   {'Amazing': 1},
   {'Fajitas': 1},
   {'Green': 1},
   {'Hungarian': 1},
   {'Jamaican': 1},
   {'Mince': 1},
   {'Own®': 1},
   {'Posole': 1},
   {'Salmon': 1},
   {'Soup': 1},
   {'Stormy': 2},
   {'ammonia': 1},
   {'coating': 2},
   {'liver': 1},
   {'sugar': 153},
   {'yeast': 2}]},
 {'token': "''",
  'bigrams': [{'(': 1},
   {')': 1},
   {'-long': 1},
   {'Chicken': 

In [None]:
len(bigram_list)

6107

In [None]:
len(unique_tokens)

6107

## Numbers and placeholder

## POS tagging

# Create edit distance

# Create bigram

#Chunking/Phrases
