# Steamboat Squad

Import and load data

In [333]:
# from google.colab import drive
# drive.mount('/content/drive')

In [334]:
import json

with open("recipes_ingredients.json", "r") as json_file:
    recipes = json.load(json_file)
    
len(recipes)

4702

Overview of data structure. This is a list of dictionary, where each dictionary is a recipe with its name, ingredients and url

In [335]:
recipes[0]

{'url': 'https://www.allrecipes.com/recipe/18318/pan-fried-asparagus/',
 'name': 'Pan-Fried Asparagus',
 'ingredients': ['¼ cup butter ',
  '2 tablespoons olive oil ',
  '1 teaspoon coarse salt ',
  '¼ teaspoon ground black pepper ',
  '3 cloves garlic, minced ',
  '1 pound fresh asparagus spears, trimmed ']}

Deleting url key

In [336]:
for recipe in recipes:
    del recipe['url']
recipes[0]

{'name': 'Pan-Fried Asparagus',
 'ingredients': ['¼ cup butter ',
  '2 tablespoons olive oil ',
  '1 teaspoon coarse salt ',
  '¼ teaspoon ground black pepper ',
  '3 cloves garlic, minced ',
  '1 pound fresh asparagus spears, trimmed ']}

# Preprocessing Recipe Names
- Lower-casing (normalise words by using POS tagging)
- Change numbers to fix number (place holder)

NLTK has a help function that explains its POS tags.

In [337]:
import nltk
from nltk import pos_tag, word_tokenize, RegexpParser, Tree
from nltk.tokenize import PunktSentenceTokenizer

nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [338]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

Using %%capture, save the NLTK help text as a string

In [339]:
%%capture cap --no-stderr

nltk.help.upenn_tagset()

In [340]:
cap.stdout

'$: dollar\n    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$\n\'\': closing quotation mark\n    \' \'\'\n(: opening parenthesis\n    ( [ {\n): closing parenthesis\n    ) ] }\n,: comma\n    ,\n--: dash\n    --\n.: sentence terminator\n    . ! ?\n:: colon or ellipsis\n    : ; ...\nCC: conjunction, coordinating\n    & \'n and both but either et for less minus neither nor or plus so\n    therefore times v. versus vs. whether yet\nCD: numeral, cardinal\n    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-\n    seven 1987 twenty \'79 zero two 78-degrees eighty-four IX \'60s .025\n    fifteen 271,124 dozen quintillion DM2,000 ...\nDT: determiner\n    all an another any both del each either every half la many much nary\n    neither no some such that the them these this those\nEX: existential there\n    there\nFW: foreign word\n    gemeinschaft hund ich jeux habeas Haementeria Herr K\'ang-si vous\n    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte\n    terram 

Using RE, get all the tag names

In [341]:
import re

ALL_POS = re.findall(".*: +", cap.stdout)

for i, pos in enumerate(ALL_POS):
  ALL_POS[i] = pos.replace(': ', '')


ALL_POS

['$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 '    ',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``']

In [342]:
ALL_POS.remove('    ')
ALL_POS

['$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``']

Create a function to pos tag a text

In [343]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def tag_pos(corpus):
    text=word_tokenize(corpus)
    return nltk.pos_tag(text)

tag_pos("This is a test sentence.")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('test', 'NN'),
 ('sentence', 'NN'),
 ('.', '.')]

Create a function that POS tag and returns words with specific POS

In [344]:
def get_words_with_pos(text, pos):
  tagged = tag_pos(text)
  return [t for t in tagged if t[1].startswith(pos)]

get_words_with_pos("This is a test sentence.", "NN")

[('test', 'NN'), ('sentence', 'NN')]

POS tag all recipe names

In [345]:
tagged_recipe_names = []

for i, recipe in enumerate(recipes):
  try:
    tagged_recipe_names.append(tag_pos(recipes[i]['name']))
  except Exception as e:
    pass

len(tagged_recipe_names)

4701

## Data cleaning for names based on POS tagging

Looking at the first 10 tagged recipe names, there is a need for pre-processing, as NLTK's tagging is confused by the letter casing.

In [346]:
tagged_recipe_names[:10]

[[('Pan-Fried', 'JJ'), ('Asparagus', 'NNP')],
 [('Pan', 'NNP'),
  ('de', 'FW'),
  ('Muertos', 'NNP'),
  ('(', '('),
  ('Mexican', 'NNP'),
  ('Bread', 'NNP'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('Dead', 'NNP'),
  (')', ')')],
 [('Creamy', 'NNP'), ('Au', 'NNP'), ('Gratin', 'NNP'), ('Potatoes', 'NNP')],
 [('Super-Delicious', 'JJ'), ('Zuppa', 'NNP'), ('Toscana', 'NNP')],
 [('Simple', 'JJ'), ('Teriyaki', 'NNP'), ('Sauce', 'NNP')],
 [('Spicy', 'JJ'),
  ('Korean', 'NNP'),
  ('Fried', 'NNP'),
  ('Chicken', 'NNP'),
  ('with', 'IN'),
  ('Gochujang', 'NNP'),
  ('Sauce', 'NNP')],
 [('Spaghetti', 'NNP'), ('Aglio', 'NNP'), ('e', 'NN'), ('Olio', 'NNP')],
 [('Easy', 'JJ'), ('Garam', 'NNP'), ('Masala', 'NNP')],
 [('Easy', 'NNP'), ('Chorizo', 'NNP'), ('Street', 'NNP'), ('Tacos', 'NNP')],
 [('Tres', 'NNS'),
  ('Leches', 'NNP'),
  ('(', '('),
  ('Milk', 'NNP'),
  ('Cake', 'NNP'),
  (')', ')')]]

Create a function that returns all tagged words with the same tag. NLTK's POS tagging assumes that capitalized noun means proper noun (name).

In [347]:
def list_words_with_tag(tuple_list, pos):
  results = []
  for name in tuple_list:
    for tag in name:
      if tag[1] == pos:
        results.append(tag[0])
  return results

list_words_with_tag(tagged_recipe_names, "NNP")

['Asparagus',
 'Pan',
 'Muertos',
 'Mexican',
 'Bread',
 'Dead',
 'Creamy',
 'Au',
 'Gratin',
 'Potatoes',
 'Zuppa',
 'Toscana',
 'Teriyaki',
 'Sauce',
 'Korean',
 'Fried',
 'Chicken',
 'Gochujang',
 'Sauce',
 'Spaghetti',
 'Aglio',
 'Olio',
 'Garam',
 'Masala',
 'Easy',
 'Chorizo',
 'Street',
 'Tacos',
 'Leches',
 'Milk',
 'Cake',
 'Cabbage',
 'Rolls',
 'Gravy',
 'Shrimp',
 'Scampi',
 'Pasta',
 'Lemon',
 'Chicken',
 'Potato',
 'Bake',
 'Mexican',
 'Casserole',
 'Caldo',
 'Res',
 'Mexican',
 'Beef',
 'Soup',
 'Nogada',
 'Mexican',
 'Stuffed',
 'Poblano',
 'Peppers',
 'Walnut',
 'Sauce',
 'Apple',
 'Cake',
 'Flan',
 'Pork',
 'Chops',
 'Sauerkraut',
 'Spicy',
 'Thai',
 'Basil',
 'Chicken',
 'Pad',
 'Krapow',
 'Gai',
 'Spaghetti',
 'Cacio',
 'Pepe',
 'Chef',
 'John',
 'Chicken',
 'Kiev',
 'Chicken',
 'Onions',
 'Fajita',
 'Perfect',
 'Sushi',
 'Rice',
 'Baked',
 'Chicken',
 'German',
 'Potato',
 'Salad',
 'Miso',
 'Soup',
 'Mexican',
 'Rice',
 'II',
 'Haluski',
 'Labneh',
 'Lebanese',
 'Y

Get the number of each POS tag

In [348]:
all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  all_name_tags.append(new_dic)

In [349]:
def get_tag_number(tag_list):
  tag_numbers = []
  for tag in tag_list:
    for key, value in tag.items(): 
      new_dict = {key: len(value)}
    tag_numbers.append(new_dict)
  return tag_numbers

get_tag_number(all_name_tags)

[{'$': 1},
 {"''": 7},
 {'(': 529},
 {')': 529},
 {',': 63},
 {'--': 0},
 {'.': 10},
 {':': 98},
 {'CC': 555},
 {'CD': 74},
 {'DT': 104},
 {'EX': 0},
 {'FW': 47},
 {'IN': 482},
 {'JJ': 1822},
 {'JJR': 4},
 {'JJS': 27},
 {'LS': 0},
 {'MD': 2},
 {'NN': 571},
 {'NNP': 13139},
 {'NNPS': 46},
 {'NNS': 307},
 {'PDT': 0},
 {'POS': 348},
 {'PRP': 72},
 {'PRP$': 20},
 {'RB': 33},
 {'RBR': 0},
 {'RBS': 1},
 {'RP': 2},
 {'SYM': 0},
 {'TO': 20},
 {'UH': 0},
 {'VB': 24},
 {'VBD': 39},
 {'VBG': 50},
 {'VBN': 133},
 {'VBP': 10},
 {'VBZ': 22},
 {'WDT': 4},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 7},
 {'``': 6}]

Some names have numbers (CD). Some are obviously not numbers, like 'Figgy'

In [350]:
def get_values_from_dict_list(dict_list, key):
  values = []
  for d in dict_list:
    if key in d:
      values.append(d[key])
  return values

cd_tokens = get_values_from_dict_list(all_name_tags, 'CD')[0]
cd_tokens

['5',
 '16',
 '2',
 '13',
 '300',
 'Figgy',
 '3',
 '9',
 'Two',
 '9',
 '22',
 '10',
 '15',
 'One',
 '18',
 'Ten',
 'Flounder',
 'Three',
 'Ziti',
 'One',
 '21',
 'Four',
 '9',
 '65',
 '17',
 '14',
 '10',
 "'n",
 '15',
 '8',
 'Minestrone',
 'Four',
 '35',
 'Fly',
 '15',
 '23',
 '8',
 '15',
 '21',
 "That's-a",
 'Tex-Mex',
 '14',
 '17',
 'Five',
 '10',
 '18',
 '5',
 "'Otai",
 '17',
 '3',
 '17',
 '75',
 '17',
 '20',
 'Take-Out',
 '16',
 '12',
 'Three',
 "'Three",
 '15',
 '20',
 '16',
 '12',
 '15',
 '22',
 '12',
 'Three',
 '21',
 '21',
 '25',
 '7',
 '10',
 '19',
 '20']

Create a function that searches for recipe name with specific string

In [351]:
def find_value_with_char(dic_list, key, char):
  matches = []
  for recipe in dic_list:
    try:
      if char in recipe[key]:
        matches.append(recipe[key])
    except Exception as e:
      pass
  return matches

find_value_with_char(recipes, 'name', 'Figgy')

['Figgy Pudding']

'Three cup chicken' is indeed a name. On the other hand, numerics, such as 9 and 13 are not part of the actual names of dishes. So, numerics, instead of NLTK's CD, should be treated. This treatment should be done using regex.

In [352]:
for cd in cd_tokens:
  print(find_value_with_char(recipes, 'name', cd))

['Our 5 Best Avgolemono Soup Recipes', '5-Ingredient Mexican Casserole', '15 Mexican-Inspired Ground Beef Casseroles That Deliver Big Flavor With Every Satisfying Bite', 'Chicken 65', 'Pan-Roasted 5-Spice Pork Loin', 'The 15 Most Iconic French Desserts', '35 Quick and Easy Chinese Dinners You Can Make at Home', '15 Essential North Indian Recipes', '15 Essential North Indian Recipes', '18 Easy Mexican Dishes With 5 Ingredients or Less', 'French 75 Cocktail', '15 Top-Rated Traditional German Christmas Cookies', '15 Traditional Italian Christmas Dinner Recipes', "25 Italian Cookies You'll Love"]
['16 German Recipes That Are Comfort Food Favorites', '16 Mexican-Inspired Casseroles for Family-Pleasing Dinners', '16 Essential Puerto Rican Recipes']
['2 Minute Cheese Quesadillas', "22 Recipes Using a Whole Baguette (That Aren't Sandwiches)", 'Our 21 Best Authentic Mexican Recipes', '23 Delicious Ways the World Cooks Pork Shoulder', '21 Easy Dinners That Start with Packaged Gnocchi', 'Our 20 B

Create a function that searches a regex pattern from a text

In [353]:
def searchWordsPatt(text, patt):
    array = re.findall(patt, text)
    return array

NUMPATTERN = r'[0-9]+'
searchWordsPatt("I want 1 cup of tea", NUMPATTERN)

['1']

Create a function that substitutes regex patterns with a given value

In [354]:
def searchReplacePatt(text, patt, new_val):
  return re.sub(patt, new_val, text)

NUMSPACEPATTERN = r'(\d+\s)'
searchReplacePatt("I want 1 cup of tea", NUMSPACEPATTERN, "")

'I want cup of tea'

searchReplacePatt, except it iterates recipe list

In [355]:
def searchReplacePattList(dict_list, patt, new_val, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            dict_list[i][key] = searchReplacePatt(dict_list[i][key], patt, new_val)
        except Exception as e:
            pass

searchReplacePattList, but adds a substring at given index

In [356]:
def searchReplaceAddPattList(dict_list, patt, new_val, substring, index=0, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            dict_list[i][key] = searchReplacePatt(dict_list[i][key], patt, new_val)
            added_string = list(dict_list[i][key]).insert(index, substring)
            dict_list[i][key]=''.join(added_string)
        except Exception as e:
            pass

Remove numerics from name

In [357]:
import re

p_recipes = recipes

searchReplacePattList(p_recipes, NUMSPACEPATTERN, "")

def retag(text_list, key):
  new_list = []
  for i, recipe in enumerate(text_list):
    try:
      new_list.append(tag_pos(recipes[i][key]))
    except Exception as e:
      pass
  return new_list

tagged_recipe_names = retag(p_recipes, "name")

Get the new remaining CD

In [358]:
new_cd_tokens = list_words_with_tag(tagged_recipe_names, "CD")
new_cd_tokens

['Figgy',
 'Two',
 'One',
 'Ten',
 'Flounder',
 'Three',
 'Ziti',
 'One',
 'Four',
 '65',
 "'n",
 'Minestrone',
 'Four',
 'Fly',
 "That's-a",
 'Tex-Mex',
 'Five',
 "'Otai",
 'Take-Out',
 'Three',
 "'Three",
 'Three']

The remaining numbers (CD) are part of actual recipe names

In [359]:
for cd in new_cd_tokens:
  print(find_value_with_char(p_recipes, 'name', cd))

['Figgy Pudding']
['Two-Ingredient Naan', 'Pollo alla Birra for Two']
['A Number One Egg Bread', 'One-Egg Egg Drop Soup', 'One Pot Thai-Style Rice Noodles', 'One-Pot Vegan Potato-Lentil Curry', 'One-Bite Thai "Flavor Bomb" Salad Wraps (Miang Kham)', 'Easy One-Skillet Ground Beef Burrito', 'One-Pot Greek Lemon Chicken and Rice']
['Tender Italian Baked Chicken', 'Tuscan Pork Tenderloin', 'Asian Pork Tenderloin', 'Italian Pork Tenderloin', 'Sweet and Sour Pork Tenderloin', 'Chipotle Crusted Pork Tenderloin', 'Ten Minute Szechuan Chicken', 'Thai Quivering Tenderloins', 'Spicy Pork Tenderloin', 'Chinese Pork Tenderloin', 'Grecian Pork Tenderloin', 'Havana Slow Cooker Pork Tenderloin', 'Curry Pork Tenderloin', 'Tender Juicy Skirt Steak  (Churrasco)', 'Spicy and Tender Corned Beef', 'Pan Roasted Pork Tenderloin with a Blue Cheese and Olive Stuffing']
['Flounder Mediterranean']
['Pastel de Tres Leches (Three Milk Cake)', 'Three-Meat Italian Meatballs', 'Three Cheese Manicotti II', 'Taiwanese-S

In [360]:
new_all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  new_all_name_tags.append(new_dic)

Can and 'll are the modal verbs found

In [361]:
md_tokens = list_words_with_tag(tagged_recipe_names, "MD")
md_tokens

['Can', "'ll"]

'can' is caused by words such as Canadian, which is processed in next section. But, 'you'll love' is not part of recipe name and more of an expression

In [362]:
for md in md_tokens:
  print(find_value_with_char(p_recipes, 'name', md))

['Canadian Yellow Split Pea Soup with Ham', 'French Canadian Tourtiere', 'Pure Maple Candy', 'Cannoli', 'The Original Donair From the East Coast of Canada', 'Sauerkraut for Canning', 'Tourtiere (French Canadian Meat Pie)', 'Pumpkin Cannoli', 'Puerto Rican Canned Corned Beef Stew', 'Canadian Pork Loin Chops', 'Caneles de Bordeaux', 'Canadian Walleye (Pickerel)', "Thera's Canadian Fried Dough", 'Italian Baked Cannelloni', 'Canary Island Red Mojo Sauce', 'Mexican Tamarind Candy', 'Cantonese Chicken Chow Mein', 'Roti Canai/Paratha (Indian Pancake)', 'Polvorones de Canele (Cinnamon Cookies)', 'Miraculous Canadian Sugar Pie', 'Canadian Tea Biscuits', 'Peanut Butter Potato Candy', 'Irish Potato Candy', 'Filipino Pancit Bihon with Canton', 'Gorton (French-Canadian Pork Spread)', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Chocolate Cantucci', 'Cantonese Style Lobster', 'Real Canadian Poutine', 'French Canadian Meatball Stew', 'Canadian Butter Tarts', 'Canadian Apple Pie', 'Cantones

Removing "You'll" and retagging new list

In [363]:
searchReplacePattList(p_recipes, r"(You'll Love)", "")
tagged_recipe_names = retag(p_recipes, "name")

'll' removed

In [364]:
new_md_tokens = list_words_with_tag(tagged_recipe_names, "MD")
new_md_tokens

['Can']

In [365]:
for md in new_md_tokens:
  print(find_value_with_char(p_recipes, 'name', md))

['Canadian Yellow Split Pea Soup with Ham', 'French Canadian Tourtiere', 'Pure Maple Candy', 'Cannoli', 'The Original Donair From the East Coast of Canada', 'Sauerkraut for Canning', 'Tourtiere (French Canadian Meat Pie)', 'Pumpkin Cannoli', 'Puerto Rican Canned Corned Beef Stew', 'Canadian Pork Loin Chops', 'Caneles de Bordeaux', 'Canadian Walleye (Pickerel)', "Thera's Canadian Fried Dough", 'Italian Baked Cannelloni', 'Canary Island Red Mojo Sauce', 'Mexican Tamarind Candy', 'Cantonese Chicken Chow Mein', 'Roti Canai/Paratha (Indian Pancake)', 'Polvorones de Canele (Cinnamon Cookies)', 'Miraculous Canadian Sugar Pie', 'Canadian Tea Biscuits', 'Peanut Butter Potato Candy', 'Irish Potato Candy', 'Filipino Pancit Bihon with Canton', 'Gorton (French-Canadian Pork Spread)', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Chocolate Cantucci', 'Cantonese Style Lobster', 'Real Canadian Poutine', 'French Canadian Meatball Stew', 'Canadian Butter Tarts', 'Canadian Apple Pie', 'Cantones

Replacing any "/" with "or" word

In [366]:
searchReplacePattList(p_recipes, r"\/", " or ")
tagged_recipe_names = retag(p_recipes, "name")

In [367]:
bracket_tokens = list(set(list_words_with_tag(tagged_recipe_names, "(")))
bracket_tokens

['(']

Examining brackers in names. Most of the words in brackets are translations

In [368]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Tres Leches (Milk Cake)', 'Caldo de Res (Mexican Beef Soup)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', '

"(no red sauce here...golden)" needs to be removed

In [369]:
# Redundant descriptions
searchReplacePattList(p_recipes,  r"(no red sauce here...golden)", "")
searchReplacePattList(p_recipes, r"(From a Swede!)", "")
searchReplacePattList(p_recipes, r"(from a Chinese person)", "")
searchReplacePattList(p_recipes, r"(Now Vegetarian!)", "")
searchReplacePattList(p_recipes, r"a.k.a. ", "")
searchReplacePattList(p_recipes, r"(That Aren't Sandwiches)", "")

# Remove copyright symbol
searchReplacePattList(p_recipes, r"&reg;", "")
# Asian Sesame Seared or Grilled Tuna (Gluten Free) => Gluten Free Asian Sesame Seared or Grilled Tuna
searchReplaceAddPattList(p_recipes, r"(Gluten Free)", "", "glutten-free")
tagged_recipe_names = retag(p_recipes, "name")

In [370]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Tres Leches (Milk Cake)', 'Caldo de Res (Mexican Beef Soup)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', '

Only three foreign words detected by NLTK, which is not true

In [371]:
fw_tokens = list(set(list_words_with_tag(tagged_recipe_names, "FW")))
fw_tokens

['et', 'de', 'Rassolnik']

From the three unique foreign words, these are the names

In [372]:
fw_names = []
for fw in fw_tokens:
    names = find_value_with_char(p_recipes, 'name', fw)
    print(names)
    fw_names = fw_names + names
fw_names = list(set(fw_names))

['Spaghetti Aglio e Olio', 'Easy Chorizo Street Tacos', 'Spaghetti Cacio e Pepe', 'Make-Ahead Vegetarian Moroccan Stew', "'Chinese Buffet' Green Beans", 'Sweet and Sour Chicken I', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Braised Corned Beef Brisket', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Fabulous Wet Burritos', 'Sauteed Sweet Plantains (Tajaditas Dulces de Platano)', 'Vegetarian Mexican Inspired Stuffed Peppers', 'Sheet Pan Chicken Fajitas', 'Sheet Pan Fried Rice', 'Vegetarian Chinese Fried Noodles', "Papa Drexler's Bavarian Pretzels", 'Quick Bruschetta Chicken Bake', 'Authentic Vietnamese Spring Rolls (Nem Ran Hay Cha Gio)', 'Kotlet Schabowy (Polish Breaded Pork Chop)', 'Spaghetti alla Carbonara: the Traditional Italian Recipe', 'Stir-Fry Chicken and Vegetables', 'Vegetarian Moussaka', 'French Baguettes', 'Shrimp Fettuccine Alfredo', 'Conchas (Mexican Sweet Bread)', 'Skillet Chicken Picante', 'Spaghetti Sauce', 'Roasted Pork Banh Mi (Vietnamese Sa

In [373]:
fw_names

['Favorite Apple Galette',
 'Sweet and Spicy Stir Fry with Chicken and Broccoli',
 'Empanadas Salte&ntilde;as',
 'Vietnamese Chicken Salad',
 'Tamales Oaxaque&ntilde;os (Oaxacan-Style Tamales)',
 'Chicken or Turkey Tetrazzini Deluxe',
 'Indian Vegetable Bhaji',
 'Conchas (Mexican Sweet Bread)',
 'Sweet and Sour Sauce II',
 'Gateau Breton',
 'Easy and Delicious Slow Cooker Cassoulet',
 "Charlie's Sweet Island Brussels Sprouts",
 'Yellow Rice with Vegetables',
 'Vietnamese Lemon Grass Chicken Curry',
 'Galette des Rois',
 'Best Yet Turkey Chili',
 'Pebber Nodder (Danish Christmas Cookies)',
 'Flash Baked Walleye Fillets',
 'Mojo Grilling Marinade',
 'Slow Cooker Guisado Verde',
 'Scandinavian Sweetheart Waffles',
 'Sicilian Homemade Ricotta Cheese',
 'Mexican Chicken Meatball Soup (Sopa de Albondigas de Pollo)',
 'Mushroom Stuffed Beef Rouladen',
 'Zucchini Taco Skillet',
 'Easy Sheet Pan Beef Bulgogi',
 'Chicken Enchiladas with Green Chile Sauce (Salsa Verde)',
 'Sunnyside Burger with C

Names that both have foreign words and bracket

In [374]:
bracket_and_fw = [name for name in bracketed_names if name in fw_names]
bracket_and_fw

['Tamales Oaxaque&ntilde;os (Oaxacan-Style Tamales)',
 'Conchas (Mexican Sweet Bread)',
 'Pebber Nodder (Danish Christmas Cookies)',
 'Mexican Chicken Meatball Soup (Sopa de Albondigas de Pollo)',
 'Chicken Enchiladas with Green Chile Sauce (Salsa Verde)',
 'Rigatoni al Segreto (Rigatoni with Secret Sauce)',
 'Vegan Spaghetti and (Beyond) Meatballs',
 'Mexican Chicken Soup with Rice (Caldo de Pollo con Arroz)',
 'Tokneneng (Filipino Street Food)',
 'Norwegian Butter Sauce (Sandefjordsmor)',
 'Rajas con Crema, Elote, y Queso (Creamy Poblano Peppers and Sweet Corn)',
 'Caldereta (Filipino Beef and Chorizo Stew)',
 'Asado de Puerco (Mexican Pork Stew)',
 "Paksiw na Pata (Pig's Feet Stew)",
 'Nuoc Cham (Vietnamese Dipping Sauce)',
 'Saltimbocca di Pollo alla Romana (Prosciutto-Stuffed Chicken Breast Roulades)',
 'Authentic Vietnamese Spring Rolls (Nem Ran Hay Cha Gio)',
 'Brazilian Style Flan (Pudim de Leite Condensado)',
 'Surullitos de Maiz (Cornmeal Sticks)',
 'Recipes Using a Whole Bag

Split the names into two names, one outside and one inside

In [375]:
BRACKET_REGEX = " \(.*\)"
def break_fw_bracket(name):
    name1 = re.findall(BRACKET_REGEX, name)[0]
    name1 = name1[name1.find("(")+1:name1.find(")")]
    name2 = re.sub(BRACKET_REGEX, "", name)
    return name1, name2

print(break_fw_bracket("Hearty Caldo de Res (Mexican Beef Soup)"))
print(break_fw_bracket("Ukha (Russian Fish Soup)"))

('Mexican Beef Soup', 'Hearty Caldo de Res')
('Russian Fish Soup', 'Ukha')


Apply the split function. Delete old recipe with bracket and foreign words. In both of the new recipes, duplicate old ingredients.

In [376]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in bracket_and_fw:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

There are still remaining names with bracket, mostly due to the foreign words not being recognized.

In [377]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Tres Leches (Milk Cake)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', 'Kalbi (Korean BBQ Short Ribs)', 'Macaron (French Macaroon)', 'Atsara (Papaya Relish)', 'Authentic Chinese Egg Rolls ()', 'Greek Le

In [378]:
bracketed_names

['Dolmas (Stuffed Grape Leaves)',
 'Lazy Golumpki (Stuffed Cabbage) Soup',
 'Moroccan Harira (Bean Soup)',
 'Berbere (Ethiopian Spice)',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Calabacitas con Elote (Zucchini with Corn)',
 'Ghormeh Sabzi (Persian Herb Stew)',
 'Gobi Aloo (Indian Style Cauliflower with Potatoes)',
 "Ta'ameya (Egyptian Falafel)",
 'Haydari (Turkish Yogurt Dip)',
 'Gorton (French-Canadian Pork Spread)',
 "Tim Perry's Soup (Creamy Curry Cauliflower and Broccoli Soup)",
 'Faworki (Polish Chrusciki)',
 'Polish Golobki (Gawumpki)',
 'Armenian Stuffed Eggplant (Imam Bayildi)',
 'Arroz con Leche (Mexican Rice Pudding)',
 'Persian Cucumber Yogurt (Maast-o Khiar)',
 'Chipas (Argentinean Cheese Bread)',
 'Papas con Chorizo (Mexican Chorizo and Potatoes)',
 'Jamaican Saltfish Fritters (Stamp and Go)',
 'Albondigas (Meatballs) en Chipotle',
 'Greek Stuffed Tomatoes and Peppers (Yemista)',
 'German Potato Dumplings (Kartoffelkloesse)',
 'Sago Pudding (Gula Melaka

Most of the brackets are at the end of each name. For those that are in the middle, they are translations of one of the words in the name.

In [379]:
b_name_end = []
b_name_mid = []
for b_name in bracketed_names:
    if b_name.endswith(')'):
        b_name_end.append(b_name)
    else:
        b_name_mid.append(b_name)
        
b_name_end

['Dolmas (Stuffed Grape Leaves)',
 'Moroccan Harira (Bean Soup)',
 'Berbere (Ethiopian Spice)',
 'Calabacitas con Elote (Zucchini with Corn)',
 'Ghormeh Sabzi (Persian Herb Stew)',
 'Gobi Aloo (Indian Style Cauliflower with Potatoes)',
 "Ta'ameya (Egyptian Falafel)",
 'Haydari (Turkish Yogurt Dip)',
 'Gorton (French-Canadian Pork Spread)',
 "Tim Perry's Soup (Creamy Curry Cauliflower and Broccoli Soup)",
 'Faworki (Polish Chrusciki)',
 'Polish Golobki (Gawumpki)',
 'Armenian Stuffed Eggplant (Imam Bayildi)',
 'Arroz con Leche (Mexican Rice Pudding)',
 'Persian Cucumber Yogurt (Maast-o Khiar)',
 'Chipas (Argentinean Cheese Bread)',
 'Papas con Chorizo (Mexican Chorizo and Potatoes)',
 'Jamaican Saltfish Fritters (Stamp and Go)',
 'Greek Stuffed Tomatoes and Peppers (Yemista)',
 'German Potato Dumplings (Kartoffelkloesse)',
 'Sago Pudding (Gula Melaka)',
 'Korean Spicy Marinated Pork (Dae Ji Bool Gogi)',
 'Oyakodon (Japanese Chicken and Egg Rice Bowl)',
 'Zucchini e Pomodori Gratinati (Z

In [380]:
b_name_mid

['Lazy Golumpki (Stuffed Cabbage) Soup',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Albondigas (Meatballs) en Chipotle',
 'Lengua (Beef Tongue) Stew',
 'Lamb (Gosht) Biryani',
 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 "World's Best () Lasagna",
 'Seaweed (Nori) Soup',
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish',
 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce',
 'Jeera (Cumin) Rice',
 'Besan (Gram Flour) Halwa',
 'Classic Cuban Midnight (Medianoche) Sandwich',
 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce',
 'Zito (Zhito or Koljivo) - Serbian Wheat Pudding',
 'Spicy Indian (Gujarati) Green Beans',
 'Korean Bean Curd (Miso) Soup',
 'Bee Sting Cake (Bienenstich) II',
 'Ulu (Breadfruit) Pancakes',
 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms',
 'Coconut (Haupia) and Chocolate Pie',
 'Hawaiian Bruddah Potato Mac (Macaroni) Salad']

On the other hand, without parenthesis anymore, names with foregin words tagged are now clean

In [381]:
fw_names = []
for fw in fw_tokens:
    names = find_value_with_char(p_recipes, 'name', fw)
    print(names)
    fw_names = fw_names + names
fw_names = list(set(fw_names))

['Spaghetti Aglio e Olio', 'Easy Chorizo Street Tacos', 'Spaghetti Cacio e Pepe', 'Make-Ahead Vegetarian Moroccan Stew', "'Chinese Buffet' Green Beans", 'Sweet and Sour Chicken I', 'Braised Corned Beef Brisket', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Fabulous Wet Burritos', 'Vegetarian Mexican Inspired Stuffed Peppers', 'Sheet Pan Chicken Fajitas', 'Sheet Pan Fried Rice', 'Vegetarian Chinese Fried Noodles', "Papa Drexler's Bavarian Pretzels", 'Quick Bruschetta Chicken Bake', 'Spaghetti alla Carbonara: the Traditional Italian Recipe', 'Stir-Fry Chicken and Vegetables', 'Vegetarian Moussaka', 'French Baguettes', 'Shrimp Fettuccine Alfredo', 'Skillet Chicken Picante', 'Spaghetti Sauce', 'Vegetarian Korma', 'Fettuccini Carbonara', 'Kaese Spaetzle', 'Beef and Beet Borscht', 'Addictive Sweet Potato Burritos', "Chef John's French Omelette", 'Sweet and Spicy Stir Fry with Chicken and Broccoli', 'Simple Sweet and Spicy Chicken Wraps', 'Johnny Marzetti Casserole', 'Th

In [382]:
fw_names

['Favorite Apple Galette',
 'Sweet and Spicy Stir Fry with Chicken and Broccoli',
 'Empanadas Salte&ntilde;as',
 'Vietnamese Chicken Salad',
 'Vegetarian Pho',
 'Chicken or Turkey Tetrazzini Deluxe',
 'Indian Vegetable Bhaji',
 'Limber de Coco',
 'Sweet and Sour Sauce II',
 'Asado de Puerco',
 'Gateau Breton',
 'Easy and Delicious Slow Cooker Cassoulet',
 "Charlie's Sweet Island Brussels Sprouts",
 'Yellow Rice with Vegetables',
 'Vietnamese Lemon Grass Chicken Curry',
 'Galette des Rois',
 'Best Yet Turkey Chili',
 'Flash Baked Walleye Fillets',
 'Mojo Grilling Marinade',
 'Slow Cooker Guisado Verde',
 'Scandinavian Sweetheart Waffles',
 'Sicilian Homemade Ricotta Cheese',
 'Sopa de Tortilla',
 'Mushroom Stuffed Beef Rouladen',
 'Zwetschgendatschi',
 'Zucchini Taco Skillet',
 'Easy Sheet Pan Beef Bulgogi',
 'Sunnyside Burger with Chipotle Aioli',
 'Filet Mignons With Pepper Cream Sauce',
 'Homemade Pork Fried Rice',
 'Filipino Spaghetti',
 'Easy Mexican Chicken Spaghetti',
 'Vegetaria

For the remaining names with bracket at the end, split into two new recipe names

In [383]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_end:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Tres Leches (Milk Cake)
Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)
Spicy Thai Basil Chicken (Pad Krapow Gai)
Labneh (Lebanese Yogurt)
Indian Chicken Curry (Murgh Kari)
Keema Aloo (Ground Beef and Potatoes)
Turkish Eggs (Cilbir)
South African Melktert (Milk Tart)
Ukrainian Apple Cake (Yabluchnyk)
Spanish Garlic Shrimp (Gambas al Ajillo)
German Potato Dumplings (Kartoffelkloesse)
Apfelkuchen (Apple Cake)
Eggplant Caponata (Sicilian Version)
Chana Masala (Savory Indian Chick Peas)
Ricotta Pie (Old Italian Recipe)
Easy Blini (Russian Pancake)
Easy Bulgogi (Korean BBQ Beef)
Carne en su Jugo (Meat in its Juices)
Ghormeh Sabzi (Persian Herb Stew)
Puerto Rican Tostones (Fried Plantains)
Kalbi (Korean BBQ Short Ribs)
Macaron (French Macaroon)
Atsara (Papaya Relish)
Authentic Chinese Egg Rolls ()
Greek Lentil Soup (Fakes)
Lumpia (Shanghai version)
Northern Ontario Partridge (Ruffed Grouse)
Vampiros Mexicanos (Mexican Vampires)
Jamaican Saltfish Fritters (Stamp and Go)
Slo

For some reasons, need to run the cell twice

In [384]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_end:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Polish Noodles (Cottage Cheese and Noodles)
Oyakodon (Japanese Chicken and Egg Rice Bowl)
Papas Rellenas (Fried Stuffed Potatoes)
Blaukraut (German Red Cabbage)
Irish Boiled Dinner (Corned Beef)
True Dominican Sancocho (Latin 7-Meat Stew)
Blini (Russian Pancakes)
Oeufs Cocotte (Baked Eggs)
Ropa Vieja (Cuban Beef)
Lace Cookies (Florentine Cookies)
Sinigang na Bangus (Filipino Milkfish in Tamarind Broth)
Schwabischer Kartoffelsalat (German Potato Salad - Schwabisch Style)
Roti Canai or Paratha (Indian Pancake)
Melanzana alla Parmigiana (Perfect Eggplant Parmigiana)
Pierogi (Traditional Polish Dumplings)
Nipples of Venus (Capezzoli di Venere)
Samosadilla (Samosa Quesadilla)
Bulgogi (Korean Barbecued Beef)
Sabaayad (Somali Flatbread)
Filipino Baked Milkfish (Baked Bangus)
Ash-e Reshteh (Persian Legume Soup)
Lentil and Cactus Soup (Mom's Recipe)
Ethiopian Cabbage and Potato Dish (Atkilt)
Finnish Kropser (Baked Pancakes)
Oma's Griessnockerlsuppe (Beef and Semolina Dumpling Soup)
Kewa Datshi 

Only the names with bracket in the middle of their names remain

In [385]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names= bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Classic Cuban Midnight (Medianoche) Sandwich', 'Spicy Indian (Gujarati) Green Beans', "World's Best () Lasagna", 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce', 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce', 'Bee Sting Cake (Bienenstich) II', 'Coconut (Haupia) and Chocolate Pie', 'Lamb (Gosht) Biryani', 'Jeera (Cumin) Rice', 'Pollo (Chicken) Fricassee from Puerto Rico', 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish', 'Lazy Golumpki (Stuffed Cabbage) Soup', 'Ulu (Breadfruit) Pancakes', 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican', 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce', 'Seaweed (Nori) Soup', 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms', 'Hawaiian Bruddah Potato Mac (Macaroni) Salad', 'Korean Bean Curd (Miso) Soup', 'Lengua (Beef Tongue) Stew', 'Albondigas (Meatballs) en Chipotle', 'Zito (Zhito or Koljivo) - Serbian Wheat Pudding', 'Besan (Gram Flour) Halwa']


Mac and rapini is only synonymous the the one word before them. Otherwise, the bracketed words are synonymous to all the words before them combined.

In [386]:
bracketed_names

['Lazy Golumpki (Stuffed Cabbage) Soup',
 'Albondigas (Meatballs) en Chipotle',
 'Spicy Indian (Gujarati) Green Beans',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Korean Bean Curd (Miso) Soup',
 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish',
 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce',
 'Bee Sting Cake (Bienenstich) II',
 'Coconut (Haupia) and Chocolate Pie',
 'Zito (Zhito or Koljivo) - Serbian Wheat Pudding',
 'Lamb (Gosht) Biryani',
 'Jeera (Cumin) Rice',
 'Lengua (Beef Tongue) Stew',
 'Besan (Gram Flour) Halwa',
 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 'Classic Cuban Midnight (Medianoche) Sandwich',
 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce',
 'Hawaiian Bruddah Potato Mac (Macaroni) Salad',
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms',
 'Seaweed (Nori) Soup',
 "World's Best () Lasagna",
 'Ulu (Breadfruit) Pancakes']

The names can still be duplicated into 2, except that the bracketed word replaces the words before in the second new name, treating them as synonyms.

In [387]:
def convert_bracket_synonym(name, num=0):
    name1 = re.findall(BRACKET_REGEX, name)[0]
    name1 = name1[name1.find("(")+1:name1.find(")")]
    name1_suffix = name.split(')')[1]
    if num==0:
        name1 = name1 + name1_suffix
        name2 = re.sub(BRACKET_REGEX, "", name)
    else:
        name1_prefix = name.split('(')[0]
        name1_prefix = name1_prefix[:-num]
        name1 = name1_prefix + name1 + name1_suffix
        name2 = re.sub(BRACKET_REGEX, " ", name)
    return name1, name2

print(convert_bracket_synonym("Lamb (Gosht) Biryani"))
print(convert_bracket_synonym("Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce", 1))
print(convert_bracket_synonym("Hawaiian Bruddah Potato Mac (Macaroni) Salad", 1))

('Gosht Biryani', 'Lamb Biryani')
('Fusilli with RapiniBroccoli Rabe, Garlic, and Tomato Wine Sauce', 'Fusilli with Rapini , Garlic, and Tomato Wine Sauce')
('Hawaiian Bruddah Potato MacMacaroni Salad', 'Hawaiian Bruddah Potato Mac  Salad')


In [388]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_mid:
            newname1, newname2 = convert_bracket_synonym(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Classic Cuban Midnight (Medianoche) Sandwich
Spicy Indian (Gujarati) Green Beans
World's Best () Lasagna
Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce
Kimchi Jun (Kimchi Pancake) and Dipping Sauce
Bee Sting Cake (Bienenstich) II
Coconut (Haupia) and Chocolate Pie
Lamb (Gosht) Biryani
Jeera (Cumin) Rice
Pollo (Chicken) Fricassee from Puerto Rico
Fish Sinigang (Tilapia) - Filipino Sour Broth Dish
Lazy Golumpki (Stuffed Cabbage) Soup
Ulu (Breadfruit) Pancakes
Fried Chicken Chunks (Chicharrones De Pollo) Dominican
Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce
Seaweed (Nori) Soup
Vareniki (Russian Pierogi) with Potatoes and Mushrooms
Hawaiian Bruddah Potato Mac (Macaroni) Salad
Korean Bean Curd (Miso) Soup
Lengua (Beef Tongue) Stew
Albondigas (Meatballs) en Chipotle
Zito (Zhito or Koljivo) - Serbian Wheat Pudding
Besan (Gram Flour) Halwa


Successfully removed all brackets from recipe names

In [389]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names= bracketed_names + names

bracketed_names = list(set(bracketed_names))
bracketed_names

[]


[]

Dashes are mostly adjectives, but things like semi colon need to be removed. As for colons, its mostly translation. Semicolons are caused by K&auml;, which are dishes with special characters or German words.

In [390]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-', ':', ';']

In [391]:
for colon in colon_tokens:
  print(find_value_with_char(p_recipes, 'name', colon))

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Her

In [392]:
def remove_entry_with(dict_list, target, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            if target in dict_list[i]["name"]:
                dict_list.remove(dict_list[i])
        except Exception as e:
            pass

In [393]:
for semicolon in ["Quorn&trade;", "Sp&auml;tzle", "Tamales Oaxaque&ntilde;os", "K&auml;sesahnetorte", "Salte&ntilde;as"]:
    remove_entry_with(p_recipes, semicolon)
tagged_recipe_names = retag(p_recipes, "name")

Semi colons cleaned

In [394]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-', ':']

In [395]:
for colon in colon_tokens:
  print(find_value_with_char(p_recipes, 'name', colon))

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Her

For these 2 names, colons are used for describing

In [396]:
# Spaghetti alla Carbonara: the Traditional Italian Recipe => traditional Italian Spaghetti alla Carbonara
searchReplaceAddPattList(p_recipes, r": the Traditional Italian Recipe", "", "traditional Italian ")
# Grandma's Focaccia: Baraise Style => Grandma's Baraise Style Focaccia
searchReplaceAddPattList(p_recipes, r": Baraise Style", "", "Baraise Style ", index=10)
tagged_recipe_names = retag(p_recipes, "name")

Cleaned 2 names with colon. If the dashes are between a word, they are either part of a word's spelling or joining two words together, typically as an adjective. However, if it is between spaces, they are translations.

In [397]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-', ':']

In [398]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Her

['Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Haluski - Cabbage and Noodles',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Carnitas - Pressure Cooker',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 'Onigiri - Japanese Rice Balls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Chicken French - Rochester, NY Style',
 'Velveting Chicken Breast, Chinese Re

But in some cases, they are words after the dashes describe the dish, such as Rochester, NY Style and Restaurant Style

In [399]:
for colname in new_colon_names:
    if len(re.findall("( - )|(: )", colname)) > 0:
        print(colname)

Haluski - Cabbage and Noodles
Carnitas - Pressure Cooker
Onigiri - Japanese Rice Balls
Chicken French - Rochester, NY Style
Taqueria Style Tacos - Carne Asada
Al Kabsa - Traditional Saudi Rice and Chicken
Italian Subs - Restaurant Style
Bazlama - Turkish Flat Bread
Norwegian Pancakes - Pannekaken
Pain de Campagne - Country French Bread
Flemish Frites - Belgian Fries with Andalouse Sauce
Portuguese Custard Tarts - Pasteis de Nata
Eggplant Parmesan - Gluten-Free
Tonkatsu - Asian-Style Pork Chop
Indian Eggplant - Bhurtha
Hot Pepper Sauce - A Trinidadian Staple
The Sarge's Goetta - German Breakfast Treat
Italian Sausage - Tuscan Style
Honey Milk Tea - Hong Kong Style
Mexican Lasagna - No Lasagna Noodles!
Lumpia - Filipino Shrimp and Pork Egg Rolls
Portuguese Muffins - Bolo Levedo
Curry Pasta - Pakistani Style
Cauliflower and Potato Stir-Fry - East Indian Recipe
Keftedes - Greek Meatballs
Brasato al Barolo - Braised Chuck Roast in Red Wine
Potato Salad - German Kartoffel
Tembleque de Coco -

Replace or remove the remaining dashes that are surrounded by spaces

In [400]:
# Chicken French - Rochester, NY Style => Rochester, NY Style Chicken French
searchReplaceAddPattList(p_recipes, r" - Rochester, NY Style", "", "Rochester, NY Style ")
# Carnitas - Pressure Cooker => pressure cooker carnitas
searchReplaceAddPattList(p_recipes, r" - Rochester, NY Style", "", "Rochester, NY Style ")
# Italian Subs - Restaurant Style => restaurant style Italian subs
searchReplaceAddPattList(p_recipes, r" - Restaurant Style", "", "restaurant style ")
# Eggplant Parmesan - Gluten-Free => glutten-free Eggplant Parmesan
searchReplaceAddPattList(p_recipes, r" - Gluten-Free", "", "glutten-free ")
# Italian Sausage - Tuscan Style => Tuscan style Italian Sausage
searchReplaceAddPattList(p_recipes, r" - Tuscan Style", "", "Tuscan style ")
# Honey Milk Tea - Hong Kong Style => Hong Kong style Honey Milk Tea
searchReplaceAddPattList(p_recipes, r" - Hong Kong Style", "", "Hong Kong style ")
# Curry Pasta - Pakistani Style => Pakistani style Curry Pasta
searchReplaceAddPattList(p_recipes, r" - Pakistani Style", "", "Pakistani style ")
# Cauliflower and Potato Stir-Fry - East Indian Recipe => East Indian style Cauliflower and Potato Stir-Fry
searchReplaceAddPattList(p_recipes, r" - East Indian Recipe", "", "East Indian style ")
# German Potato Salad - Schwabisch Style => Schwabisch style German Potato Salad
searchReplaceAddPattList(p_recipes, r" - Schwabisch Style", "", "Schwabisch style ")
# Tilapia - Filipino Sour Broth Dish => Filipino Sour Broth tilapia
searchReplaceAddPattList(p_recipes, r"Tilapia - ", "", "tilapia", index=20)
# Fish Sinigang - Filipino Sour Broth Dish - Schwabisch Style => Filipino Sour Broth Sinigang fish
searchReplaceAddPattList(p_recipes, r"Fish Sinigang - ", "", "Sinigang fish", index=20)

# remove  - A Trinidadian Staple from Hot Pepper Sauce - A Trinidadian Staple
searchReplacePattList(p_recipes, r" - A Trinidadian Staple", "")
# remove  - German Breakfast Treat from The Sarge's Goetta - German Breakfast Treat
searchReplacePattList(p_recipes, r" - German Breakfast Treat", "")
# remove  - No Lasagna Noodles! from Mexican Lasagna - No Lasagna Noodles!
searchReplacePattList(p_recipes, r" - No Lasagna Noodles!", "")
# remove  - Not Just for Chicken from Sweet and Sour Jam - Not Just for Chicken
searchReplacePattList(p_recipes, r" - Not Just for Chicken", "")
                      
tagged_recipe_names = retag(p_recipes, "name")

In [401]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Herb Linguine', 'Korean-style Seaweed Soup'

['Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Haluski - Cabbage and Noodles',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Carnitas - Pressure Cooker',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 'Onigiri - Japanese Rice Balls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Velveting Chicken Breast, Chinese Restaurant-Style',
 'Garlic-Herb Linguine',

The remaining names with dashes surrounded by dashes are translations, which can be split into two names

In [402]:
colnames_to_split = []
for colname in new_colon_names:
    if len(re.findall("( - )|(: )", colname)) > 0:
        print(colname)
        colnames_to_split.append(colname)

Haluski - Cabbage and Noodles
Carnitas - Pressure Cooker
Onigiri - Japanese Rice Balls
Taqueria Style Tacos - Carne Asada
Al Kabsa - Traditional Saudi Rice and Chicken
Bazlama - Turkish Flat Bread
Norwegian Pancakes - Pannekaken
Pain de Campagne - Country French Bread
Flemish Frites - Belgian Fries with Andalouse Sauce
Portuguese Custard Tarts - Pasteis de Nata
Tonkatsu - Asian-Style Pork Chop
Indian Eggplant - Bhurtha
Lumpia - Filipino Shrimp and Pork Egg Rolls
Portuguese Muffins - Bolo Levedo
Keftedes - Greek Meatballs
Brasato al Barolo - Braised Chuck Roast in Red Wine
Potato Salad - German Kartoffel
Tembleque de Coco - Coconut Tembleque
Kroppkakor - Swedish Potato Dumplings
Ladolemono - Lemon Oil Sauce for Fish or Chicken
Mie Goreng - Indonesian Fried Noodles
Vaselopita - Greek New Years Cake
Knedliky - Czech Dumpling with Sauerkraut
Zhito or Koljivo - Serbian Wheat Pudding
Zito - Serbian Wheat Pudding
Doro Wat: Ethiopian Chicken Dish


In [403]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in colnames_to_split:
            splits = re.split("( - )|(: )", p_recipes[i]["name"])
            newname1 = splits[0]
            newname2 = splits[len(splits)-1]
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

The remaining names with dash are those in words

In [404]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-']

In [405]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Herb Linguine', 'Korean-style Seaweed Soup', 'Ube-Macapuno Cake', 'Cuban-Style Yuca', 'Japanese-Style Cabbage Salad', "Jorge's Indian-Spice

['Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Velveting Chicken Breast, Chinese Restaurant-Style',
 'Garlic-Herb Linguine',
 'Korean-style Seaweed Soup',
 'Ube-Macapuno Cake',
 'Cuban-Style Yuca',
 'Japanese-Style Cabbage 

!, ? and . are found, which are odd for recipe names

In [406]:
punc_tokens = list_words_with_tag(tagged_recipe_names, ".")
punc_tokens

['!', '!', '!', '!', '.', '?']

The punctuations are mostly slang abbreviations and exclamations

In [407]:
for punc in list(set(punc_tokens)):
  print(find_value_with_char(p_recipes, 'name', punc))

['Real Canadian Butter Tarts, eh?']
["Our Top P.F. Chang's Copycat Recipes", "Perfect St. Patrick's Day Cake"]
['Sangria! Sangria!', 'Oatmeal Apple Crisp To Die For!', "Sushi House Salad Dressing, It's ORANGE!"]


Remove the exclamations

In [408]:
searchReplacePattList(p_recipes, r"! Sangria!", "")
searchReplacePattList(p_recipes, r" To Die For!", "")
searchReplacePattList(p_recipes, r", It's ORANGE!", "")
searchReplacePattList(p_recipes, r", eh\?", "")
searchReplacePattList(p_recipes, r"Our Top ", "")

tagged_recipe_names = retag(p_recipes, "name")

Fullstops that remain are part of recipe names

In [409]:
punc_tokens = list_words_with_tag(tagged_recipe_names, ".")
punc_tokens

['.']

In [410]:
for punc in list(set(punc_tokens)):
  print(find_value_with_char(p_recipes, 'name', punc))

["P.F. Chang's Copycat Recipes", "Perfect St. Patrick's Day Cake"]


Some 'that' can be found

In [411]:
wdt_tokens = list_words_with_tag(tagged_recipe_names, "WDT")
wdt_tokens

['That', 'That', 'That', 'That']

The 'that's are used to add details, but not actual recipe name

In [412]:
for wdt in list(set(wdt_tokens)):
  print(find_value_with_char(p_recipes, 'name', wdt))

['German Recipes That Are Comfort Food Favorites', 'Mexican-Inspired Ground Beef Casseroles That Deliver Big Flavor With Every Satisfying Bite', 'Tuscan Recipes That Reveal the Best of Italian Cooking', 'Easy Dinners That Start with Packaged Gnocchi', "That's-a Meatloaf", 'Favorite Recipes That Show Off Armenian Cuisine', 'Our Best Stir-Fry Recipes That Are Even Better Than Take-Out', 'Comforting Polish Cabbage Recipes That Are Family Favorites']


Remove

In [413]:
searchReplacePattList(p_recipes, r" That Are Comfort Food Favorites", "")
searchReplacePattList(p_recipes, r" That Deliver Big Flavor With Every Satisfying Bite", "")
searchReplacePattList(p_recipes, r" That Reveal the Best of Italian Cooking", "")
searchReplacePattList(p_recipes, r"That's-a ", "")
searchReplacePattList(p_recipes, r"Favorite Recipes That Show Off ", "")
searchReplacePattList(p_recipes, r" That Are Even Better Than Take-Out", "")
searchReplacePattList(p_recipes, r" That Are Family Favorites", "")

searchReplaceAddPattList(p_recipes, r" That Start with Packaged Gnocchi", "", "packaged gnocchi ", index=5)
tagged_recipe_names = retag(p_recipes, "name")

That removed

In [414]:
wdt_tokens = list_words_with_tag(tagged_recipe_names, "WDT")
wdt_tokens

[]

There's some 'how's

In [415]:
wrb_tokens = list_words_with_tag(tagged_recipe_names, "WRB")
wrb_tokens

['How', 'How', 'How', 'How', 'How', 'How', 'How']

In [416]:
for wrb in list(set(wrb_tokens)):
  print(find_value_with_char(p_recipes, 'name', wrb))

['How to Make Coquilles Saint-Jacques', 'How to Make Bolognese Sauce', 'How to Make Beef Satay', 'How to Make Peanut Dipping Sauce', 'How to Make Tres Leches Cake', 'How to Make Cassoulet', 'How to Make Turkey Manicotti']


Remove the 'how's and keep only the name

In [417]:
searchReplacePattList(p_recipes, r"How to Make ", "")

tagged_recipe_names = retag(p_recipes, "name")

In [418]:
list_words_with_tag(tagged_recipe_names, "WRB")

[]

There's some personal pronouns (possessive)

In [419]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP$")
prp_tokens

['Our',
 'My',
 'My',
 'My',
 'Our',
 'My',
 'Our',
 'My',
 'My',
 'My',
 'Our',
 'My',
 'My',
 'Your',
 'Our',
 'Our',
 'Our',
 'My',
 'its']

In [420]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['Our Best Avgolemono Soup Recipes', 'Our Best Authentic Mexican Recipes', 'Our Best Empanada Recipes', 'Our Best Indian Recipes for Beginner Cooks', 'Our Best Stir-Fry Recipes', 'Our Favorite German Potato Recipes', 'Say Aloha to Our Best Hawaiian Recipes']
['My Own Famous Stuffed Grape Leaves', 'My Best Chicken Piccata', 'My Favorite Sesame Noodles', 'My Chicken Parmesan', "My Mom's Greek Lemon Rice", 'My Fly Stir-Fry', 'My Chicken Pho Recipe', 'My Tangy German Potato Salad', 'My Big Fat Greek Baked Beans', "My Grandmother's French Dressing"]
['Anzac Biscuits I', "Sadie's Buttermilk Biscuits", 'Canadian Tea Biscuits', 'Empire Biscuits', 'Pastitsio IV', 'Crescent Butter Biscuits', 'Pastitsio', "Nanny's Newfoundland Tea Biscuits", 'Meat in its Juices']
['Sweet Recipes to Complete Your Indian Dinner', 'Melt-in-Your-Mouth Beef Cacciatore', 'Polish Recipes to Make Your Grandmother Proud']


Most can be removed

In [421]:
searchReplacePattList(p_recipes, r"Our ", "")
searchReplacePattList(p_recipes, r"Your ", "")
searchReplacePattList(p_recipes, r"Melt-in-Your-Mouth ", "")
searchReplacePattList(p_recipes, r"My Own ", "")
searchReplacePattList(p_recipes, r"My Best ", "")
searchReplacePattList(p_recipes, r"My Favorite ", "")
searchReplacePattList(p_recipes, r"My Mom's ", "")
searchReplacePattList(p_recipes, r"My Grandmother's ", "")
searchReplacePattList(p_recipes, r"My ", "")

tagged_recipe_names = retag(p_recipes, "name")

The remaining ones are misclassified tags by nltk

In [422]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP$")
prp_tokens

['its']

In [423]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['Anzac Biscuits I', "Sadie's Buttermilk Biscuits", 'Canadian Tea Biscuits', 'Empire Biscuits', 'Pastitsio IV', 'Crescent Butter Biscuits', 'Pastitsio', "Nanny's Newfoundland Tea Biscuits", 'Meat in its Juices']


There's some personal pronouns

In [424]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP")
prp_tokens

['I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'You',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'You',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'We',
 'I',
 'I',
 'I']

In [425]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['Shrimp Egg Foo Young', 'Good for You Greek Salad', 'Egg Foo Young', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Young Coconut Jelly', 'Keto Egg Foo Young']
['West African Peanut Stew', 'Real Welsh Rarebit', 'Fabulous Wet Burritos', 'Mexican Wedding Cookies', 'Italian Wedding Cookies III', 'Beef Wellington', 'West African-Style Peanut Stew with Chicken', 'Party Italian Wedding Soup', 'West Coast Trail Cookies', 'Italian Wedding Cake', 'Weeknight Mexican Chicken Lasagna', 'Comforting Russian Soups for Fall and Winter Weather', 'Comforting Russian Soups for Fall and Winter Weather', 'West Indian Curried Chicken', 'Welsh Cakes', "Mrs Welch's Butter Tarts", 'Italian Wedding Cake Martini', 'West African Lime Cake', 'Hawaiian Wedding Cake II', 'Weeknight Wonton Soup', 'Traditional Welsh Rarebit', 'West African Peanut Soup', "We Be Jammin' Jamaican Banana Bread", 'Italian Wedding Soup II', 'Chocolate Mexican Wedding Cookies', 'Traditional Welsh Broth']
['German Apple Cake I', 'In

Not much to remove, since most are misclassified POS

In [426]:
searchReplacePattList(p_recipes, r" You Can Make at Home", "")

tagged_recipe_names = retag(p_recipes, "name")

Some base verbs can be removed

In [427]:
vb_tokens = list_words_with_tag(tagged_recipe_names, "VB")
vb_tokens

['Take',
 'Make',
 'Take',
 'Kedgeree',
 'Swordfish',
 'Serve',
 'Make',
 'Celebrate',
 'Chicken',
 'Pata',
 'aux',
 'Poulet',
 'Papa',
 'Tarte',
 'Pollo',
 'Pancake',
 'Dutch',
 'Kransekake',
 'Dish',
 'Pannekaken']

In [428]:
for vb in list(set(vb_tokens)):
  print(find_value_with_char(p_recipes, 'name', vb))

['Take Out-Style Fried Rice', 'Take The Night Off Slow Cooker Pineapple Chicken', 'Take-Out Fake-Out Pollo Con Crema', 'Chinese Take-Out Shrimp with Garlic']
['Kransekake']
['German Pancakes II', "Mom's Buttermilk Pancakes", 'Japanese-Style Fluffy Pancakes', 'Arvidson Swedish Pancakes', 'Easy Swedish Pancakes', 'Easy Potato Pancakes', 'Finnish Pancakes', 'Coconut Pancake Syrup', 'Japanese Souffle Pancakes', "Barbarella's German Pancakes", 'Pan-Fried Chinese Pancakes', 'The Best Ricotta Pancakes', "Chef John's Chinese Scallion Pancakes", 'Traditional Swedish Pancakes', 'Chinese Scallion Pancakes', 'Authentic Potato Pancakes', 'German Pancake with Buttermilk Sauce', 'German Puff Pancakes', 'Dutch Pancakes', 'Russian Pancake', 'Russian Cheese Pancakes', 'Czech Savory Potato Pancakes', 'Japanese Pancake', 'Dutch Mini Pancakes', 'Moroccan Pancakes', 'Polish Apple Pancakes', 'Russian Pancakes', 'Indian Pancake', 'Baked Pancakes', 'Kimchi Pancake and Dipping Sauce', 'Breadfruit Pancakes', 'Ul

Remove recipe names with instruction

In [429]:
searchReplacePattList(p_recipes, r" to Make at Home", "")
searchReplacePattList(p_recipes, r" to Make Grandmother Proud", "")
searchReplacePattList(p_recipes, r"Ways The World Makes Chicken And ", "")

searchReplaceAddPattList(p_recipes, r"Make Ahead ", "", "packaged gnocchi ")

tagged_recipe_names = retag(p_recipes, "name")

In [430]:
for vb in list(set(vb_tokens)):
  print(find_value_with_char(p_recipes, 'name', vb))

['Take Out-Style Fried Rice', 'Take The Night Off Slow Cooker Pineapple Chicken', 'Take-Out Fake-Out Pollo Con Crema', 'Chinese Take-Out Shrimp with Garlic']
['Kransekake']
['German Pancakes II', "Mom's Buttermilk Pancakes", 'Japanese-Style Fluffy Pancakes', 'Arvidson Swedish Pancakes', 'Easy Swedish Pancakes', 'Easy Potato Pancakes', 'Finnish Pancakes', 'Coconut Pancake Syrup', 'Japanese Souffle Pancakes', "Barbarella's German Pancakes", 'Pan-Fried Chinese Pancakes', 'The Best Ricotta Pancakes', "Chef John's Chinese Scallion Pancakes", 'Traditional Swedish Pancakes', 'Chinese Scallion Pancakes', 'Authentic Potato Pancakes', 'German Pancake with Buttermilk Sauce', 'German Puff Pancakes', 'Dutch Pancakes', 'Russian Pancake', 'Russian Cheese Pancakes', 'Czech Savory Potato Pancakes', 'Japanese Pancake', 'Dutch Mini Pancakes', 'Moroccan Pancakes', 'Polish Apple Pancakes', 'Russian Pancakes', 'Indian Pancake', 'Baked Pancakes', 'Kimchi Pancake and Dipping Sauce', 'Breadfruit Pancakes', 'Ul

Words like best and most can be removed

In [431]:
rbs_tokens = list_words_with_tag(tagged_recipe_names, "RBS")
rbs_tokens

['Best', 'Most', 'Best']

In [432]:
for rbs in list(set(rbs_tokens)):
  print(find_value_with_char(p_recipes, 'name', rbs))

['Best Bobotie', 'Best Fried Walleye', 'Best Avgolemono Soup Recipes', "Chef John's Best German Recipes", 'The Best Thai Peanut Sauce', 'Best Ever Russian Beef Stroganoff', "Grandma's Best Ever Sour Cream Lasagna", 'Best Guacamole', 'Best Ever Slow Cooker Italian Beef Roast', 'The Best Pavlova', "Savannah's Best Marinated Portobello Mushrooms", 'Best Peanut Sauce', 'Best Ever Carne Asada Marinade', "Mom's Best Spaghetti Sauce", 'The Best Korean Chicken Recipes', 'Best Instant Pot Chicken Cacciatore', 'Best Ziti Ever', 'Best Authentic Mexican Recipes', 'Best Empanada Recipes', 'Best Ziti Ever with Sausage', 'Best Chicken Parmesan', 'Best Pernil Ever', 'The Best Ricotta Pancakes', 'Best Indian Recipes for Beginner Cooks', 'Best Hot Sauce', 'Best Ever Irish Soda Bread', 'Best Hummus', 'The Best Thai Tom Kha Soup Recipe', 'Best French Macarons', 'Best Falafel', "Gordo's Best of the Best Lasagna", 'The Best Classic Beef Stroganoff', 'Best Asian Slow Cooker Recipes', 'Best Cheesy Broccoli So

In [433]:
searchReplacePattList(p_recipes, r"Best Ever ", "")
searchReplacePattList(p_recipes, r"Best ", "")
searchReplacePattList(p_recipes, r" ever", "")
searchReplacePattList(p_recipes, r"The Most Iconic ", "")

tagged_recipe_names = retag(p_recipes, "name")

In [434]:
rbs_tokens = list_words_with_tag(tagged_recipe_names, "RBS")
rbs_tokens

[]

Adverbs with -ly can be removed, except for the misclassified ones mainly caused by foreign recipe names

In [435]:
rb_tokens = list_words_with_tag(tagged_recipe_names, "RB")
rb_tokens

['Absolutely',
 'Aebleskiver',
 'Incredibly',
 'Perfectly',
 'Absolutely',
 'Oven',
 'Perfectly',
 'Absolutely',
 'Heavenly',
 'Asiago',
 'Philly',
 'Family',
 'Deadly',
 'Yet',
 'Absolutely',
 'Ever',
 'Tourtiere',
 'Tourtiere',
 'Soon',
 'Here',
 'Long',
 'Tourtiere',
 'Tourtiere']

In [436]:
for rb in list(set(rb_tokens)):
  print(find_value_with_char(p_recipes, 'name', rb))

['Aebleskiver', 'Dansk Aebleskiver']
['Heavenly Raspberry Dessert']
['Air Fryer Oven Taco Shells', 'Oven Kalua Pork', 'Oven-Roasted Chicken Thighs', 'Oven Baked Chicken Teriyaki', 'Oven-Baked Chicken Fajitas', 'Oven-Baked Teriyaki Chicken Thighs', 'Crispy Oven Beef-and-Bean Tostadas', "Oven-Roasted Za'atar Chicken Breasts", 'Chicken Chimi in the Oven']
['Yet Turkey Chili']
['Absolutely Fabulous Greek or House Dressing', 'Absolutely Amazing Ahi', 'Absolutely Delicious Stuffed Calamari', 'Absolutely Perfect Palak Paneer']
['Chicken Long Rice Soup', 'Vietnamese Chicken and Long-Grain Rice Congee', 'Long Soup', 'Philippine Longanisa de Eugenio', 'Long Drink']
['Perfectly Moist Irish Wheaten Bread', 'Perfectly Dry Roasted Chickpeas']
['No Tomato Paste Here']
['Ziti Ever', 'Ziti Ever with Sausage', 'Pernil Ever', 'Date Squares Ever']
['Deadly Delicious Lasagna']
['Asiago Sun-Dried Tomato Pasta', 'Chicken and Bowtie Pasta with Asiago Cream Sauce']
['Soon Du Bu Jigae']
['Philly Cheesesteak Que

In [437]:
searchReplacePattList(p_recipes, r"Deadly Delicious ", "")
searchReplacePattList(p_recipes, r"Heavenly ", "")
searchReplacePattList(p_recipes, r"Perfectly ", "")
searchReplacePattList(p_recipes, r"Absolutely Fabulous ", "")
searchReplacePattList(p_recipes, r"Absolutely Amazing  ", "")
searchReplacePattList(p_recipes, r"Absolutely Delicious ", "")
searchReplacePattList(p_recipes, r"Absolutely Perfect ", "")

searchReplaceAddPattList(p_recipes, r"No Tomato Paste Here", "", "tomato paste")

tagged_recipe_names = retag(p_recipes, "name")

In [438]:
rb_tokens = list_words_with_tag(tagged_recipe_names, "RB")
rb_tokens

['Aebleskiver',
 'Incredibly',
 'Absolutely',
 'Oven',
 'Asiago',
 'Philly',
 'Family',
 'Yet',
 'Ever',
 'Tourtiere',
 'Tourtiere',
 'Soon',
 'Long',
 'Tourtiere',
 'Tourtiere']

In [439]:
for rb in list(set(rb_tokens)):
  print(find_value_with_char(p_recipes, 'name', rb))

['Aebleskiver', 'Dansk Aebleskiver']
['Air Fryer Oven Taco Shells', 'Oven Kalua Pork', 'Oven-Roasted Chicken Thighs', 'Oven Baked Chicken Teriyaki', 'Oven-Baked Chicken Fajitas', 'Oven-Baked Teriyaki Chicken Thighs', 'Crispy Oven Beef-and-Bean Tostadas', "Oven-Roasted Za'atar Chicken Breasts", 'Chicken Chimi in the Oven']
['Yet Turkey Chili']
['Absolutely Amazing Ahi']
['Chicken Long Rice Soup', 'Vietnamese Chicken and Long-Grain Rice Congee', 'Long Soup', 'Philippine Longanisa de Eugenio', 'Long Drink']
['Ziti Ever', 'Ziti Ever with Sausage', 'Pernil Ever', 'Date Squares Ever']
['Soon Du Bu Jigae']
['Asiago Sun-Dried Tomato Pasta', 'Chicken and Bowtie Pasta with Asiago Cream Sauce']
['Philly Cheesesteak Quesadillas']
['Willard Family German Chocolate Cake', 'Mexican-Inspired Casseroles for Family-Pleasing Dinners', 'Chinese Happy Family', 'Family Sicilian Sauce and Meatballs', 'Greek Ground Beef Recipes Sure To Become Family Favorites']
['French Canadian Tourtiere', 'Traditional Frenc

In [440]:
all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  all_name_tags.append(new_dic)

get_tag_number(all_name_tags)

[{'$': 1},
 {"''": 7},
 {'(': 0},
 {')': 0},
 {',': 62},
 {'--': 0},
 {'.': 1},
 {':': 1},
 {'CC': 506},
 {'CD': 23},
 {'DT': 96},
 {'EX': 0},
 {'FW': 67},
 {'IN': 464},
 {'JJ': 1897},
 {'JJR': 2},
 {'JJS': 1},
 {'LS': 0},
 {'MD': 0},
 {'NN': 659},
 {'NNP': 12712},
 {'NNPS': 36},
 {'NNS': 389},
 {'PDT': 0},
 {'POS': 346},
 {'PRP': 69},
 {'PRP$': 1},
 {'RB': 15},
 {'RBR': 0},
 {'RBS': 0},
 {'RP': 2},
 {'SYM': 0},
 {'TO': 10},
 {'UH': 0},
 {'VB': 18},
 {'VBD': 39},
 {'VBG': 59},
 {'VBN': 139},
 {'VBP': 9},
 {'VBZ': 29},
 {'WDT': 0},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 6}]

## Examining other POS in names

So as to get an idea of POS tagging in the later section

In [441]:
vbz_tokens = list_words_with_tag(tagged_recipe_names, "VBZ")
vbz_tokens

['Ties',
 'el',
 'Leaves',
 'al',
 'al',
 'Leaves',
 'au',
 'di',
 'Ways',
 'de',
 'al',
 'Breasts',
 'en',
 'e',
 'al',
 'Leaves',
 'Breasts',
 'Squares',
 'al',
 'di',
 'aux',
 'di',
 'Leaves',
 'au',
 'di',
 'di',
 'al',
 'en',
 'en']

In [442]:
vbp_tokens = list_words_with_tag(tagged_recipe_names, "VBP")
vbp_tokens

['Rellenos',
 'Greek',
 'Divine',
 'Wat',
 'Be',
 'en',
 'Mexicanos',
 'Rellenos',
 'en']

In [443]:
vbg_tokens = list_words_with_tag(tagged_recipe_names, "VBG")
vbg_tokens

['Seasoning',
 'Dressing',
 'Pudding',
 'Using',
 'Canning',
 'Pudding',
 'Velveting',
 'Pudding',
 'Pudding',
 'Pudding',
 'Seasoning',
 'Comforting',
 'Seasoning',
 'Pouding',
 'Pudding',
 'Amazing',
 'Pudding',
 'Refreshing',
 'Pudding',
 'Seasoning',
 'Dressing',
 'Comforting',
 'Pudding',
 'Making',
 'Comforting',
 'Pudding',
 'Dumpling',
 'Dipping',
 'Refreshing',
 'Pudding',
 'Seasoning',
 'Seasoning',
 'Filling',
 'Thanksgiving',
 'Stuffing',
 'Pudding',
 'Pudding',
 'Refreshing',
 'Pudding',
 'Sizzling',
 'Topping',
 'Amazing',
 'Refreshing',
 'Comforting',
 'Dressing',
 'Using',
 'Seasoning',
 'Refreshing',
 'Pudding',
 'Pudding',
 'Pudding',
 'Ping',
 'Pudding',
 'Pudding',
 'Pudding',
 'Pudding',
 'Pudding',
 'Dumpling',
 'Pudding']

In [444]:
vbd_tokens = list_words_with_tag(tagged_recipe_names, "VBD")
vbd_tokens

['Braised',
 'Corned',
 'Corned',
 'Pickled',
 'Shredded',
 'Braised',
 'Fashioned',
 'Filled',
 'Corned',
 'Fashioned',
 'Pickled',
 'Braised',
 'Breaded',
 'Fried',
 'Grilled',
 'Braised',
 'Pickled',
 'Braised',
 'Braised',
 'Planked',
 'Corned',
 'Corned',
 'Braised',
 'Infused',
 'Corned',
 'Obsessed',
 'Pickled',
 'Pulled',
 'Roasted',
 'Broiled',
 'Pickled',
 'Roasted',
 'di',
 'Braised',
 'Braised',
 'Pickled',
 'Mulled',
 'Pickled',
 'Boiled']

In [445]:
rp_tokens = set(list(list_words_with_tag(tagged_recipe_names, "RP")))
rp_tokens

{'Hanout', 'Over'}

In [446]:
comma_tokens = set(list(list_words_with_tag(tagged_recipe_names, ",")))
comma_tokens

{','}

In [447]:
for c in list(set(comma_tokens)):
  print(find_value_with_char(p_recipes, 'name', c))

['Bow Ties with Sausage, Tomatoes and Cream', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Chicken, Spinach, and Cheese Pasta Bake', 'Super-Simple, Super-Spicy Mongolian Beef', 'Creamy Potato, Carrot, and Leek Soup', 'Beef, Mushroom and Guinness Pie', 'Easy, Chewy Flourless Peanut Butter Cookies', 'Filipino Steamed Rice, Cebu Style', 'Orange, Honey and Soy Chicken', 'Chicken Francese, Italian-Style', 'Duck with Honey, Soy, and Ginger', 'Steak, Onion, and Pepper Fajitas', 'Indian Carrots, Peas and Potatoes', 'Simple, Baked Finnan Haddie', 'Indian-Style Rice with Cashews, Raisins and Turmeric', 'Serbian Ground Beef, Veggie, and Potato Bake', 'Fried Rice with Ginger, Hoisin, and Sesame', 'Chard Lentil Soup, Lebanese-Style', 'Easy, Cheesy Tortellini Bake', 'Curried Cashew, Pear, and Grape Salad', 'Pork, Sauerkraut and Dumplings', 'Spinach, Feta, and Pine Nut Ravioli Filling', 'Bell Pepper, Tomato, and Potato Indian Curry', 'Mascarpone Pasta with Chicken, Bacon and Spinach', 'Past

In [448]:
jjr_tokens = list_words_with_tag(tagged_recipe_names, "JJR")
jjr_tokens

['Healthier', 'Lighter']

In [449]:
for j in list(set(jjr_tokens)):
  print(find_value_with_char(p_recipes, 'name', j))

['Lighter Mexican Meatloaf']
['Healthier Bang Bang Chicken in the Air Fryer', 'Healthier Swedish Meatballs', 'Healthier Pan-Fried Honey-Sesame Chicken', 'Healthier Chicken Enchiladas I', 'Healthier Honey-Sesame Chicken']


In [450]:
jjs_tokens = list_words_with_tag(tagged_recipe_names, "JJS")
jjs_tokens

['Oktoberfest']

In [451]:
for j in list(set(jjs_tokens)):
  print(find_value_with_char(p_recipes, 'name', j))

['Oktoberfest Chicken and Red Cabbage', 'Oktoberfest Potato Salad', 'Oktoberfest Chili', 'The Recipes to Celebrate Oktoberfest']


In [452]:
dt_tokens = list_words_with_tag(tagged_recipe_names, "DT")
dt_tokens

['a',
 'The',
 'No',
 'The',
 'the',
 'a',
 'the',
 'The',
 'the',
 'the',
 'a',
 'the',
 'the',
 'A',
 'a',
 'The',
 'a',
 'the',
 'the',
 'a',
 'a',
 'A',
 'The',
 'A',
 'the',
 'a',
 'a',
 'The',
 'a',
 'a',
 'The',
 'the',
 'The',
 'This',
 'The',
 'a',
 'a',
 'the',
 'The',
 'a',
 'a',
 'The',
 'a',
 'A',
 'the',
 'the',
 'No',
 'the',
 'a',
 'a',
 'The',
 'The',
 'a',
 'The',
 'the',
 'the',
 'The',
 'the',
 'a',
 'a',
 'The',
 'a',
 'the',
 'a',
 'The',
 'All',
 'The',
 'a',
 'the',
 'the',
 'the',
 'The',
 'The',
 'A',
 'a',
 'the',
 'a',
 'the',
 'The',
 'the',
 'a',
 'a',
 'a',
 'the',
 'a',
 'a',
 'the',
 'a',
 'An',
 'the',
 'a',
 'a',
 'a',
 'No',
 'a',
 'No']

In [453]:
for dt in list(set(dt_tokens)):
  print(find_value_with_char(p_recipes, 'name', dt))

["Grandma's Noodles II", 'Buche de Noel', 'Norwegian Lefse', 'Vegetarian Chinese Fried Noodles', 'No Fail Bean Pie', 'Vermicelli Noodle Bowl', 'Chicken Udon Noodle Soup', 'Peanut Butter Noodles', 'Lo Mein Noodles', 'Broccoli and Ramen Noodle Salad', 'Cabbage Balushka or Cabbage and Noodles', 'Chinese Fried Noodles', 'Jap Chae Korean Glass Noodles', "Chef John's No-Knead Ciabatta", 'Norwegian Krumkake', "Mom's Nova Scotia Seafood Chowder", 'Cabbage and Noodles', 'Ramen Noodle Soup', 'Asian Ground Beef Noodle Bowls', 'Udon Peanut Butter Noodles', 'Thai-Inspired Noodle Salad', 'Japanese Pan Noodles', 'Homemade Noodles', "Nong's Khao Man Gai", 'Fried Cabbage and Egg Noodles', 'No-Cream Pasta Primavera', 'Sesame Noodles', 'One Pot Thai-Style Rice Noodles', 'Nova Scotia Blueberry Cream Cake', 'Authentic Pad Thai Noodles', 'Thai Rice Noodle Salad', 'Thai Chicken Noodle Soup', "Victor's Non-Dairy Hawaiian Coconut Mochi Cake", 'Shrimp Noodle Soup', 'Nova Scotian Hodge Podge', 'No-Cook Chicken L

In [454]:
to_tokens = list_words_with_tag(tagged_recipe_names, "TO")
to_tokens

['to', 'na', 'to', 'to', 'to', 'To', 'to', 'na', 'na', 'na']

In [455]:
for to in list(set(to_tokens)):
  print(find_value_with_char(p_recipes, 'name', to))

['Super-Delicious Zuppa Toscana', 'Canadian Yellow Split Pea Soup with Ham', "Randy's Slow Cooker Ravioli Lasagna", 'Spinach Tomato Tortellini', 'Traditional Gyros', 'Cheese Lasagna', 'Creamy Chicken Lasagna', 'Chicken Parmigiana', 'French Canadian Tourtiere', 'Pipirrana', "Dash's Donair", 'American Lasagna', 'Taco Lasagna', "Bob's Stuffed Banana Peppers", 'Spaghetti alla Carbonara', 'Kalamata Olive Tapenade', 'Lyonnaise Potatoes', "Chef John's Lasagna", 'Original Homemade Italian Beef', 'Fettuccini Carbonara', 'Simply Traditional Lasagna', 'Spinach Cheese Manicotti', 'The Original Donair From the East Coast of Canada', 'Chicken and Shrimp Carbonara', 'Fried Empanadas', 'Authentic Paella Valenciana', 'Danish Cinnamon Snails', 'Panang Curry with Chicken', 'Easy Lasagna I', 'Eggplant Lasagna', 'Jamaican Fried Snapper', 'Mushrooms and Spinach Italian Style', 'Deep Dish Lasagna', 'Simple Spinach Lasagna', 'Quesadilla Salvadorena', 'Italian Chicken Marinade', 'Korean BBQ Chicken Marinade', 

Chicken is considered dollar?

In [456]:
dol_tokens = list_words_with_tag(tagged_recipe_names, "$")
dol_tokens

['Chicken']

It's a tagging error, so this can be ignored

In [457]:
for dol in dol_tokens:
  print(find_value_with_char(p_recipes, 'name', dol))

['Spicy Korean Fried Chicken with Gochujang Sauce', 'Greek Lemon Chicken and Potato Bake', "Chef John's Chicken Kiev", 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Chicken Katsu', 'Chicken Stir-Fry', 'Mexican-Style Chicken Taco Casserole', 'Curry Stand Chicken Tikka Masala Sauce', 'Chicken Enchiladas V', 'Jamaican Style Curry Chicken', 'Salsa Chicken', 'Grilled Asian Chicken', 'Chicken Tikka Masala', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Turkish Chicken Kebabs', 'Chicken Souvlaki with Tzatziki Sauce', 'Greek Lemon Chicken Soup', 'Chicken Cacciatore in a Slow Cooker', 'Chicken and Broccoli Stir-Fry', 'Creamy Chicken Lasagna', 'Broccoli and Chicken Stir-Fry', 'Chicken Parmigiana', 'Shoyu Chicken', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Sheet Pan Chicken Fajitas', 'White Chicken Enchilada Slow-Cooker Casserole', 'Chicken Enchiladas II', 'Chinese Chicken Fried Rice II', 'Chicken Milanese', 'Chicken Massaman Curry', "Chef J

There are some quotation marks

In [458]:
quote_tokens = list_words_with_tag(tagged_recipe_names, "''")
quote_tokens

["''", "''", "'", "''", "''", "''", "''"]

Quotation marks are caused by possessive -'s

In [459]:
for quote in quote_tokens:
  print(find_value_with_char(p_recipes, 'name', quote))

[]
[]
["Chef John's Chicken Kiev", "Angela's Awesome Enchiladas", "Randy's Slow Cooker Ravioli Lasagna", "'Chinese Buffet' Green Beans", "Chef John's Beef Rouladen", "Corned Beef and Cabbage Shepherd's Pie", "Gramma's Date Squares", "Authentic Russian Salad 'Olivye'", "Chef John's Meatless Meatballs", "Chef John's Beef Goulash", "Grandma's Noodles II", "Chef John's Clotted Cream", "Newfoundland Jigg's Dinner", "Chef John's Coq Au Vin", "Chef John's Loco Moco", "Dash's Donair", "Turkey Shepherd's Pie", "Papa Drexler's Bavarian Pretzels", "Bob's Stuffed Banana Peppers", "Chef John's Swedish Meatballs", "Chef John's German Recipes", "Chef John's Chicken Tikka Masala", "Maria's Mexican Rice", "Mom's Buttermilk Pancakes", "Geneva's Ultimate Hungarian Mushroom Soup", "Charley's Slow Cooker Mexican Style Meat", "Ingrid's Rouladen", "Chef John's Lasagna", "Lola's Horchata", "Chef John's Italian Sausage Chili", "Kid's Favorite Pizza Casserole", "Traci's Adobo Seasoning", "Frank's Favorite Slow-

 For now, leave the preprocessing of the recipe names first.

## Preprocessing of ingredients

Ingriendts are a lot more straightforward to preprocess, since recipe names have to be attractive to encourage user to click in

In [460]:
p_ingredients = []

for recipe in p_recipes:
    p_ingredients = p_ingredients + recipe['ingredients']
    
p_ingredients = list(set(p_ingredients))
len(p_ingredients)

19342

In [461]:
p_ingredients[:10]

[' fresh mushrooms, sliced ',
 '1 slice mild Cheddar cheese ',
 '2 (14.5 ounce) cans no-salt-added tomatoes, diced or crushed ',
 '2 quarts hot water ',
 '2 pounds pork tenderloin, cut into 1 1/2-inch cubes ',
 ' superfine sugar as needed ',
 '⅓ cup finely chopped dry roasted peanuts ',
 '1\u2009½ pounds cod ',
 '1 tablespoon finely chopped Chinese chives ',
 '8 chicken tenderloins ']

In [462]:
for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = p_ingredients[i].strip()

p_ingredients[:10]

['fresh mushrooms, sliced',
 '1 slice mild Cheddar cheese',
 '2 (14.5 ounce) cans no-salt-added tomatoes, diced or crushed',
 '2 quarts hot water',
 '2 pounds pork tenderloin, cut into 1 1/2-inch cubes',
 'superfine sugar as needed',
 '⅓ cup finely chopped dry roasted peanuts',
 '1\u2009½ pounds cod',
 '1 tablespoon finely chopped Chinese chives',
 '8 chicken tenderloins']

A reusable function that re-tags ingredients

In [463]:
def retag_ingredients():
    tagged_recipe_ingredients = []

    for ingredient in p_ingredients:
        tagged_recipe_ingredients.append(tag_pos(ingredient))
        
    return tagged_recipe_ingredients

tagged_recipe_ingredients = retag_ingredients()
tagged_recipe_ingredients[:10]

[[('fresh', 'JJ'), ('mushrooms', 'NNS'), (',', ','), ('sliced', 'VBD')],
 [('1', 'CD'),
  ('slice', 'NN'),
  ('mild', 'NN'),
  ('Cheddar', 'NNP'),
  ('cheese', 'NN')],
 [('2', 'CD'),
  ('(', '('),
  ('14.5', 'CD'),
  ('ounce', 'NN'),
  (')', ')'),
  ('cans', 'VBZ'),
  ('no-salt-added', 'JJ'),
  ('tomatoes', 'NNS'),
  (',', ','),
  ('diced', 'VBD'),
  ('or', 'CC'),
  ('crushed', 'VBD')],
 [('2', 'CD'), ('quarts', 'NNS'), ('hot', 'JJ'), ('water', 'NN')],
 [('2', 'CD'),
  ('pounds', 'NNS'),
  ('pork', 'NN'),
  ('tenderloin', 'NN'),
  (',', ','),
  ('cut', 'VBD'),
  ('into', 'IN'),
  ('1', 'CD'),
  ('1/2-inch', 'JJ'),
  ('cubes', 'NNS')],
 [('superfine', 'NN'), ('sugar', 'NN'), ('as', 'IN'), ('needed', 'VBN')],
 [('⅓', 'JJ'),
  ('cup', 'NN'),
  ('finely', 'RB'),
  ('chopped', 'VBD'),
  ('dry', 'JJ'),
  ('roasted', 'VBN'),
  ('peanuts', 'NNS')],
 [('1', 'CD'), ('½', 'JJ'), ('pounds', 'NNS'), ('cod', 'VBP')],
 [('1', 'CD'),
  ('tablespoon', 'NN'),
  ('finely', 'RB'),
  ('chopped', 'VBD'),
  

Numbers need a placeholder

In [464]:
list_words_with_tag(tagged_recipe_ingredients, "CD")

['1',
 '2',
 '14.5',
 '2',
 '2',
 '1',
 '1',
 '1',
 '8',
 '2',
 '3',
 '1',
 '5',
 '1',
 '1',
 '1',
 '12',
 '8',
 '2',
 '1',
 '4',
 '1',
 '2',
 '1',
 '1',
 '2',
 '1',
 '1',
 '3',
 '5',
 '2',
 '1',
 '15',
 '2',
 '2',
 '3',
 '2',
 '2',
 '1',
 '1',
 '1',
 '1',
 '4',
 '2',
 '4',
 '1',
 '1.25',
 '1',
 '2',
 '1',
 '2',
 '4',
 '3',
 '1',
 '1',
 '1',
 '3',
 '1',
 '4',
 '1',
 '1',
 '1',
 '16',
 '2',
 '6',
 '1',
 '2',
 '1',
 '3',
 '10.5',
 '1',
 '5.3',
 '8',
 '4',
 '4',
 '2',
 '3',
 '1',
 '1',
 '1',
 '2',
 '1',
 '2',
 '1',
 '1',
 '1',
 '3',
 '16',
 '1',
 '1',
 '1',
 '1',
 '2',
 '2',
 '1',
 '10',
 '3',
 '4',
 '5',
 '4',
 '1',
 '14',
 '1',
 '1',
 '1',
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '3',
 '1',
 '2.2',
 '6',
 '1',
 '1.5',
 '8',
 '4',
 '1',
 '2',
 '2',
 '2',
 '12',
 '3',
 '1',
 '1',
 '15',
 '1',
 '1',
 '2',
 '1',
 '1',
 '2',
 '3',
 '1/4',
 '3',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '2',
 '1',
 '3',
 '26',
 '2',
 '1',
 '3',
 '1',
 '15.25',
 '2',
 '2',
 '18',
 '1',
 '1',
 '1/2',
 '12',
 '1',
 '1',


NLTK assumes fractions as JJ (adjectives)

In [465]:
list_words_with_tag(tagged_recipe_ingredients, "JJ")

['fresh',
 'no-salt-added',
 'hot',
 '1/2-inch',
 '⅓',
 'dry',
 '½',
 'Chinese',
 'large',
 'garlic',
 'boneless',
 '⅓',
 'white',
 '⅓',
 'frozen',
 'napa',
 'refried',
 'small',
 'unpeeled',
 'red',
 '½',
 'sun-dried',
 '2-inch',
 '½',
 'such',
 'hot',
 'tablespoon',
 'Japanese',
 'such',
 '½',
 'tomato',
 'unpeeled',
 'large',
 'uncooked',
 'basmati',
 'hot',
 'green',
 '½',
 'white',
 '¾',
 'all-purpose',
 '¼',
 '¼',
 'medium-grain',
 'white',
 'red',
 'green',
 'salmon',
 'thin',
 '¼',
 'black',
 'mixed',
 'whole',
 'small',
 'white',
 'bite-size',
 'teaspoon',
 'whole',
 '½',
 '⅓',
 '½',
 '½',
 'white',
 '½',
 '½',
 'boneless',
 'salmon',
 'large',
 'thin',
 '½',
 'mixed',
 '½',
 'green',
 'beef',
 '¼',
 'Greek',
 '½',
 'minced',
 'garlic',
 '½',
 '⅓',
 'black',
 '½',
 'green',
 'bite-size',
 '1-inch',
 'French',
 'green',
 'large',
 'large',
 'large',
 '3/4-inch',
 '¼',
 'sour',
 '¾',
 'soft',
 'fresh',
 'white',
 'hot',
 '½',
 'fat-free',
 'sour',
 '½',
 '½',
 '½',
 'vegetable',

Create a function that converts any fraction in a text to integer

In [466]:
import unicodedata
from decimal import Decimal

def fraction_to_int(text):
  for i, char in enumerate(text):
    try:
      # unicode.numeric converts fractions such as ½ to decimal place, 0.25
      # remove trailing decimals, otherwise keep decimals
      text = text[:i] + str(Decimal(unicodedata.numeric(char)).normalize()) + text[i + 1:]
    except Exception as e:
      pass
  # Because number + fraction, such as 1 1/4 may be converted to 1 0, so use re.sub to remove
  text = re.sub("([0-9]+ [0])+", "4", text)
  return text

for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = fraction_to_int(p_ingredients[i])

tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['fresh mushrooms, sliced',
 '1 slice mild Cheddar cheese',
 '2 (14.5 ounce) cans no-salt-added tomatoes, diced or crushed',
 '2 quarts hot water',
 '2 pounds pork tenderloin, cut into 1 1/2-inch cubes',
 'superfine sugar as needed',
 '0.3333333333333333148296162562 cup finely chopped dry roasted peanuts',
 '4.5 pounds cod',
 '1 tablespoon finely chopped Chinese chives',
 '8 chicken tenderloins',
 '2 large cloves garlic, thinly sliced',
 '3 pounds boneless beef chuck roast',
 '1 tablespoon oil',
 '5 saffron threads',
 '0.3333333333333333148296162562 cup white vinegar',
 '0.3333333333333333148296162562 teaspoon ground coriander',
 '1 cup frozen corn',
 '1 cup halved grape tomatoes',
 '1 head napa cabbage, chopped',
 '12 (8 inch) flour tortillas']

By converting fractions into integers, NLTK stops seeing them as adjectives (JJ) and instead, they are considered numbers (CD)

In [467]:
list_words_with_tag(tagged_recipe_ingredients, "JJ")

['fresh',
 'no-salt-added',
 'hot',
 '1/2-inch',
 'dry',
 'Chinese',
 'large',
 'garlic',
 'boneless',
 'cup',
 'white',
 'frozen',
 'napa',
 'refried',
 'small',
 'unpeeled',
 'red',
 'cup',
 'sun-dried',
 '2-inch',
 'such',
 'hot',
 'tablespoon',
 'Japanese',
 'such',
 'lamb',
 'tomato',
 'unpeeled',
 'large',
 'uncooked',
 'basmati',
 'hot',
 'green',
 'white',
 'all-purpose',
 'medium-grain',
 'white',
 'red',
 'green',
 'salmon',
 'thin',
 'cup',
 'black',
 'mixed',
 'whole',
 'small',
 'white',
 'bite-size',
 'teaspoon',
 'whole',
 'white',
 'boneless',
 'salmon',
 'large',
 'thin',
 'mixed',
 'green',
 'beef',
 'Greek',
 'minced',
 'garlic',
 'black',
 'green',
 'bite-size',
 '1-inch',
 'French',
 'green',
 'large',
 'large',
 'large',
 '3/4-inch',
 'sour',
 'soft',
 'fresh',
 'white',
 'hot',
 'cup',
 'fat-free',
 'sour',
 'vegetable',
 'phyllo',
 'frozen',
 'small',
 'masa',
 'vegetable',
 'chicken',
 'olive',
 'hot',
 'such',
 'desired',
 'bunch',
 'fresh',
 'cup',
 'green',


Replace all the numbers with placeholder of 4

In [468]:
for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = searchReplacePatt(p_ingredients[i], NUMPATTERN, "4")
    
tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['fresh mushrooms, sliced',
 '4 slice mild Cheddar cheese',
 '4 (4.4 ounce) cans no-salt-added tomatoes, diced or crushed',
 '4 quarts hot water',
 '4 pounds pork tenderloin, cut into 4 4/4-inch cubes',
 'superfine sugar as needed',
 '4.4 cup finely chopped dry roasted peanuts',
 '4.4 pounds cod',
 '4 tablespoon finely chopped Chinese chives',
 '4 chicken tenderloins',
 '4 large cloves garlic, thinly sliced',
 '4 pounds boneless beef chuck roast',
 '4 tablespoon oil',
 '4 saffron threads',
 '4.4 cup white vinegar',
 '4.4 teaspoon ground coriander',
 '4 cup frozen corn',
 '4 cup halved grape tomatoes',
 '4 head napa cabbage, chopped',
 '4 (4 inch) flour tortillas']

In [469]:
new_cd_tokens = list(set(list_words_with_tag(tagged_recipe_ingredients, "CD")))
new_cd_tokens.remove('4')
new_cd_tokens

['beef4',
 'kalamata',
 'one',
 'seven',
 'provolone',
 'mozzarella',
 'zucchini',
 'millet',
 'zapallo',
 '4up',
 'ziti',
 'fontina',
 'xanthan',
 'marinara',
 'mascarpone',
 '4/4',
 'mostaccioli',
 '4.4',
 '4p',
 'yum',
 '4/4x4/4',
 'four',
 "za'atar",
 'yellow',
 'bleu',
 '4.4.4']

Define a function that returns ingredient with specific substring

In [470]:
def find_ingre_with_substring(sub):
    ingres = []
    for ingre in p_ingredients:
        matches = searchWordsPatt(ingre, sub)
        if len(matches)  > 0:
            ingres.append(ingre)
    return ingres

find_ingre_with_substring('4/4')

['4 pounds pork tenderloin, cut into 4 4/4-inch cubes',
 '4 pineapple, sliced 4/4-inch thick',
 '4 pounds venison, cut into 4/4 thick strips',
 '4 (4 4/4 inch) piece fresh ginger root, sliced, or to taste',
 '4 zucchini, sliced 4/4-inch thick',
 '4.4 pineapple, peeled and cut int4 4/4-inch dice',
 '4 pounds skinless, boneless chicken breast halves - cut into 4/4 inch strips',
 '4.4 cup warm wate4 degrees4/4 degrees C)',
 '4 ounces Spanish chorizo sausage, casing removed, sliced 4/4-inch thick',
 '4 (4/4 inch thick) slices fresh ginger root',
 '4 medium (4-4/4" dia)s sweet onion, peeled and chopped',
 '4 ounces serrano ham, cut into 4/4-inch cubes',
 '4 (4/4 inch thick) slice ginger, finely grated',
 '4.4 pounds russet potatoes, peeled and cut int4 4 4/4-inch thick slices',
 '4 large unpeeled purple eggplant, trimmed and cut into 4/4-inch thick slices',
 '4 large eggplants, peeled and sliced into 4/4-inch rounds',
 '4 pounds frozen calamari (tubes and tentacles), thawed, sliced into 4/4

Define a function that searches and replace specific regex pattern from ingredients

In [471]:
def search_edit_ingredient(regex, new_val):
    for i, ingre in enumerate(p_ingredients):
        p_ingredients[i] = searchReplacePatt(p_ingredients[i], regex, new_val)
        
search_edit_ingredient(r"4/4", "4.4")

find_ingre_with_substring('4/4')

[]

Remove copyright symbols

In [472]:
search_edit_ingredient(r"®", "")

find_ingre_with_substring('®')

[]

Remove 4p

In [473]:
find_ingre_with_substring('4p')

['4.4 4p warm milk (4 degrees F/4 degrees C)',
 '4.4 c4p4.4-inch long vermicelli']

In [474]:
search_edit_ingredient(r"c4p", "")
search_edit_ingredient(r"4p", "")

find_ingre_with_substring('4p')

[]

Change 4up back to 7up

In [475]:
find_ingre_with_substring('4up')

['4.4 4up 4% milk']

In [476]:
search_edit_ingredient(r"4up", "7up")

find_ingre_with_substring('7up')

['4.4 7up 4% milk']

Define a function that splits a list element into two new elements and deletes it

In [477]:
def split_ingre_to_two(target, search, retain_target=False):
    for i, ingre in enumerate(p_ingredients):
        if p_ingredients[i] == target:
            splits = re.split(search, p_ingredients[i])
            new_ingre1 = splits[0].strip()
            new_ingre2 = splits[1].strip()
            if retain_target:
                new_ingre2 = search.strip()
            del p_ingredients[i]
            p_ingredients.append(new_ingre1)
            p_ingredients.append(new_ingre2)

split_ingre_to_two('4.4 7up 4% milk', " 4% milk", retain_target=True)

find_ingre_with_substring('7up')

['4.4 7up']

In [478]:
tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['fresh mushrooms, sliced',
 '4 slice mild Cheddar cheese',
 '4 (4.4 ounce) cans no-salt-added tomatoes, diced or crushed',
 '4 quarts hot water',
 '4 pounds pork tenderloin, cut into 4 4.4-inch cubes',
 'superfine sugar as needed',
 '4.4 cup finely chopped dry roasted peanuts',
 '4.4 pounds cod',
 '4 tablespoon finely chopped Chinese chives',
 '4 chicken tenderloins',
 '4 large cloves garlic, thinly sliced',
 '4 pounds boneless beef chuck roast',
 '4 tablespoon oil',
 '4 saffron threads',
 '4.4 cup white vinegar',
 '4.4 teaspoon ground coriander',
 '4 cup frozen corn',
 '4 cup halved grape tomatoes',
 '4 head napa cabbage, chopped',
 '4 (4 inch) flour tortillas']

Numbers are mostly cleaned

In [479]:
new_cd_tokens = list(set(list_words_with_tag(tagged_recipe_ingredients, "CD")))
new_cd_tokens

['4',
 'beef4',
 'kalamata',
 'one',
 'seven',
 'provolone',
 'mozzarella',
 'zucchini',
 'millet',
 'zapallo',
 'ziti',
 '7up',
 'fontina',
 'xanthan',
 'marinara',
 'mascarpone',
 'mostaccioli',
 '4.4x4.4',
 '4.4',
 'yum',
 'four',
 "za'atar",
 'yellow',
 'bleu',
 '4.4.4']

Looking at the number of each POS tag for ingredient list

In [480]:
tagged_recipe_ingredients = retag_ingredients()

all_ingre_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_ingredients, POS)}
  all_ingre_tags.append(new_dic)

get_tag_number(all_ingre_tags)

[{'$': 0},
 {"''": 14},
 {'(': 3744},
 {')': 3828},
 {',': 8512},
 {'--': 0},
 {'.': 23},
 {':': 304},
 {'CC': 3074},
 {'CD': 21788},
 {'DT': 99},
 {'EX': 0},
 {'FW': 52},
 {'IN': 2849},
 {'JJ': 13401},
 {'JJR': 523},
 {'JJS': 6},
 {'LS': 0},
 {'MD': 612},
 {'NN': 32987},
 {'NNP': 2411},
 {'NNPS': 2},
 {'NNS': 13598},
 {'PDT': 1},
 {'POS': 126},
 {'PRP': 2},
 {'PRP$': 1},
 {'RB': 1452},
 {'RBR': 5},
 {'RBS': 0},
 {'RP': 13},
 {'SYM': 53},
 {'TO': 1039},
 {'UH': 0},
 {'VB': 1725},
 {'VBD': 8949},
 {'VBG': 354},
 {'VBN': 3434},
 {'VBP': 646},
 {'VBZ': 588},
 {'WDT': 1},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 0}]

In [481]:
colon_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, ":")))
colon_tags

['--', '-', ':', ';']

In [482]:
for c in colon_tags:
    print(find_ingre_with_substring(c))

['4 large skinless, boneless chicken breast halves -- trimmed and cut into 4-inch pieces']
['4 (4.4 ounce) cans no-salt-added tomatoes, diced or crushed', '4 pounds pork tenderloin, cut into 4 4.4-inch cubes', '4.4 cup sun-dried tomatoes, chopped', '4 carrots, cut into 4-inch chunks', '4.4 cups all-purpose flour, divided', '4 cups medium-grain white rice', '4 boneless chicken breast, cut into bite-size pieces', '4 green bell pepper, cut into bite-size pieces', '4 onion, sliced 4-inch thick and separated into rings', '4 pineapple, sliced 4.4-inch thick', '4.4 cup fat-free sour cream', '4.4 cup reduced-sodium soy sauce', '4 green bell pepper, cut into 4-inch pieces', '4 (4.4 ounce) package beef top sirloin, thinly sliced and cut into bite-size pieces', '4 pounds pork spareribs, cut into 4-inch pieces', '4.4 teaspoon garlic-pepper seasoning', '4 (4 ounce) can ranch-style beans', '4 zucchini, sliced 4.4-inch thick', '4 (4 ounce) can reduced-sodium beef broth', '4.4 cup extra-virgin olive o

In [483]:
find_ingre_with_substring("--")

['4 large skinless, boneless chicken breast halves -- trimmed and cut into 4-inch pieces']

In [484]:
search_edit_ingredient(r"--", ",")

find_ingre_with_substring('--')

[]

Remove the hanging colons

In [485]:
find_ingre_with_substring(":")

['Dipping Sauce:',
 'Chipotle Mayonnaise:',
 'Fillings:',
 'Spice Blend:',
 'Gravy:',
 'Meatballs:',
 'Caramel:']

In [486]:
search_edit_ingredient(r":", "")

find_ingre_with_substring(':')

[]

In [487]:
find_ingre_with_substring(";")

['4 (4 ounce) can black beans; drain and reserve liquid',
 '4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)',
 '4 raw chop with refuse, 4 g; (blank) 4.4 ounces boneless pork chops, pounded to 4.4 inch thick']

In [488]:
find_ingre_with_substring(', 4 g')

['4 raw chop with refuse, 4 g; (blank) 4.4 ounces boneless pork chops, pounded to 4.4 inch thick']

Remove the \(blanlk\) typo

In [489]:
search_edit_ingredient(r", 4 g; \(blank\)", ", 4g")

find_ingre_with_substring(";")

['4 (4 ounce) can black beans; drain and reserve liquid',
 '4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)']

In [490]:
split_ingre_to_two('4 raw chop with refuse, 4g; (blank) 4.4 ounces boneless pork chops, pounded to 4.4 inch thick', "; ")

find_ingre_with_substring(";")

['4 (4 ounce) can black beans; drain and reserve liquid',
 '4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)']

In [491]:
split_ingre_to_two("4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)", r"\(I like ")

find_ingre_with_substring(";")

['4 (4 ounce) can black beans; drain and reserve liquid',
 'white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)']

In [492]:
find_ingre_with_substring("/")

['4.4 c4 warm water (4 degrees F/4 degrees C)',
 '4 (4.4 ounce) package corn bread/muffin mix',
 '4.4 cu4 warm water (4 degrees F/4 degrees C)',
 '4 cup warm milk (4 degrees F/4 degrees C)',
 '4.4 tablespoon Guacamole, salsa, and/or sour cream',
 '4 cups warm water (4 degrees F/4 degrees C)',
 '4.4 c4 warm water (4 degrees F/4 degrees C)',
 '4.4  warm milk (4 degrees F/4 degrees C)',
 '4 tablespoons warm water (4 degrees F/4 degrees C)',
 '4 cups warm water (4 degrees F/4 degrees C)',
 '4 cup shredded Cheddar/Monterey Jack cheese blend',
 '4 cup warm water (4 degrees F/4 degrees C)',
 '4 cups warm water (4 degrees F/4 degrees C)',
 '4 tablespoons warm milk (4 degrees F/4 degrees C)',
 '4 cup warm water (4 degrees F/4 degrees C)',
 '4.4 cups warm wat4(4 degree4F/4 degrees C)',
 '4 (4 ounce) package round gyoza/potsticker wrappers',
 '4 tablespoons warm water (4 degrees F/4 degrees C)']

Replace / with or

In [493]:
search_edit_ingredient(r"\/", " or ")
find_ingre_with_substring("/")

[]

In [494]:
tagged_recipe_ingredients = retag_ingredients()

tagged_recipe_ingredients[:20]

[[('fresh', 'JJ'), ('mushrooms', 'NNS'), (',', ','), ('sliced', 'VBD')],
 [('4', 'CD'),
  ('slice', 'NN'),
  ('mild', 'NN'),
  ('Cheddar', 'NNP'),
  ('cheese', 'NN')],
 [('4', 'CD'),
  ('(', '('),
  ('4.4', 'CD'),
  ('ounce', 'NN'),
  (')', ')'),
  ('cans', 'VBZ'),
  ('no-salt-added', 'JJ'),
  ('tomatoes', 'NNS'),
  (',', ','),
  ('diced', 'VBD'),
  ('or', 'CC'),
  ('crushed', 'VBD')],
 [('4', 'CD'), ('quarts', 'NNS'), ('hot', 'JJ'), ('water', 'NN')],
 [('4', 'CD'),
  ('pounds', 'NNS'),
  ('pork', 'NN'),
  ('tenderloin', 'NN'),
  (',', ','),
  ('cut', 'VBD'),
  ('into', 'IN'),
  ('4', 'CD'),
  ('4.4-inch', 'JJ'),
  ('cubes', 'NNS')],
 [('superfine', 'NN'), ('sugar', 'NN'), ('as', 'IN'), ('needed', 'VBN')],
 [('4.4', 'CD'),
  ('cup', 'NN'),
  ('finely', 'RB'),
  ('chopped', 'VBD'),
  ('dry', 'JJ'),
  ('roasted', 'VBN'),
  ('peanuts', 'NNS')],
 [('4.4', 'CD'), ('pounds', 'NNS'), ('cod', 'NN')],
 [('4', 'CD'),
  ('tablespoon', 'NN'),
  ('finely', 'RB'),
  ('chopped', 'VBD'),
  ('Chinese',

## Examining other POS in ingredients

So as to get an idea of POS tagging in the later section

In [495]:
fw_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "FW")))
fw_tags

['s',
 'arbol',
 'mirin',
 'skin',
 'paprika',
 'herbes',
 'kalonji',
 'gallo',
 'miso',
 'bilbao',
 'vanilla',
 'pico',
 'de',
 'di',
 'kalamansi',
 'kielbasa']

In [496]:
rp_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "RP")))
rp_tags

['dashi', 'aside', 'off', 'tomato', 'out', 'up']

In [497]:
for rp in rp_tags:
    print(find_ingre_with_substring(" " + rp))

['4 (4 inch) piece dashi kombu (dried kelp) (Optional)', '4 teaspoon dashi granules', '4.4 teaspoon dashi granules', '4.4 cup prepared dashi stock', '4 (4 inch) piece dashi kombu (dried kelp)', '4 cups prepared dashi stock', '4.4 teaspoon white miso paste with dashi', '4 cups dashi stock, made with dashi powder', '4 tablespoon dashi granules', '4.4 teaspoons dashi no moto (instant dashi or fish-broth powder), available at Asian markets', '4 ounce dashi kombu (dried kelp)', '4 cups prepared dashi stock', '4 teaspoons dashi granules', '4.4 tablespoon dashi granules', '4.4 cups prepared dashi stock']
['4 cup chopped Chinese roast duck meat, skin and fat separated and set aside']
['4.4 bunch cilantro, stems cut off and leaves chopped', '4 ear fresh corn, kernels cut off', '4 bell peppers, tops cut off and seeded']
['4 (4.4 ounce) cans no-salt-added tomatoes, diced or crushed', '4 cup halved grape tomatoes', '4.4 cup sun-dried tomatoes, chopped', '4 cups tomato sauce, or to taste', '4 mediu

Lamb, lobster and leeks are supposed to be nouns!

In [498]:
rbr_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "RBR")))
rbr_tags

['lamb', 'lobster', 'leeks']

In [499]:
wdt_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "WDT")))
wdt_tags

['whole']

In [500]:
pdt_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "PDT")))
pdt_tags

['half']

In [501]:
prp_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "PRP")))
prp_tags

['you']

In [502]:
find_ingre_with_substring("you ")

['4 (4 ounce) packages garlic and herb couscous mix (or any flavor you prefer)']

In [503]:
prp_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "PRP$")))
prp_tags

['your']

In [504]:
find_ingre_with_substring("your")

['4 (4 ounce) package pasta, your choice of shape']

In [505]:
punc_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, ".")))
punc_tags

['.', '!']

In [506]:
find_ingre_with_substring("!")

['4.4 cup Greek salad dressing, such as Yazzo!']

In [507]:
quote_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "''")))
quote_tags

["''"]

In [508]:
for q in quote_tags:
    print(find_ingre_with_substring("'"))

["4.4 cups coconut flakes (such as Baker's Angel Flake)", "4.4 cups confectioners' sugar", "4 frozen meatless vegetable meatballs (such as IKEA's frozen vegetable balls)", "4.4 tablespoons confectioners' sugar", "4 (4.4 ounce) can fat-free condensed cream of mushroom soup (such as Campbell's)", "4 teaspoons garlic pepper seasoning (such as SuzyQ's Santa Maria Valley Style Seasoning), or to taste", "4.4 cup cajeta, sweetened caramelized goat's milk syrup", "4 cups parboiled rice (such as Uncle Ben's)", "4 tablespoon confectioners' sugar, or to taste (Optional)", "4 Thai bird's eye chiles, halved lengthwise", "4.4 cup confectioners' sugar, for dusting", "4 tablespoons confectioners' sugar for dusting", "4 (4.4 ounce) can Campbell's Condensed Beef Broth", "sifted confectioners' sugar", "4 bird's eye chile, minced", "4.4 cup confectioners' sugar", "4 (4.4 ounce) package UNCLE BEN'S Jasmine READY RICE", "4 tablespoon golden syrup (such as Lyle's)", "4.4 cups confectioners' sugar", "4.4 cup 

How can these words be symbols?

In [509]:
list(set(list_words_with_tag(tagged_recipe_ingredients, "SYM")))

['lettuce',
 'mango',
 'breast',
 'choy',
 'mangoes',
 'leeks',
 'squash',
 'spinach',
 'thighs',
 'shrimp',
 'avocado',
 'beaten',
 'tomato',
 'kale',
 'mangos',
 'sauerkraut',
 'lemon',
 'basil',
 'avocados',
 'cucumber']

## Casing of recipe names

Because almost all words are capitalized by default in recipe name, need to correct the casing

In [510]:
all_recipe_names = []

for recipe in p_recipes:
    try:
        all_recipe_names.append(recipe['name'])
    except Exception as e:
        pass
    
all_recipe_names[:10]

['Pan-Fried Asparagus',
 'Creamy Au Gratin Potatoes',
 'Super-Delicious Zuppa Toscana',
 'Simple Teriyaki Sauce',
 'Spicy Korean Fried Chicken with Gochujang Sauce',
 'Spaghetti Aglio e Olio',
 'Easy Garam Masala',
 'Easy Chorizo Street Tacos',
 'Russian Cabbage Rolls with Gravy',
 'Shrimp Scampi with Pasta']

Create a corpus by joining all recipe names with \n, because the names were not literally a single text originally. Othwewise it will confuse the tokenisation

In [511]:
all_recipe_names_corpus = ("\n").join(all_recipe_names)

all_recipe_names_corpus

'Pan-Fried Asparagus\nCreamy Au Gratin Potatoes\nSuper-Delicious Zuppa Toscana\nSimple Teriyaki Sauce\nSpicy Korean Fried Chicken with Gochujang Sauce\nSpaghetti Aglio e Olio\nEasy Garam Masala\nEasy Chorizo Street Tacos\nRussian Cabbage Rolls with Gravy\nShrimp Scampi with Pasta\nGreek Lemon Chicken and Potato Bake\nEasy Mexican Casserole\nGerman Apple Cake I\nSpanish Flan\nGerman Pork Chops and Sauerkraut\nSpaghetti Cacio e Pepe\nChef John\'s Chicken Kiev\nIndian-Style Chicken and Onions\nFajita Seasoning\nPerfect Sushi Rice\nTender Italian Baked Chicken\nAuthentic German Potato Salad\nMiso Soup\nMexican Rice II\nSpongy Japanese Cheesecake\nChicken Katsu\nChicken Stir-Fry\nQuick Beef Stir-Fry\nEasy Authentic Mexican Rice\nHerbs de Provence\nGreek or House Dressing\nFrench Bread\nFocaccia Bread\nJamaican Fried Dumplings\nGluehwein\nCoquilles Saint-Jacques\nMexican-Style Chicken Taco Casserole\nRosemary Braised Lamb Shanks\nMake-Ahead Vegetarian Moroccan Stew\nCurry Stand Chicken Tikka

Tokenize

In [512]:
import nltk

recipe_tokens = list(set(nltk.word_tokenize(all_recipe_names_corpus)))
recipe_tokens[:10]

['Varuval',
 'Up',
 'Japan',
 'Roulades',
 'Pasties',
 'Florets',
 'Yia',
 'Captain',
 'Roll',
 'Candied']

In [513]:
len(recipe_tokens)

3271

Join ingredients into a text with \n and tokenize

In [514]:
ingredients_corpus = ("\n").join(p_ingredients)

ingredients_corpus

'fresh mushrooms, sliced\n4 slice mild Cheddar cheese\n4 (4.4 ounce) cans no-salt-added tomatoes, diced or crushed\n4 quarts hot water\n4 pounds pork tenderloin, cut into 4 4.4-inch cubes\nsuperfine sugar as needed\n4.4 cup finely chopped dry roasted peanuts\n4.4 pounds cod\n4 tablespoon finely chopped Chinese chives\n4 chicken tenderloins\n4 large cloves garlic, thinly sliced\n4 pounds boneless beef chuck roast\n4 tablespoon oil\n4 saffron threads\n4.4 cup white vinegar\n4.4 teaspoon ground coriander\n4 cup frozen corn\n4 cup halved grape tomatoes\n4 head napa cabbage, chopped\n4 (4 inch) flour tortillas\n4 ears corn on the cob, cut into quarters\n4 tablespoon vanilla extract\n4 cups refried beans, divided\n4 small unpeeled red potato, shredded\n4.4 cup sun-dried tomatoes, chopped\n4 carrots, cut into 4-inch chunks\n4 egg\n4.4 cups coconut flakes (such as Baker\'s Angel Flake)\n4 tablespoons milk (Optional)\n4 teaspoon hot chile paste\n4 tablespoon Japanese mayonnaise (such as Kewpie)

In [515]:
ingre_tokens = list(set(nltk.word_tokenize(ingredients_corpus)))
ingre_tokens[:10]

['super',
 'espresso',
 'serrano',
 'breakfast',
 'Oaxaca',
 'halved',
 'spears',
 'apricot',
 'no-boil',
 'a']

In [516]:
len(ingre_tokens)

2815

Most words in recipe tokens are capitalized

In [517]:
lower_recipe_tokens = []
for token in recipe_tokens:
    if token[0].islower():
        lower_recipe_tokens.append(token)
        
lower_recipe_tokens

['a',
 'z',
 'aka',
 'laziale',
 'version',
 'to',
 "all'Amatriciana",
 'on',
 'and',
 'for',
 'bil',
 'aux',
 'na',
 'con',
 'powder',
 'chili',
 'el',
 'alla',
 'without',
 'al',
 'e',
 'or',
 'the',
 'au',
 'et',
 'su',
 'le',
 'from',
 'la',
 'y',
 'by',
 'de',
 'in',
 'des',
 'of',
 'over',
 'with',
 'en',
 "l'Oignon",
 'its',
 'di',
 'nach',
 'sa']

Number of words that are not capitalized increased significantly crosschecking with lowercase words in ingredient tokens

In [518]:
for i, name in enumerate(recipe_tokens):
    for ingre in ingre_tokens:
        if recipe_tokens[i].lower() == ingre:
            recipe_tokens[i] = recipe_tokens[i].lower()

lower_recipe_tokens = []
for token in recipe_tokens:
    if token[0].islower():
        lower_recipe_tokens.append(token)
        
len(lower_recipe_tokens)

923

In [519]:
upper_recipe_tokens = list(filter(str.istitle, recipe_tokens))
len(upper_recipe_tokens)

2314

In [520]:
upper_recipe_tokens[:20]

['Varuval',
 'Japan',
 'Roulades',
 'Pasties',
 'Yia',
 'Captain',
 'Frosting',
 'Ragu',
 'Frosted',
 'Lahanosalata',
 'Shakshuka',
 'Slow-Cooker',
 'Mess',
 'Cowboy',
 'Tablet',
 'Venezuelan',
 'Krupnikas',
 'Puto',
 'Shooters',
 'Aphrodisiac']

Use country names to get the words related to country names for capitalization

In [521]:
!pipenv install country_list

Installing country_list...

Installing dependencies from Pipfile.lock (577ce1)...
Ignoring argcomplete: markers 'python_full_version < "3.8.0"' don't match your environment
Ignoring importlib-metadata: markers 'python_version == "3.7" and python_full_version < "3.8.0" and python_full_version < "3.8.0" and python_full_version < "3.8.0"' don't match your environment
Ignoring typing-extensions: markers 'python_full_version < "3.8.0"' don't match your environment




[    ] Installing...
[=   ] Installing country_list...
[==  ] Installing country_list...
[=== ] Installing country_list...
[ ===] Installing country_list...
[  ==] Installing country_list...
[   =] Installing country_list...
[    ] Installing country_list...
[   =] Installing country_list...
[  ==] Installing country_list...
[ ===] Installing country_list...
[====] Installing country_list...
[=== ] Installing country_list...
[==  ] Installing country_list...
[=   ] Installing country_list...
[    ] Installing country_list...
[=   ] Installing country_list...
[==  ] Installing country_list...
[=== ] Installing country_list...
[ ===] Installing country_list...
[  ==] Installing country_list...
[   =] Installing country_list...
[    ] Installing country_list...
[   =] Installing country_list...
[  ==] Installing country_list...
[ ===] Installing country_list...
[====] Installing country_list...
[=== ] Installing country_list...
[==  ] Installing country_list.

In [522]:
from country_list import countries_for_language

countries = dict(countries_for_language('en'))
countries = list(countries.values())

countries

['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua & Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia & Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Caribbean Netherlands',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo - Brazzaville',
 'Congo - Kinshasa',
 'Cook Islands',
 'Costa Rica',
 'Côte d’Ivoire',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egyp

Not all words are captured in the country names library, added some more.

In [523]:
countries = ' '.join([elem for elem in countries])
countries = countries.replace('&', '')
countries = countries.split(" ")
countries = [i.strip() for i in countries]
countries = [string for string in countries if string != ""]
countries = [string for string in countries if string != "-"]

countries = countries + ["Filipino", "Malay", "Spanish", "Danish", "Welsh", "Polish", "Schwabisch", "Rochester", "Asia",
                         "Aussie", "Greek", "German", "Mexica", "Hawaii", "Irish", "Mediterranean", "Middle", "East",
                        "Norwegian", "Persian", "Pollo", "Thai", "West"]

countries

['Afghanistan',
 'Åland',
 'Islands',
 'Albania',
 'Algeria',
 'American',
 'Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua',
 'Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia',
 'Herzegovina',
 'Botswana',
 'Bouvet',
 'Island',
 'Brazil',
 'British',
 'Indian',
 'Ocean',
 'Territory',
 'British',
 'Virgin',
 'Islands',
 'Brunei',
 'Bulgaria',
 'Burkina',
 'Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape',
 'Verde',
 'Caribbean',
 'Netherlands',
 'Cayman',
 'Islands',
 'Central',
 'African',
 'Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas',
 'Island',
 'Cocos',
 '(Keeling)',
 'Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Brazzaville',
 'Congo',
 'Kinshasa',
 'Cook',
 'Islands',
 'Costa',
 'Rica',
 'Côte',
 'd’Ivoire',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'C

Then use stemmer to get the stem of the words in the country names. But if the stem is too short, just use the first 5 characters of the word

In [524]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter = PorterStemmer()
lancaster=LancasterStemmer()

porter_c = []
lancester_c = []

for c in countries:
    port = porter.stem(c.split(' ').pop(0))
    if len(port) < 5:
        port = c[:4]
    porter_c.append(port.capitalize())
    lan = lancaster.stem(c.split(' ').pop(0))
    if len(lan) < 5:
        lan = c[:4]
    lancester_c.append(lan.capitalize())

print(porter_c[:10])
print(lancester_c[:10])

['Afghanistan', 'Åland', 'Island', 'Albania', 'Algeria', 'American', 'Samoa', 'Andorra', 'Angola', 'Anguilla']
['Afgh', 'Åland', 'Island', 'Alban', 'Alger', 'Amer', 'Samo', 'Andorr', 'Angol', 'Anguill']


In [525]:
lancester_c.append("Victoria")
lancester_c

['Afgh',
 'Åland',
 'Island',
 'Alban',
 'Alger',
 'Amer',
 'Samo',
 'Andorr',
 'Angol',
 'Anguill',
 'Antarctic',
 'Antigu',
 'Barbud',
 'Argentin',
 'Armen',
 'Arub',
 'Austral',
 'Austr',
 'Azerbaid',
 'Bahama',
 'Bahrain',
 'Bangladesh',
 'Barbado',
 'Belar',
 'Belg',
 'Beli',
 'Benin',
 'Bermud',
 'Bhut',
 'Boliv',
 'Bosn',
 'Herzegovin',
 'Botswan',
 'Bouvet',
 'Island',
 'Brazil',
 'Brit',
 'Indi',
 'Ocea',
 'Territ',
 'Brit',
 'Virgin',
 'Island',
 'Brune',
 'Bulgar',
 'Burkin',
 'Faso',
 'Burund',
 'Cambod',
 'Cameroon',
 'Canad',
 'Cape',
 'Verd',
 'Carib',
 'Netherland',
 'Caym',
 'Island',
 'Cent',
 'Afri',
 'Republ',
 'Chad',
 'Chil',
 'Chin',
 'Christmas',
 'Island',
 'Coco',
 '(keeling)',
 'Island',
 'Colomb',
 'Comoro',
 'Congo',
 'Brazzavil',
 'Congo',
 'Kinshas',
 'Cook',
 'Island',
 'Cost',
 'Rica',
 'Côte',
 'D’ivoire',
 'Croat',
 'Cuba',
 'Curaçao',
 'Cypr',
 'Czech',
 'Denmark',
 'Djibout',
 'Dominic',
 'Domin',
 'Republ',
 'Ecuad',
 'Egypt',
 'El',
 'Salvad',
 'E

Get all the recipe tokens that have the country names stem and remove the unrelated tokens

In [526]:
token_with_country_prefix = []
for rt in recipe_tokens:
    for lan in lancester_c:
        if lan in rt:
            token_with_country_prefix.append(rt)

token_with_country_prefix = sorted(list(set(token_with_country_prefix)))
token_with_country_prefix.remove("No-Cook")
token_with_country_prefix.remove("Man")
token_with_country_prefix.remove("Slow-Cooked")
token_with_country_prefix.remove("Slow-Cooker")
token_with_country_prefix.remove("Garlic-Anchovy-Sardine")
token_with_country_prefix

["'Chinese",
 'Afghan',
 'Afghani',
 'African',
 'African-Style',
 'Afritada',
 'Algerian',
 'Almond-Ricotta',
 'American',
 'Americano',
 'Arabic',
 'Argentine',
 'Argentinean',
 'Armenian',
 'Asiago',
 'Asian',
 'Asian-Inspired',
 'Asian-Style',
 'Asian-Themed',
 'Australian',
 'Bangladeshi',
 'Belgi',
 'Belgian',
 'Belizean',
 'Bermuda',
 'Bhutanese',
 'Bolivian',
 'Brazilian',
 'Brazilian-Style',
 'British',
 'Bulgarian',
 'Cambodian',
 'Canada',
 'Canadian',
 'Cape',
 'Capezzoli',
 'Caribbean',
 'Caribbean-Spiced',
 'Chad',
 'Chilaquiles',
 'Chilean',
 'Chilean-Style',
 'Chinese',
 'Chinese-Style',
 'Christmas',
 'Coco',
 'Coconut-Lentil',
 'Coconut-Lime',
 'Cocotte',
 'Colombian',
 'Cooker',
 'Cooks',
 'Cookup',
 'Costa',
 'Croatian',
 'Cuban',
 'Cuban-Inspired',
 'Cuban-Style',
 'Cubanos',
 'Curry-Coconut',
 'Czech',
 'Czechoslovakian',
 'Danielle',
 'Danish',
 'Dominican',
 'Dominican-Style',
 'East',
 'Easter',
 'Eastern',
 'Eastern-Style',
 'Egyptian',
 'Elizabeth',
 'Ellen',

In [527]:
token_with_country_prefix

["'Chinese",
 'Afghan',
 'Afghani',
 'African',
 'African-Style',
 'Afritada',
 'Algerian',
 'Almond-Ricotta',
 'American',
 'Americano',
 'Arabic',
 'Argentine',
 'Argentinean',
 'Armenian',
 'Asiago',
 'Asian',
 'Asian-Inspired',
 'Asian-Style',
 'Asian-Themed',
 'Australian',
 'Bangladeshi',
 'Belgi',
 'Belgian',
 'Belizean',
 'Bermuda',
 'Bhutanese',
 'Bolivian',
 'Brazilian',
 'Brazilian-Style',
 'British',
 'Bulgarian',
 'Cambodian',
 'Canada',
 'Canadian',
 'Cape',
 'Capezzoli',
 'Caribbean',
 'Caribbean-Spiced',
 'Chad',
 'Chilaquiles',
 'Chilean',
 'Chilean-Style',
 'Chinese',
 'Chinese-Style',
 'Christmas',
 'Coco',
 'Coconut-Lentil',
 'Coconut-Lime',
 'Cocotte',
 'Colombian',
 'Cooker',
 'Cooks',
 'Cookup',
 'Costa',
 'Croatian',
 'Cuban',
 'Cuban-Inspired',
 'Cuban-Style',
 'Cubanos',
 'Curry-Coconut',
 'Czech',
 'Czechoslovakian',
 'Danielle',
 'Danish',
 'Dominican',
 'Dominican-Style',
 'East',
 'Easter',
 'Eastern',
 'Eastern-Style',
 'Egyptian',
 'Elizabeth',
 'Ellen',

Possessives can also be used for capitalizing, since proper names like Chef John's occur a lot

In [528]:
possesive_tokens = list_words_with_tag(tagged_recipe_names, "''")
possesive_tokens

["''", "''", "'", "''", "''", "''", "''"]

In [529]:
possessive_names = []
for ps in possesive_tokens:
    print(find_value_with_char(p_recipes, 'name', ps))
    possessive_names = possessive_names + find_value_with_char(p_recipes, 'name', ps)

[]
[]
["Chef John's Chicken Kiev", "Angela's Awesome Enchiladas", "Randy's Slow Cooker Ravioli Lasagna", "'Chinese Buffet' Green Beans", "Chef John's Beef Rouladen", "Corned Beef and Cabbage Shepherd's Pie", "Gramma's Date Squares", "Authentic Russian Salad 'Olivye'", "Chef John's Meatless Meatballs", "Chef John's Beef Goulash", "Grandma's Noodles II", "Chef John's Clotted Cream", "Newfoundland Jigg's Dinner", "Chef John's Coq Au Vin", "Chef John's Loco Moco", "Dash's Donair", "Turkey Shepherd's Pie", "Papa Drexler's Bavarian Pretzels", "Bob's Stuffed Banana Peppers", "Chef John's Swedish Meatballs", "Chef John's German Recipes", "Chef John's Chicken Tikka Masala", "Maria's Mexican Rice", "Mom's Buttermilk Pancakes", "Geneva's Ultimate Hungarian Mushroom Soup", "Charley's Slow Cooker Mexican Style Meat", "Ingrid's Rouladen", "Chef John's Lasagna", "Lola's Horchata", "Chef John's Italian Sausage Chili", "Kid's Favorite Pizza Casserole", "Traci's Adobo Seasoning", "Frank's Favorite Slow-

In [530]:
possessive_names

["Chef John's Chicken Kiev",
 "Angela's Awesome Enchiladas",
 "Randy's Slow Cooker Ravioli Lasagna",
 "'Chinese Buffet' Green Beans",
 "Chef John's Beef Rouladen",
 "Corned Beef and Cabbage Shepherd's Pie",
 "Gramma's Date Squares",
 "Authentic Russian Salad 'Olivye'",
 "Chef John's Meatless Meatballs",
 "Chef John's Beef Goulash",
 "Grandma's Noodles II",
 "Chef John's Clotted Cream",
 "Newfoundland Jigg's Dinner",
 "Chef John's Coq Au Vin",
 "Chef John's Loco Moco",
 "Dash's Donair",
 "Turkey Shepherd's Pie",
 "Papa Drexler's Bavarian Pretzels",
 "Bob's Stuffed Banana Peppers",
 "Chef John's Swedish Meatballs",
 "Chef John's German Recipes",
 "Chef John's Chicken Tikka Masala",
 "Maria's Mexican Rice",
 "Mom's Buttermilk Pancakes",
 "Geneva's Ultimate Hungarian Mushroom Soup",
 "Charley's Slow Cooker Mexican Style Meat",
 "Ingrid's Rouladen",
 "Chef John's Lasagna",
 "Lola's Horchata",
 "Chef John's Italian Sausage Chili",
 "Kid's Favorite Pizza Casserole",
 "Traci's Adobo Seasoning"

Chef John's Lasagna, but how about just lasagna? Saving both

In [531]:
non_possessive = []
for ps in possessive_names:
    if "'s " in ps:
        non_possessive.append(ps.split("'s ",1)[1].lower())

non_possessive

['chicken kiev',
 'awesome enchiladas',
 'slow cooker ravioli lasagna',
 'beef rouladen',
 'pie',
 'date squares',
 'meatless meatballs',
 'beef goulash',
 'noodles ii',
 'clotted cream',
 'dinner',
 'coq au vin',
 'loco moco',
 'donair',
 'pie',
 'bavarian pretzels',
 'stuffed banana peppers',
 'swedish meatballs',
 'german recipes',
 'chicken tikka masala',
 'mexican rice',
 'buttermilk pancakes',
 'ultimate hungarian mushroom soup',
 'slow cooker mexican style meat',
 'rouladen',
 'lasagna',
 'horchata',
 'italian sausage chili',
 'favorite pizza casserole',
 'adobo seasoning',
 'favorite slow-cooker thai chicken',
 'shrimp fra diavolo',
 'chicken paprikash',
 'french omelette',
 'pie',
 'hazelnut christmas cookies',
 'patatas bravas',
 'italian bread',
 'cuban bread',
 'pie',
 'chimichurri sauce',
 'easy german sauerbraten',
 'pie',
 'german marble cake',
 'steak pizzaiola',
 'sour cream lasagna',
 'beef shish kabobs',
 'polish perogies',
 'indian-spiced tomato lentil soup',
 'shep

In [532]:
all_recipe_names[:10]

['Pan-Fried Asparagus',
 'Creamy Au Gratin Potatoes',
 'Super-Delicious Zuppa Toscana',
 'Simple Teriyaki Sauce',
 'Spicy Korean Fried Chicken with Gochujang Sauce',
 'Spaghetti Aglio e Olio',
 'Easy Garam Masala',
 'Easy Chorizo Street Tacos',
 'Russian Cabbage Rolls with Gravy',
 'Shrimp Scampi with Pasta']

Create a copy of all_recipe_names as backup

In [533]:
all_recipe_names2 = all_recipe_names.copy()
all_recipe_names2[:10]

['Pan-Fried Asparagus',
 'Creamy Au Gratin Potatoes',
 'Super-Delicious Zuppa Toscana',
 'Simple Teriyaki Sauce',
 'Spicy Korean Fried Chicken with Gochujang Sauce',
 'Spaghetti Aglio e Olio',
 'Easy Garam Masala',
 'Easy Chorizo Street Tacos',
 'Russian Cabbage Rolls with Gravy',
 'Shrimp Scampi with Pasta']

Drop the recipe names that have possessives temporarily

In [534]:
print(len(all_recipe_names))
  
all_recipe_names2 = [ele for ele in all_recipe_names2 if ele not in possessive_names] 
print(len(all_recipe_names2))

5249
4890


If a word in a recipe does not belong to the tokens with country prefix, lowercase it by default

In [535]:
# https://stackoverflow.com/questions/40291443/python-convert-a-string-to-lowercase-except-some-special-strings/40291577
lowerAllExcept = lambda x: " ".join( a if a in token_with_country_prefix else a.lower()
                                    for a in x.split() )

for i, recipe in enumerate(all_recipe_names2):
    for t in token_with_country_prefix:
        all_recipe_names2[i] = lowerAllExcept(all_recipe_names2[i])

Join the names with possessives back to the list

In [536]:
all_recipe_names2 = all_recipe_names2 +  possessive_names
print(len(all_recipe_names2))
all_recipe_names2 = all_recipe_names2 +  non_possessive
print(len(all_recipe_names2))
all_recipe_names2 = list(set(all_recipe_names2))
print(len(all_recipe_names2))

5249
5577
5362


For some reasons, 'Thai' is saved as 'thai'

In [537]:
all_recipe_names_corpus = ("\n").join(all_recipe_names2)

recipe_tokens = list(set(nltk.word_tokenize(all_recipe_names_corpus)))

recipe_tokens = [i.replace('thai','Thai') for i in recipe_tokens]

lower_recipe_tokens = []
for token in recipe_tokens:
    if token[0].islower():
        lower_recipe_tokens.append(token)
        
lower_recipe_tokens

['pig-shaped',
 'ii',
 'super',
 'bredie',
 'arrabbiata',
 'number',
 'breakfast',
 'flesh-keek-luh',
 'apricot',
 'lemak',
 'callaloo',
 'ragu',
 'bowls',
 'a',
 'time',
 'solange',
 'banana-caramel',
 'maalat',
 'dolmathes',
 'rainbow',
 'koljivo',
 'gogi',
 'oyster',
 'pudding',
 'baklava',
 'honey-sesame',
 'exotic',
 'brasa',
 'nougat',
 'steam',
 'lahmahjoon',
 'chocolat',
 'boiled',
 'worth',
 'lemonade',
 'blue',
 'pretzels',
 'michigan',
 'stewed',
 'tenderloins',
 'out-style',
 'palusami',
 'beefs',
 'swedish',
 'tongue',
 'tilapia',
 'bowtie',
 'battenburg',
 'pibil',
 'super-delicious',
 'pacific',
 'shahi',
 'kawali',
 'gehakt',
 'masala',
 'jell-o',
 'under',
 'street',
 'hurry',
 'sweet-and-sour',
 'orleans',
 'kid-approved',
 'rois',
 'fresa',
 'old-school',
 'torta',
 'doo',
 'canton',
 'vegetarian',
 'marnier',
 'laotian',
 'maracuja',
 'shakshuka',
 'giniling',
 'make-ahead',
 'jalapeno',
 'melts',
 'vendor',
 'caprese',
 'sinful',
 'key',
 'favorites',
 'halushki',


In [538]:
len(lower_recipe_tokens)

2811

In [539]:
upper_recipe_tokens = list(filter(str.istitle, recipe_tokens))
len(upper_recipe_tokens)

853

In [540]:
upper_recipe_tokens

['Japan',
 'Yia',
 'Captain',
 'Shells',
 'Slow-Cooker',
 'Venezuelan',
 'Buttermilk',
 'Recipes',
 'Puto',
 'Maharajah',
 'Chile',
 'Daddy',
 'Roman',
 'Grandma',
 'Men',
 'Cornish',
 'Singaporean',
 'Gai',
 'Easter',
 'Perry',
 "'Bride",
 'Carrie',
 "O'Brien",
 'Jeanie',
 'Mangonada',
 'Year',
 'Sans',
 'Corned',
 'Belgian',
 'Spanish-Style',
 'Zabaglione',
 'Persian-Inspired',
 'Christmas',
 'Nonna',
 'Tarte',
 'Posole',
 'Lanka',
 'Bulgarian',
 'Jorge',
 'Pizza',
 'Bob',
 'Wassail',
 'Mince',
 "'Olivye'",
 'Broccoli',
 'Mashed',
 'Scarlett',
 'Nut',
 'Puffs',
 'Sauerkraut',
 'Thighs',
 'Beans',
 'Croatian',
 'Louise',
 'Owen',
 'South',
 'Spinach',
 'Haitian',
 'Coconut-Lentil',
 'Tim',
 'Thera',
 'Asiago',
 'Jansson',
 'Ninabell',
 'Biddy',
 'Sarciado',
 'Allie',
 'Dressing',
 'Machaca',
 'Krista',
 'Strawberries',
 'Liberian',
 'Au',
 'Lazy',
 'Sofrito',
 'Al',
 'Lasagna',
 'Shortbread',
 'Honduran',
 'Marinade',
 'Perfect',
 'Sarita',
 'Aloo',
 'Mango-Pineapple',
 'Samoan',
 'Co

## Updating POS tags in names after changing casing

Previously, almost all the words belong to NNP or NNPS, due to capitalization. By fixing the letter casings, now most of the words are NN (common nouns)

In [541]:
final_tagged_names = []

for recipe in all_recipe_names2:
    final_tagged_names.append(tag_pos(recipe))

all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(final_tagged_names, POS)}
  all_name_tags.append(new_dic)

get_tag_number(all_name_tags)

[{'$': 0},
 {"''": 8},
 {'(': 0},
 {')': 0},
 {',': 63},
 {'--': 0},
 {'.': 1},
 {':': 1},
 {'CC': 509},
 {'CD': 28},
 {'DT': 101},
 {'EX': 3},
 {'FW': 46},
 {'IN': 523},
 {'JJ': 3132},
 {'JJR': 10},
 {'JJS': 6},
 {'LS': 0},
 {'MD': 2},
 {'NN': 9070},
 {'NNP': 1724},
 {'NNPS': 5},
 {'NNS': 1426},
 {'PDT': 0},
 {'POS': 344},
 {'PRP': 3},
 {'PRP$': 1},
 {'RB': 61},
 {'RBR': 0},
 {'RBS': 0},
 {'RP': 4},
 {'SYM': 1},
 {'TO': 10},
 {'UH': 0},
 {'VB': 52},
 {'VBD': 261},
 {'VBG': 101},
 {'VBN': 232},
 {'VBP': 211},
 {'VBZ': 20},
 {'WDT': 0},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 6}]

## Chunking (recipe names)

If the number of words in recipes are more than 2 (since bigram can deal with 2-word names), then it can be treated as a recipe name chunk

In [542]:
def sort_unique_list(old_list):
    return sorted(list(set(old_list)))

In [543]:
recipe_name_chunk = []

for recipe in all_recipe_names2:
    if len(recipe.split()) > 2:
        recipe_name_chunk.append(recipe)

recipe_name_chunk = sort_unique_list(recipe_name_chunk)

for n in recipe_name_chunk:
    print(n)

"million dollar" Chinese cabbage salad
"pantry raid" chicken enchilada casserole
"skinny" chicken tacos
'Chinese Buffet' Green Beans
3-ingredient lemon scones
5-ingredient Mexican casserole
A Firefighter's Meatloaf
A Scotsman's Shepherd Pie
Adriel's Chinese Curry Chicken
Afghan beef raviolis
Afghani kabli pulao
African cabbage stew
African chicken stew
African sweet potato and peanut soup
African sweet potato stew
African-Style oxtail stew
Al's Baked Swiss Steak
Al's Burmese Chicken Curry
Ali's Amazing Bruschetta
Alicia's Aloo Gobi
Allie's Mushroom Pizza
Alysia's Basic Meat Lasagna
Amanda's Stuffed Peppers
Andy's Spicy Green Chile Pork
Angela's Asian-Inspired Chicken Noodle Soup
Angela's Awesome Enchiladas
Anne's Chicken Chilaquiles Rojas
Arabic fattoush salad
Argentine chimichurri bread
Argentine meat empanadas
Argentinean cheese bread
Armenian Easter bread
Armenian shish kabob
Armenian stuffed eggplant
Asiago sun-dried tomato pasta
Asian beef with snow peas
Asian chicken salad
Asian 

gratin dauphinois de solange
great British fry up
grecian green beans in tomato sauce
grecian pork tenderloin
green banana fries
green bean curry
green bean rice
green chicken enchilada
green chicken enchiladas
green chicken tamales
green chile chicken enchilada casserole
green chile spinach quiche
green chile stew
green chili and cheese chicken
green chili stew
green coconut chicken
green herb rice
green hot sauce
green onion cakes
green tea cheesecake
green tea mochi ice cream
grilled "tandoori" lamb
grilled Asian asparagus
grilled Asian chicken
grilled Asian ginger pork chops
grilled Greek chicken
grilled Korean-Style beef short ribs
grilled Mexican steak
grilled Spanish mustard beef
grilled cheese of the gods
grilled chicken adobo
grilled chicken quesadillas
grilled chicken shawarma wraps with raita
grilled chicken spiedies
grilled chicken teriyaki skewers with miso ranch
grilled chicken thighs tandoori
grilled eggplant moussaka
grilled eisbein, pork shanks
grilled fish tacos with 

Get all the prepositions found by NLTK

In [544]:
in_tokens = sort_unique_list(get_values_from_dict_list(all_name_tags, 'IN')[0])

in_tokens

['Of',
 'Under',
 'arroz',
 'bayrischer',
 'before',
 'beyond',
 'brown',
 'by',
 'de',
 'dough',
 'en',
 'for',
 'from',
 'in',
 'of',
 'on',
 'out',
 'over',
 'pina',
 'so',
 'trout',
 'under',
 'with',
 'without',
 'worth']

Keep only the actual prepositions

In [545]:
in_tokens = ['Of',
 'Under',
 'before',
 'beyond',
 'by',
 'for',
 'from',
 'in',
 'of',
 'on',
 'out',
 'over',
 'so',
 'under',
 'with',
 'without']

in_tokens

['Of',
 'Under',
 'before',
 'beyond',
 'by',
 'for',
 'from',
 'in',
 'of',
 'on',
 'out',
 'over',
 'so',
 'under',
 'with',
 'without']

Get all the recipe names with prepositions

In [546]:
names_in_tokens = [s for s in all_recipe_names2 if any(xs in s for xs in in_tokens)]

names_in_tokens

['curry pineapple fried rice',
 'chanterelle risotto',
 'pudding',
 'classic clafouti',
 'steamed garlic prawns Chinese-Style',
 'spongy Japanese cheesecake',
 'Korean-style seaweed soup',
 'traditional gyros',
 'spinach casserole',
 'Ukrainian apple cake',
 'easy Irish colcannon',
 'velveting chicken breast, Chinese restaurant-style',
 'pasta with sardines',
 'eggs poached in tomato sauce',
 'ping gai',
 'chocolate Mexican wedding cookies',
 'Ukrainian meat filled cabbage rolls',
 'posole soup',
 'Irish chicken and dumplings',
 'chicken chow mein',
 'soon du bu jigae',
 'grilled salmon steaks Italian-Style',
 'wine cookies',
 'florentine stuffed chicken',
 'Chinese napa cabbage salad',
 'Norwegian potato dumplings',
 'Indian lentil soup',
 'cock a leekie soup',
 'chicken scallopini',
 'butter soup',
 'easy stuffed zucchini',
 'azteca soup',
 'fried rice with ham',
 'Mediterranean seafood soup',
 'Cambodian lemongrass chicken soup',
 'baby bok choy with garlic',
 'garlic and herb marin

Define a function that chunks based on grammar given, but only return chunk that have more than 2 words, since bigrams already can deal with phrases with 2 words anyway

In [547]:
from nltk import RegexpParser

def chunk(corpus, grammar, target):
    chunker = RegexpParser(grammar)
    tagged = pos_tag(word_tokenize(corpus))
    output = chunker.parse(tagged)
    outputs = []
    for subtree in output.subtrees(filter=lambda t: t.label() == target):
        result = re.sub("(\([A-Z]+ )|(\/[A-Z]+)|(\))+", "", str(subtree))
        if len(result.split()) > 2:
            outputs.append(result)
    return outputs

# https://github.com/nopynospy/pos_tagging/blob/main/pos.ipynb

PP_REGEX = r"""
  ADJP: {<RB>?<JJ|JJR|JJS|RBR|RBS>}    # Adjectives may have comparative and superlative, and come after adverbs like very
  NP: {<DT|WDT|WP$>?<CD>?<AdjP>*<NN|NNS|NNP|NNPS><POS>*<NN|NNS|NNP|NNPS|PP|CD>*<VBG>?}    # Determiner, number and adjectives come before nouns and nouns may have possessive -s and followed by another noun
  NP: {<PRP|EX|CD|WP|WRB|PRP$|WP$>}    # Pronouns and numbers can also replace nouns and function as one
  PP: {<IN>?<IN>?<IN|TO><NP>}    # Prepositions come before nouns and sometimes two prepositions come together
"""

chunk("chicken marsala with portobello mushrooms", PP_REGEX, "PP")

['with portobello mushrooms']

Get prepositional phrases from recipe names

In [549]:
prepositional_phrases = []

for name in names_in_tokens:
    prepositional_phrases = prepositional_phrases + (chunk(name, PP_REGEX, "PP"))
    
prepositional_phrases

['in tomato sauce',
 'de Pollo con arroz',
 'with bok choy',
 'with shiitake mushrooms',
 'in the oven',
 'with Mango-Pineapple slaw',
 'with berry sauce',
 'in tamarind broth',
 'with ground beef',
 'for the instant pot',
 'with miso ranch',
 'in chicken broth',
 'of the lasagna',
 'with Asiago cream sauce',
 'in a hurry',
 'with Mango-Habanero sauce',
 'with cucumber-yogurt sauce',
 'without a pasta machine',
 'in puff pastry',
 'with homemade taco seasoning',
 'with gochujang sauce',
 'with cream cheese',
 'in the instant pot',
 'in the air fryer',
 'with mango coulis',
 'in walnut sauce',
 'with peanut sauce',
 'with pearl sugar',
 'de chili chocolate cupcakes',
 'with chili cream cheese frosting',
 'without the chili powder',
 'of cauliflower soup',
 'in coconut milk',
 'for Mexican soups',
 'with peanut butter',
 'arroz con Pollo',
 'with peanut sauce',
 'with yogurt raita',
 'with coconut milk',
 'with cucumber sauce',
 'without the refry',
 'on the rocks',
 'with artichoke hear

## Chunking (ingredients)

In [550]:
all_ingre_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_ingredients, POS)}
  all_ingre_tags.append(new_dic)

get_tag_number(all_ingre_tags)

[{'$': 0},
 {"''": 14},
 {'(': 3742},
 {')': 3827},
 {',': 8513},
 {'--': 0},
 {'.': 23},
 {':': 295},
 {'CC': 3094},
 {'CD': 21802},
 {'DT': 99},
 {'EX': 0},
 {'FW': 52},
 {'IN': 2849},
 {'JJ': 13400},
 {'JJR': 523},
 {'JJS': 6},
 {'LS': 0},
 {'MD': 612},
 {'NN': 32984},
 {'NNP': 2416},
 {'NNPS': 2},
 {'NNS': 13598},
 {'PDT': 1},
 {'POS': 126},
 {'PRP': 1},
 {'PRP$': 1},
 {'RB': 1452},
 {'RBR': 5},
 {'RBS': 0},
 {'RP': 13},
 {'SYM': 53},
 {'TO': 1039},
 {'UH': 0},
 {'VB': 1724},
 {'VBD': 8947},
 {'VBG': 354},
 {'VBN': 3436},
 {'VBP': 645},
 {'VBZ': 588},
 {'WDT': 1},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 0}]

Get all the prepositions detected by NLTK from ingredients

In [551]:
in_tokens = sort_unique_list(get_values_from_dict_list(all_ingre_tags, 'IN')[0])

in_tokens

['OF',
 'about',
 'across',
 'against',
 'aji',
 'almond',
 'ancho',
 'aonori',
 'as',
 'at',
 'brown',
 'by',
 'de',
 'dough',
 'for',
 'from',
 'if',
 'in',
 'into',
 'nonfat',
 'nutmeg',
 'of',
 'on',
 'orzo',
 'out',
 'over',
 'pepper',
 'per',
 'pimento',
 'pinto',
 'taco',
 'tamarind',
 'through',
 'trout',
 'until',
 'with',
 'without',
 'wrapper']

Keep only actual prepositions

In [552]:
in_tokens = ['OF',
 'about',
 'across',
 'against',
 'as',
 'at',
 'by',
 'for',
 'from',
 'if',
 'in',
 'into',
 'of',
 'on',
 'out',
 'over',
 'per',
 'through',
 'until',
 'with',
 'without']

in_tokens

['OF',
 'about',
 'across',
 'against',
 'as',
 'at',
 'by',
 'for',
 'from',
 'if',
 'in',
 'into',
 'of',
 'on',
 'out',
 'over',
 'per',
 'through',
 'until',
 'with',
 'without']

Get all the ingredients with prepositions

In [553]:
ingres_in_tokens = [s for s in p_ingredients if any(xs in s for xs in in_tokens)]

ingres_in_tokens

['4 (4.4 ounce) cans no-salt-added tomatoes, diced or crushed',
 '4 quarts hot water',
 '4 pounds pork tenderloin, cut into 4 4.4-inch cubes',
 'superfine sugar as needed',
 '4.4 cup finely chopped dry roasted peanuts',
 '4 tablespoon finely chopped Chinese chives',
 '4 chicken tenderloins',
 '4 large cloves garlic, thinly sliced',
 '4 pounds boneless beef chuck roast',
 '4 tablespoon oil',
 '4 saffron threads',
 '4.4 cup white vinegar',
 '4.4 teaspoon ground coriander',
 '4 cup halved grape tomatoes',
 '4 (4 inch) flour tortillas',
 '4 ears corn on the cob, cut into quarters',
 '4 tablespoon vanilla extract',
 '4 small unpeeled red potato, shredded',
 '4.4 cup sun-dried tomatoes, chopped',
 '4 carrots, cut into 4-inch chunks',
 "4.4 cups coconut flakes (such as Baker's Angel Flake)",
 '4 tablespoons milk (Optional)',
 '4 teaspoon hot chile paste',
 '4 tablespoon Japanese mayonnaise (such as Kewpie)',
 '4 cups tomato sauce, or to taste',
 '4 tablespoons tamarind pulp',
 '4 pounds unpee

In [554]:
prepositional_phrases2 = []

for name in ingres_in_tokens:
    prepositional_phrases2 = prepositional_phrases2 + (chunk(name, PP_REGEX, "PP"))
    
prepositional_phrases2

['on the cob',
 "as Baker 's Angel Flake",
 'as Kitchen Bouquet',
 'in adobo sauce',
 'into 4 inch pieces',
 'into 4 pieces',
 'into 4 inch pieces',
 'ancho chile powder',
 'into 4 wedges',
 'into julienne strips',
 'of one lime',
 'into 4 inch pieces',
 'into 4 inch pieces',
 'brown sugar cones',
 'for osso buco',
 'as Montreal Steak Seasoning',
 'into 4 wedges',
 'into 4 inch pieces',
 'at room temperature',
 'as Phil Supreme',
 'to 4 % cocao',
 "as IKEA 's",
 'as Sun Gold',
 'at room temperature',
 'as RO * TEL',
 'as Green Giant',
 'at room temperature',
 'about 4 inches thick',
 'at room temperature',
 'into 4 wedges',
 'at room temperature',
 'of mushroom soup',
 "as Campbell 's",
 'into 4 pieces',
 'as RO * TEL Hot',
 'on paper towels',
 'for 4 minutes',
 "(PP\n  as\n  (NP\n    SuzyQ\n    's\n    Santa\n    Maria\n    Valley\n    Style\n    Seasoning",
 'as Cabot Seriously Sharp',
 'with Minced Pimientos',
 'as Archer Farms',
 'in 4 pieces',
 'into 4 inch pieces',
 'into 4.4 inc

Fix typo

In [555]:
prepositional_phrases2 = ["as SuzyQ's Santa Maria Valley Style Seasoning" if x=="(PP\n  as\n  (NP\n    SuzyQ\n    's\n    Santa\n    Maria\n    Valley\n    Style\n    Seasoning" else x for x in prepositional_phrases2]

prepositional_phrases2

['on the cob',
 "as Baker 's Angel Flake",
 'as Kitchen Bouquet',
 'in adobo sauce',
 'into 4 inch pieces',
 'into 4 pieces',
 'into 4 inch pieces',
 'ancho chile powder',
 'into 4 wedges',
 'into julienne strips',
 'of one lime',
 'into 4 inch pieces',
 'into 4 inch pieces',
 'brown sugar cones',
 'for osso buco',
 'as Montreal Steak Seasoning',
 'into 4 wedges',
 'into 4 inch pieces',
 'at room temperature',
 'as Phil Supreme',
 'to 4 % cocao',
 "as IKEA 's",
 'as Sun Gold',
 'at room temperature',
 'as RO * TEL',
 'as Green Giant',
 'at room temperature',
 'about 4 inches thick',
 'at room temperature',
 'into 4 wedges',
 'at room temperature',
 'of mushroom soup',
 "as Campbell 's",
 'into 4 pieces',
 'as RO * TEL Hot',
 'on paper towels',
 'for 4 minutes',
 "as SuzyQ's Santa Maria Valley Style Seasoning",
 'as Cabot Seriously Sharp',
 'with Minced Pimientos',
 'as Archer Farms',
 'in 4 pieces',
 'into 4 inch pieces',
 'into 4.4 inch slices',
 'into 4.4 inch pieces',
 'as Smart Bal

Get all the singular common nouns detected by NLTK from ingredients

In [556]:
nn_tokens = sort_unique_list(get_values_from_dict_list(all_ingre_tags, 'NN')[0])

nn_tokens

['%',
 '4.4-pound',
 'Caramel',
 'Class',
 'Italian',
 'Moist',
 'Oil',
 'SHAKE-N-BAKE',
 'TOUCH',
 'Yazzo',
 'acacia',
 'achiote',
 'acid',
 'acini',
 'adobo',
 'advieh',
 'agave',
 'ahi',
 'aisle',
 'alcohol',
 'ale',
 'allspice',
 'almond',
 'aluminum',
 'amani',
 'amaretto',
 'amarillo',
 'amber',
 'ammonia',
 'amount',
 'ancho',
 'anchovy',
 'angel',
 'anise',
 'annato',
 'annatto',
 'aperitif',
 'apple',
 'applesauce',
 'apricot',
 'arbol',
 'arborio',
 'arrachera',
 'arrowroot',
 'artichoke',
 'arugula',
 'asadero',
 'asafoetida',
 'asparagus',
 'au',
 'avocado',
 'avocados',
 'baby',
 'bacon',
 'bag',
 'baguette',
 'baking',
 'ball',
 'balsamic',
 'bamboo',
 'banana',
 'bananas',
 'bangus',
 'bar',
 'barbecue',
 'barbeque',
 'barley',
 'base',
 'basil',
 'basmati',
 'bass',
 'batter',
 'bay',
 'bean',
 'beaten',
 'bechamel',
 'bee4',
 'beech',
 'beef',
 'beer',
 'beeswax',
 'beet',
 'bell',
 'bella',
 'bellas',
 'beluga',
 'berry',
 'besan',
 'beverage',
 'bhaji',
 'bias',
 'bi

Get all the ingredients with the common nouns

In [557]:
ingres_nn_tokens = [s for s in p_ingredients if any(xs in s for xs in nn_tokens)]

ingres_nn_tokens

['fresh mushrooms, sliced',
 '4 slice mild Cheddar cheese',
 '4 (4.4 ounce) cans no-salt-added tomatoes, diced or crushed',
 '4 quarts hot water',
 '4 pounds pork tenderloin, cut into 4 4.4-inch cubes',
 'superfine sugar as needed',
 '4.4 cup finely chopped dry roasted peanuts',
 '4.4 pounds cod',
 '4 tablespoon finely chopped Chinese chives',
 '4 chicken tenderloins',
 '4 large cloves garlic, thinly sliced',
 '4 pounds boneless beef chuck roast',
 '4 tablespoon oil',
 '4 saffron threads',
 '4.4 cup white vinegar',
 '4.4 teaspoon ground coriander',
 '4 cup frozen corn',
 '4 cup halved grape tomatoes',
 '4 head napa cabbage, chopped',
 '4 (4 inch) flour tortillas',
 '4 ears corn on the cob, cut into quarters',
 '4 tablespoon vanilla extract',
 '4 cups refried beans, divided',
 '4 small unpeeled red potato, shredded',
 '4.4 cup sun-dried tomatoes, chopped',
 '4 carrots, cut into 4-inch chunks',
 '4 egg',
 "4.4 cups coconut flakes (such as Baker's Angel Flake)",
 '4 tablespoons milk (Opti

Filter for those without numbers at the beginning and make sure that each has at least 3 words

In [558]:
ingres_nn_tokens = [s for s in ingres_nn_tokens if not any(xs in s for xs in ["4", "4.4"]) and len(s.split()) > 2]

ingres_nn_tokens

['fresh mushrooms, sliced',
 'superfine sugar as needed',
 'chopped green onions for garnish',
 'fresh cracked black pepper to taste',
 'chopped tomatoes, for garnish',
 'ice cubes, or as needed',
 'salt and freshly ground pepper',
 'cayenne pepper, to taste',
 'chopped peanuts, or to taste',
 'sour cream for garnish',
 'kosher salt and freshly ground black pepper to taste',
 'salt and ground black pepper, to taste',
 'kosher salt, or to taste',
 'vegetable oil as needed',
 'coarse salt to taste',
 'coarse kosher salt',
 'chopped fresh cilantro, for garnish',
 'freshly grated Parmesan cheese',
 'finely grated Parmigiano-Reggiano cheese',
 'coarse salt as needed',
 'Lime wedges for serving',
 "sifted confectioners' sugar",
 'Goya Adobo with Pepper, to taste',
 'salt and ground black pepper',
 'salt and ground black pepper to taste',
 'Hot cooked regular long-grain white rice',
 'avocado oil cooking spray',
 'water, as needed',
 'Hog casing, rinsed well',
 'Freshly ground black pepper to

Get all the proper nouns from ingredients

In [559]:
nnp_tokens = sort_unique_list(get_values_from_dict_list(all_ingre_tags, 'NNP')[0])

nnp_tokens

['*',
 "Ac'cent",
 'Accent',
 'Adobo',
 'Agave',
 'Aji-No-Moto',
 'Ajinomoto',
 'Alcaparrado',
 'Aleppo',
 'Alfredo',
 'All-Purpose',
 'Aloha™',
 'Aluminum',
 'Anaheim',
 'Ancho',
 'Angel',
 'Angeli',
 'Angostura',
 'Annatto',
 'Arborio',
 'Archer',
 'Arthur',
 'Asafoetida',
 'Asiago',
 'Asian',
 'Authentic',
 'Azafran',
 'B',
 'BC',
 'BEN',
 'BOCA',
 'Bacardi',
 'Badia',
 'Baileys',
 'Baker',
 'Balance',
 'Barbeque',
 'Barilla',
 'Barolo',
 'Base',
 'Basics',
 'Basil',
 'Basmati',
 'Bavarian-style',
 'Bay',
 'Bay™',
 'Beaujolais',
 'Beef',
 'Ben',
 'Bengal',
 'Betty',
 'Beyond',
 'Bing',
 'Bisquick',
 'Black',
 'Blanc',
 'Blend',
 'Blue',
 'Bob',
 'Bold',
 'Bosc',
 'Boston',
 'Bouillon',
 'Bouquet',
 'Bragg',
 'Brand',
 'Branzino',
 'Bread',
 'Brie',
 'Broth',
 'Brown',
 'Brussels',
 'Buffalo',
 'Buitoni',
 "Bull's-Eye",
 'Buns',
 'Burgundy',
 'Butter',
 'Buttercream',
 'C',
 'Cabernet',
 'Cabot',
 'Cajun',
 'California',
 'Calimyrna',
 'Campari',
 'Campbell',
 'Canilla',
 'Canola',
 

Get all the ingredients with the proper noun

In [560]:
ingres_nnp_tokens = [s for s in p_ingredients if any(xs in s for xs in nnp_tokens)]

ingres_nnp_tokens

['4 slice mild Cheddar cheese',
 'superfine sugar as needed',
 '4 tablespoon finely chopped Chinese chives',
 "4.4 cups coconut flakes (such as Baker's Angel Flake)",
 '4 tablespoons milk (Optional)',
 '4 tablespoon Japanese mayonnaise (such as Kewpie)',
 '4 tablespoons white sugar, divided',
 '4 tablespoons chopped green bell pepper (Optional)',
 '4 (4.4 ounce) container Greek yogurt',
 '4 cup French green lentils',
 '4 tablespoon sour cream, or as desired (Optional)',
 '4.4 teaspoon lemon zest',
 '4 tablespoon browning sauce (such as Kitchen Bouquet), or as desired',
 '4 tablespoons grated Parmesan cheese',
 '4 tablespoon honey (Optional)',
 '4.4 teaspoon paprika (Optional)',
 '4 cup chopped Chinese roast duck meat, skin and fat separated and set aside',
 "4.4 cups confectioners' sugar",
 '4 pinch salt and black pepper to taste (Optional)',
 '4 tablespoons red Thai curry paste',
 '4 ounces crumbled Gorgonzola cheese',
 '4.4 tablespoons monosodium glutamate (MSG)',
 '4.4 cup superfine

Filter for those without numbers at the beginning and make sure that each has at least 3 words

In [561]:
ingres_nnp_tokens = [s for s in ingres_nnp_tokens if not any(xs in s for xs in ["4", "4.4"]) and len(s.split()) > 2]

ingres_nnp_tokens

['superfine sugar as needed',
 'freshly grated Parmesan cheese',
 'finely grated Parmigiano-Reggiano cheese',
 'Lime wedges for serving',
 "sifted confectioners' sugar",
 'Goya Adobo with Pepper, to taste',
 'Hot cooked regular long-grain white rice',
 'Hog casing, rinsed well',
 'Freshly ground black pepper to taste',
 'Salt and pepper, to taste',
 "confectioners' sugar for dusting",
 'Salt and black pepper to taste',
 'white sugar for decoration',
 'cooking spray (such as Pam)',
 'Goya Ground Black Pepper, to taste',
 'Kosher salt, to taste',
 'Water to cover',
 'Curry powder to taste',
 'Tomato ketchup and hot mustard or Kikkoman Sweet & Sour Sauce',
 'Finely chopped white onions',
 'Freshly grated lemon zest',
 'Chopped Italian parsley',
 'Salt and freshly ground pepper to taste',
 'sweet Thai basil',
 'Kosher salt and fresh cracked pepper to taste',
 'Canola oil, for frying',
 'Parsley or cilantro for garnish',
 'Goya Corn Oil, for frying',
 'Salt and pepper to taste',
 'Sriracha 

Define noun phrase rule

In [562]:
NP_REGEX = r"""
  ADJP: {<RB>?<JJ|JJR|JJS|RBR|RBS>}    # Adjectives may have comparative and superlative, and come after adverbs like very
  NP: {<DT|WDT|WP$>?<CD>?<AdjP>*<NN|NNS|NNP|NNPS><POS>*<NN|NNS|NNP|NNPS|PP|CD>*<VBG>?}    # Determiner, number and adjectives come before nouns and nouns may have possessive -s and followed by another noun
  NP: {<NP><,>*<NP>*<,>*<NP>*<CC>?<NP>}    # Multiple nouns can come in comma and 'and'
"""

chunk("salt and pepper", NP_REGEX, "NP")
# pos_tag("salt and pepper")

['salt and pepper']

These are the results of the chunking. Some chunks are used more than once

In [563]:
noun_phrases = []

for name in ingres_nn_tokens:
    noun_phrases = noun_phrases + (chunk(name, NP_REGEX, "NP"))
    
noun_phrases

['salt and ground',
 'coarse kosher salt',
 "confectioners ' sugar",
 'salt and ground',
 'salt and ground',
 'oil cooking spray',
 'spicy cilantro chutney',
 'Salt and pepper',
 "confectioners ' sugar",
 'sea salt and ground',
 'Goya Ground Black Pepper',
 'cream or half-and-half',
 'mustard or Kikkoman Sweet',
 'Chopped Italian parsley',
 'Parsley or cilantro',
 'Goya Corn Oil',
 'salt and ground',
 'tomato and clam juice cocktail',
 'clam juice cocktail',
 'Salt and pepper',
 'salt and ground',
 'paper candy cups',
 'Salt and ground',
 'cheesecloth and kitchen string',
 'coarse sea salt',
 'plain bread crumbs',
 'salt and pepper',
 'salt and pepper',
 'salt and ground pepper',
 'margarita or kosher salt',
 'Goya Hot Sauce',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'oil cooking spray',
 'kosher salt and ground',
 'salt and pepper',
 'buttons ,/, oyster ,/, shiitake',
 'portobello and crimini']

Fix typo

In [564]:
noun_phrases = ["buttons, oyster, shitake" if x=="buttons ,/, oyster ,/, shiitake" else x for x in noun_phrases]

noun_phrases

['salt and ground',
 'coarse kosher salt',
 "confectioners ' sugar",
 'salt and ground',
 'salt and ground',
 'oil cooking spray',
 'spicy cilantro chutney',
 'Salt and pepper',
 "confectioners ' sugar",
 'sea salt and ground',
 'Goya Ground Black Pepper',
 'cream or half-and-half',
 'mustard or Kikkoman Sweet',
 'Chopped Italian parsley',
 'Parsley or cilantro',
 'Goya Corn Oil',
 'salt and ground',
 'tomato and clam juice cocktail',
 'clam juice cocktail',
 'Salt and pepper',
 'salt and ground',
 'paper candy cups',
 'Salt and ground',
 'cheesecloth and kitchen string',
 'coarse sea salt',
 'plain bread crumbs',
 'salt and pepper',
 'salt and pepper',
 'salt and ground pepper',
 'margarita or kosher salt',
 'Goya Hot Sauce',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'oil cooking spray',
 'kosher salt and ground',
 'salt and pepper',
 'buttons, oyster, shitake',
 'portobello and crimini']

In [565]:
for name in ingres_nnp_tokens:
    noun_phrases = noun_phrases + (chunk(name, NP_REGEX, "NP"))
    
noun_phrases

['salt and ground',
 'coarse kosher salt',
 "confectioners ' sugar",
 'salt and ground',
 'salt and ground',
 'oil cooking spray',
 'spicy cilantro chutney',
 'Salt and pepper',
 "confectioners ' sugar",
 'sea salt and ground',
 'Goya Ground Black Pepper',
 'cream or half-and-half',
 'mustard or Kikkoman Sweet',
 'Chopped Italian parsley',
 'Parsley or cilantro',
 'Goya Corn Oil',
 'salt and ground',
 'tomato and clam juice cocktail',
 'clam juice cocktail',
 'Salt and pepper',
 'salt and ground',
 'paper candy cups',
 'Salt and ground',
 'cheesecloth and kitchen string',
 'coarse sea salt',
 'plain bread crumbs',
 'salt and pepper',
 'salt and pepper',
 'salt and ground pepper',
 'margarita or kosher salt',
 'Goya Hot Sauce',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'oil cooking spray',
 'kosher salt and ground',
 'salt and pepper',
 'buttons, oyster, shitake',
 'portobello and crimini',
 "confectioners ' sugar",
 'Salt and pepper',
 "confectioners ' sugar",
 'Goya Ground Black Pep

In [566]:
noun_phrases = sort_unique_list(noun_phrases)

noun_phrases

['Chopped Italian parsley',
 'Goya Corn Oil',
 'Goya Ground Black Pepper',
 'Goya Hot Sauce',
 'Parsley or cilantro',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'Salt and ground',
 'Salt and pepper',
 'buttons, oyster, shitake',
 'cheesecloth and kitchen string',
 'clam juice cocktail',
 'coarse kosher salt',
 'coarse sea salt',
 "confectioners ' sugar",
 'cream or half-and-half',
 'kosher salt and ground',
 'margarita or kosher salt',
 'mustard or Kikkoman Sweet',
 'oil cooking spray',
 'paper candy cups',
 'plain bread crumbs',
 'portobello and crimini',
 'salt and ground',
 'salt and ground pepper',
 'salt and pepper',
 'sea salt and ground',
 'spicy cilantro chutney',
 'tomato and clam juice cocktail']

Fix typos

In [567]:
prepositional_phrases = sort_unique_list(prepositional_phrases + prepositional_phrases2)

prepositional_phrases = ["as Bull's-Eye Texas-Style Bold Barbeque Sauce" if x=="(PP\n  as\n  Bull's-Eye Texas-Style Bold Barbeque Sauce" else x for x in prepositional_phrases]
prepositional_phrases = ["as Grill Mates Montreal Chicken Seasoning" if x=="(PP\n  as\n  Grill Mates Montreal Chicken Seasoning" else x for x in prepositional_phrases]

prepositional_phrases

["as Bull's-Eye Texas-Style Bold Barbeque Sauce",
 'as Grill Mates Montreal Chicken Seasoning',
 'Of This World Spaghetti',
 'Under a Brick',
 'about 4 inches',
 'about 4 inches thick',
 'across the grain',
 'against the grain',
 'ancho chile powder',
 'arroz con Pollo',
 'as Aloha™ Shoyu',
 'as Archer Farms',
 'as Bacardi Coconut™',
 'as Badia Complete Seasoning',
 'as Badia Tropical',
 "as Baker 's Angel Flake",
 "as Baker 's German",
 'as Baker Fine Dessert Filling',
 'as Barilla Napoletana',
 'as Betty Crocker',
 'as Beyond Meat',
 'as Beyond Meat Beyond Beef',
 "as Bob 's Red Mill",
 'as Bob Evans',
 'as Cabernet Sauvignon',
 'as Cabot Seriously Sharp',
 "as Campbell 's",
 "as Campbell 's Healthy Request",
 "as Cavender 's",
 'as Chantaboon Rice Noodles',
 'as Chocolate Ibarra',
 'as Classico Cabernet Marinara',
 'as Coco Lopez',
 'as Cool Whip',
 'as Country Crock',
 'as De Cecco',
 'as Diamond Crystal',
 'as Diet Sprite',
 'as Duncan Hines',
 'as El Paso',
 'as El Pato',
 'as FA

Save all the phrases as a txt file

In [568]:
all_phrases = sort_unique_list(prepositional_phrases + noun_phrases)

with open('all_phrases.txt', 'w') as filehandle:
    for listitem in all_phrases:
        filehandle.write('%s\n' % listitem)

## Data merging and creating bigram

In [569]:
all_recipe_names2[:10]

['',
 'curry pineapple fried rice',
 'Mexican spaghetti',
 'chanterelle risotto',
 'italian eggplant parmigiana',
 'dolmathes',
 'pudding',
 'baklava',
 'classic clafouti',
 'steamed garlic prawns Chinese-Style']

In [570]:
p_ingredients[:10]

['fresh mushrooms, sliced',
 '4 slice mild Cheddar cheese',
 '4 (4.4 ounce) cans no-salt-added tomatoes, diced or crushed',
 '4 quarts hot water',
 '4 pounds pork tenderloin, cut into 4 4.4-inch cubes',
 'superfine sugar as needed',
 '4.4 cup finely chopped dry roasted peanuts',
 '4.4 pounds cod',
 '4 tablespoon finely chopped Chinese chives',
 '4 chicken tenderloins']

Generate bigram from each entry, rather than directly as a whole chunk of text, since they were not joined originally in the source

In [571]:
def generate_bigram_from_entry(entry):
    bigrams = nltk.bigrams(entry.split(' '))
    frequence = nltk.FreqDist(bigrams)
    return dict(sorted(frequence.items(), key=lambda item: item[0]))

generate_bigram_from_entry("4 tablespoons grated orange peel")

{('4', 'tablespoons'): 1,
 ('grated', 'orange'): 1,
 ('orange', 'peel'): 1,
 ('tablespoons', 'grated'): 1}

Combine individual bigram

In [572]:
from collections import Counter

name_bigrams = {}

for name in all_recipe_names2:
    name_bigrams = dict(Counter(name_bigrams)+Counter(generate_bigram_from_entry(name)))
    
name_bigrams

{('curry', 'pineapple'): 1,
 ('fried', 'rice'): 43,
 ('pineapple', 'fried'): 1,
 ('Mexican', 'spaghetti'): 1,
 ('chanterelle', 'risotto'): 1,
 ('eggplant', 'parmigiana'): 2,
 ('italian', 'eggplant'): 1,
 ('classic', 'clafouti'): 1,
 ('garlic', 'prawns'): 2,
 ('prawns', 'Chinese-Style'): 1,
 ('steamed', 'garlic'): 1,
 ('kecap', 'manis'): 1,
 ('Spanish', 'flan'): 1,
 ('Italian', 'subs'): 1,
 ('Belgi', 'galettes'): 1,
 ('Japanese', 'cheesecake'): 2,
 ('spongy', 'Japanese'): 1,
 ('maja', 'blanca'): 1,
 ('Korean-style', 'seaweed'): 1,
 ('seaweed', 'soup'): 2,
 ('and', 'pepper'): 6,
 ('pepper', 'penne'): 1,
 ('sausage', 'and'): 14,
 ('Cooker', 'pork'): 3,
 ('pork', 'chops'): 15,
 ('savory', 'slow'): 1,
 ('slow', 'Cooker'): 86,
 ('de', 'tres'): 1,
 ('pastel', 'de'): 3,
 ('tres', 'leches'): 7,
 ('traditional', 'gyros'): 1,
 ('spinach', 'casserole'): 1,
 ('Ukrainian', 'apple'): 1,
 ('apple', 'cake'): 8,
 ('Korean', 'cucumber'): 2,
 ('cucumber', 'salad'): 13,
 ('German', 'Potato'): 1,
 ("Grammy'

In [573]:
ingre_bigrams = {}

for ingre in p_ingredients:
    ingre_bigrams = dict(Counter(ingre_bigrams)+Counter(generate_bigram_from_entry(ingre)))
    
ingre_bigrams

{('fresh', 'mushrooms,'): 34,
 ('mushrooms,', 'sliced'): 51,
 ('4', 'slice'): 17,
 ('Cheddar', 'cheese'): 56,
 ('mild', 'Cheddar'): 7,
 ('slice', 'mild'): 1,
 ('(4.4', 'ounce)'): 308,
 ('4', '(4.4'): 347,
 ('cans', 'no-salt-added'): 1,
 ('diced', 'or'): 2,
 ('no-salt-added', 'tomatoes,'): 1,
 ('or', 'crushed'): 1,
 ('ounce)', 'cans'): 213,
 ('tomatoes,', 'diced'): 26,
 ('4', 'quarts'): 32,
 ('hot', 'water'): 24,
 ('quarts', 'hot'): 1,
 ('4', '4.4-inch'): 30,
 ('4', 'pounds'): 407,
 ('4.4-inch', 'cubes'): 36,
 ('cut', 'into'): 990,
 ('into', '4'): 146,
 ('pork', 'tenderloin,'): 15,
 ('pounds', 'pork'): 43,
 ('tenderloin,', 'cut'): 14,
 ('as', 'needed'): 366,
 ('sugar', 'as'): 1,
 ('superfine', 'sugar'): 10,
 ('4.4', 'cup'): 2395,
 ('chopped', 'dry'): 3,
 ('cup', 'finely'): 91,
 ('dry', 'roasted'): 5,
 ('finely', 'chopped'): 278,
 ('roasted', 'peanuts'): 13,
 ('4.4', 'pounds'): 263,
 ('pounds', 'cod'): 5,
 ('4', 'tablespoon'): 785,
 ('Chinese', 'chives'): 2,
 ('chopped', 'Chinese'): 3,
 

In [574]:
all_bigrams = {}

all_bigrams = dict(Counter(name_bigrams)+Counter(ingre_bigrams))

all_bigrams

{('curry', 'pineapple'): 1,
 ('fried', 'rice'): 43,
 ('pineapple', 'fried'): 1,
 ('Mexican', 'spaghetti'): 1,
 ('chanterelle', 'risotto'): 1,
 ('eggplant', 'parmigiana'): 2,
 ('italian', 'eggplant'): 1,
 ('classic', 'clafouti'): 1,
 ('garlic', 'prawns'): 2,
 ('prawns', 'Chinese-Style'): 1,
 ('steamed', 'garlic'): 1,
 ('kecap', 'manis'): 2,
 ('Spanish', 'flan'): 1,
 ('Italian', 'subs'): 1,
 ('Belgi', 'galettes'): 1,
 ('Japanese', 'cheesecake'): 2,
 ('spongy', 'Japanese'): 1,
 ('maja', 'blanca'): 1,
 ('Korean-style', 'seaweed'): 1,
 ('seaweed', 'soup'): 2,
 ('and', 'pepper'): 14,
 ('pepper', 'penne'): 1,
 ('sausage', 'and'): 14,
 ('Cooker', 'pork'): 3,
 ('pork', 'chops'): 34,
 ('savory', 'slow'): 1,
 ('slow', 'Cooker'): 86,
 ('de', 'tres'): 1,
 ('pastel', 'de'): 3,
 ('tres', 'leches'): 7,
 ('traditional', 'gyros'): 1,
 ('spinach', 'casserole'): 1,
 ('Ukrainian', 'apple'): 1,
 ('apple', 'cake'): 8,
 ('Korean', 'cucumber'): 2,
 ('cucumber', 'salad'): 13,
 ('German', 'Potato'): 1,
 ("Grammy

Group bigrams with same first word together into a dictionary

In [575]:
def find_dict_tuple_key(search):
    entry = {
        "token": search,
        "bigrams": []
    }
    bigrams = {x: all_bigrams[x] for x in all_bigrams.keys() if x[0] == search}
    for key, value in bigrams.items():
        newDict = {key[1]: value}
        entry["bigrams"].append(newDict)
    return entry

find_dict_tuple_key('spicy')

{'token': 'spicy',
 'bigrams': [{'yogurt': 1},
  {'tuna': 2},
  {'Peruvian': 1},
  {'banana': 1},
  {'Indian': 4},
  {'chicken': 4},
  {'Vietnamese': 2},
  {'Korean': 3},
  {'orange': 4},
  {'and': 1},
  {'Mexican-American': 1},
  {'crispy': 1},
  {'thai': 4},
  {'beef': 3},
  {'eggplant': 3},
  {'sushi': 1},
  {'African': 1},
  {'basil': 2},
  {'peach': 1},
  {'rice': 2},
  {'bok': 1},
  {'stir-fry': 1},
  {'mango': 1},
  {'stir': 1},
  {'fried': 1},
  {'Chinese': 2},
  {'yellowtail': 1},
  {'pork': 5},
  {'marinated': 1},
  {'Italian': 4},
  {'szechuan': 1},
  {'shrimp': 3},
  {'himalayan': 1},
  {'penyet': 1},
  {'cabbage': 1},
  {'avocado': 1},
  {'salmon': 1},
  {'green': 1},
  {'Asian-Style': 1},
  {'dipping': 1},
  {'vegan': 1},
  {'Asian': 1},
  {'Sinterklass': 1},
  {'red': 2},
  {'tomato': 1},
  {'noodles': 1},
  {'feta': 1},
  {'calabrian': 1},
  {'Southwest': 1},
  {'pesto': 1},
  {'refried': 1},
  {'cilantro': 1},
  {'brown': 2},
  {'Portuguese': 1},
  {'curry': 1},
  {'se

In [576]:
all_tokens = recipe_tokens + ingre_tokens

len(all_tokens)

6504

In [577]:
bigram_in_list = []
for value in all_tokens:
    bigram_in_list.append(find_dict_tuple_key(value))
    
bigram_in_list

[{'token': 'pig-shaped', 'bigrams': [{'cookies': 1}]},
 {'token': 'ii', 'bigrams': []},
 {'token': 'super', 'bigrams': [{'easy': 2}, {'savory': 1}, {'fine': 1}]},
 {'token': 'bredie', 'bigrams': []},
 {'token': 'arrabbiata', 'bigrams': [{'sauce': 1}]},
 {'token': 'number', 'bigrams': [{'one': 1}]},
 {'token': 'Japan', 'bigrams': []},
 {'token': 'breakfast',
  'bigrams': [{'tacos': 1},
   {'puffs': 1},
   {'enchiladas': 1},
   {'kidney': 1},
   {'custard': 1},
   {'skillet': 1},
   {'crepes': 1},
   {'rice': 1},
   {'burritos': 2},
   {'dish': 1},
   {'egg': 1},
   {'chops': 1},
   {'sausage': 1},
   {'pork': 1}]},
 {'token': 'flesh-keek-luh', 'bigrams': []},
 {'token': 'apricot',
  'bigrams': [{'rugelach': 1},
   {'chutney': 1},
   {'jelly': 2},
   {'preserves': 5},
   {'or': 1},
   {'fruit': 1},
   {'jam': 3}]},
 {'token': 'Yia', 'bigrams': [{"Yia's": 2}]},
 {'token': 'Captain', 'bigrams': [{"Duarte's": 1}]},
 {'token': 'lemak', 'bigrams': []},
 {'token': 'callaloo', 'bigrams': [{'and

## Add Phonetics

In [578]:
!pipenv install eng-to-ipa 

Installing eng-to-ipa...

Installing dependencies from Pipfile.lock (577ce1)...
Ignoring argcomplete: markers 'python_full_version < "3.8.0"' don't match your environment
Ignoring importlib-metadata: markers 'python_version == "3.7" and python_full_version < "3.8.0" and python_full_version < "3.8.0" and python_full_version < "3.8.0"' don't match your environment
Ignoring typing-extensions: markers 'python_full_version < "3.8.0"' don't match your environment




[    ] Installing...
[=   ] Installing eng-to-ipa...
[==  ] Installing eng-to-ipa...
[=== ] Installing eng-to-ipa...
[ ===] Installing eng-to-ipa...
[  ==] Installing eng-to-ipa...
[   =] Installing eng-to-ipa...
[    ] Installing eng-to-ipa...
[   =] Installing eng-to-ipa...
[  ==] Installing eng-to-ipa...
[ ===] Installing eng-to-ipa...
[====] Installing eng-to-ipa...
[=== ] Installing eng-to-ipa...
[==  ] Installing eng-to-ipa...
[=   ] Installing eng-to-ipa...
[    ] Installing eng-to-ipa...
[=   ] Installing eng-to-ipa...
[==  ] Installing eng-to-ipa...
[=== ] Installing eng-to-ipa...
[ ===] Installing eng-to-ipa...
[  ==] Installing eng-to-ipa...
[   =] Installing eng-to-ipa...
[    ] Installing eng-to-ipa...
[   =] Installing eng-to-ipa...
[  ==] Installing eng-to-ipa...
[ ===] Installing eng-to-ipa...
[====] Installing eng-to-ipa...
[=== ] Installing eng-to-ipa...
[==  ] Installing eng-to-ipa...
[=   ] Installing eng-to-ipa...
[    ] Installing e

In [579]:
import eng_to_ipa as eng_to_ipa

eng_to_ipa.convert("hey!")

'heɪ!'

In [580]:
for bigram in bigram_in_list:
    try:
        bigram["ipa"] = eng_to_ipa.convert(eng_to_ipa.convert(bigram["token"]))
    except Exception as e:
        pass

bigram_in_list

[{'token': 'pig-shaped', 'bigrams': [{'cookies': 1}], 'ipa': 'pig-shaped**'},
 {'token': 'ii', 'bigrams': [], 'ipa': 'ii**'},
 {'token': 'super',
  'bigrams': [{'easy': 2}, {'savory': 1}, {'fine': 1}],
  'ipa': 'ˈˈsupər*'},
 {'token': 'bredie', 'bigrams': [], 'ipa': 'bredie**'},
 {'token': 'arrabbiata', 'bigrams': [{'sauce': 1}], 'ipa': 'arrabbiata**'},
 {'token': 'number', 'bigrams': [{'one': 1}], 'ipa': 'ˈˈnəmbər*'},
 {'token': 'Japan', 'bigrams': [], 'ipa': 'ʤəˈʤəˈpæn*'},
 {'token': 'breakfast',
  'bigrams': [{'tacos': 1},
   {'puffs': 1},
   {'enchiladas': 1},
   {'kidney': 1},
   {'custard': 1},
   {'skillet': 1},
   {'crepes': 1},
   {'rice': 1},
   {'burritos': 2},
   {'dish': 1},
   {'egg': 1},
   {'chops': 1},
   {'sausage': 1},
   {'pork': 1}],
  'ipa': 'ˈˈbrɛkfəst*'},
 {'token': 'flesh-keek-luh', 'bigrams': [], 'ipa': 'flesh-keek-luh**'},
 {'token': 'apricot',
  'bigrams': [{'rugelach': 1},
   {'chutney': 1},
   {'jelly': 2},
   {'preserves': 5},
   {'or': 1},
   {'fruit': 1

Save bigram list, which contain IPA symbol and bigrams of each token into a json file

In [581]:
import json

with open('bigrams.json', 'w') as f:
    json.dump(bigram_in_list, f)

# Create edit distance