# Steamboat Squad

Import and load data

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import json

with open("recipes_ingredients.json", "r") as json_file:
    recipes = json.load(json_file)
    
len(recipes)

4702

Overview of data structure. This is a list of dictionary, where each dictionary is a recipe with its name, ingredients and url

In [3]:
recipes[0]

{'url': 'https://www.allrecipes.com/recipe/18318/pan-fried-asparagus/',
 'name': 'Pan-Fried Asparagus',
 'ingredients': ['¼ cup butter ',
  '2 tablespoons olive oil ',
  '1 teaspoon coarse salt ',
  '¼ teaspoon ground black pepper ',
  '3 cloves garlic, minced ',
  '1 pound fresh asparagus spears, trimmed ']}

Deleting url key

In [4]:
for recipe in recipes:
    del recipe['url']
recipes[0]

{'name': 'Pan-Fried Asparagus',
 'ingredients': ['¼ cup butter ',
  '2 tablespoons olive oil ',
  '1 teaspoon coarse salt ',
  '¼ teaspoon ground black pepper ',
  '3 cloves garlic, minced ',
  '1 pound fresh asparagus spears, trimmed ']}

# Preprocessing Recipe Names
- Lower-casing (normalise words by using POS tagging)
- Change numbers to fix number (place holder)

NLTK has a help function that explains its POS tags.

In [5]:
import nltk
from nltk import pos_tag, word_tokenize, RegexpParser, Tree
from nltk.tokenize import PunktSentenceTokenizer

nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [6]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

Using %%capture, save the NLTK help text as a string

In [7]:
%%capture cap --no-stderr

nltk.help.upenn_tagset()

In [8]:
cap.stdout

'$: dollar\n    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$\n\'\': closing quotation mark\n    \' \'\'\n(: opening parenthesis\n    ( [ {\n): closing parenthesis\n    ) ] }\n,: comma\n    ,\n--: dash\n    --\n.: sentence terminator\n    . ! ?\n:: colon or ellipsis\n    : ; ...\nCC: conjunction, coordinating\n    & \'n and both but either et for less minus neither nor or plus so\n    therefore times v. versus vs. whether yet\nCD: numeral, cardinal\n    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-\n    seven 1987 twenty \'79 zero two 78-degrees eighty-four IX \'60s .025\n    fifteen 271,124 dozen quintillion DM2,000 ...\nDT: determiner\n    all an another any both del each either every half la many much nary\n    neither no some such that the them these this those\nEX: existential there\n    there\nFW: foreign word\n    gemeinschaft hund ich jeux habeas Haementeria Herr K\'ang-si vous\n    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte\n    terram 

Using RE, get all the tag names

In [9]:
import re

ALL_POS = re.findall(".*: +", cap.stdout)

for i, pos in enumerate(ALL_POS):
  ALL_POS[i] = pos.replace(': ', '')


ALL_POS

['$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 '    ',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``']

In [10]:
ALL_POS.remove('    ')
ALL_POS

['$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``']

Create a function to pos tag a text

In [11]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def tag_pos(corpus):
    text=word_tokenize(corpus)
    return nltk.pos_tag(text)

tag_pos("This is a test sentence.")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('test', 'NN'),
 ('sentence', 'NN'),
 ('.', '.')]

Create a function that POS tag and returns words with specific POS

In [12]:
def get_words_with_pos(text, pos):
  tagged = tag_pos(text)
  return [t for t in tagged if t[1].startswith(pos)]

get_words_with_pos("This is a test sentence.", "NN")

[('test', 'NN'), ('sentence', 'NN')]

POS tag all recipe names

In [13]:
tagged_recipe_names = []

for i, recipe in enumerate(recipes):
  try:
    tagged_recipe_names.append(tag_pos(recipes[i]['name']))
  except Exception as e:
    pass

len(tagged_recipe_names)

4701

## Data cleaning for names based on POS tagging

Looking at the first 10 tagged recipe names, there is a need for pre-processing, as NLTK's tagging is confused by the letter casing.

In [14]:
tagged_recipe_names[:10]

[[('Pan-Fried', 'JJ'), ('Asparagus', 'NNP')],
 [('Pan', 'NNP'),
  ('de', 'FW'),
  ('Muertos', 'NNP'),
  ('(', '('),
  ('Mexican', 'NNP'),
  ('Bread', 'NNP'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('Dead', 'NNP'),
  (')', ')')],
 [('Creamy', 'NNP'), ('Au', 'NNP'), ('Gratin', 'NNP'), ('Potatoes', 'NNP')],
 [('Super-Delicious', 'JJ'), ('Zuppa', 'NNP'), ('Toscana', 'NNP')],
 [('Simple', 'JJ'), ('Teriyaki', 'NNP'), ('Sauce', 'NNP')],
 [('Spicy', 'JJ'),
  ('Korean', 'NNP'),
  ('Fried', 'NNP'),
  ('Chicken', 'NNP'),
  ('with', 'IN'),
  ('Gochujang', 'NNP'),
  ('Sauce', 'NNP')],
 [('Spaghetti', 'NNP'), ('Aglio', 'NNP'), ('e', 'NN'), ('Olio', 'NNP')],
 [('Easy', 'JJ'), ('Garam', 'NNP'), ('Masala', 'NNP')],
 [('Easy', 'NNP'), ('Chorizo', 'NNP'), ('Street', 'NNP'), ('Tacos', 'NNP')],
 [('Tres', 'NNS'),
  ('Leches', 'NNP'),
  ('(', '('),
  ('Milk', 'NNP'),
  ('Cake', 'NNP'),
  (')', ')')]]

Create a function that returns all tagged words with the same tag. NLTK's POS tagging assumes that capitalized noun means proper noun (name).

In [15]:
def list_words_with_tag(tuple_list, pos):
  results = []
  for name in tuple_list:
    for tag in name:
      if tag[1] == pos:
        results.append(tag[0])
  return results

list_words_with_tag(tagged_recipe_names, "NNP")

['Asparagus',
 'Pan',
 'Muertos',
 'Mexican',
 'Bread',
 'Dead',
 'Creamy',
 'Au',
 'Gratin',
 'Potatoes',
 'Zuppa',
 'Toscana',
 'Teriyaki',
 'Sauce',
 'Korean',
 'Fried',
 'Chicken',
 'Gochujang',
 'Sauce',
 'Spaghetti',
 'Aglio',
 'Olio',
 'Garam',
 'Masala',
 'Easy',
 'Chorizo',
 'Street',
 'Tacos',
 'Leches',
 'Milk',
 'Cake',
 'Cabbage',
 'Rolls',
 'Gravy',
 'Shrimp',
 'Scampi',
 'Pasta',
 'Lemon',
 'Chicken',
 'Potato',
 'Bake',
 'Mexican',
 'Casserole',
 'Caldo',
 'Res',
 'Mexican',
 'Beef',
 'Soup',
 'Nogada',
 'Mexican',
 'Stuffed',
 'Poblano',
 'Peppers',
 'Walnut',
 'Sauce',
 'Apple',
 'Cake',
 'Flan',
 'Pork',
 'Chops',
 'Sauerkraut',
 'Spicy',
 'Thai',
 'Basil',
 'Chicken',
 'Pad',
 'Krapow',
 'Gai',
 'Spaghetti',
 'Cacio',
 'Pepe',
 'Chef',
 'John',
 'Chicken',
 'Kiev',
 'Chicken',
 'Onions',
 'Fajita',
 'Perfect',
 'Sushi',
 'Rice',
 'Baked',
 'Chicken',
 'German',
 'Potato',
 'Salad',
 'Miso',
 'Soup',
 'Mexican',
 'Rice',
 'II',
 'Haluski',
 'Labneh',
 'Lebanese',
 'Y

Get the number of each POS tag

In [16]:
all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  all_name_tags.append(new_dic)

In [17]:
def get_tag_number(tag_list):
  tag_numbers = []
  for tag in tag_list:
    for key, value in tag.items(): 
      new_dict = {key: len(value)}
    tag_numbers.append(new_dict)
  return tag_numbers

get_tag_number(all_name_tags)

[{'$': 1},
 {"''": 7},
 {'(': 529},
 {')': 529},
 {',': 63},
 {'--': 0},
 {'.': 10},
 {':': 98},
 {'CC': 555},
 {'CD': 74},
 {'DT': 104},
 {'EX': 0},
 {'FW': 47},
 {'IN': 482},
 {'JJ': 1822},
 {'JJR': 4},
 {'JJS': 27},
 {'LS': 0},
 {'MD': 2},
 {'NN': 571},
 {'NNP': 13139},
 {'NNPS': 46},
 {'NNS': 307},
 {'PDT': 0},
 {'POS': 348},
 {'PRP': 72},
 {'PRP$': 20},
 {'RB': 33},
 {'RBR': 0},
 {'RBS': 1},
 {'RP': 2},
 {'SYM': 0},
 {'TO': 20},
 {'UH': 0},
 {'VB': 24},
 {'VBD': 39},
 {'VBG': 50},
 {'VBN': 133},
 {'VBP': 10},
 {'VBZ': 22},
 {'WDT': 4},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 7},
 {'``': 6}]

Some names have numbers (CD). Some are obviously not numbers, like 'Figgy'

In [18]:
def get_values_from_dict_list(dict_list, key):
  values = []
  for d in dict_list:
    if key in d:
      values.append(d[key])
  return values

cd_tokens = get_values_from_dict_list(all_name_tags, 'CD')[0]
cd_tokens

['5',
 '16',
 '2',
 '13',
 '300',
 'Figgy',
 '3',
 '9',
 'Two',
 '9',
 '22',
 '10',
 '15',
 'One',
 '18',
 'Ten',
 'Flounder',
 'Three',
 'Ziti',
 'One',
 '21',
 'Four',
 '9',
 '65',
 '17',
 '14',
 '10',
 "'n",
 '15',
 '8',
 'Minestrone',
 'Four',
 '35',
 'Fly',
 '15',
 '23',
 '8',
 '15',
 '21',
 "That's-a",
 'Tex-Mex',
 '14',
 '17',
 'Five',
 '10',
 '18',
 '5',
 "'Otai",
 '17',
 '3',
 '17',
 '75',
 '17',
 '20',
 'Take-Out',
 '16',
 '12',
 'Three',
 "'Three",
 '15',
 '20',
 '16',
 '12',
 '15',
 '22',
 '12',
 'Three',
 '21',
 '21',
 '25',
 '7',
 '10',
 '19',
 '20']

Create a function that searches for recipe name with specific string

In [19]:
def find_value_with_char(dic_list, key, char):
  matches = []
  for recipe in dic_list:
    try:
      if char in recipe[key]:
        matches.append(recipe[key])
    except Exception as e:
      pass
  return matches

find_value_with_char(recipes, 'name', 'Figgy')

['Figgy Pudding']

'Three cup chicken' is indeed a name. On the other hand, numerics, such as 9 and 13 are not part of the actual names of dishes. So, numerics, instead of NLTK's CD, should be treated. This treatment should be done using regex.

In [20]:
for cd in cd_tokens:
  print(find_value_with_char(recipes, 'name', cd))

['Our 5 Best Avgolemono Soup Recipes', '5-Ingredient Mexican Casserole', '15 Mexican-Inspired Ground Beef Casseroles That Deliver Big Flavor With Every Satisfying Bite', 'Chicken 65', 'Pan-Roasted 5-Spice Pork Loin', 'The 15 Most Iconic French Desserts', '35 Quick and Easy Chinese Dinners You Can Make at Home', '15 Essential North Indian Recipes', '15 Essential North Indian Recipes', '18 Easy Mexican Dishes With 5 Ingredients or Less', 'French 75 Cocktail', '15 Top-Rated Traditional German Christmas Cookies', '15 Traditional Italian Christmas Dinner Recipes', "25 Italian Cookies You'll Love"]
['16 German Recipes That Are Comfort Food Favorites', '16 Mexican-Inspired Casseroles for Family-Pleasing Dinners', '16 Essential Puerto Rican Recipes']
['2 Minute Cheese Quesadillas', "22 Recipes Using a Whole Baguette (That Aren't Sandwiches)", 'Our 21 Best Authentic Mexican Recipes', '23 Delicious Ways the World Cooks Pork Shoulder', '21 Easy Dinners That Start with Packaged Gnocchi', 'Our 20 B

Create a function that searches a regex pattern from a text

In [21]:
def searchWordsPatt(text, patt):
    array = re.findall(patt, text)
    return array

NUMPATTERN = r'[0-9]+'
searchWordsPatt("I want 1 cup of tea", NUMPATTERN)

['1']

Create a function that substitutes regex patterns with a given value

In [22]:
def searchReplacePatt(text, patt, new_val):
  return re.sub(patt, new_val, text)

NUMSPACEPATTERN = r'(\d+\s)'
searchReplacePatt("I want 1 cup of tea", NUMSPACEPATTERN, "")

'I want cup of tea'

searchReplacePatt, except it iterates recipe list

In [23]:
def searchReplacePattList(dict_list, patt, new_val, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            dict_list[i][key] = searchReplacePatt(dict_list[i][key], patt, new_val)
        except Exception as e:
            pass

searchReplacePattList, but adds a substring at given index

In [24]:
def searchReplaceAddPattList(dict_list, patt, new_val, substring, index=0, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            dict_list[i][key] = searchReplacePatt(dict_list[i][key], patt, new_val)
            added_string = list(dict_list[i][key]).insert(index, substring)
            dict_list[i][key]=''.join(added_string)
        except Exception as e:
            pass

Remove numerics from name

In [25]:
import re

p_recipes = recipes

searchReplacePattList(p_recipes, NUMSPACEPATTERN, "")

def retag(text_list, key):
  new_list = []
  for i, recipe in enumerate(text_list):
    try:
      new_list.append(tag_pos(recipes[i][key]))
    except Exception as e:
      pass
  return new_list

tagged_recipe_names = retag(p_recipes, "name")

Get the new remaining CD

In [26]:
new_cd_tokens = list_words_with_tag(tagged_recipe_names, "CD")
new_cd_tokens

['Figgy',
 'Two',
 'One',
 'Ten',
 'Flounder',
 'Three',
 'Ziti',
 'One',
 'Four',
 '65',
 "'n",
 'Minestrone',
 'Four',
 'Fly',
 "That's-a",
 'Tex-Mex',
 'Five',
 "'Otai",
 'Take-Out',
 'Three',
 "'Three",
 'Three']

The remaining numbers (CD) are part of actual recipe names

In [27]:
for cd in new_cd_tokens:
  print(find_value_with_char(p_recipes, 'name', cd))

['Figgy Pudding']
['Two-Ingredient Naan', 'Pollo alla Birra for Two']
['A Number One Egg Bread', 'One-Egg Egg Drop Soup', 'One Pot Thai-Style Rice Noodles', 'One-Pot Vegan Potato-Lentil Curry', 'One-Bite Thai "Flavor Bomb" Salad Wraps (Miang Kham)', 'Easy One-Skillet Ground Beef Burrito', 'One-Pot Greek Lemon Chicken and Rice']
['Tender Italian Baked Chicken', 'Tuscan Pork Tenderloin', 'Asian Pork Tenderloin', 'Italian Pork Tenderloin', 'Sweet and Sour Pork Tenderloin', 'Chipotle Crusted Pork Tenderloin', 'Ten Minute Szechuan Chicken', 'Thai Quivering Tenderloins', 'Spicy Pork Tenderloin', 'Chinese Pork Tenderloin', 'Grecian Pork Tenderloin', 'Havana Slow Cooker Pork Tenderloin', 'Curry Pork Tenderloin', 'Tender Juicy Skirt Steak  (Churrasco)', 'Spicy and Tender Corned Beef', 'Pan Roasted Pork Tenderloin with a Blue Cheese and Olive Stuffing']
['Flounder Mediterranean']
['Pastel de Tres Leches (Three Milk Cake)', 'Three-Meat Italian Meatballs', 'Three Cheese Manicotti II', 'Taiwanese-S

In [28]:
new_all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  new_all_name_tags.append(new_dic)

Can and 'll are the modal verbs found

In [29]:
md_tokens = list_words_with_tag(tagged_recipe_names, "MD")
md_tokens

['Can', "'ll"]

'can' is caused by words such as Canadian, which is processed in next section. But, 'you'll love' is not part of recipe name and more of an expression

In [30]:
for md in md_tokens:
  print(find_value_with_char(p_recipes, 'name', md))

['Canadian Yellow Split Pea Soup with Ham', 'French Canadian Tourtiere', 'Pure Maple Candy', 'Cannoli', 'The Original Donair From the East Coast of Canada', 'Sauerkraut for Canning', 'Tourtiere (French Canadian Meat Pie)', 'Pumpkin Cannoli', 'Puerto Rican Canned Corned Beef Stew', 'Canadian Pork Loin Chops', 'Caneles de Bordeaux', 'Canadian Walleye (Pickerel)', "Thera's Canadian Fried Dough", 'Italian Baked Cannelloni', 'Canary Island Red Mojo Sauce', 'Mexican Tamarind Candy', 'Cantonese Chicken Chow Mein', 'Roti Canai/Paratha (Indian Pancake)', 'Polvorones de Canele (Cinnamon Cookies)', 'Miraculous Canadian Sugar Pie', 'Canadian Tea Biscuits', 'Peanut Butter Potato Candy', 'Irish Potato Candy', 'Filipino Pancit Bihon with Canton', 'Gorton (French-Canadian Pork Spread)', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Chocolate Cantucci', 'Cantonese Style Lobster', 'Real Canadian Poutine', 'French Canadian Meatball Stew', 'Canadian Butter Tarts', 'Canadian Apple Pie', 'Cantones

Removing "You'll" and retagging new list

In [31]:
searchReplacePattList(p_recipes, r"(You'll Love)", "")
tagged_recipe_names = retag(p_recipes, "name")

'll' removed

In [32]:
new_md_tokens = list_words_with_tag(tagged_recipe_names, "MD")
new_md_tokens

['Can']

In [33]:
for md in new_md_tokens:
  print(find_value_with_char(p_recipes, 'name', md))

['Canadian Yellow Split Pea Soup with Ham', 'French Canadian Tourtiere', 'Pure Maple Candy', 'Cannoli', 'The Original Donair From the East Coast of Canada', 'Sauerkraut for Canning', 'Tourtiere (French Canadian Meat Pie)', 'Pumpkin Cannoli', 'Puerto Rican Canned Corned Beef Stew', 'Canadian Pork Loin Chops', 'Caneles de Bordeaux', 'Canadian Walleye (Pickerel)', "Thera's Canadian Fried Dough", 'Italian Baked Cannelloni', 'Canary Island Red Mojo Sauce', 'Mexican Tamarind Candy', 'Cantonese Chicken Chow Mein', 'Roti Canai/Paratha (Indian Pancake)', 'Polvorones de Canele (Cinnamon Cookies)', 'Miraculous Canadian Sugar Pie', 'Canadian Tea Biscuits', 'Peanut Butter Potato Candy', 'Irish Potato Candy', 'Filipino Pancit Bihon with Canton', 'Gorton (French-Canadian Pork Spread)', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Chocolate Cantucci', 'Cantonese Style Lobster', 'Real Canadian Poutine', 'French Canadian Meatball Stew', 'Canadian Butter Tarts', 'Canadian Apple Pie', 'Cantones

Replacing any "/" with "or" word

In [34]:
searchReplacePattList(p_recipes, r"\/", " or ")
tagged_recipe_names = retag(p_recipes, "name")

In [35]:
bracket_tokens = list(set(list_words_with_tag(tagged_recipe_names, "(")))
bracket_tokens

['(']

Examining brackers in names. Most of the words in brackets are translations

In [36]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Tres Leches (Milk Cake)', 'Caldo de Res (Mexican Beef Soup)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', '

"(no red sauce here...golden)" needs to be removed

In [37]:
# Redundant descriptions
searchReplacePattList(p_recipes,  r"(no red sauce here...golden)", "")
searchReplacePattList(p_recipes, r"(From a Swede!)", "")
searchReplacePattList(p_recipes, r"(from a Chinese person)", "")
searchReplacePattList(p_recipes, r"(Now Vegetarian!)", "")
searchReplacePattList(p_recipes, r"a.k.a. ", "")
searchReplacePattList(p_recipes, r"(That Aren't Sandwiches)", "")

# Remove copyright symbol
searchReplacePattList(p_recipes, r"&reg;", "")
# Asian Sesame Seared or Grilled Tuna (Gluten Free) => Gluten Free Asian Sesame Seared or Grilled Tuna
searchReplaceAddPattList(p_recipes, r"(Gluten Free)", "", "glutten-free")
tagged_recipe_names = retag(p_recipes, "name")

In [38]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Tres Leches (Milk Cake)', 'Caldo de Res (Mexican Beef Soup)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', '

Only three foreign words detected by NLTK, which is not true

In [39]:
fw_tokens = list(set(list_words_with_tag(tagged_recipe_names, "FW")))
fw_tokens

['et', 'de', 'Rassolnik']

From the three unique foreign words, these are the names

In [40]:
fw_names = []
for fw in fw_tokens:
    names = find_value_with_char(p_recipes, 'name', fw)
    print(names)
    fw_names = fw_names + names
fw_names = list(set(fw_names))

['Spaghetti Aglio e Olio', 'Easy Chorizo Street Tacos', 'Spaghetti Cacio e Pepe', 'Make-Ahead Vegetarian Moroccan Stew', "'Chinese Buffet' Green Beans", 'Sweet and Sour Chicken I', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Braised Corned Beef Brisket', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Fabulous Wet Burritos', 'Sauteed Sweet Plantains (Tajaditas Dulces de Platano)', 'Vegetarian Mexican Inspired Stuffed Peppers', 'Sheet Pan Chicken Fajitas', 'Sheet Pan Fried Rice', 'Vegetarian Chinese Fried Noodles', "Papa Drexler's Bavarian Pretzels", 'Quick Bruschetta Chicken Bake', 'Authentic Vietnamese Spring Rolls (Nem Ran Hay Cha Gio)', 'Kotlet Schabowy (Polish Breaded Pork Chop)', 'Spaghetti alla Carbonara: the Traditional Italian Recipe', 'Stir-Fry Chicken and Vegetables', 'Vegetarian Moussaka', 'French Baguettes', 'Shrimp Fettuccine Alfredo', 'Conchas (Mexican Sweet Bread)', 'Skillet Chicken Picante', 'Spaghetti Sauce', 'Roasted Pork Banh Mi (Vietnamese Sa

In [41]:
fw_names

['Easy German Apple Sheet Cake',
 'Vegetarian Sushi',
 'Slow Cooker Guisado Verde',
 'Sweet Curry Pumpkin Seeds',
 'Gourmet Pastelillos (Meat Pies)',
 'K&auml;sesahnetorte (German Yogurt Mousse Cake)',
 'Timballo Spaghetti Casserole',
 "Ninabell's Appetizer Meatballs",
 'Spaghetti Aglio, Olio, e Peperoncini',
 'Sheet Pan Shrimp Fajitas',
 'Mexican Chicken Soup with Rice (Caldo de Pollo con Arroz)',
 'Homemade Pork Fried Rice',
 'Grecian Pork Tenderloin',
 'Havana Slow Cooker Pork Tenderloin',
 'Chow Mein with Chicken and Vegetables',
 "Traditional Spaghetti all'Amatriciana",
 'Tuscan Pork Tenderloin',
 'Turmeric Golden Milk with Turmeric Paste',
 'Chinese Sweet and Sour Chicken',
 'Amaretti',
 'Chicken Lettuce Wraps',
 'Harissa Powder',
 'Rib Eye Steaks with a Soy and Ginger Marinade',
 'Indian Vegetable Bhaji',
 'Refreshing Oatmeal Drink (Agua de Avena)',
 'Loaded Greek Burgers',
 'Godeungeo Jorim (Korean Braised Mackerel with Radish)',
 'Slow Cooker Mexican Recipes Under Calories',
 

Names that both have foreign words and bracket

In [42]:
bracket_and_fw = [name for name in bracketed_names if name in fw_names]
bracket_and_fw

['Gourmet Pastelillos (Meat Pies)',
 'K&auml;sesahnetorte (German Yogurt Mousse Cake)',
 'Mexican Chicken Soup with Rice (Caldo de Pollo con Arroz)',
 'Refreshing Oatmeal Drink (Agua de Avena)',
 'Godeungeo Jorim (Korean Braised Mackerel with Radish)',
 'Semmelknoedel (Bread Dumplings)',
 'Sauteed Sweet Plantains (Tajaditas Dulces de Platano)',
 'Caldo Verde (Portuguese Sausage Kale Soup)',
 'Cazuela de Vaca (Beef and Pumpkin Stew)',
 "Bucatini Cacio e Pepe (Roman Sheep Herder's Pasta)",
 'Recipes Using a Whole Baguette ()',
 'Brazilian Cheese Rolls (Pao de Queijo)',
 'Poblano and Cheese Tamales (Tamales de Rajas con Queso)',
 'Birria de Res Tacos (Beef Birria Tacos)',
 'Homemade Irish (Whiskey) Cream',
 'Kwek Kwek (Filipino Street Food)',
 'Mexican Chicken Meatball Soup (Sopa de Albondigas de Pollo)',
 'Feta Cheese Burek (Phyllo Dough)',
 'Lithuanian Saltibarsciai (Cold Beet Soup)',
 'Horchata de Arroz (Rice Drink)',
 'Tonkatsu Shoyu Ramen (Pork Cutlet Soy Sauce Ramen)',
 'Spaghetti a

Split the names into two names, one outside and one inside

In [43]:
BRACKET_REGEX = " \(.*\)"
def break_fw_bracket(name):
    name1 = re.findall(BRACKET_REGEX, name)[0]
    name1 = name1[name1.find("(")+1:name1.find(")")]
    name2 = re.sub(BRACKET_REGEX, "", name)
    return name1, name2

print(break_fw_bracket("Hearty Caldo de Res (Mexican Beef Soup)"))
print(break_fw_bracket("Ukha (Russian Fish Soup)"))

('Mexican Beef Soup', 'Hearty Caldo de Res')
('Russian Fish Soup', 'Ukha')


Apply the split function. Delete old recipe with bracket and foreign words. In both of the new recipes, duplicate old ingredients.

In [44]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in bracket_and_fw:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

There are still remaining names with bracket, mostly due to the foreign words not being recognized.

In [45]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Tres Leches (Milk Cake)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', 'Kalbi (Korean BBQ Short Ribs)', 'Macaron (French Macaroon)', 'Atsara (Papaya Relish)', 'Authentic Chinese Egg Rolls ()', 'Greek Le

In [46]:
bracketed_names

['Tuscan Onion Soup (Carabaccia)',
 'Bulgogi (Korean Barbecued Beef)',
 'Korean Saewoo Bokkeumbap (Shrimp Fried Rice)',
 'Onigiri (Japanese Rice Balls)',
 'Japanese Egg Salad Sandwich (Tamago Sando)',
 'Arroz con Leche (Mexican Rice Pudding)',
 "Tim Perry's Soup (Creamy Curry Cauliflower and Broccoli Soup)",
 'Hawaiian Bruddah Potato Mac (Macaroni) Salad',
 'Marranitos (Mexican Pig-Shaped Cookies)',
 'Feijoada (Brazilian Black Bean Stew)',
 'Blaukraut (German Red Cabbage)',
 'Cauliflower Rice (Biryani-Style)',
 'Indonesian Fried Rice (Nasi Goreng)',
 'Brazilian Passion Fruit Mousse (Maracuja)',
 'Schwabischer Kartoffelsalat (German Potato Salad - Schwabisch Style)',
 'Drozdzowka (Polish Yeast Plum Cake)',
 'Mahalabia (Middle Eastern-Style Milk Pudding)',
 'Persian Cucumber Yogurt (Maast-o Khiar)',
 'Swedish Chocolate Balls (Chokladbollar)',
 'Ethiopian Firfir with Dried Beef (Quanta Firfir)',
 'Mutton Varuval (Malaysian Indian-Style Goat Curry)',
 'Embutido (Filipino Meatloaf)',
 'Choe

Most of the brackets are at the end of each name. For those that are in the middle, they are translations of one of the words in the name.

In [47]:
b_name_end = []
b_name_mid = []
for b_name in bracketed_names:
    if b_name.endswith(')'):
        b_name_end.append(b_name)
    else:
        b_name_mid.append(b_name)
        
b_name_end

['Tuscan Onion Soup (Carabaccia)',
 'Bulgogi (Korean Barbecued Beef)',
 'Korean Saewoo Bokkeumbap (Shrimp Fried Rice)',
 'Onigiri (Japanese Rice Balls)',
 'Japanese Egg Salad Sandwich (Tamago Sando)',
 'Arroz con Leche (Mexican Rice Pudding)',
 "Tim Perry's Soup (Creamy Curry Cauliflower and Broccoli Soup)",
 'Marranitos (Mexican Pig-Shaped Cookies)',
 'Feijoada (Brazilian Black Bean Stew)',
 'Blaukraut (German Red Cabbage)',
 'Cauliflower Rice (Biryani-Style)',
 'Indonesian Fried Rice (Nasi Goreng)',
 'Brazilian Passion Fruit Mousse (Maracuja)',
 'Schwabischer Kartoffelsalat (German Potato Salad - Schwabisch Style)',
 'Drozdzowka (Polish Yeast Plum Cake)',
 'Mahalabia (Middle Eastern-Style Milk Pudding)',
 'Persian Cucumber Yogurt (Maast-o Khiar)',
 'Swedish Chocolate Balls (Chokladbollar)',
 'Ethiopian Firfir with Dried Beef (Quanta Firfir)',
 'Mutton Varuval (Malaysian Indian-Style Goat Curry)',
 'Embutido (Filipino Meatloaf)',
 'Choereg (Armenian Easter Bread)',
 'Kimchi Fried Rice

In [48]:
b_name_mid

['Hawaiian Bruddah Potato Mac (Macaroni) Salad',
 'Korean Bean Curd (Miso) Soup',
 'Lengua (Beef Tongue) Stew',
 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 'Jeera (Cumin) Rice',
 'Seaweed (Nori) Soup',
 'Besan (Gram Flour) Halwa',
 'Lamb (Gosht) Biryani',
 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish',
 'Bee Sting Cake (Bienenstich) II',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce',
 'Ulu (Breadfruit) Pancakes',
 "World's Best () Lasagna",
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Albondigas (Meatballs) en Chipotle',
 'Zito (Zhito or Koljivo) - Serbian Wheat Pudding',
 'Lazy Golumpki (Stuffed Cabbage) Soup',
 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms',
 'Spicy Indian (Gujarati) Green Beans',
 'Classic Cuban Midnight (Medianoche) Sandwich',
 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce',
 'Coconut (Haupia) and Chocolate Pie']

On the other hand, without parenthesis anymore, names with foregin words tagged are now clean

In [49]:
fw_names = []
for fw in fw_tokens:
    names = find_value_with_char(p_recipes, 'name', fw)
    print(names)
    fw_names = fw_names + names
fw_names = list(set(fw_names))

['Spaghetti Aglio e Olio', 'Easy Chorizo Street Tacos', 'Spaghetti Cacio e Pepe', 'Make-Ahead Vegetarian Moroccan Stew', "'Chinese Buffet' Green Beans", 'Sweet and Sour Chicken I', 'Braised Corned Beef Brisket', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Fabulous Wet Burritos', 'Vegetarian Mexican Inspired Stuffed Peppers', 'Sheet Pan Chicken Fajitas', 'Sheet Pan Fried Rice', 'Vegetarian Chinese Fried Noodles', "Papa Drexler's Bavarian Pretzels", 'Quick Bruschetta Chicken Bake', 'Spaghetti alla Carbonara: the Traditional Italian Recipe', 'Stir-Fry Chicken and Vegetables', 'Vegetarian Moussaka', 'French Baguettes', 'Shrimp Fettuccine Alfredo', 'Skillet Chicken Picante', 'Spaghetti Sauce', 'Vegetarian Korma', 'Fettuccini Carbonara', 'Kaese Spaetzle', 'Beef and Beet Borscht', 'Addictive Sweet Potato Burritos', "Chef John's French Omelette", 'Sweet and Spicy Stir Fry with Chicken and Broccoli', 'Simple Sweet and Spicy Chicken Wraps', 'Johnny Marzetti Casserole', 'Th

In [50]:
fw_names

['Double Ka Meeta',
 'Easy German Apple Sheet Cake',
 'Vegetarian Sushi',
 'Slow Cooker Guisado Verde',
 'Sweet Curry Pumpkin Seeds',
 'Pan de Muertos',
 'Timballo Spaghetti Casserole',
 "Ninabell's Appetizer Meatballs",
 'Spaghetti Aglio, Olio, e Peperoncini',
 'Sheet Pan Shrimp Fajitas',
 'Homemade Pork Fried Rice',
 'Grecian Pork Tenderloin',
 'Havana Slow Cooker Pork Tenderloin',
 'Chow Mein with Chicken and Vegetables',
 "Traditional Spaghetti all'Amatriciana",
 'Homemade Manti',
 'Chile Verde',
 'Tuscan Pork Tenderloin',
 'Turmeric Golden Milk with Turmeric Paste',
 'Chinese Sweet and Sour Chicken',
 'Amaretti',
 'Chicken Lettuce Wraps',
 'Harissa Powder',
 'Pasta Con Sarde',
 'Rib Eye Steaks with a Soy and Ginger Marinade',
 'Beet Salad',
 'Indian Vegetable Bhaji',
 'Mexican Spaghetti',
 'Loaded Greek Burgers',
 'Slow Cooker Mexican Recipes Under Calories',
 'Simple Sweet and Sour Sauce',
 'Homemade Muesli',
 'Hawaiian Sausage Skillet',
 'Vegetarian Chinese Fried Noodles',
 "Che

For the remaining names with bracket at the end, split into two new recipe names

In [51]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_end:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Tres Leches (Milk Cake)
Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)
Spicy Thai Basil Chicken (Pad Krapow Gai)
Labneh (Lebanese Yogurt)
Indian Chicken Curry (Murgh Kari)
Keema Aloo (Ground Beef and Potatoes)
Turkish Eggs (Cilbir)
South African Melktert (Milk Tart)
Ukrainian Apple Cake (Yabluchnyk)
Spanish Garlic Shrimp (Gambas al Ajillo)
German Potato Dumplings (Kartoffelkloesse)
Apfelkuchen (Apple Cake)
Eggplant Caponata (Sicilian Version)
Chana Masala (Savory Indian Chick Peas)
Ricotta Pie (Old Italian Recipe)
Easy Blini (Russian Pancake)
Easy Bulgogi (Korean BBQ Beef)
Carne en su Jugo (Meat in its Juices)
Ghormeh Sabzi (Persian Herb Stew)
Puerto Rican Tostones (Fried Plantains)
Kalbi (Korean BBQ Short Ribs)
Macaron (French Macaroon)
Atsara (Papaya Relish)
Authentic Chinese Egg Rolls ()
Greek Lentil Soup (Fakes)
Lumpia (Shanghai version)
Northern Ontario Partridge (Ruffed Grouse)
Vampiros Mexicanos (Mexican Vampires)
Jamaican Saltfish Fritters (Stamp and Go)
Slo

For some reasons, need to run the cell twice

In [52]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_end:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Polish Noodles (Cottage Cheese and Noodles)
Oyakodon (Japanese Chicken and Egg Rice Bowl)
Papas Rellenas (Fried Stuffed Potatoes)
Blaukraut (German Red Cabbage)
Irish Boiled Dinner (Corned Beef)
True Dominican Sancocho (Latin 7-Meat Stew)
Blini (Russian Pancakes)
Oeufs Cocotte (Baked Eggs)
Ropa Vieja (Cuban Beef)
Lace Cookies (Florentine Cookies)
Sinigang na Bangus (Filipino Milkfish in Tamarind Broth)
Schwabischer Kartoffelsalat (German Potato Salad - Schwabisch Style)
Roti Canai or Paratha (Indian Pancake)
Melanzana alla Parmigiana (Perfect Eggplant Parmigiana)
Pierogi (Traditional Polish Dumplings)
Nipples of Venus (Capezzoli di Venere)
Samosadilla (Samosa Quesadilla)
Bulgogi (Korean Barbecued Beef)
Sabaayad (Somali Flatbread)
Filipino Baked Milkfish (Baked Bangus)
Ash-e Reshteh (Persian Legume Soup)
Lentil and Cactus Soup (Mom's Recipe)
Ethiopian Cabbage and Potato Dish (Atkilt)
Finnish Kropser (Baked Pancakes)
Oma's Griessnockerlsuppe (Beef and Semolina Dumpling Soup)
Kewa Datshi 

Only the names with bracket in the middle of their names remain

In [53]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names= bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Classic Cuban Midnight (Medianoche) Sandwich', 'Spicy Indian (Gujarati) Green Beans', "World's Best () Lasagna", 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce', 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce', 'Bee Sting Cake (Bienenstich) II', 'Coconut (Haupia) and Chocolate Pie', 'Lamb (Gosht) Biryani', 'Jeera (Cumin) Rice', 'Pollo (Chicken) Fricassee from Puerto Rico', 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish', 'Lazy Golumpki (Stuffed Cabbage) Soup', 'Ulu (Breadfruit) Pancakes', 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican', 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce', 'Seaweed (Nori) Soup', 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms', 'Hawaiian Bruddah Potato Mac (Macaroni) Salad', 'Korean Bean Curd (Miso) Soup', 'Lengua (Beef Tongue) Stew', 'Albondigas (Meatballs) en Chipotle', 'Zito (Zhito or Koljivo) - Serbian Wheat Pudding', 'Besan (Gram Flour) Halwa']


Mac and rapini is only synonymous the the one word before them. Otherwise, the bracketed words are synonymous to all the words before them combined.

In [54]:
bracketed_names

['Albondigas (Meatballs) en Chipotle',
 'Lamb (Gosht) Biryani',
 'Lazy Golumpki (Stuffed Cabbage) Soup',
 "World's Best () Lasagna",
 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce',
 'Jeera (Cumin) Rice',
 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish',
 'Seaweed (Nori) Soup',
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms',
 'Spicy Indian (Gujarati) Green Beans',
 'Hawaiian Bruddah Potato Mac (Macaroni) Salad',
 'Zito (Zhito or Koljivo) - Serbian Wheat Pudding',
 'Classic Cuban Midnight (Medianoche) Sandwich',
 'Besan (Gram Flour) Halwa',
 'Bee Sting Cake (Bienenstich) II',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Korean Bean Curd (Miso) Soup',
 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce',
 'Lengua (Beef Tongue) Stew',
 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 'Ulu (Breadfruit) Pancakes',
 'Coconut (Haupia) and Chocolate Pie']

The names can still be duplicated into 2, except that the bracketed word replaces the words before in the second new name, treating them as synonyms.

In [55]:
def convert_bracket_synonym(name, num=0):
    name1 = re.findall(BRACKET_REGEX, name)[0]
    name1 = name1[name1.find("(")+1:name1.find(")")]
    name1_suffix = name.split(')')[1]
    if num==0:
        name1 = name1 + name1_suffix
        name2 = re.sub(BRACKET_REGEX, "", name)
    else:
        name1_prefix = name.split('(')[0]
        name1_prefix = name1_prefix[:-num]
        name1 = name1_prefix + name1 + name1_suffix
        name2 = re.sub(BRACKET_REGEX, " ", name)
    return name1, name2

print(convert_bracket_synonym("Lamb (Gosht) Biryani"))
print(convert_bracket_synonym("Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce", 1))
print(convert_bracket_synonym("Hawaiian Bruddah Potato Mac (Macaroni) Salad", 1))

('Gosht Biryani', 'Lamb Biryani')
('Fusilli with RapiniBroccoli Rabe, Garlic, and Tomato Wine Sauce', 'Fusilli with Rapini , Garlic, and Tomato Wine Sauce')
('Hawaiian Bruddah Potato MacMacaroni Salad', 'Hawaiian Bruddah Potato Mac  Salad')


In [56]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_mid:
            newname1, newname2 = convert_bracket_synonym(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Classic Cuban Midnight (Medianoche) Sandwich
Spicy Indian (Gujarati) Green Beans
World's Best () Lasagna
Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce
Kimchi Jun (Kimchi Pancake) and Dipping Sauce
Bee Sting Cake (Bienenstich) II
Coconut (Haupia) and Chocolate Pie
Lamb (Gosht) Biryani
Jeera (Cumin) Rice
Pollo (Chicken) Fricassee from Puerto Rico
Fish Sinigang (Tilapia) - Filipino Sour Broth Dish
Lazy Golumpki (Stuffed Cabbage) Soup
Ulu (Breadfruit) Pancakes
Fried Chicken Chunks (Chicharrones De Pollo) Dominican
Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce
Seaweed (Nori) Soup
Vareniki (Russian Pierogi) with Potatoes and Mushrooms
Hawaiian Bruddah Potato Mac (Macaroni) Salad
Korean Bean Curd (Miso) Soup
Lengua (Beef Tongue) Stew
Albondigas (Meatballs) en Chipotle
Zito (Zhito or Koljivo) - Serbian Wheat Pudding
Besan (Gram Flour) Halwa


Successfully removed all brackets from recipe names

In [57]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names= bracketed_names + names

bracketed_names = list(set(bracketed_names))
bracketed_names

[]


[]

Dashes are mostly adjectives, but things like semi colon need to be removed. As for colons, its mostly translation. Semicolons are caused by K&auml;, which are dishes with special characters or German words.

In [58]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

[':', '-', ';']

In [59]:
for colon in colon_tokens:
  print(find_value_with_char(p_recipes, 'name', colon))

['Spaghetti alla Carbonara: the Traditional Italian Recipe', 'Doro Wat: Ethiopian Chicken Dish', "Grandma's Focaccia: Baraise Style"]
['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chic

In [60]:
def remove_entry_with(dict_list, target, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            if target in dict_list[i]["name"]:
                dict_list.remove(dict_list[i])
        except Exception as e:
            pass

In [61]:
for semicolon in ["Quorn&trade;", "Sp&auml;tzle", "Tamales Oaxaque&ntilde;os", "K&auml;sesahnetorte", "Salte&ntilde;as"]:
    remove_entry_with(p_recipes, semicolon)
tagged_recipe_names = retag(p_recipes, "name")

Semi colons cleaned

In [62]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

[':', '-']

In [63]:
for colon in colon_tokens:
  print(find_value_with_char(p_recipes, 'name', colon))

['Spaghetti alla Carbonara: the Traditional Italian Recipe', 'Doro Wat: Ethiopian Chicken Dish', "Grandma's Focaccia: Baraise Style"]
['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chic

For these 2 names, colons are used for describing

In [64]:
# Spaghetti alla Carbonara: the Traditional Italian Recipe => traditional Italian Spaghetti alla Carbonara
searchReplaceAddPattList(p_recipes, r": the Traditional Italian Recipe", "", "traditional Italian ")
# Grandma's Focaccia: Baraise Style => Grandma's Baraise Style Focaccia
searchReplaceAddPattList(p_recipes, r": Baraise Style", "", "Baraise Style ", index=10)
tagged_recipe_names = retag(p_recipes, "name")

Cleaned 2 names with colon. If the dashes are between a word, they are either part of a word's spelling or joining two words together, typically as an adjective. However, if it is between spaces, they are translations.

In [65]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

[':', '-']

In [66]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Doro Wat: Ethiopian Chicken Dish']
['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, C

['Doro Wat: Ethiopian Chicken Dish',
 'Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Haluski - Cabbage and Noodles',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Carnitas - Pressure Cooker',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 'Onigiri - Japanese Rice Balls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Chicken French - Rochester, NY Style',
 

But in some cases, they are words after the dashes describe the dish, such as Rochester, NY Style and Restaurant Style

In [67]:
for colname in new_colon_names:
    if len(re.findall("( - )|(: )", colname)) > 0:
        print(colname)

Doro Wat: Ethiopian Chicken Dish
Haluski - Cabbage and Noodles
Carnitas - Pressure Cooker
Onigiri - Japanese Rice Balls
Chicken French - Rochester, NY Style
Taqueria Style Tacos - Carne Asada
Al Kabsa - Traditional Saudi Rice and Chicken
Italian Subs - Restaurant Style
Bazlama - Turkish Flat Bread
Norwegian Pancakes - Pannekaken
Pain de Campagne - Country French Bread
Flemish Frites - Belgian Fries with Andalouse Sauce
Portuguese Custard Tarts - Pasteis de Nata
Eggplant Parmesan - Gluten-Free
Tonkatsu - Asian-Style Pork Chop
Indian Eggplant - Bhurtha
Hot Pepper Sauce - A Trinidadian Staple
The Sarge's Goetta - German Breakfast Treat
Italian Sausage - Tuscan Style
Honey Milk Tea - Hong Kong Style
Mexican Lasagna - No Lasagna Noodles!
Lumpia - Filipino Shrimp and Pork Egg Rolls
Portuguese Muffins - Bolo Levedo
Curry Pasta - Pakistani Style
Cauliflower and Potato Stir-Fry - East Indian Recipe
Keftedes - Greek Meatballs
Brasato al Barolo - Braised Chuck Roast in Red Wine
Potato Salad - Ger

Replace or remove the remaining dashes that are surrounded by spaces

In [68]:
# Chicken French - Rochester, NY Style => Rochester, NY Style Chicken French
searchReplaceAddPattList(p_recipes, r" - Rochester, NY Style", "", "Rochester, NY Style ")
# Carnitas - Pressure Cooker => pressure cooker carnitas
searchReplaceAddPattList(p_recipes, r" - Rochester, NY Style", "", "Rochester, NY Style ")
# Italian Subs - Restaurant Style => restaurant style Italian subs
searchReplaceAddPattList(p_recipes, r" - Restaurant Style", "", "restaurant style ")
# Eggplant Parmesan - Gluten-Free => glutten-free Eggplant Parmesan
searchReplaceAddPattList(p_recipes, r" - Gluten-Free", "", "glutten-free ")
# Italian Sausage - Tuscan Style => Tuscan style Italian Sausage
searchReplaceAddPattList(p_recipes, r" - Tuscan Style", "", "Tuscan style ")
# Honey Milk Tea - Hong Kong Style => Hong Kong style Honey Milk Tea
searchReplaceAddPattList(p_recipes, r" - Hong Kong Style", "", "Hong Kong style ")
# Curry Pasta - Pakistani Style => Pakistani style Curry Pasta
searchReplaceAddPattList(p_recipes, r" - Pakistani Style", "", "Pakistani style ")
# Cauliflower and Potato Stir-Fry - East Indian Recipe => East Indian style Cauliflower and Potato Stir-Fry
searchReplaceAddPattList(p_recipes, r" - East Indian Recipe", "", "East Indian style ")
# German Potato Salad - Schwabisch Style => Schwabisch style German Potato Salad
searchReplaceAddPattList(p_recipes, r" - Schwabisch Style", "", "Schwabisch style ")
# Tilapia - Filipino Sour Broth Dish => Filipino Sour Broth tilapia
searchReplaceAddPattList(p_recipes, r"Tilapia - ", "", "tilapia", index=20)
# Fish Sinigang - Filipino Sour Broth Dish - Schwabisch Style => Filipino Sour Broth Sinigang fish
searchReplaceAddPattList(p_recipes, r"Fish Sinigang - ", "", "Sinigang fish", index=20)

# remove  - A Trinidadian Staple from Hot Pepper Sauce - A Trinidadian Staple
searchReplacePattList(p_recipes, r" - A Trinidadian Staple", "")
# remove  - German Breakfast Treat from The Sarge's Goetta - German Breakfast Treat
searchReplacePattList(p_recipes, r" - German Breakfast Treat", "")
# remove  - No Lasagna Noodles! from Mexican Lasagna - No Lasagna Noodles!
searchReplacePattList(p_recipes, r" - No Lasagna Noodles!", "")
# remove  - Not Just for Chicken from Sweet and Sour Jam - Not Just for Chicken
searchReplacePattList(p_recipes, r" - Not Just for Chicken", "")
                      
tagged_recipe_names = retag(p_recipes, "name")

In [69]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Doro Wat: Ethiopian Chicken Dish']
['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Herb L

['Doro Wat: Ethiopian Chicken Dish',
 'Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Haluski - Cabbage and Noodles',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Carnitas - Pressure Cooker',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 'Onigiri - Japanese Rice Balls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Velveting Chicken Breast, Chinese Restau

The remaining names with dashes surrounded by dashes are translations, which can be split into two names

In [70]:
colnames_to_split = []
for colname in new_colon_names:
    if len(re.findall("( - )|(: )", colname)) > 0:
        print(colname)
        colnames_to_split.append(colname)

Doro Wat: Ethiopian Chicken Dish
Haluski - Cabbage and Noodles
Carnitas - Pressure Cooker
Onigiri - Japanese Rice Balls
Taqueria Style Tacos - Carne Asada
Al Kabsa - Traditional Saudi Rice and Chicken
Bazlama - Turkish Flat Bread
Norwegian Pancakes - Pannekaken
Pain de Campagne - Country French Bread
Flemish Frites - Belgian Fries with Andalouse Sauce
Portuguese Custard Tarts - Pasteis de Nata
Tonkatsu - Asian-Style Pork Chop
Indian Eggplant - Bhurtha
Lumpia - Filipino Shrimp and Pork Egg Rolls
Portuguese Muffins - Bolo Levedo
Keftedes - Greek Meatballs
Brasato al Barolo - Braised Chuck Roast in Red Wine
Potato Salad - German Kartoffel
Tembleque de Coco - Coconut Tembleque
Kroppkakor - Swedish Potato Dumplings
Ladolemono - Lemon Oil Sauce for Fish or Chicken
Mie Goreng - Indonesian Fried Noodles
Vaselopita - Greek New Years Cake
Knedliky - Czech Dumpling with Sauerkraut
Zhito or Koljivo - Serbian Wheat Pudding
Zito - Serbian Wheat Pudding


In [71]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in colnames_to_split:
            splits = re.split("( - )|(: )", p_recipes[i]["name"])
            newname1 = splits[0]
            newname2 = splits[len(splits)-1]
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

The remaining names with dash are those in words

In [72]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-']

In [73]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Herb Linguine', 'Korean-style Seaweed Soup', 'Ube-Macapuno Cake', 'Cuban-Style Yuca', 'Japanese-Style Cabbage Salad', "Jorge's Indian-Spice

['Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Velveting Chicken Breast, Chinese Restaurant-Style',
 'Garlic-Herb Linguine',
 'Korean-style Seaweed Soup',
 'Ube-Macapuno Cake',
 'Cuban-Style Yuca',
 'Japanese-Style Cabbage 

!, ? and . are found, which are odd for recipe names

In [74]:
punc_tokens = list_words_with_tag(tagged_recipe_names, ".")
punc_tokens

['!', '!', '!', '!', '.', '?']

The punctuations are mostly slang abbreviations and exclamations

In [75]:
for punc in list(set(punc_tokens)):
  print(find_value_with_char(p_recipes, 'name', punc))

['Real Canadian Butter Tarts, eh?']
["Our Top P.F. Chang's Copycat Recipes", "Perfect St. Patrick's Day Cake"]
['Sangria! Sangria!', 'Oatmeal Apple Crisp To Die For!', "Sushi House Salad Dressing, It's ORANGE!"]


Remove the exclamations

In [76]:
searchReplacePattList(p_recipes, r"! Sangria!", "")
searchReplacePattList(p_recipes, r" To Die For!", "")
searchReplacePattList(p_recipes, r", It's ORANGE!", "")
searchReplacePattList(p_recipes, r", eh\?", "")
searchReplacePattList(p_recipes, r"Our Top ", "")

tagged_recipe_names = retag(p_recipes, "name")

Fullstops that remain are part of recipe names

In [77]:
punc_tokens = list_words_with_tag(tagged_recipe_names, ".")
punc_tokens

['.']

In [78]:
for punc in list(set(punc_tokens)):
  print(find_value_with_char(p_recipes, 'name', punc))

["P.F. Chang's Copycat Recipes", "Perfect St. Patrick's Day Cake"]


Some 'that' can be found

In [79]:
wdt_tokens = list_words_with_tag(tagged_recipe_names, "WDT")
wdt_tokens

['That', 'That', 'That', 'That']

The 'that's are used to add details, but not actual recipe name

In [80]:
for wdt in list(set(wdt_tokens)):
  print(find_value_with_char(p_recipes, 'name', wdt))

['German Recipes That Are Comfort Food Favorites', 'Mexican-Inspired Ground Beef Casseroles That Deliver Big Flavor With Every Satisfying Bite', 'Tuscan Recipes That Reveal the Best of Italian Cooking', 'Easy Dinners That Start with Packaged Gnocchi', "That's-a Meatloaf", 'Favorite Recipes That Show Off Armenian Cuisine', 'Our Best Stir-Fry Recipes That Are Even Better Than Take-Out', 'Comforting Polish Cabbage Recipes That Are Family Favorites']


Remove

In [81]:
searchReplacePattList(p_recipes, r" That Are Comfort Food Favorites", "")
searchReplacePattList(p_recipes, r" That Deliver Big Flavor With Every Satisfying Bite", "")
searchReplacePattList(p_recipes, r" That Reveal the Best of Italian Cooking", "")
searchReplacePattList(p_recipes, r"That's-a ", "")
searchReplacePattList(p_recipes, r"Favorite Recipes That Show Off ", "")
searchReplacePattList(p_recipes, r" That Are Even Better Than Take-Out", "")
searchReplacePattList(p_recipes, r" That Are Family Favorites", "")

searchReplaceAddPattList(p_recipes, r" That Start with Packaged Gnocchi", "", "packaged gnocchi ", index=5)
tagged_recipe_names = retag(p_recipes, "name")

That removed

In [82]:
wdt_tokens = list_words_with_tag(tagged_recipe_names, "WDT")
wdt_tokens

[]

There's some 'how's

In [83]:
wrb_tokens = list_words_with_tag(tagged_recipe_names, "WRB")
wrb_tokens

['How', 'How', 'How', 'How', 'How', 'How', 'How']

In [84]:
for wrb in list(set(wrb_tokens)):
  print(find_value_with_char(p_recipes, 'name', wrb))

['How to Make Coquilles Saint-Jacques', 'How to Make Bolognese Sauce', 'How to Make Beef Satay', 'How to Make Peanut Dipping Sauce', 'How to Make Tres Leches Cake', 'How to Make Cassoulet', 'How to Make Turkey Manicotti']


Remove the 'how's and keep only the name

In [85]:
searchReplacePattList(p_recipes, r"How to Make ", "")

tagged_recipe_names = retag(p_recipes, "name")

In [86]:
list_words_with_tag(tagged_recipe_names, "WRB")

[]

There's some personal pronouns (possessive)

In [87]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP$")
prp_tokens

['Our',
 'My',
 'My',
 'My',
 'Our',
 'My',
 'Our',
 'My',
 'My',
 'My',
 'Our',
 'My',
 'My',
 'Your',
 'Our',
 'Our',
 'Our',
 'My',
 'its']

In [88]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['Sweet Recipes to Complete Your Indian Dinner', 'Melt-in-Your-Mouth Beef Cacciatore', 'Polish Recipes to Make Your Grandmother Proud']
['My Own Famous Stuffed Grape Leaves', 'My Best Chicken Piccata', 'My Favorite Sesame Noodles', 'My Chicken Parmesan', "My Mom's Greek Lemon Rice", 'My Fly Stir-Fry', 'My Chicken Pho Recipe', 'My Tangy German Potato Salad', 'My Big Fat Greek Baked Beans', "My Grandmother's French Dressing"]
['Our Best Avgolemono Soup Recipes', 'Our Best Authentic Mexican Recipes', 'Our Best Empanada Recipes', 'Our Best Indian Recipes for Beginner Cooks', 'Our Best Stir-Fry Recipes', 'Our Favorite German Potato Recipes', 'Say Aloha to Our Best Hawaiian Recipes']
['Anzac Biscuits I', "Sadie's Buttermilk Biscuits", 'Canadian Tea Biscuits', 'Empire Biscuits', 'Pastitsio IV', 'Crescent Butter Biscuits', 'Pastitsio', "Nanny's Newfoundland Tea Biscuits", 'Meat in its Juices']


Most can be removed

In [89]:
searchReplacePattList(p_recipes, r"Our ", "")
searchReplacePattList(p_recipes, r"Your ", "")
searchReplacePattList(p_recipes, r"Melt-in-Your-Mouth ", "")
searchReplacePattList(p_recipes, r"My Own ", "")
searchReplacePattList(p_recipes, r"My Best ", "")
searchReplacePattList(p_recipes, r"My Favorite ", "")
searchReplacePattList(p_recipes, r"My Mom's ", "")
searchReplacePattList(p_recipes, r"My Grandmother's ", "")
searchReplacePattList(p_recipes, r"My ", "")

tagged_recipe_names = retag(p_recipes, "name")

The remaining ones are misclassified tags by nltk

In [90]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP$")
prp_tokens

['its']

In [91]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['Anzac Biscuits I', "Sadie's Buttermilk Biscuits", 'Canadian Tea Biscuits', 'Empire Biscuits', 'Pastitsio IV', 'Crescent Butter Biscuits', 'Pastitsio', "Nanny's Newfoundland Tea Biscuits", 'Meat in its Juices']


There's some personal pronouns

In [92]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP")
prp_tokens

['I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'You',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'You',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'We',
 'I',
 'I',
 'I']

In [93]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['Shrimp Egg Foo Young', 'Good for You Greek Salad', 'Egg Foo Young', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Young Coconut Jelly', 'Keto Egg Foo Young']
['West African Peanut Stew', 'Real Welsh Rarebit', 'Fabulous Wet Burritos', 'Mexican Wedding Cookies', 'Italian Wedding Cookies III', 'Beef Wellington', 'West African-Style Peanut Stew with Chicken', 'Party Italian Wedding Soup', 'West Coast Trail Cookies', 'Italian Wedding Cake', 'Weeknight Mexican Chicken Lasagna', 'Comforting Russian Soups for Fall and Winter Weather', 'Comforting Russian Soups for Fall and Winter Weather', 'West Indian Curried Chicken', 'Welsh Cakes', "Mrs Welch's Butter Tarts", 'Italian Wedding Cake Martini', 'West African Lime Cake', 'Hawaiian Wedding Cake II', 'Weeknight Wonton Soup', 'Traditional Welsh Rarebit', 'West African Peanut Soup', "We Be Jammin' Jamaican Banana Bread", 'Italian Wedding Soup II', 'Chocolate Mexican Wedding Cookies', 'Traditional Welsh Broth']
['German Apple Cake I', 'In

Not much to remove, since most are misclassified POS

In [94]:
searchReplacePattList(p_recipes, r" You Can Make at Home", "")

tagged_recipe_names = retag(p_recipes, "name")

Some base verbs can be removed

In [95]:
vb_tokens = list_words_with_tag(tagged_recipe_names, "VB")
vb_tokens

['Take',
 'Make',
 'Take',
 'Kedgeree',
 'Swordfish',
 'Serve',
 'Make',
 'Celebrate',
 'Chicken',
 'Pata',
 'aux',
 'Poulet',
 'Papa',
 'Tarte',
 'Pollo',
 'Pancake',
 'Dutch',
 'Kransekake',
 'Dish',
 'Pannekaken']

In [96]:
for vb in list(set(vb_tokens)):
  print(find_value_with_char(p_recipes, 'name', vb))

['Dutch Apple Pie with Oatmeal Streusel', 'Dutch Apple Cake', 'Dutch Apple Pie', 'Dutch Croquetten', 'Dutch Apple Berry Pie', 'Dutch Leek Casserole', 'Dutch Pancakes', 'Dutch Butter Cake', 'Dutch Doughnuts', 'Dutch Apple Tart', 'Dutch Meatballs', 'Dutch Mini Pancakes']
['Dishes to Serve With Chapati']
['Spicy Korean Fried Chicken with Gochujang Sauce', 'Greek Lemon Chicken and Potato Bake', "Chef John's Chicken Kiev", 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Chicken Katsu', 'Chicken Stir-Fry', 'Mexican-Style Chicken Taco Casserole', 'Curry Stand Chicken Tikka Masala Sauce', 'Chicken Enchiladas V', 'Jamaican Style Curry Chicken', 'Salsa Chicken', 'Grilled Asian Chicken', 'Chicken Tikka Masala', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Turkish Chicken Kebabs', 'Chicken Souvlaki with Tzatziki Sauce', 'Greek Lemon Chicken Soup', 'Chicken Cacciatore in a Slow Cooker', 'Chicken and Broccoli Stir-Fry', 'Creamy Chicken Lasagna', 'Broccoli and Chicken St

Remove recipe names with instruction

In [97]:
searchReplacePattList(p_recipes, r" to Make at Home", "")
searchReplacePattList(p_recipes, r" to Make Grandmother Proud", "")
searchReplacePattList(p_recipes, r"Ways The World Makes Chicken And ", "")

searchReplaceAddPattList(p_recipes, r"Make Ahead ", "", "packaged gnocchi ")

tagged_recipe_names = retag(p_recipes, "name")

In [98]:
for vb in list(set(vb_tokens)):
  print(find_value_with_char(p_recipes, 'name', vb))

['Dutch Apple Pie with Oatmeal Streusel', 'Dutch Apple Cake', 'Dutch Apple Pie', 'Dutch Croquetten', 'Dutch Apple Berry Pie', 'Dutch Leek Casserole', 'Dutch Pancakes', 'Dutch Butter Cake', 'Dutch Doughnuts', 'Dutch Apple Tart', 'Dutch Meatballs', 'Dutch Mini Pancakes']
['Dishes to Serve With Chapati']
['Spicy Korean Fried Chicken with Gochujang Sauce', 'Greek Lemon Chicken and Potato Bake', "Chef John's Chicken Kiev", 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Chicken Katsu', 'Chicken Stir-Fry', 'Mexican-Style Chicken Taco Casserole', 'Curry Stand Chicken Tikka Masala Sauce', 'Chicken Enchiladas V', 'Jamaican Style Curry Chicken', 'Salsa Chicken', 'Grilled Asian Chicken', 'Chicken Tikka Masala', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Turkish Chicken Kebabs', 'Chicken Souvlaki with Tzatziki Sauce', 'Greek Lemon Chicken Soup', 'Chicken Cacciatore in a Slow Cooker', 'Chicken and Broccoli Stir-Fry', 'Creamy Chicken Lasagna', 'Broccoli and Chicken St

Words like best and most can be removed

In [99]:
rbs_tokens = list_words_with_tag(tagged_recipe_names, "RBS")
rbs_tokens

['Best', 'Most', 'Best']

In [100]:
for rbs in list(set(rbs_tokens)):
  print(find_value_with_char(p_recipes, 'name', rbs))

['The Most Iconic French Desserts', 'Alfredo Mostaccioli']
['Best Bobotie', 'Best Fried Walleye', 'Best Avgolemono Soup Recipes', "Chef John's Best German Recipes", 'The Best Thai Peanut Sauce', 'Best Ever Russian Beef Stroganoff', "Grandma's Best Ever Sour Cream Lasagna", 'Best Guacamole', 'Best Ever Slow Cooker Italian Beef Roast', 'The Best Pavlova', "Savannah's Best Marinated Portobello Mushrooms", 'Best Peanut Sauce', 'Best Ever Carne Asada Marinade', "Mom's Best Spaghetti Sauce", 'The Best Korean Chicken Recipes', 'Best Instant Pot Chicken Cacciatore', 'Best Ziti Ever', 'Best Authentic Mexican Recipes', 'Best Empanada Recipes', 'Best Ziti Ever with Sausage', 'Best Chicken Parmesan', 'Best Pernil Ever', 'The Best Ricotta Pancakes', 'Best Indian Recipes for Beginner Cooks', 'Best Hot Sauce', 'Best Ever Irish Soda Bread', 'Best Hummus', 'The Best Thai Tom Kha Soup Recipe', 'Best French Macarons', 'Best Falafel', "Gordo's Best of the Best Lasagna", 'The Best Classic Beef Stroganoff',

In [101]:
searchReplacePattList(p_recipes, r"Best Ever ", "")
searchReplacePattList(p_recipes, r"Best ", "")
searchReplacePattList(p_recipes, r" ever", "")
searchReplacePattList(p_recipes, r"The Most Iconic ", "")

tagged_recipe_names = retag(p_recipes, "name")

In [102]:
rbs_tokens = list_words_with_tag(tagged_recipe_names, "RBS")
rbs_tokens

[]

Adverbs with -ly can be removed, except for the misclassified ones mainly caused by foreign recipe names

In [103]:
rb_tokens = list_words_with_tag(tagged_recipe_names, "RB")
rb_tokens

['Absolutely',
 'Aebleskiver',
 'Incredibly',
 'Perfectly',
 'Absolutely',
 'Oven',
 'Perfectly',
 'Absolutely',
 'Heavenly',
 'Asiago',
 'Philly',
 'Family',
 'Deadly',
 'Yet',
 'Absolutely',
 'Ever',
 'Tourtiere',
 'Tourtiere',
 'Soon',
 'Here',
 'Long',
 'Tourtiere',
 'Tourtiere']

In [104]:
for rb in list(set(rb_tokens)):
  print(find_value_with_char(p_recipes, 'name', rb))

['Yet Turkey Chili']
['No Tomato Paste Here']
['Chicken Long Rice Soup', 'Vietnamese Chicken and Long-Grain Rice Congee', 'Long Soup', 'Philippine Longanisa de Eugenio', 'Long Drink']
['Air Fryer Oven Taco Shells', 'Oven Kalua Pork', 'Oven-Roasted Chicken Thighs', 'Oven Baked Chicken Teriyaki', 'Oven-Baked Chicken Fajitas', 'Oven-Baked Teriyaki Chicken Thighs', 'Crispy Oven Beef-and-Bean Tostadas', "Oven-Roasted Za'atar Chicken Breasts", 'Chicken Chimi in the Oven']
['Deadly Delicious Lasagna']
['Asiago Sun-Dried Tomato Pasta', 'Chicken and Bowtie Pasta with Asiago Cream Sauce']
['Absolutely Fabulous Greek or House Dressing', 'Absolutely Amazing Ahi', 'Absolutely Delicious Stuffed Calamari', 'Absolutely Perfect Palak Paneer']
['Perfectly Moist Irish Wheaten Bread', 'Perfectly Dry Roasted Chickpeas']
['Ziti Ever', 'Ziti Ever with Sausage', 'Pernil Ever', 'Date Squares Ever']
['Soon Du Bu Jigae']
['Philly Cheesesteak Quesadillas']
['French Canadian Tourtiere', 'Traditional French Canadia

In [105]:
searchReplacePattList(p_recipes, r"Deadly Delicious ", "")
searchReplacePattList(p_recipes, r"Heavenly ", "")
searchReplacePattList(p_recipes, r"Perfectly ", "")
searchReplacePattList(p_recipes, r"Absolutely Fabulous ", "")
searchReplacePattList(p_recipes, r"Absolutely Amazing  ", "")
searchReplacePattList(p_recipes, r"Absolutely Delicious ", "")
searchReplacePattList(p_recipes, r"Absolutely Perfect ", "")

searchReplaceAddPattList(p_recipes, r"No Tomato Paste Here", "", "tomato paste")

tagged_recipe_names = retag(p_recipes, "name")

In [106]:
rb_tokens = list_words_with_tag(tagged_recipe_names, "RB")
rb_tokens

['Aebleskiver',
 'Incredibly',
 'Absolutely',
 'Oven',
 'Asiago',
 'Philly',
 'Family',
 'Yet',
 'Ever',
 'Tourtiere',
 'Tourtiere',
 'Soon',
 'Long',
 'Tourtiere',
 'Tourtiere']

In [107]:
for rb in list(set(rb_tokens)):
  print(find_value_with_char(p_recipes, 'name', rb))

['Yet Turkey Chili']
['Chicken Long Rice Soup', 'Vietnamese Chicken and Long-Grain Rice Congee', 'Long Soup', 'Philippine Longanisa de Eugenio', 'Long Drink']
['Air Fryer Oven Taco Shells', 'Oven Kalua Pork', 'Oven-Roasted Chicken Thighs', 'Oven Baked Chicken Teriyaki', 'Oven-Baked Chicken Fajitas', 'Oven-Baked Teriyaki Chicken Thighs', 'Crispy Oven Beef-and-Bean Tostadas', "Oven-Roasted Za'atar Chicken Breasts", 'Chicken Chimi in the Oven']
['Asiago Sun-Dried Tomato Pasta', 'Chicken and Bowtie Pasta with Asiago Cream Sauce']
['Absolutely Amazing Ahi']
['Ziti Ever', 'Ziti Ever with Sausage', 'Pernil Ever', 'Date Squares Ever']
['Soon Du Bu Jigae']
['Philly Cheesesteak Quesadillas']
['French Canadian Tourtiere', 'Traditional French Canadian Tourtiere', 'Reveillon Tourtiere', 'Tourtiere Spices', 'Tourtiere', 'Tourtiere', 'Tourtiere', 'Tourtiere']
['Aebleskiver', 'Dansk Aebleskiver']
['Willard Family German Chocolate Cake', 'Mexican-Inspired Casseroles for Family-Pleasing Dinners', 'Chine

In [108]:
all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  all_name_tags.append(new_dic)

get_tag_number(all_name_tags)

[{'$': 1},
 {"''": 7},
 {'(': 0},
 {')': 0},
 {',': 62},
 {'--': 0},
 {'.': 1},
 {':': 1},
 {'CC': 506},
 {'CD': 23},
 {'DT': 96},
 {'EX': 0},
 {'FW': 67},
 {'IN': 464},
 {'JJ': 1897},
 {'JJR': 2},
 {'JJS': 1},
 {'LS': 0},
 {'MD': 0},
 {'NN': 659},
 {'NNP': 12712},
 {'NNPS': 36},
 {'NNS': 389},
 {'PDT': 0},
 {'POS': 346},
 {'PRP': 69},
 {'PRP$': 1},
 {'RB': 15},
 {'RBR': 0},
 {'RBS': 0},
 {'RP': 2},
 {'SYM': 0},
 {'TO': 10},
 {'UH': 0},
 {'VB': 18},
 {'VBD': 39},
 {'VBG': 59},
 {'VBN': 139},
 {'VBP': 9},
 {'VBZ': 29},
 {'WDT': 0},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 6}]

## Examining other POS in names

So as to get an idea of POS tagging in the later section

In [109]:
vbz_tokens = list_words_with_tag(tagged_recipe_names, "VBZ")
vbz_tokens

['Ties',
 'el',
 'Leaves',
 'al',
 'al',
 'Leaves',
 'au',
 'di',
 'Ways',
 'de',
 'al',
 'Breasts',
 'en',
 'e',
 'al',
 'Leaves',
 'Breasts',
 'Squares',
 'al',
 'di',
 'aux',
 'di',
 'Leaves',
 'au',
 'di',
 'di',
 'al',
 'en',
 'en']

In [110]:
vbp_tokens = list_words_with_tag(tagged_recipe_names, "VBP")
vbp_tokens

['Rellenos',
 'Greek',
 'Divine',
 'Wat',
 'Be',
 'en',
 'Mexicanos',
 'Rellenos',
 'en']

In [111]:
vbg_tokens = list_words_with_tag(tagged_recipe_names, "VBG")
vbg_tokens

['Seasoning',
 'Dressing',
 'Pudding',
 'Using',
 'Canning',
 'Pudding',
 'Velveting',
 'Pudding',
 'Pudding',
 'Pudding',
 'Seasoning',
 'Comforting',
 'Seasoning',
 'Pouding',
 'Pudding',
 'Amazing',
 'Pudding',
 'Refreshing',
 'Pudding',
 'Seasoning',
 'Dressing',
 'Comforting',
 'Pudding',
 'Making',
 'Comforting',
 'Pudding',
 'Dumpling',
 'Dipping',
 'Refreshing',
 'Pudding',
 'Seasoning',
 'Seasoning',
 'Filling',
 'Thanksgiving',
 'Stuffing',
 'Pudding',
 'Pudding',
 'Refreshing',
 'Pudding',
 'Sizzling',
 'Topping',
 'Amazing',
 'Refreshing',
 'Comforting',
 'Dressing',
 'Using',
 'Seasoning',
 'Refreshing',
 'Pudding',
 'Pudding',
 'Pudding',
 'Ping',
 'Pudding',
 'Pudding',
 'Pudding',
 'Pudding',
 'Pudding',
 'Dumpling',
 'Pudding']

In [112]:
vbd_tokens = list_words_with_tag(tagged_recipe_names, "VBD")
vbd_tokens

['Braised',
 'Corned',
 'Corned',
 'Pickled',
 'Shredded',
 'Braised',
 'Fashioned',
 'Filled',
 'Corned',
 'Fashioned',
 'Pickled',
 'Braised',
 'Breaded',
 'Fried',
 'Grilled',
 'Braised',
 'Pickled',
 'Braised',
 'Braised',
 'Planked',
 'Corned',
 'Corned',
 'Braised',
 'Infused',
 'Corned',
 'Obsessed',
 'Pickled',
 'Pulled',
 'Roasted',
 'Broiled',
 'Pickled',
 'Roasted',
 'di',
 'Braised',
 'Braised',
 'Pickled',
 'Mulled',
 'Pickled',
 'Boiled']

In [113]:
rp_tokens = set(list(list_words_with_tag(tagged_recipe_names, "RP")))
rp_tokens

{'Hanout', 'Over'}

In [114]:
comma_tokens = set(list(list_words_with_tag(tagged_recipe_names, ",")))
comma_tokens

{','}

In [115]:
for c in list(set(comma_tokens)):
  print(find_value_with_char(p_recipes, 'name', c))

['Bow Ties with Sausage, Tomatoes and Cream', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Chicken, Spinach, and Cheese Pasta Bake', 'Super-Simple, Super-Spicy Mongolian Beef', 'Creamy Potato, Carrot, and Leek Soup', 'Beef, Mushroom and Guinness Pie', 'Easy, Chewy Flourless Peanut Butter Cookies', 'Filipino Steamed Rice, Cebu Style', 'Orange, Honey and Soy Chicken', 'Chicken Francese, Italian-Style', 'Duck with Honey, Soy, and Ginger', 'Steak, Onion, and Pepper Fajitas', 'Indian Carrots, Peas and Potatoes', 'Simple, Baked Finnan Haddie', 'Indian-Style Rice with Cashews, Raisins and Turmeric', 'Serbian Ground Beef, Veggie, and Potato Bake', 'Fried Rice with Ginger, Hoisin, and Sesame', 'Chard Lentil Soup, Lebanese-Style', 'Easy, Cheesy Tortellini Bake', 'Curried Cashew, Pear, and Grape Salad', 'Pork, Sauerkraut and Dumplings', 'Spinach, Feta, and Pine Nut Ravioli Filling', 'Bell Pepper, Tomato, and Potato Indian Curry', 'Mascarpone Pasta with Chicken, Bacon and Spinach', 'Past

In [116]:
jjr_tokens = list_words_with_tag(tagged_recipe_names, "JJR")
jjr_tokens

['Healthier', 'Lighter']

In [117]:
for j in list(set(jjr_tokens)):
  print(find_value_with_char(p_recipes, 'name', j))

['Lighter Mexican Meatloaf']
['Healthier Bang Bang Chicken in the Air Fryer', 'Healthier Swedish Meatballs', 'Healthier Pan-Fried Honey-Sesame Chicken', 'Healthier Chicken Enchiladas I', 'Healthier Honey-Sesame Chicken']


In [118]:
jjs_tokens = list_words_with_tag(tagged_recipe_names, "JJS")
jjs_tokens

['Oktoberfest']

In [119]:
for j in list(set(jjs_tokens)):
  print(find_value_with_char(p_recipes, 'name', j))

['Oktoberfest Chicken and Red Cabbage', 'Oktoberfest Potato Salad', 'Oktoberfest Chili', 'The Recipes to Celebrate Oktoberfest']


In [120]:
dt_tokens = list_words_with_tag(tagged_recipe_names, "DT")
dt_tokens

['a',
 'The',
 'No',
 'The',
 'the',
 'a',
 'the',
 'The',
 'the',
 'the',
 'a',
 'the',
 'the',
 'A',
 'a',
 'The',
 'a',
 'the',
 'the',
 'a',
 'a',
 'A',
 'The',
 'A',
 'the',
 'a',
 'a',
 'The',
 'a',
 'a',
 'The',
 'the',
 'The',
 'This',
 'The',
 'a',
 'a',
 'the',
 'The',
 'a',
 'a',
 'The',
 'a',
 'A',
 'the',
 'the',
 'No',
 'the',
 'a',
 'a',
 'The',
 'The',
 'a',
 'The',
 'the',
 'the',
 'The',
 'the',
 'a',
 'a',
 'The',
 'a',
 'the',
 'a',
 'The',
 'All',
 'The',
 'a',
 'the',
 'the',
 'the',
 'The',
 'The',
 'A',
 'a',
 'the',
 'a',
 'the',
 'The',
 'the',
 'a',
 'a',
 'a',
 'the',
 'a',
 'a',
 'the',
 'a',
 'An',
 'the',
 'a',
 'a',
 'a',
 'No',
 'a',
 'No']

In [121]:
for dt in list(set(dt_tokens)):
  print(find_value_with_char(p_recipes, 'name', dt))

['Pan-Fried Asparagus', 'Creamy Au Gratin Potatoes', 'Super-Delicious Zuppa Toscana', 'Simple Teriyaki Sauce', 'Spicy Korean Fried Chicken with Gochujang Sauce', 'Spaghetti Aglio e Olio', 'Easy Garam Masala', 'Easy Chorizo Street Tacos', 'Russian Cabbage Rolls with Gravy', 'Shrimp Scampi with Pasta', 'Greek Lemon Chicken and Potato Bake', 'Easy Mexican Casserole', 'German Apple Cake I', 'Spanish Flan', 'German Pork Chops and Sauerkraut', 'Spaghetti Cacio e Pepe', 'Indian-Style Chicken and Onions', 'Fajita Seasoning', 'Tender Italian Baked Chicken', 'Authentic German Potato Salad', 'Mexican Rice II', 'Spongy Japanese Cheesecake', 'Chicken Katsu', 'Easy Authentic Mexican Rice', 'French Bread', 'Focaccia Bread', 'Jamaican Fried Dumplings', 'Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Rosemary Braised Lamb Shanks', 'Make-Ahead Vegetarian Moroccan Stew', 'Curry Stand Chicken Tikka Masala Sauce', 'Easy Pavlova', "Angela's Awesome Enchiladas", 'Sausage and Sauerkraut', 

In [122]:
to_tokens = list_words_with_tag(tagged_recipe_names, "TO")
to_tokens

['to', 'na', 'to', 'to', 'to', 'To', 'to', 'na', 'na', 'na']

In [123]:
for to in list(set(to_tokens)):
  print(find_value_with_char(p_recipes, 'name', to))

['Super-Delicious Zuppa Toscana', 'Canadian Yellow Split Pea Soup with Ham', "Randy's Slow Cooker Ravioli Lasagna", 'Spinach Tomato Tortellini', 'Traditional Gyros', 'Cheese Lasagna', 'Creamy Chicken Lasagna', 'Chicken Parmigiana', 'French Canadian Tourtiere', 'Pipirrana', "Dash's Donair", 'American Lasagna', 'Taco Lasagna', "Bob's Stuffed Banana Peppers", 'Spaghetti alla Carbonara', 'Kalamata Olive Tapenade', 'Lyonnaise Potatoes', "Chef John's Lasagna", 'Original Homemade Italian Beef', 'Fettuccini Carbonara', 'Simply Traditional Lasagna', 'Spinach Cheese Manicotti', 'The Original Donair From the East Coast of Canada', 'Chicken and Shrimp Carbonara', 'Fried Empanadas', 'Authentic Paella Valenciana', 'Danish Cinnamon Snails', 'Panang Curry with Chicken', 'Easy Lasagna I', 'Eggplant Lasagna', 'Jamaican Fried Snapper', 'Mushrooms and Spinach Italian Style', 'Deep Dish Lasagna', 'Simple Spinach Lasagna', 'Quesadilla Salvadorena', 'Italian Chicken Marinade', 'Korean BBQ Chicken Marinade', 

Chicken is considered dollar?

In [124]:
dol_tokens = list_words_with_tag(tagged_recipe_names, "$")
dol_tokens

['Chicken']

It's a tagging error, so this can be ignored

In [125]:
for dol in dol_tokens:
  print(find_value_with_char(p_recipes, 'name', dol))

['Spicy Korean Fried Chicken with Gochujang Sauce', 'Greek Lemon Chicken and Potato Bake', "Chef John's Chicken Kiev", 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Chicken Katsu', 'Chicken Stir-Fry', 'Mexican-Style Chicken Taco Casserole', 'Curry Stand Chicken Tikka Masala Sauce', 'Chicken Enchiladas V', 'Jamaican Style Curry Chicken', 'Salsa Chicken', 'Grilled Asian Chicken', 'Chicken Tikka Masala', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Turkish Chicken Kebabs', 'Chicken Souvlaki with Tzatziki Sauce', 'Greek Lemon Chicken Soup', 'Chicken Cacciatore in a Slow Cooker', 'Chicken and Broccoli Stir-Fry', 'Creamy Chicken Lasagna', 'Broccoli and Chicken Stir-Fry', 'Chicken Parmigiana', 'Shoyu Chicken', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Sheet Pan Chicken Fajitas', 'White Chicken Enchilada Slow-Cooker Casserole', 'Chicken Enchiladas II', 'Chinese Chicken Fried Rice II', 'Chicken Milanese', 'Chicken Massaman Curry', "Chef J

There are some quotation marks

In [126]:
quote_tokens = list_words_with_tag(tagged_recipe_names, "''")
quote_tokens

["''", "''", "'", "''", "''", "''", "''"]

Quotation marks are caused by possessive -'s

In [127]:
for quote in quote_tokens:
  print(find_value_with_char(p_recipes, 'name', quote))

[]
[]
["Chef John's Chicken Kiev", "Angela's Awesome Enchiladas", "Randy's Slow Cooker Ravioli Lasagna", "'Chinese Buffet' Green Beans", "Chef John's Beef Rouladen", "Corned Beef and Cabbage Shepherd's Pie", "Gramma's Date Squares", "Authentic Russian Salad 'Olivye'", "Chef John's Meatless Meatballs", "Chef John's Beef Goulash", "Grandma's Noodles II", "Chef John's Clotted Cream", "Newfoundland Jigg's Dinner", "Chef John's Coq Au Vin", "Chef John's Loco Moco", "Dash's Donair", "Turkey Shepherd's Pie", "Papa Drexler's Bavarian Pretzels", "Bob's Stuffed Banana Peppers", "Chef John's Swedish Meatballs", "Chef John's German Recipes", "Chef John's Chicken Tikka Masala", "Maria's Mexican Rice", "Mom's Buttermilk Pancakes", "Geneva's Ultimate Hungarian Mushroom Soup", "Charley's Slow Cooker Mexican Style Meat", "Ingrid's Rouladen", "Chef John's Lasagna", "Lola's Horchata", "Chef John's Italian Sausage Chili", "Kid's Favorite Pizza Casserole", "Traci's Adobo Seasoning", "Frank's Favorite Slow-

 For now, leave the preprocessing of the recipe names first.

## Preprocessing of ingredients

Ingriendts are a lot more straightforward to preprocess, since recipe names have to be attractive to encourage user to click in

In [128]:
p_ingredients = []

for recipe in p_recipes:
    p_ingredients = p_ingredients + recipe['ingredients']
    
p_ingredients = list(set(p_ingredients))
len(p_ingredients)

19342

In [129]:
p_ingredients[:10]

['½ cup butter ',
 '2 teaspoons garlic powder, divided ',
 '2 green bell peppers, cut into chunks ',
 '3 sprigs fresh thyme leaves ',
 '1\u2009¼ cups shredded mozzarella cheese ',
 '1 teaspoon mayonnaise ',
 '4 teaspoons heavy cream ',
 '2 onions, quartered ',
 '4 potatoes, peeled and cut into 1 inch cubes ',
 '1 cucumber, chopped ']

In [130]:
for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = p_ingredients[i].strip()

p_ingredients[:10]

['½ cup butter',
 '2 teaspoons garlic powder, divided',
 '2 green bell peppers, cut into chunks',
 '3 sprigs fresh thyme leaves',
 '1\u2009¼ cups shredded mozzarella cheese',
 '1 teaspoon mayonnaise',
 '4 teaspoons heavy cream',
 '2 onions, quartered',
 '4 potatoes, peeled and cut into 1 inch cubes',
 '1 cucumber, chopped']

A reusable function that re-tags ingredients

In [131]:
def retag_ingredients():
    tagged_recipe_ingredients = []

    for ingredient in p_ingredients:
        tagged_recipe_ingredients.append(tag_pos(ingredient))
        
    return tagged_recipe_ingredients

tagged_recipe_ingredients = retag_ingredients()
tagged_recipe_ingredients[:10]

[[('½', 'JJ'), ('cup', 'NN'), ('butter', 'NN')],
 [('2', 'CD'),
  ('teaspoons', 'NNS'),
  ('garlic', 'JJ'),
  ('powder', 'NN'),
  (',', ','),
  ('divided', 'VBD')],
 [('2', 'CD'),
  ('green', 'JJ'),
  ('bell', 'NN'),
  ('peppers', 'NNS'),
  (',', ','),
  ('cut', 'VBN'),
  ('into', 'IN'),
  ('chunks', 'NNS')],
 [('3', 'CD'),
  ('sprigs', 'NNS'),
  ('fresh', 'JJ'),
  ('thyme', 'NN'),
  ('leaves', 'NNS')],
 [('1', 'CD'),
  ('¼', 'JJ'),
  ('cups', 'NNS'),
  ('shredded', 'VBD'),
  ('mozzarella', 'NN'),
  ('cheese', 'NN')],
 [('1', 'CD'), ('teaspoon', 'NN'), ('mayonnaise', 'NN')],
 [('4', 'CD'), ('teaspoons', 'NNS'), ('heavy', 'JJ'), ('cream', 'NN')],
 [('2', 'CD'), ('onions', 'NNS'), (',', ','), ('quartered', 'VBD')],
 [('4', 'CD'),
  ('potatoes', 'NNS'),
  (',', ','),
  ('peeled', 'VBD'),
  ('and', 'CC'),
  ('cut', 'VBD'),
  ('into', 'IN'),
  ('1', 'CD'),
  ('inch', 'NN'),
  ('cubes', 'NNS')],
 [('1', 'CD'), ('cucumber', 'NN'), (',', ','), ('chopped', 'VBD')]]

Numbers need a placeholder

In [132]:
list_words_with_tag(tagged_recipe_ingredients, "CD")

['2',
 '2',
 '3',
 '1',
 '1',
 '4',
 '2',
 '4',
 '1',
 '1',
 '4',
 '1',
 '8',
 '2',
 '2',
 '1',
 '8',
 '8',
 '8',
 '2',
 '5',
 '10',
 '2',
 '1',
 '3',
 '2',
 '1',
 '2',
 '1',
 '1',
 '1',
 '3',
 '2',
 '2',
 '3',
 '2',
 '16',
 '6',
 '2',
 '2',
 '5',
 '1',
 '1',
 '3',
 '1',
 '14.5',
 '2',
 '36',
 '1',
 '4',
 '1',
 'one',
 '4',
 '1',
 '2',
 '2',
 '2',
 '3',
 '1',
 '2',
 '4',
 '1',
 '1',
 '1',
 '2',
 '1',
 '1',
 '2',
 '1',
 '4',
 '2',
 '4',
 '1',
 '1',
 '2',
 '15',
 '2',
 '3',
 '1',
 '1/4',
 '3',
 '2',
 '1/4',
 '3',
 "za'atar",
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '2',
 '8',
 '12',
 '1',
 '14',
 '1',
 '4',
 '1',
 '2',
 '11',
 '4',
 '1/2',
 '3',
 '2',
 '8',
 '1',
 '1',
 '1',
 '2',
 '1',
 '1',
 '5',
 '1',
 '4',
 '1/4',
 '1',
 '16',
 '1',
 '6',
 '2',
 '1',
 '2',
 '2',
 '1',
 '1',
 '4',
 '2',
 '1',
 '8',
 '15',
 '3',
 '1',
 '1',
 '1',
 '8',
 '1',
 '1/2',
 '1',
 '1/2',
 '8',
 '1',
 '1',
 '2',
 '1',
 '26',
 '2',
 '1',
 '1',
 '1',
 '6',
 '1',
 '2',
 '1',
 '1',
 '12.5',
 '2'

NLTK assumes fractions as JJ (adjectives)

In [133]:
list_words_with_tag(tagged_recipe_ingredients, "JJ")

['½',
 'garlic',
 'green',
 'fresh',
 '¼',
 'heavy',
 '¼',
 '¾',
 '¼',
 'Italian',
 'unsalted',
 'frozen',
 'Jamaican-style',
 '½',
 'iceberg',
 '¾',
 'hot',
 '½',
 '¼',
 '1/2-inch',
 '¾',
 'coarse',
 '½',
 '½',
 'green',
 'olive',
 '½',
 '½',
 'lemon',
 '¼',
 'large',
 'yellow',
 '1-inch',
 '¼',
 'hot',
 'strong',
 'cooked',
 '½',
 'all-purpose',
 '½',
 '½',
 '½',
 '½',
 '⅔',
 'fresh-squeezed',
 '½',
 'red',
 'fresh',
 '½',
 'fresh',
 '⅔',
 'no-salt-added',
 'diced',
 'Irish',
 'empanada',
 'such',
 '½',
 'black',
 'grated',
 'red',
 '½',
 'unblanched',
 'thick',
 'hickory-smoked',
 'cube',
 'bite-sized',
 'large',
 'sesame',
 'Japanese',
 'sweet',
 'Japanese',
 '1-inch',
 'small',
 'white',
 '½',
 'all-purpose',
 'pinch',
 'Thai-style',
 'extra-large',
 'Spanish',
 'sweet',
 'Italian',
 '½',
 'sparkling',
 '½',
 'large',
 '¾',
 'kidney',
 'whole',
 'fresh',
 'large',
 'inch',
 'wide',
 '½',
 'black',
 'small',
 '½',
 'bite-sized',
 '½',
 'fresh',
 'French',
 'rosemary',
 'small',
 '¼

Create a function that converts any fraction in a text to integer

In [134]:
import unicodedata
from decimal import Decimal

def fraction_to_int(text):
  for i, char in enumerate(text):
    try:
      # unicode.numeric converts fractions such as ½ to decimal place, 0.25
      # remove trailing decimals, otherwise keep decimals
      text = text[:i] + str(Decimal(unicodedata.numeric(char)).normalize()) + text[i + 1:]
    except Exception as e:
      pass
  # Because number + fraction, such as 1 1/4 may be converted to 1 0, so use re.sub to remove
  text = re.sub("([0-9]+ [0])+", "4", text)
  return text

for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = fraction_to_int(p_ingredients[i])

tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['0.5 cup butter',
 '2 teaspoons garlic powder, divided',
 '2 green bell peppers, cut into chunks',
 '3 sprigs fresh thyme leaves',
 '4.25 cups shredded mozzarella cheese',
 '1 teaspoon mayonnaise',
 '4 teaspoons heavy cream',
 '2 onions, quartered',
 '4 potatoes, peeled and cut into 1 inch cubes',
 '1 cucumber, chopped',
 '0.25 cup walnut pieces',
 '4 teaspoons sea salt',
 '1 (8 ounce) can sliced water chestnuts, halved',
 '0.75 cup dried apricots, chopped',
 '0.25 teaspoon Italian seasoning',
 '2 sticks unsalted butter, sliced, frozen',
 '2 tablespoons Jamaican-style curry powder',
 '1 pinch Asafoetida',
 '0.5 medium onion, halved and separated',
 '8 leaves iceberg lettuce']

By converting fractions into integers, NLTK stops seeing them as adjectives (JJ) and instead, they are considered numbers (CD)

In [135]:
list_words_with_tag(tagged_recipe_ingredients, "JJ")

['garlic',
 'green',
 'fresh',
 'heavy',
 'Italian',
 'unsalted',
 'frozen',
 'Jamaican-style',
 'iceberg',
 'hot',
 '1/2-inch',
 'cup',
 'coarse',
 'fluid',
 'green',
 'olive',
 'lemon',
 'large',
 'yellow',
 '1-inch',
 'hot',
 'strong',
 'cooked',
 'all-purpose',
 'brown',
 'cup',
 'fresh-squeezed',
 'red',
 'fresh',
 'fresh',
 'no-salt-added',
 'diced',
 'Irish',
 'empanada',
 'such',
 'black',
 'grated',
 'red',
 'unblanched',
 'thick',
 'hickory-smoked',
 'cube',
 'bite-sized',
 'large',
 'sesame',
 'Japanese',
 'sweet',
 'Japanese',
 '1-inch',
 'small',
 'white',
 'cup',
 'all-purpose',
 'pinch',
 'Thai-style',
 'extra-large',
 'Spanish',
 'sweet',
 'Italian',
 'sparkling',
 'large',
 'kidney',
 'whole',
 'fresh',
 'large',
 'inch',
 'wide',
 'cup',
 'black',
 'small',
 'skinless',
 'boneless',
 'bite-sized',
 'fresh',
 'French',
 'rosemary',
 'small',
 'broccoli',
 'flat',
 'fresh',
 'red',
 'chicken',
 'inch',
 'bow',
 'stalk',
 '1/4-inch',
 'low-carb',
 'high-fiber',
 'such',


Replace all the numbers with placeholder of 4

In [136]:
for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = searchReplacePatt(p_ingredients[i], NUMPATTERN, "4")
    
tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['4.4 cup butter',
 '4 teaspoons garlic powder, divided',
 '4 green bell peppers, cut into chunks',
 '4 sprigs fresh thyme leaves',
 '4.4 cups shredded mozzarella cheese',
 '4 teaspoon mayonnaise',
 '4 teaspoons heavy cream',
 '4 onions, quartered',
 '4 potatoes, peeled and cut into 4 inch cubes',
 '4 cucumber, chopped',
 '4.4 cup walnut pieces',
 '4 teaspoons sea salt',
 '4 (4 ounce) can sliced water chestnuts, halved',
 '4.4 cup dried apricots, chopped',
 '4.4 teaspoon Italian seasoning',
 '4 sticks unsalted butter, sliced, frozen',
 '4 tablespoons Jamaican-style curry powder',
 '4 pinch Asafoetida',
 '4.4 medium onion, halved and separated',
 '4 leaves iceberg lettuce']

In [137]:
new_cd_tokens = list(set(list_words_with_tag(tagged_recipe_ingredients, "CD")))
new_cd_tokens.remove('4')
new_cd_tokens

['zapallo',
 'xanthan',
 'bleu',
 'mascarpone',
 '4p',
 'mozzarella',
 '4/4x4/4',
 "za'atar",
 'beef4',
 'one',
 'four',
 'mostaccioli',
 'kalamata',
 'yellow',
 'fontina',
 'seven',
 '4.4',
 'ziti',
 'zucchini',
 '4/4',
 'marinara',
 '4.4.4',
 'millet',
 'provolone',
 '4up',
 'yum']

Define a function that returns ingredient with specific substring

In [138]:
def find_ingre_with_substring(sub):
    ingres = []
    for ingre in p_ingredients:
        matches = searchWordsPatt(ingre, sub)
        if len(matches)  > 0:
            ingres.append(ingre)
    return ingres

find_ingre_with_substring('4/4')

['4 medium sweet potato, cut into 4/4-inch pieces',
 '4 (4/4 inch x 4 inch) strip lime peel',
 '4 large onions, cut into 4/4 inch wide strips',
 '4 skinless, boneless chicken breast halves - cut into 4/4 inch strips',
 '4 stalks stalk celery, chopped into 4/4-inch pieces',
 '4 russet potatoes, sliced into 4/4 inch slices',
 '4 green bell pepper, cut into 4/4 inch wide strips',
 '4 French baguette, cut into diagonal 4/4 inch slices',
 '4.4 red bell pepper, sliced int4 4/4-inch strips',
 '4 daikon radish, halved lengthwise and sliced into 4/4-inch thick pieces',
 '4 French baguette, cut into 4/4 inch slices',
 '4 red bell pepper, cut into 4/4-inch strips',
 '4 inches sausage casing, 4 4/4 inches wide',
 '4 pound boneless beef sirloin steak (4/4 inch thick)',
 '4/4 cup braising liquid',
 '4 pound skinless, boneless chicken breasts, cut into 4/4-inch pieces',
 '4 pounds Korean-style short ribs (beef chuck flanken, cut 4/4 to 4/4 inch thick across bones)',
 '4 medium yellow onion, cut into 

Define a function that searches and replace specific regex pattern from ingredients

In [139]:
def search_edit_ingredient(regex, new_val):
    for i, ingre in enumerate(p_ingredients):
        p_ingredients[i] = searchReplacePatt(p_ingredients[i], regex, new_val)
        
search_edit_ingredient(r"4/4", "4.4")

find_ingre_with_substring('4/4')

[]

Remove copyright symbols

In [140]:
search_edit_ingredient(r"®", "")

find_ingre_with_substring('®')

[]

Remove 4p

In [141]:
find_ingre_with_substring('4p')

['4.4 4p warm milk (4 degrees F/4 degrees C)',
 '4.4 c4p4.4-inch long vermicelli']

In [142]:
search_edit_ingredient(r"c4p", "")
search_edit_ingredient(r"4p", "")

find_ingre_with_substring('4p')

[]

Change 4up back to 7up

In [143]:
find_ingre_with_substring('4up')

['4.4 4up 4% milk']

In [144]:
search_edit_ingredient(r"4up", "7up")

find_ingre_with_substring('7up')

['4.4 7up 4% milk']

Define a function that splits a list element into two new elements and deletes it

In [145]:
def split_ingre_to_two(target, search, retain_target=False):
    for i, ingre in enumerate(p_ingredients):
        if p_ingredients[i] == target:
            splits = re.split(search, p_ingredients[i])
            new_ingre1 = splits[0].strip()
            new_ingre2 = splits[1].strip()
            if retain_target:
                new_ingre2 = search.strip()
            del p_ingredients[i]
            p_ingredients.append(new_ingre1)
            p_ingredients.append(new_ingre2)

split_ingre_to_two('4.4 7up 4% milk', " 4% milk", retain_target=True)

find_ingre_with_substring('7up')

['4.4 7up']

In [146]:
tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['4.4 cup butter',
 '4 teaspoons garlic powder, divided',
 '4 green bell peppers, cut into chunks',
 '4 sprigs fresh thyme leaves',
 '4.4 cups shredded mozzarella cheese',
 '4 teaspoon mayonnaise',
 '4 teaspoons heavy cream',
 '4 onions, quartered',
 '4 potatoes, peeled and cut into 4 inch cubes',
 '4 cucumber, chopped',
 '4.4 cup walnut pieces',
 '4 teaspoons sea salt',
 '4 (4 ounce) can sliced water chestnuts, halved',
 '4.4 cup dried apricots, chopped',
 '4.4 teaspoon Italian seasoning',
 '4 sticks unsalted butter, sliced, frozen',
 '4 tablespoons Jamaican-style curry powder',
 '4 pinch Asafoetida',
 '4.4 medium onion, halved and separated',
 '4 leaves iceberg lettuce']

Numbers are mostly cleaned

In [147]:
new_cd_tokens = list(set(list_words_with_tag(tagged_recipe_ingredients, "CD")))
new_cd_tokens

['zapallo',
 'xanthan',
 '7up',
 'bleu',
 'mascarpone',
 '4',
 'mozzarella',
 "za'atar",
 'beef4',
 'one',
 'four',
 'mostaccioli',
 'kalamata',
 'yellow',
 'fontina',
 'seven',
 '4.4',
 'ziti',
 'zucchini',
 'marinara',
 '4.4.4',
 'millet',
 '4.4x4.4',
 'provolone',
 'yum']

Looking at the number of each POS tag for ingredient list

In [148]:
tagged_recipe_ingredients = retag_ingredients()

all_ingre_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_ingredients, POS)}
  all_ingre_tags.append(new_dic)

get_tag_number(all_ingre_tags)

[{'$': 0},
 {"''": 14},
 {'(': 3744},
 {')': 3828},
 {',': 8512},
 {'--': 0},
 {'.': 23},
 {':': 304},
 {'CC': 3074},
 {'CD': 21788},
 {'DT': 99},
 {'EX': 0},
 {'FW': 52},
 {'IN': 2849},
 {'JJ': 13401},
 {'JJR': 523},
 {'JJS': 6},
 {'LS': 0},
 {'MD': 612},
 {'NN': 32987},
 {'NNP': 2411},
 {'NNPS': 2},
 {'NNS': 13598},
 {'PDT': 1},
 {'POS': 126},
 {'PRP': 2},
 {'PRP$': 1},
 {'RB': 1452},
 {'RBR': 5},
 {'RBS': 0},
 {'RP': 13},
 {'SYM': 53},
 {'TO': 1039},
 {'UH': 0},
 {'VB': 1725},
 {'VBD': 8949},
 {'VBG': 354},
 {'VBN': 3434},
 {'VBP': 646},
 {'VBZ': 588},
 {'WDT': 1},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 0}]

In [149]:
colon_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, ":")))
colon_tags

['--', ':', '-', ';']

In [150]:
for c in colon_tags:
    print(find_ingre_with_substring(c))

['4 large skinless, boneless chicken breast halves -- trimmed and cut into 4-inch pieces']
['Gravy:', 'Spice Blend:', 'Dipping Sauce:', 'Chipotle Mayonnaise:', 'Meatballs:', 'Fillings:', 'Caramel:']
['4 tablespoons Jamaican-style curry powder', '4 medium sweet potato, cut into 4.4-inch pieces', '4 skinless, boneless chicken breasts, or as needed, cut into 4-inch pieces', '4.4 cups all-purpose flour, or more as needed', '4.4 cup fresh-squeezed lemon juice', '4 (4.4 ounce) can no-salt-added diced tomatoes, undrained', '4 thick slices hickory-smoked bacon', '4 pounds cube steaks, pounded thin and cut into bite-sized pieces', '4 Japanese eggplants, cut into 4-inch cubes', '4.4 cup all-purpose flour for dusting', '4 tablespoons Thai-style chile sauce', '4 extra-large Spanish onion, chopped', '4 skinless, boneless chicken breast halves - cut into cubes', '4.4 pounds skinless, boneless chicken breast halves - cut into bite-sized pieces', '4 skinless, boneless chicken breast halves - cut into 

In [151]:
find_ingre_with_substring("--")

['4 large skinless, boneless chicken breast halves -- trimmed and cut into 4-inch pieces']

In [152]:
search_edit_ingredient(r"--", ",")

find_ingre_with_substring('--')

[]

Remove the hanging colons

In [153]:
find_ingre_with_substring(":")

['Gravy:',
 'Spice Blend:',
 'Dipping Sauce:',
 'Chipotle Mayonnaise:',
 'Meatballs:',
 'Fillings:',
 'Caramel:']

In [154]:
search_edit_ingredient(r":", "")

find_ingre_with_substring(':')

[]

In [155]:
find_ingre_with_substring(";")

['4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)',
 '4 (4 ounce) can black beans; drain and reserve liquid',
 '4 raw chop with refuse, 4 g; (blank) 4.4 ounces boneless pork chops, pounded to 4.4 inch thick']

In [156]:
find_ingre_with_substring(', 4 g')

['4 raw chop with refuse, 4 g; (blank) 4.4 ounces boneless pork chops, pounded to 4.4 inch thick']

Remove the \(blanlk\) typo

In [157]:
search_edit_ingredient(r", 4 g; \(blank\)", ", 4g")

find_ingre_with_substring(";")

['4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)',
 '4 (4 ounce) can black beans; drain and reserve liquid']

In [158]:
split_ingre_to_two('4 raw chop with refuse, 4g; (blank) 4.4 ounces boneless pork chops, pounded to 4.4 inch thick', "; ")

find_ingre_with_substring(";")

['4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)',
 '4 (4 ounce) can black beans; drain and reserve liquid']

In [159]:
split_ingre_to_two("4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)", r"\(I like ")

find_ingre_with_substring(";")

['4 (4 ounce) can black beans; drain and reserve liquid',
 'white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)']

In [160]:
find_ingre_with_substring("/")

['4 tablespoons warm water (4 degrees F/4 degrees C)',
 '4 tablespoons warm milk (4 degrees F/4 degrees C)',
 '4 cup warm water (4 degrees F/4 degrees C)',
 '4 tablespoons warm water (4 degrees F/4 degrees C)',
 '4 (4 ounce) package round gyoza/potsticker wrappers',
 '4.4 cups warm wat4(4 degree4F/4 degrees C)',
 '4 cups warm water (4 degrees F/4 degrees C)',
 '4.4  warm milk (4 degrees F/4 degrees C)',
 '4.4 cu4 warm water (4 degrees F/4 degrees C)',
 '4.4 c4 warm water (4 degrees F/4 degrees C)',
 '4 (4.4 ounce) package corn bread/muffin mix',
 '4 cup warm water (4 degrees F/4 degrees C)',
 '4.4 tablespoon Guacamole, salsa, and/or sour cream',
 '4 cups warm water (4 degrees F/4 degrees C)',
 '4 cups warm water (4 degrees F/4 degrees C)',
 '4.4 c4 warm water (4 degrees F/4 degrees C)',
 '4 cup warm milk (4 degrees F/4 degrees C)',
 '4 cup shredded Cheddar/Monterey Jack cheese blend']

Replace / with or

In [161]:
search_edit_ingredient(r"\/", " or ")
find_ingre_with_substring("/")

[]

In [162]:
tagged_recipe_ingredients = retag_ingredients()

tagged_recipe_ingredients[:20]

[[('4.4', 'CD'), ('cup', 'NN'), ('butter', 'NN')],
 [('4', 'CD'),
  ('teaspoons', 'NNS'),
  ('garlic', 'JJ'),
  ('powder', 'NN'),
  (',', ','),
  ('divided', 'VBD')],
 [('4', 'CD'),
  ('green', 'JJ'),
  ('bell', 'NN'),
  ('peppers', 'NNS'),
  (',', ','),
  ('cut', 'VBN'),
  ('into', 'IN'),
  ('chunks', 'NNS')],
 [('4', 'CD'),
  ('sprigs', 'NNS'),
  ('fresh', 'JJ'),
  ('thyme', 'NN'),
  ('leaves', 'NNS')],
 [('4.4', 'CD'),
  ('cups', 'NNS'),
  ('shredded', 'VBD'),
  ('mozzarella', 'NN'),
  ('cheese', 'NN')],
 [('4', 'CD'), ('teaspoon', 'NN'), ('mayonnaise', 'NN')],
 [('4', 'CD'), ('teaspoons', 'NNS'), ('heavy', 'JJ'), ('cream', 'NN')],
 [('4', 'CD'), ('onions', 'NNS'), (',', ','), ('quartered', 'VBD')],
 [('4', 'CD'),
  ('potatoes', 'NNS'),
  (',', ','),
  ('peeled', 'VBD'),
  ('and', 'CC'),
  ('cut', 'VBD'),
  ('into', 'IN'),
  ('4', 'CD'),
  ('inch', 'NN'),
  ('cubes', 'NNS')],
 [('4', 'CD'), ('cucumber', 'NN'), (',', ','), ('chopped', 'VBD')],
 [('4.4', 'CD'), ('cup', 'NN'), ('walnut

## Examining other POS in ingredients

So as to get an idea of POS tagging in the later section

In [163]:
fw_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "FW")))
fw_tags

['s',
 'di',
 'de',
 'arbol',
 'herbes',
 'kalamansi',
 'pico',
 'miso',
 'kielbasa',
 'gallo',
 'bilbao',
 'vanilla',
 'skin',
 'paprika',
 'kalonji',
 'mirin']

In [164]:
rp_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "RP")))
rp_tags

['dashi', 'out', 'aside', 'off', 'up', 'tomato']

In [165]:
for rp in rp_tags:
    print(find_ingre_with_substring(" " + rp))

['4 cups dashi stock, made with dashi powder', '4.4 cups prepared dashi stock', '4 teaspoon dashi granules', '4.4 cup prepared dashi stock', '4 (4 inch) piece dashi kombu (dried kelp) (Optional)', '4.4 tablespoon dashi granules', '4 ounce dashi kombu (dried kelp)', '4.4 teaspoons dashi no moto (instant dashi or fish-broth powder), available at Asian markets', '4.4 teaspoon dashi granules', '4.4 teaspoon white miso paste with dashi', '4 cups prepared dashi stock', '4 tablespoon dashi granules', '4 cups prepared dashi stock', '4 teaspoons dashi granules', '4 (4 inch) piece dashi kombu (dried kelp)']
['4 medium eggplants, cut in half lengthwise and hollowed out to 4.4 inch flesh rim', '4 cucumbers - halved lengthwise, seeds scraped out with a spoon, and thinly sliced', '4 stalk lemongrass, tough outer parts removed, thinly sliced', '4.4 cup water (to rinse out can of tomatoes)', '4 pounds leeks, trimmed and outer leaves removed', '4 large cucumbers - sliced lengthwise, seeds scooped out, 

Lamb, lobster and leeks are supposed to be nouns!

In [166]:
rbr_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "RBR")))
rbr_tags

['lobster', 'lamb', 'leeks']

In [167]:
wdt_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "WDT")))
wdt_tags

['whole']

In [168]:
pdt_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "PDT")))
pdt_tags

['half']

In [169]:
prp_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "PRP")))
prp_tags

['you']

In [170]:
find_ingre_with_substring("you ")

['4 (4 ounce) packages garlic and herb couscous mix (or any flavor you prefer)']

In [171]:
prp_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "PRP$")))
prp_tags

['your']

In [172]:
find_ingre_with_substring("your")

['4 (4 ounce) package pasta, your choice of shape']

In [173]:
punc_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, ".")))
punc_tags

['.', '!']

In [174]:
find_ingre_with_substring("!")

['4.4 cup Greek salad dressing, such as Yazzo!']

In [175]:
quote_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "''")))
quote_tags

["''"]

In [176]:
for q in quote_tags:
    print(find_ingre_with_substring("'"))

["sifted confectioners' sugar", "4 tablespoons za'atar seasoning, divided", "4 (4 ounce) container crushed tomatoes (such as Hunt's)", "4 Thai bird's eye chiles, halved lengthwise", "4.4 cup confectioners' sugar", "4 tablespoons confectioners' sugar, or to taste", "4.4 cup sifted confectioners' sugar", "4.4 cup confectioners' sugar for rolling", "4 (4.4 ounce) package UNCLE BEN'S Jasmine READY RICE", "4.4 tablespoons confectioners' sugar", "4.4 cups sifted confectioners' sugar", "4 cups confectioners' sugar", "4 teaspoons confectioners' sugar", "4 cups vegetable broth (such as Penzy's)", "4 tablespoon unsalted butter (such as Land O'Lakes)", "4.4 teaspoon seasoned salt (such as LAWRY'S)", "4 cup barbeque sauce (such as Bull's-Eye Texas-Style Bold Barbeque Sauce)", "4 (4.4 ounce) carton Campbell's Mexican Style Tomato Soup", "4 tablespoon golden syrup (such as Lyle's)", "4 pound smoked sausage (such as farmer's sausage), sliced", "4 (4 ounce) can cream of coconut (such as Trader Joe's E

How can these words be symbols?

In [177]:
list(set(list_words_with_tag(tagged_recipe_ingredients, "SYM")))

['sauerkraut',
 'mangoes',
 'avocados',
 'beaten',
 'thighs',
 'cucumber',
 'squash',
 'lemon',
 'mangos',
 'mango',
 'avocado',
 'tomato',
 'spinach',
 'lettuce',
 'choy',
 'shrimp',
 'breast',
 'leeks',
 'kale',
 'basil']

## Casing of recipe names

Because almost all words are capitalized by default in recipe name, need to correct the casing

In [178]:
all_recipe_names = []

for recipe in p_recipes:
    try:
        all_recipe_names.append(recipe['name'])
    except Exception as e:
        pass
    
all_recipe_names[:10]

['Pan-Fried Asparagus',
 'Creamy Au Gratin Potatoes',
 'Super-Delicious Zuppa Toscana',
 'Simple Teriyaki Sauce',
 'Spicy Korean Fried Chicken with Gochujang Sauce',
 'Spaghetti Aglio e Olio',
 'Easy Garam Masala',
 'Easy Chorizo Street Tacos',
 'Russian Cabbage Rolls with Gravy',
 'Shrimp Scampi with Pasta']

Create a corpus by joining all recipe names with \n, because the names were not literally a single text originally. Othwewise it will confuse the tokenisation

In [179]:
all_recipe_names_corpus = ("\n").join(all_recipe_names)

all_recipe_names_corpus

'Pan-Fried Asparagus\nCreamy Au Gratin Potatoes\nSuper-Delicious Zuppa Toscana\nSimple Teriyaki Sauce\nSpicy Korean Fried Chicken with Gochujang Sauce\nSpaghetti Aglio e Olio\nEasy Garam Masala\nEasy Chorizo Street Tacos\nRussian Cabbage Rolls with Gravy\nShrimp Scampi with Pasta\nGreek Lemon Chicken and Potato Bake\nEasy Mexican Casserole\nGerman Apple Cake I\nSpanish Flan\nGerman Pork Chops and Sauerkraut\nSpaghetti Cacio e Pepe\nChef John\'s Chicken Kiev\nIndian-Style Chicken and Onions\nFajita Seasoning\nPerfect Sushi Rice\nTender Italian Baked Chicken\nAuthentic German Potato Salad\nMiso Soup\nMexican Rice II\nSpongy Japanese Cheesecake\nChicken Katsu\nChicken Stir-Fry\nQuick Beef Stir-Fry\nEasy Authentic Mexican Rice\nHerbs de Provence\nGreek or House Dressing\nFrench Bread\nFocaccia Bread\nJamaican Fried Dumplings\nGluehwein\nCoquilles Saint-Jacques\nMexican-Style Chicken Taco Casserole\nRosemary Braised Lamb Shanks\nMake-Ahead Vegetarian Moroccan Stew\nCurry Stand Chicken Tikka

Tokenize

In [180]:
import nltk

recipe_tokens = list(set(nltk.word_tokenize(all_recipe_names_corpus)))
recipe_tokens[:10]

['Raspberry',
 'Farmer',
 'Edition',
 'Mojo',
 'Company',
 'Naan',
 'World',
 'Malian',
 'Crescent',
 'the']

In [181]:
len(recipe_tokens)

3271

Join ingredients into a text with \n and tokenize

In [182]:
ingredients_corpus = ("\n").join(p_ingredients)

ingredients_corpus

'4.4 cup butter\n4 teaspoons garlic powder, divided\n4 green bell peppers, cut into chunks\n4 sprigs fresh thyme leaves\n4.4 cups shredded mozzarella cheese\n4 teaspoon mayonnaise\n4 teaspoons heavy cream\n4 onions, quartered\n4 potatoes, peeled and cut into 4 inch cubes\n4 cucumber, chopped\n4.4 cup walnut pieces\n4 teaspoons sea salt\n4 (4 ounce) can sliced water chestnuts, halved\n4.4 cup dried apricots, chopped\n4.4 teaspoon Italian seasoning\n4 sticks unsalted butter, sliced, frozen\n4 tablespoons Jamaican-style curry powder\n4 pinch Asafoetida\n4.4 medium onion, halved and separated\n4 leaves iceberg lettuce\n4.4 cup hot water\n4.4 teaspoon dried sage\n4 baby corn ears, sliced\n4.4 cup freshly squeezed lemon juice\n4 Roma tomatoes, sliced\n4 medium sweet potato, cut into 4.4-inch pieces\n4 (4 inch) flour tortillas\n4 dill pickles, diced\n4.4 cup pinole (coarse ground maize flour)\n4 cup blanched almonds\n4.4 fluid ounce beer\n4.4 cups shredded green cabbage\n4 tablespoons olive o

In [183]:
ingre_tokens = list(set(nltk.word_tokenize(ingredients_corpus)))
ingre_tokens[:10]

['huckleberries',
 'glaces',
 'pasta',
 'rigatoni',
 'whole-milk',
 'achiote',
 'cultures',
 'petite',
 'marshmallows',
 'Misto']

In [184]:
len(ingre_tokens)

2815

Most words in recipe tokens are capitalized

In [185]:
lower_recipe_tokens = []
for token in recipe_tokens:
    if token[0].islower():
        lower_recipe_tokens.append(token)
        
lower_recipe_tokens

['the',
 'e',
 'la',
 'of',
 'a',
 'laziale',
 'z',
 'et',
 'powder',
 'on',
 'des',
 'over',
 'au',
 'by',
 'nach',
 'chili',
 'aux',
 'y',
 'its',
 'alla',
 'en',
 'con',
 'su',
 "l'Oignon",
 'di',
 'without',
 'el',
 'le',
 'aka',
 'in',
 'sa',
 'to',
 'version',
 "all'Amatriciana",
 'de',
 'from',
 'al',
 'or',
 'bil',
 'with',
 'na',
 'for',
 'and']

Number of words that are not capitalized increased significantly crosschecking with lowercase words in ingredient tokens

In [186]:
for i, name in enumerate(recipe_tokens):
    for ingre in ingre_tokens:
        if recipe_tokens[i].lower() == ingre:
            recipe_tokens[i] = recipe_tokens[i].lower()

lower_recipe_tokens = []
for token in recipe_tokens:
    if token[0].islower():
        lower_recipe_tokens.append(token)
        
len(lower_recipe_tokens)

923

In [187]:
upper_recipe_tokens = list(filter(str.istitle, recipe_tokens))
len(upper_recipe_tokens)

2314

In [188]:
upper_recipe_tokens[:20]

['Edition',
 'Mojo',
 'Company',
 'World',
 'Malian',
 'Iced',
 'Brennan',
 'Jell-O',
 'Bollen',
 'Japanese-Style',
 'Sancocho',
 'Teena',
 'Malva',
 "L'Ossa",
 'Bazlama',
 'Aloha',
 'Makhani',
 'Whit',
 'Asada',
 'Fairy']

Use country names to get the words related to country names for capitalization

In [189]:
!pipenv install country_list

Installing country_list...

Installing dependencies from Pipfile.lock (577ce1)...
Ignoring argcomplete: markers 'python_full_version < "3.8.0"' don't match your environment
Ignoring importlib-metadata: markers 'python_version == "3.7" and python_full_version < "3.8.0" and python_full_version < "3.8.0" and python_full_version < "3.8.0"' don't match your environment
Ignoring typing-extensions: markers 'python_full_version < "3.8.0"' don't match your environment




[    ] Installing...
[=   ] Installing country_list...
[==  ] Installing country_list...
[=== ] Installing country_list...
[ ===] Installing country_list...
[  ==] Installing country_list...
[   =] Installing country_list...
[    ] Installing country_list...
[   =] Installing country_list...
[  ==] Installing country_list...
[ ===] Installing country_list...
[====] Installing country_list...
[=== ] Installing country_list...
[==  ] Installing country_list...
[=   ] Installing country_list...
[    ] Installing country_list...
[=   ] Installing country_list...
[==  ] Installing country_list...
[=== ] Installing country_list...
[ ===] Installing country_list...
[  ==] Installing country_list...
[   =] Installing country_list...
[    ] Installing country_list...
[   =] Installing country_list...
[  ==] Installing country_list...
[ ===] Installing country_list...
[====] Installing country_list...
[=== ] Installing country_list...
[==  ] Installing country_list.

In [190]:
from country_list import countries_for_language

countries = dict(countries_for_language('en'))
countries = list(countries.values())

countries

['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua & Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia & Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Caribbean Netherlands',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo - Brazzaville',
 'Congo - Kinshasa',
 'Cook Islands',
 'Costa Rica',
 'Côte d’Ivoire',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egyp

Not all words are captured in the country names library, added some more.

In [191]:
countries = ' '.join([elem for elem in countries])
countries = countries.replace('&', '')
countries = countries.split(" ")
countries = [i.strip() for i in countries]
countries = [string for string in countries if string != ""]
countries = [string for string in countries if string != "-"]

countries = countries + ["Filipino", "Malay", "Spanish", "Danish", "Welsh", "Polish", "Schwabisch", "Rochester", "Asia",
                         "Aussie", "Greek", "German", "Mexica", "Hawaii", "Irish", "Mediterranean", "Middle", "East",
                        "Norwegian", "Persian", "Pollo", "Thai", "West"]

countries

['Afghanistan',
 'Åland',
 'Islands',
 'Albania',
 'Algeria',
 'American',
 'Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua',
 'Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia',
 'Herzegovina',
 'Botswana',
 'Bouvet',
 'Island',
 'Brazil',
 'British',
 'Indian',
 'Ocean',
 'Territory',
 'British',
 'Virgin',
 'Islands',
 'Brunei',
 'Bulgaria',
 'Burkina',
 'Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape',
 'Verde',
 'Caribbean',
 'Netherlands',
 'Cayman',
 'Islands',
 'Central',
 'African',
 'Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas',
 'Island',
 'Cocos',
 '(Keeling)',
 'Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Brazzaville',
 'Congo',
 'Kinshasa',
 'Cook',
 'Islands',
 'Costa',
 'Rica',
 'Côte',
 'd’Ivoire',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'C

Then use stemmer to get the stem of the words in the country names. But if the stem is too short, just use the first 5 characters of the word

In [192]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter = PorterStemmer()
lancaster=LancasterStemmer()

porter_c = []
lancester_c = []

for c in countries:
    port = porter.stem(c.split(' ').pop(0))
    if len(port) < 5:
        port = c[:4]
    porter_c.append(port.capitalize())
    lan = lancaster.stem(c.split(' ').pop(0))
    if len(lan) < 5:
        lan = c[:4]
    lancester_c.append(lan.capitalize())

print(porter_c[:10])
print(lancester_c[:10])

['Afghanistan', 'Åland', 'Island', 'Albania', 'Algeria', 'American', 'Samoa', 'Andorra', 'Angola', 'Anguilla']
['Afgh', 'Åland', 'Island', 'Alban', 'Alger', 'Amer', 'Samo', 'Andorr', 'Angol', 'Anguill']


In [193]:
lancester_c.append("Victoria")
lancester_c

['Afgh',
 'Åland',
 'Island',
 'Alban',
 'Alger',
 'Amer',
 'Samo',
 'Andorr',
 'Angol',
 'Anguill',
 'Antarctic',
 'Antigu',
 'Barbud',
 'Argentin',
 'Armen',
 'Arub',
 'Austral',
 'Austr',
 'Azerbaid',
 'Bahama',
 'Bahrain',
 'Bangladesh',
 'Barbado',
 'Belar',
 'Belg',
 'Beli',
 'Benin',
 'Bermud',
 'Bhut',
 'Boliv',
 'Bosn',
 'Herzegovin',
 'Botswan',
 'Bouvet',
 'Island',
 'Brazil',
 'Brit',
 'Indi',
 'Ocea',
 'Territ',
 'Brit',
 'Virgin',
 'Island',
 'Brune',
 'Bulgar',
 'Burkin',
 'Faso',
 'Burund',
 'Cambod',
 'Cameroon',
 'Canad',
 'Cape',
 'Verd',
 'Carib',
 'Netherland',
 'Caym',
 'Island',
 'Cent',
 'Afri',
 'Republ',
 'Chad',
 'Chil',
 'Chin',
 'Christmas',
 'Island',
 'Coco',
 '(keeling)',
 'Island',
 'Colomb',
 'Comoro',
 'Congo',
 'Brazzavil',
 'Congo',
 'Kinshas',
 'Cook',
 'Island',
 'Cost',
 'Rica',
 'Côte',
 'D’ivoire',
 'Croat',
 'Cuba',
 'Curaçao',
 'Cypr',
 'Czech',
 'Denmark',
 'Djibout',
 'Dominic',
 'Domin',
 'Republ',
 'Ecuad',
 'Egypt',
 'El',
 'Salvad',
 'E

Get all the recipe tokens that have the country names stem and remove the unrelated tokens

In [194]:
token_with_country_prefix = []
for rt in recipe_tokens:
    for lan in lancester_c:
        if lan in rt:
            token_with_country_prefix.append(rt)

token_with_country_prefix = sorted(list(set(token_with_country_prefix)))
token_with_country_prefix.remove("No-Cook")
token_with_country_prefix.remove("Man")
token_with_country_prefix.remove("Slow-Cooked")
token_with_country_prefix.remove("Slow-Cooker")
token_with_country_prefix.remove("Garlic-Anchovy-Sardine")
token_with_country_prefix

["'Chinese",
 'Afghan',
 'Afghani',
 'African',
 'African-Style',
 'Afritada',
 'Algerian',
 'Almond-Ricotta',
 'American',
 'Americano',
 'Arabic',
 'Argentine',
 'Argentinean',
 'Armenian',
 'Asiago',
 'Asian',
 'Asian-Inspired',
 'Asian-Style',
 'Asian-Themed',
 'Australian',
 'Bangladeshi',
 'Belgi',
 'Belgian',
 'Belizean',
 'Bermuda',
 'Bhutanese',
 'Bolivian',
 'Brazilian',
 'Brazilian-Style',
 'British',
 'Bulgarian',
 'Cambodian',
 'Canada',
 'Canadian',
 'Cape',
 'Capezzoli',
 'Caribbean',
 'Caribbean-Spiced',
 'Chad',
 'Chilaquiles',
 'Chilean',
 'Chilean-Style',
 'Chinese',
 'Chinese-Style',
 'Christmas',
 'Coco',
 'Coconut-Lentil',
 'Coconut-Lime',
 'Cocotte',
 'Colombian',
 'Cooker',
 'Cooks',
 'Cookup',
 'Costa',
 'Croatian',
 'Cuban',
 'Cuban-Inspired',
 'Cuban-Style',
 'Cubanos',
 'Curry-Coconut',
 'Czech',
 'Czechoslovakian',
 'Danielle',
 'Danish',
 'Dominican',
 'Dominican-Style',
 'East',
 'Easter',
 'Eastern',
 'Eastern-Style',
 'Egyptian',
 'Elizabeth',
 'Ellen',

In [195]:
token_with_country_prefix

["'Chinese",
 'Afghan',
 'Afghani',
 'African',
 'African-Style',
 'Afritada',
 'Algerian',
 'Almond-Ricotta',
 'American',
 'Americano',
 'Arabic',
 'Argentine',
 'Argentinean',
 'Armenian',
 'Asiago',
 'Asian',
 'Asian-Inspired',
 'Asian-Style',
 'Asian-Themed',
 'Australian',
 'Bangladeshi',
 'Belgi',
 'Belgian',
 'Belizean',
 'Bermuda',
 'Bhutanese',
 'Bolivian',
 'Brazilian',
 'Brazilian-Style',
 'British',
 'Bulgarian',
 'Cambodian',
 'Canada',
 'Canadian',
 'Cape',
 'Capezzoli',
 'Caribbean',
 'Caribbean-Spiced',
 'Chad',
 'Chilaquiles',
 'Chilean',
 'Chilean-Style',
 'Chinese',
 'Chinese-Style',
 'Christmas',
 'Coco',
 'Coconut-Lentil',
 'Coconut-Lime',
 'Cocotte',
 'Colombian',
 'Cooker',
 'Cooks',
 'Cookup',
 'Costa',
 'Croatian',
 'Cuban',
 'Cuban-Inspired',
 'Cuban-Style',
 'Cubanos',
 'Curry-Coconut',
 'Czech',
 'Czechoslovakian',
 'Danielle',
 'Danish',
 'Dominican',
 'Dominican-Style',
 'East',
 'Easter',
 'Eastern',
 'Eastern-Style',
 'Egyptian',
 'Elizabeth',
 'Ellen',

Possessives can also be used for capitalizing, since proper names like Chef John's occur a lot

In [196]:
possesive_tokens = list_words_with_tag(tagged_recipe_names, "''")
possesive_tokens

["''", "''", "'", "''", "''", "''", "''"]

In [197]:
possessive_names = []
for ps in possesive_tokens:
    print(find_value_with_char(p_recipes, 'name', ps))
    possessive_names = possessive_names + find_value_with_char(p_recipes, 'name', ps)

[]
[]
["Chef John's Chicken Kiev", "Angela's Awesome Enchiladas", "Randy's Slow Cooker Ravioli Lasagna", "'Chinese Buffet' Green Beans", "Chef John's Beef Rouladen", "Corned Beef and Cabbage Shepherd's Pie", "Gramma's Date Squares", "Authentic Russian Salad 'Olivye'", "Chef John's Meatless Meatballs", "Chef John's Beef Goulash", "Grandma's Noodles II", "Chef John's Clotted Cream", "Newfoundland Jigg's Dinner", "Chef John's Coq Au Vin", "Chef John's Loco Moco", "Dash's Donair", "Turkey Shepherd's Pie", "Papa Drexler's Bavarian Pretzels", "Bob's Stuffed Banana Peppers", "Chef John's Swedish Meatballs", "Chef John's German Recipes", "Chef John's Chicken Tikka Masala", "Maria's Mexican Rice", "Mom's Buttermilk Pancakes", "Geneva's Ultimate Hungarian Mushroom Soup", "Charley's Slow Cooker Mexican Style Meat", "Ingrid's Rouladen", "Chef John's Lasagna", "Lola's Horchata", "Chef John's Italian Sausage Chili", "Kid's Favorite Pizza Casserole", "Traci's Adobo Seasoning", "Frank's Favorite Slow-

In [198]:
possessive_names

["Chef John's Chicken Kiev",
 "Angela's Awesome Enchiladas",
 "Randy's Slow Cooker Ravioli Lasagna",
 "'Chinese Buffet' Green Beans",
 "Chef John's Beef Rouladen",
 "Corned Beef and Cabbage Shepherd's Pie",
 "Gramma's Date Squares",
 "Authentic Russian Salad 'Olivye'",
 "Chef John's Meatless Meatballs",
 "Chef John's Beef Goulash",
 "Grandma's Noodles II",
 "Chef John's Clotted Cream",
 "Newfoundland Jigg's Dinner",
 "Chef John's Coq Au Vin",
 "Chef John's Loco Moco",
 "Dash's Donair",
 "Turkey Shepherd's Pie",
 "Papa Drexler's Bavarian Pretzels",
 "Bob's Stuffed Banana Peppers",
 "Chef John's Swedish Meatballs",
 "Chef John's German Recipes",
 "Chef John's Chicken Tikka Masala",
 "Maria's Mexican Rice",
 "Mom's Buttermilk Pancakes",
 "Geneva's Ultimate Hungarian Mushroom Soup",
 "Charley's Slow Cooker Mexican Style Meat",
 "Ingrid's Rouladen",
 "Chef John's Lasagna",
 "Lola's Horchata",
 "Chef John's Italian Sausage Chili",
 "Kid's Favorite Pizza Casserole",
 "Traci's Adobo Seasoning"

Chef John's Lasagna, but how about just lasagna? Saving both

In [199]:
non_possessive = []
for ps in possessive_names:
    if "'s " in ps:
        non_possessive.append(ps.split("'s ",1)[1].lower())

non_possessive

['chicken kiev',
 'awesome enchiladas',
 'slow cooker ravioli lasagna',
 'beef rouladen',
 'pie',
 'date squares',
 'meatless meatballs',
 'beef goulash',
 'noodles ii',
 'clotted cream',
 'dinner',
 'coq au vin',
 'loco moco',
 'donair',
 'pie',
 'bavarian pretzels',
 'stuffed banana peppers',
 'swedish meatballs',
 'german recipes',
 'chicken tikka masala',
 'mexican rice',
 'buttermilk pancakes',
 'ultimate hungarian mushroom soup',
 'slow cooker mexican style meat',
 'rouladen',
 'lasagna',
 'horchata',
 'italian sausage chili',
 'favorite pizza casserole',
 'adobo seasoning',
 'favorite slow-cooker thai chicken',
 'shrimp fra diavolo',
 'chicken paprikash',
 'french omelette',
 'pie',
 'hazelnut christmas cookies',
 'patatas bravas',
 'italian bread',
 'cuban bread',
 'pie',
 'chimichurri sauce',
 'easy german sauerbraten',
 'pie',
 'german marble cake',
 'steak pizzaiola',
 'sour cream lasagna',
 'beef shish kabobs',
 'polish perogies',
 'indian-spiced tomato lentil soup',
 'shep

In [200]:
all_recipe_names[:10]

['Pan-Fried Asparagus',
 'Creamy Au Gratin Potatoes',
 'Super-Delicious Zuppa Toscana',
 'Simple Teriyaki Sauce',
 'Spicy Korean Fried Chicken with Gochujang Sauce',
 'Spaghetti Aglio e Olio',
 'Easy Garam Masala',
 'Easy Chorizo Street Tacos',
 'Russian Cabbage Rolls with Gravy',
 'Shrimp Scampi with Pasta']

Create a copy of all_recipe_names as backup

In [201]:
all_recipe_names2 = all_recipe_names.copy()
all_recipe_names2[:10]

['Pan-Fried Asparagus',
 'Creamy Au Gratin Potatoes',
 'Super-Delicious Zuppa Toscana',
 'Simple Teriyaki Sauce',
 'Spicy Korean Fried Chicken with Gochujang Sauce',
 'Spaghetti Aglio e Olio',
 'Easy Garam Masala',
 'Easy Chorizo Street Tacos',
 'Russian Cabbage Rolls with Gravy',
 'Shrimp Scampi with Pasta']

Drop the recipe names that have possessives temporarily

In [202]:
print(len(all_recipe_names))
  
all_recipe_names2 = [ele for ele in all_recipe_names2 if ele not in possessive_names] 
print(len(all_recipe_names2))

5249
4890


If a word in a recipe does not belong to the tokens with country prefix, lowercase it by default

In [203]:
# https://stackoverflow.com/questions/40291443/python-convert-a-string-to-lowercase-except-some-special-strings/40291577
lowerAllExcept = lambda x: " ".join( a if a in token_with_country_prefix else a.lower()
                                    for a in x.split() )

for i, recipe in enumerate(all_recipe_names2):
    for t in token_with_country_prefix:
        all_recipe_names2[i] = lowerAllExcept(all_recipe_names2[i])

Join the names with possessives back to the list

In [204]:
all_recipe_names2 = all_recipe_names2 +  possessive_names
print(len(all_recipe_names2))
all_recipe_names2 = all_recipe_names2 +  non_possessive
print(len(all_recipe_names2))
all_recipe_names2 = list(set(all_recipe_names2))
print(len(all_recipe_names2))

5249
5577
5362


For some reasons, 'Thai' is saved as 'thai'

In [205]:
all_recipe_names_corpus = ("\n").join(all_recipe_names2)

recipe_tokens = list(set(nltk.word_tokenize(all_recipe_names_corpus)))

recipe_tokens = [i.replace('thai','Thai') for i in recipe_tokens]

lower_recipe_tokens = []
for token in recipe_tokens:
    if token[0].islower():
        lower_recipe_tokens.append(token)
        
lower_recipe_tokens

['krupnikas',
 'garbanzos',
 'griessnockerlsuppe',
 'pasta',
 'piccata',
 'rigatoni',
 'achiote',
 'peri',
 'stovetop',
 'snert',
 'nut',
 'cowpeas',
 'macaron',
 'the',
 'gringa',
 'pearl',
 'juleskringle',
 'veracruz-style',
 'grouse',
 'bacon',
 'tamarind',
 'brewis',
 'churros',
 'nakkileipa',
 'aphrodisiac',
 'pipirrana',
 'autenticos',
 'vampires',
 'baiana',
 'fudge',
 'deviled',
 'panna',
 'zucchini',
 'om',
 'phujia',
 'champurrado',
 'churrasco',
 'scrumptious',
 'pate',
 'himalayan',
 'parchment',
 'brasciole',
 'feta',
 'tteokbokki',
 'burger',
 'saltado',
 'macaroon',
 'really',
 'vegetables',
 'semolina',
 'caneles',
 'tofu',
 'e',
 'saffron',
 'rum-spiked',
 'german',
 'stuffed',
 'stone',
 'thanksgiving',
 'beans',
 'kladdkaka',
 'sarde',
 'dinners',
 'frittata',
 'moist',
 'turnips',
 'rouladen',
 'maduro',
 'brown',
 'green',
 'calabrian',
 'pho',
 'kombu',
 'jam',
 'cornmeal',
 'montreal',
 'spiced',
 'devils',
 'stewed',
 'bialys',
 'schweinebraten',
 'chive',
 'spa

In [206]:
len(lower_recipe_tokens)

2811

In [207]:
upper_recipe_tokens = list(filter(str.istitle, recipe_tokens))
len(upper_recipe_tokens)

853

In [208]:
upper_recipe_tokens

['Farmer',
 'Mojo',
 'Malian',
 'World',
 'Brennan',
 'Gnocchi',
 'Japanese-Style',
 'Sancocho',
 'Teena',
 "L'Ossa",
 'Whit',
 'Liver',
 'Harissa',
 'Fruit',
 'Cream',
 'Halloween',
 'Mrs',
 'Charlie',
 'Lance',
 "Guisada'",
 'Afghani',
 'Ukrainian',
 'Cape',
 'Date',
 'Tim',
 'Cobbler',
 'Perry',
 'Steaks',
 'Asian-Style',
 'Tonno',
 'Chang',
 'Welsh',
 'Mange',
 'Indian-Style',
 'Shredded',
 'Diavolo',
 'Polish',
 'Lanka',
 'Nonna',
 "'Three",
 'Mango-Habanero',
 'Romesco',
 'Egyptian',
 'Nancy',
 'Mike',
 'Pancakes',
 'P.F',
 'Indian',
 'Tso',
 'The',
 'Angela',
 'Melissa',
 'Scallops',
 'Haitian',
 'Kid',
 'Amazing',
 'Mushroom',
 'Recipe',
 'Cakes',
 'Picadillo',
 'Sans',
 'Sarita',
 'Savory',
 'Chris',
 'Favorite',
 'Classic',
 'Tiramisu',
 'Rolls',
 'Ravioli',
 'Lasagna',
 'Copycat',
 'Colombian',
 'Chilaquiles',
 'Oma',
 'Onions',
 'Carnitas',
 'Schwabischer',
 'Afghan',
 'Matthew',
 'Lazy',
 'Sesame',
 'Seasoning',
 'Sandwiches',
 'Seared',
 'Feijoada',
 'Prizewinning',
 'Mah

## Updating POS tags in names after changing casing

Previously, almost all the words belong to NNP or NNPS, due to capitalization. By fixing the letter casings, now most of the words are NN (common nouns)

In [209]:
final_tagged_names = []

for recipe in all_recipe_names2:
    final_tagged_names.append(tag_pos(recipe))

all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(final_tagged_names, POS)}
  all_name_tags.append(new_dic)

get_tag_number(all_name_tags)

[{'$': 0},
 {"''": 8},
 {'(': 0},
 {')': 0},
 {',': 63},
 {'--': 0},
 {'.': 1},
 {':': 1},
 {'CC': 509},
 {'CD': 28},
 {'DT': 101},
 {'EX': 3},
 {'FW': 46},
 {'IN': 523},
 {'JJ': 3132},
 {'JJR': 10},
 {'JJS': 6},
 {'LS': 0},
 {'MD': 2},
 {'NN': 9070},
 {'NNP': 1724},
 {'NNPS': 5},
 {'NNS': 1426},
 {'PDT': 0},
 {'POS': 344},
 {'PRP': 3},
 {'PRP$': 1},
 {'RB': 61},
 {'RBR': 0},
 {'RBS': 0},
 {'RP': 4},
 {'SYM': 1},
 {'TO': 10},
 {'UH': 0},
 {'VB': 52},
 {'VBD': 261},
 {'VBG': 101},
 {'VBN': 232},
 {'VBP': 211},
 {'VBZ': 20},
 {'WDT': 0},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 6}]

## Chunking (recipe names)

If the number of words in recipes are more than 2 (since bigram can deal with 2-word names), then it can be treated as a recipe name chunk

In [210]:
def sort_unique_list(old_list):
    return sorted(list(set(old_list)))

In [211]:
recipe_name_chunk = []

for recipe in all_recipe_names2:
    if len(recipe.split()) > 2:
        recipe_name_chunk.append(recipe)

recipe_name_chunk = sort_unique_list(recipe_name_chunk)

for n in recipe_name_chunk:
    print(n)

"million dollar" Chinese cabbage salad
"pantry raid" chicken enchilada casserole
"skinny" chicken tacos
'Chinese Buffet' Green Beans
3-ingredient lemon scones
5-ingredient Mexican casserole
A Firefighter's Meatloaf
A Scotsman's Shepherd Pie
Adriel's Chinese Curry Chicken
Afghan beef raviolis
Afghani kabli pulao
African cabbage stew
African chicken stew
African sweet potato and peanut soup
African sweet potato stew
African-Style oxtail stew
Al's Baked Swiss Steak
Al's Burmese Chicken Curry
Ali's Amazing Bruschetta
Alicia's Aloo Gobi
Allie's Mushroom Pizza
Alysia's Basic Meat Lasagna
Amanda's Stuffed Peppers
Andy's Spicy Green Chile Pork
Angela's Asian-Inspired Chicken Noodle Soup
Angela's Awesome Enchiladas
Anne's Chicken Chilaquiles Rojas
Arabic fattoush salad
Argentine chimichurri bread
Argentine meat empanadas
Argentinean cheese bread
Armenian Easter bread
Armenian shish kabob
Armenian stuffed eggplant
Asiago sun-dried tomato pasta
Asian beef with snow peas
Asian chicken salad
Asian 

instant pot Puerto Rican arroz con Pollo
instant pot Thai-Style green curry chicken
instant pot baked beans
instant pot barbacoa
instant pot caldillo
instant pot chicken biryani
instant pot chicken cacciatore
instant pot chicken enchiladas
instant pot chicken fried rice
instant pot chicken posole verde
instant pot chicken tagine with apricots and chickpeas
instant pot chicken tagine with butternut squash and spinach
instant pot crispy chicken carnitas
instant pot dumpling soup
instant pot galbi
instant pot ginataang salmon
instant pot haluski with kielbasa
instant pot lebanese lentil soup
instant pot moroccan chicken tagine
instant pot pasta with Italian sausage
instant pot provencal honey-lemon chicken
instant pot red posole
instant pot red thai curry chicken
instant pot risotto
instant pot sweet and sour pork
instant pot tacos al pastor
instant pot tonkotsu ramen broth
irish apple pie
irish cream brownies
irish soda bread
irresistible Irish soda bread
italian butter cookies
italian e

Get all the prepositions found by NLTK

In [212]:
in_tokens = sort_unique_list(get_values_from_dict_list(all_name_tags, 'IN')[0])

in_tokens

['Of',
 'Under',
 'arroz',
 'bayrischer',
 'before',
 'beyond',
 'brown',
 'by',
 'de',
 'dough',
 'en',
 'for',
 'from',
 'in',
 'of',
 'on',
 'out',
 'over',
 'pina',
 'so',
 'trout',
 'under',
 'with',
 'without',
 'worth']

Keep only the actual prepositions

In [213]:
in_tokens = ['Of',
 'Under',
 'before',
 'beyond',
 'by',
 'for',
 'from',
 'in',
 'of',
 'on',
 'out',
 'over',
 'so',
 'under',
 'with',
 'without']

in_tokens

['Of',
 'Under',
 'before',
 'beyond',
 'by',
 'for',
 'from',
 'in',
 'of',
 'on',
 'out',
 'over',
 'so',
 'under',
 'with',
 'without']

Get all the recipe names with prepositions

In [214]:
names_in_tokens = [s for s in all_recipe_names2 if any(xs in s for xs in in_tokens)]

names_in_tokens

['moroccan lentil soup with veggies',
 'macaron',
 'onion soup gratinee',
 'moong dal',
 'Canadian pork loin chops',
 'calabaza con Pollo',
 'Greek pizza with spinach, feta and olives',
 'simple Mexican quinoa',
 'instant pot crispy chicken carnitas',
 'black bean and corn pasta with chicken',
 'easy baked chicken cordon bleu',
 'Italian lemon coffee cake',
 "Diana's Guinness Chocolate Cake with Guinness Chocolate Icing",
 'gingerbread biscotti',
 'tofu',
 'beef bourguignon i',
 'chicken chimi in the oven',
 'fresh spring rolls',
 'sticky honey garlic wings',
 'Irish root soup',
 'soy eggs',
 'scottish mince pie',
 'chicken tetrazzini iv',
 'linzer torte cookies',
 'empanada pork filling',
 'sweet and sour pork tenderloin',
 'air fryer steak for fajitas',
 "Chef John's Ham and Cheese Calzones",
 'authentic Puerto Rican sofrito',
 'Greek seasoning',
 'sopa de lima',
 'caramelized chicken wings',
 'refreshing limoncello cake',
 'Mexican chicken and rice soup',
 'lamb stew with green bean

Define a function that chunks based on grammar given, but only return chunk that have more than 2 words, since bigrams already can deal with phrases with 2 words anyway

In [215]:
from nltk import RegexpParser

def chunk(corpus, grammar, target):
    chunker = RegexpParser(grammar)
    tagged = pos_tag(word_tokenize(corpus))
    output = chunker.parse(tagged)
    outputs = []
    for subtree in output.subtrees(filter=lambda t: t.label() == target):
        result = re.sub("(\([A-Z]+ )|(\/[A-Z]+)|(\))+", "", str(subtree))
        if len(result.split()) > 2:
            outputs.append(result)
    return outputs

# https://github.com/nopynospy/pos_tagging/blob/main/pos.ipynb

PP_REGEX = r"""
  ADJP: {<RB>?<JJ|JJR|JJS|RBR|RBS>}    # Adjectives may have comparative and superlative, and come after adverbs like very
  NP: {<DT|WDT|WP$>?<CD>?<AdjP>*<NN|NNS|NNP|NNPS><POS>*<NN|NNS|NNP|NNPS|PP|CD>*<VBG>?}    # Determiner, number and adjectives come before nouns and nouns may have possessive -s and followed by another noun
  NP: {<PRP|EX|CD|WP|WRB|PRP$|WP$>}    # Pronouns and numbers can also replace nouns and function as one
  PP: {<IN>?<IN>?<IN|TO><NP>}    # Prepositions come before nouns and sometimes two prepositions come together
"""

chunk("chicken marsala with portobello mushrooms", PP_REGEX, "PP")

['with portobello mushrooms']

Get prepositional phrases from recipe names

In [216]:
prepositional_phrases = []

for name in names_in_tokens:
    prepositional_phrases = prepositional_phrases + (chunk(name, PP_REGEX, "PP"))
    
prepositional_phrases

['with Guinness Chocolate Icing',
 'in the oven',
 'with homemade beef stock',
 'de Pollo con arroz',
 'with rice wine',
 'on the rocks',
 'in chicken broth',
 'with zucchini pasta',
 'de Coco y queso',
 'with Meat Sauce',
 'in oyster sauce',
 'with coconut rice',
 'with tzatziki sauce',
 'with hollandaise sauce',
 'in a bag',
 'with mango salsa',
 'with sweet potatoes',
 'under a brick',
 'with ground beef',
 'Under a Brick',
 'on a pizza stone',
 'with Asiago cream sauce',
 'with a soy',
 'with coconut milk',
 'with sake butter',
 'with Mango-Habanero sauce',
 'in tamarind broth',
 'with jasmine rice',
 'with tomato sauce',
 'with crumbly topping',
 'on the rocks',
 'with chanterelle mushrooms',
 'with meat sauce',
 'with Kimchee Fried Rice',
 'with ramen noodles',
 'with artichoke hearts',
 'with coconut milk',
 'with coconut curry sauce',
 'with ginger-orange sauce',
 'with miso ranch',
 'in coconut milk',
 'with cream cheese',
 'with homemade taco seasoning',
 'with coconut milk',

## Chunking (ingredients)

In [217]:
all_ingre_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_ingredients, POS)}
  all_ingre_tags.append(new_dic)

get_tag_number(all_ingre_tags)

[{'$': 0},
 {"''": 14},
 {'(': 3742},
 {')': 3827},
 {',': 8513},
 {'--': 0},
 {'.': 23},
 {':': 295},
 {'CC': 3094},
 {'CD': 21802},
 {'DT': 99},
 {'EX': 0},
 {'FW': 52},
 {'IN': 2849},
 {'JJ': 13400},
 {'JJR': 523},
 {'JJS': 6},
 {'LS': 0},
 {'MD': 612},
 {'NN': 32984},
 {'NNP': 2416},
 {'NNPS': 2},
 {'NNS': 13598},
 {'PDT': 1},
 {'POS': 126},
 {'PRP': 1},
 {'PRP$': 1},
 {'RB': 1452},
 {'RBR': 5},
 {'RBS': 0},
 {'RP': 13},
 {'SYM': 53},
 {'TO': 1039},
 {'UH': 0},
 {'VB': 1724},
 {'VBD': 8947},
 {'VBG': 354},
 {'VBN': 3436},
 {'VBP': 645},
 {'VBZ': 588},
 {'WDT': 1},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 0}]

Get all the prepositions detected by NLTK from ingredients

In [218]:
in_tokens = sort_unique_list(get_values_from_dict_list(all_ingre_tags, 'IN')[0])

in_tokens

['OF',
 'about',
 'across',
 'against',
 'aji',
 'almond',
 'ancho',
 'aonori',
 'as',
 'at',
 'brown',
 'by',
 'de',
 'dough',
 'for',
 'from',
 'if',
 'in',
 'into',
 'nonfat',
 'nutmeg',
 'of',
 'on',
 'orzo',
 'out',
 'over',
 'pepper',
 'per',
 'pimento',
 'pinto',
 'taco',
 'tamarind',
 'through',
 'trout',
 'until',
 'with',
 'without',
 'wrapper']

Keep only actual prepositions

In [219]:
in_tokens = ['OF',
 'about',
 'across',
 'against',
 'as',
 'at',
 'by',
 'for',
 'from',
 'if',
 'in',
 'into',
 'of',
 'on',
 'out',
 'over',
 'per',
 'through',
 'until',
 'with',
 'without']

in_tokens

['OF',
 'about',
 'across',
 'against',
 'as',
 'at',
 'by',
 'for',
 'from',
 'if',
 'in',
 'into',
 'of',
 'on',
 'out',
 'over',
 'per',
 'through',
 'until',
 'with',
 'without']

Get all the ingredients with prepositions

In [220]:
ingres_in_tokens = [s for s in p_ingredients if any(xs in s for xs in in_tokens)]

ingres_in_tokens

['4 teaspoons garlic powder, divided',
 '4 green bell peppers, cut into chunks',
 '4 teaspoon mayonnaise',
 '4 teaspoons heavy cream',
 '4 onions, quartered',
 '4 potatoes, peeled and cut into 4 inch cubes',
 '4 teaspoons sea salt',
 '4 (4 ounce) can sliced water chestnuts, halved',
 '4.4 teaspoon Italian seasoning',
 '4 tablespoons Jamaican-style curry powder',
 '4 pinch Asafoetida',
 '4.4 medium onion, halved and separated',
 '4.4 cup hot water',
 '4.4 teaspoon dried sage',
 '4 baby corn ears, sliced',
 '4.4 cup freshly squeezed lemon juice',
 '4 Roma tomatoes, sliced',
 '4 medium sweet potato, cut into 4.4-inch pieces',
 '4 (4 inch) flour tortillas',
 '4.4 cup pinole (coarse ground maize flour)',
 '4 cup blanched almonds',
 '4 tablespoons olive oil, or as needed',
 '4 cinnamon stick, broken',
 '4 cube chicken bouillon (Optional)',
 '4 teaspoon mustard seeds',
 '4.4 teaspoons lemon juice',
 '4 large yellow onions',
 '4 skinless, boneless chicken breasts, or as needed, cut into 4-inch

In [221]:
prepositional_phrases2 = []

for name in ingres_in_tokens:
    prepositional_phrases2 = prepositional_phrases2 + (chunk(name, PP_REGEX, "PP"))
    
prepositional_phrases2

['into 4 inch cubes',
 'of one lime',
 'into 4 inch cubes',
 'as La Tortilla Factory',
 'of Goya Chipotle Chile',
 'in Adobo Sauce',
 'into 4.4 inch slices',
 'as Cabot Seriously Sharp',
 "as Hunt 's",
 'into 4 slices',
 'into 4.4 inch slices',
 'as FAGE Total',
 'at room temperature',
 'as El Pato',
 'into 4 pieces',
 'into 4 inch cubes',
 'de arbol peppers',
 'into 4 inch pieces',
 'to 4.4 inch thick',
 'at room temperature',
 'as Marshmallow Fluff',
 'to 4.4 inch thick',
 'to 4 pound',
 'for 4 minutes',
 'into 4 portions',
 'at room temperature',
 'into 4.4 inch slices',
 'into 4 inch pieces',
 'for osso buco',
 'into 4 pieces',
 'through a sieve',
 'into julienne strips',
 'for a 4 inch',
 'for a 4 inch',
 'of celery soup',
 'into 4 inch pieces',
 'as Phil Supreme',
 'in half crosswise',
 'as Pepperidge Farm',
 'as Old El Paso',
 'of round roast',
 'as Hodgson Mill',
 'on paper towels',
 'into 4 inch pieces',
 'of one orange',
 'into 4.4 inch pieces',
 'into 4.4 inch slices',
 'int

Fix typo

In [222]:
prepositional_phrases2 = ["as SuzyQ's Santa Maria Valley Style Seasoning" if x=="(PP\n  as\n  (NP\n    SuzyQ\n    's\n    Santa\n    Maria\n    Valley\n    Style\n    Seasoning" else x for x in prepositional_phrases2]

prepositional_phrases2

['into 4 inch cubes',
 'of one lime',
 'into 4 inch cubes',
 'as La Tortilla Factory',
 'of Goya Chipotle Chile',
 'in Adobo Sauce',
 'into 4.4 inch slices',
 'as Cabot Seriously Sharp',
 "as Hunt 's",
 'into 4 slices',
 'into 4.4 inch slices',
 'as FAGE Total',
 'at room temperature',
 'as El Pato',
 'into 4 pieces',
 'into 4 inch cubes',
 'de arbol peppers',
 'into 4 inch pieces',
 'to 4.4 inch thick',
 'at room temperature',
 'as Marshmallow Fluff',
 'to 4.4 inch thick',
 'to 4 pound',
 'for 4 minutes',
 'into 4 portions',
 'at room temperature',
 'into 4.4 inch slices',
 'into 4 inch pieces',
 'for osso buco',
 'into 4 pieces',
 'through a sieve',
 'into julienne strips',
 'for a 4 inch',
 'for a 4 inch',
 'of celery soup',
 'into 4 inch pieces',
 'as Phil Supreme',
 'in half crosswise',
 'as Pepperidge Farm',
 'as Old El Paso',
 'of round roast',
 'as Hodgson Mill',
 'on paper towels',
 'into 4 inch pieces',
 'of one orange',
 'into 4.4 inch pieces',
 'into 4.4 inch slices',
 'int

Get all the singular common nouns detected by NLTK from ingredients

In [223]:
nn_tokens = sort_unique_list(get_values_from_dict_list(all_ingre_tags, 'NN')[0])

nn_tokens

['%',
 '4.4-pound',
 'Caramel',
 'Class',
 'Italian',
 'Moist',
 'Oil',
 'SHAKE-N-BAKE',
 'TOUCH',
 'Yazzo',
 'acacia',
 'achiote',
 'acid',
 'acini',
 'adobo',
 'advieh',
 'agave',
 'ahi',
 'aisle',
 'alcohol',
 'ale',
 'allspice',
 'almond',
 'aluminum',
 'amani',
 'amaretto',
 'amarillo',
 'amber',
 'ammonia',
 'amount',
 'ancho',
 'anchovy',
 'angel',
 'anise',
 'annato',
 'annatto',
 'aperitif',
 'apple',
 'applesauce',
 'apricot',
 'arbol',
 'arborio',
 'arrachera',
 'arrowroot',
 'artichoke',
 'arugula',
 'asadero',
 'asafoetida',
 'asparagus',
 'au',
 'avocado',
 'avocados',
 'baby',
 'bacon',
 'bag',
 'baguette',
 'baking',
 'ball',
 'balsamic',
 'bamboo',
 'banana',
 'bananas',
 'bangus',
 'bar',
 'barbecue',
 'barbeque',
 'barley',
 'base',
 'basil',
 'basmati',
 'bass',
 'batter',
 'bay',
 'bean',
 'beaten',
 'bechamel',
 'bee4',
 'beech',
 'beef',
 'beer',
 'beeswax',
 'beet',
 'bell',
 'bella',
 'bellas',
 'beluga',
 'berry',
 'besan',
 'beverage',
 'bhaji',
 'bias',
 'bi

Get all the ingredients with the common nouns

In [224]:
ingres_nn_tokens = [s for s in p_ingredients if any(xs in s for xs in nn_tokens)]

ingres_nn_tokens

['4.4 cup butter',
 '4 teaspoons garlic powder, divided',
 '4 green bell peppers, cut into chunks',
 '4 sprigs fresh thyme leaves',
 '4.4 cups shredded mozzarella cheese',
 '4 teaspoon mayonnaise',
 '4 teaspoons heavy cream',
 '4 onions, quartered',
 '4 potatoes, peeled and cut into 4 inch cubes',
 '4 cucumber, chopped',
 '4.4 cup walnut pieces',
 '4 teaspoons sea salt',
 '4 (4 ounce) can sliced water chestnuts, halved',
 '4.4 cup dried apricots, chopped',
 '4.4 teaspoon Italian seasoning',
 '4 sticks unsalted butter, sliced, frozen',
 '4 tablespoons Jamaican-style curry powder',
 '4 pinch Asafoetida',
 '4.4 medium onion, halved and separated',
 '4 leaves iceberg lettuce',
 '4.4 cup hot water',
 '4.4 teaspoon dried sage',
 '4 baby corn ears, sliced',
 '4.4 cup freshly squeezed lemon juice',
 '4 Roma tomatoes, sliced',
 '4 medium sweet potato, cut into 4.4-inch pieces',
 '4 (4 inch) flour tortillas',
 '4 dill pickles, diced',
 '4.4 cup pinole (coarse ground maize flour)',
 '4 cup blanch

Filter for those without numbers at the beginning and make sure that each has at least 3 words

In [225]:
ingres_nn_tokens = [s for s in ingres_nn_tokens if not any(xs in s for xs in ["4", "4.4"]) and len(s.split()) > 2]

ingres_nn_tokens

["sifted confectioners' sugar",
 'Salt to taste',
 'Goya Corn Oil, for frying',
 'chopped fresh cilantro, for garnish',
 'crushed red pepper flakes to taste',
 'jalapeno pepper sauce, to taste',
 'thinly sliced onion',
 'sea salt and ground black pepper to taste',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'German stone ground mustard, to taste',
 'ground black pepper to taste',
 'salt, to taste',
 'sea salt and coarsely ground black pepper to taste',
 'lemon juice, to taste',
 'chopped fresh parsley for garnish',
 'extra-virgin olive oil for frying',
 'boiling water as needed',
 'grated zest of one orange',
 'seasoned salt to taste',
 'kosher salt, or to taste',
 'Goya Hot Sauce',
 'chopped fresh cilantro',
 'Salt and ground black pepper to taste',
 'vegetable oil, or as needed, divided',
 'freshly ground black pepper',
 'paprika, or to taste',
 'tortilla chips, for topping',
 'margarita or kosher salt for the rims',
 'Hog casing, rinsed well',
 'hot sauce to taste',
 'bean sprouts (

Get all the proper nouns from ingredients

In [226]:
nnp_tokens = sort_unique_list(get_values_from_dict_list(all_ingre_tags, 'NNP')[0])

nnp_tokens

['*',
 "Ac'cent",
 'Accent',
 'Adobo',
 'Agave',
 'Aji-No-Moto',
 'Ajinomoto',
 'Alcaparrado',
 'Aleppo',
 'Alfredo',
 'All-Purpose',
 'Aloha™',
 'Aluminum',
 'Anaheim',
 'Ancho',
 'Angel',
 'Angeli',
 'Angostura',
 'Annatto',
 'Arborio',
 'Archer',
 'Arthur',
 'Asafoetida',
 'Asiago',
 'Asian',
 'Authentic',
 'Azafran',
 'B',
 'BC',
 'BEN',
 'BOCA',
 'Bacardi',
 'Badia',
 'Baileys',
 'Baker',
 'Balance',
 'Barbeque',
 'Barilla',
 'Barolo',
 'Base',
 'Basics',
 'Basil',
 'Basmati',
 'Bavarian-style',
 'Bay',
 'Bay™',
 'Beaujolais',
 'Beef',
 'Ben',
 'Bengal',
 'Betty',
 'Beyond',
 'Bing',
 'Bisquick',
 'Black',
 'Blanc',
 'Blend',
 'Blue',
 'Bob',
 'Bold',
 'Bosc',
 'Boston',
 'Bouillon',
 'Bouquet',
 'Bragg',
 'Brand',
 'Branzino',
 'Bread',
 'Brie',
 'Broth',
 'Brown',
 'Brussels',
 'Buffalo',
 'Buitoni',
 "Bull's-Eye",
 'Buns',
 'Burgundy',
 'Butter',
 'Buttercream',
 'C',
 'Cabernet',
 'Cabot',
 'Cajun',
 'California',
 'Calimyrna',
 'Campari',
 'Campbell',
 'Canilla',
 'Canola',
 

Get all the ingredients with the proper noun

In [227]:
ingres_nnp_tokens = [s for s in p_ingredients if any(xs in s for xs in nnp_tokens)]

ingres_nnp_tokens

['4.4 teaspoon Italian seasoning',
 '4 tablespoons Jamaican-style curry powder',
 '4 pinch Asafoetida',
 '4 Roma tomatoes, sliced',
 '4 cube chicken bouillon (Optional)',
 '4.4 teaspoons brown sugar',
 "sifted confectioners' sugar",
 '4 tablespoons Irish whiskey',
 '4 cup recaito (such as Goya)',
 '4 grated zest of one lime',
 '4 tablespoon Chinese rice vinegar',
 '4 tablespoons Thai-style chile sauce',
 '4 extra-large Spanish onion, chopped',
 '4 pound sweet Italian sausage',
 '4.4 pound Cheddar cheese, shredded',
 '4 (4 pound) loaf French bread, cut into 4 inch cubes',
 '4 small chile peppers, diced (Optional)',
 'Salt to taste',
 '4 low-carb high-fiber tortillas (such as La Tortilla Factory)',
 '4 teaspoon monosodium glutamate (MSG) (Optional)',
 '4 chipotle pepper from a can of Goya Chipotle Chile in Adobo Sauce',
 '4 whole Dungeness crabs',
 '4.4 cup fresh Thai basil leaves',
 '4 French baguette, cut into diagonal 4.4 inch slices',
 '4 cup shredded sharp Cheddar cheese (such as Ca

Filter for those without numbers at the beginning and make sure that each has at least 3 words

In [228]:
ingres_nnp_tokens = [s for s in ingres_nnp_tokens if not any(xs in s for xs in ["4", "4.4"]) and len(s.split()) > 2]

ingres_nnp_tokens

["sifted confectioners' sugar",
 'Salt to taste',
 'Goya Corn Oil, for frying',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'German stone ground mustard, to taste',
 'grated zest of one orange',
 'Goya Hot Sauce',
 'Salt and ground black pepper to taste',
 'Hog casing, rinsed well',
 'Lime wedges for serving',
 'finely grated Parmigiano-Reggiano cheese',
 'Kosher salt and fresh cracked pepper to taste',
 'Canola oil, for frying',
 'sliced French bread',
 'Kosher salt, to taste',
 'superfine sugar as needed',
 'sweet Thai basil',
 'Salt and pepper, to taste',
 'Chopped Italian parsley',
 'Salt and freshly ground pepper, to taste',
 'Hot cooked regular long-grain white rice',
 'Tomato ketchup and hot mustard or Kikkoman Sweet & Sour Sauce',
 'Red pepper flakes to taste',
 'Curry powder to taste',
 'Freshly ground black pepper to taste',
 'Water to spray tops of loaves',
 'Parsley or cilantro for garnish',
 'Salt and freshly ground pepper to taste',
 'Water to cover',
 'white sugar for de

Define noun phrase rule

In [229]:
NP_REGEX = r"""
  ADJP: {<RB>?<JJ|JJR|JJS|RBR|RBS>}    # Adjectives may have comparative and superlative, and come after adverbs like very
  NP: {<DT|WDT|WP$>?<CD>?<AdjP>*<NN|NNS|NNP|NNPS><POS>*<NN|NNS|NNP|NNPS|PP|CD>*<VBG>?}    # Determiner, number and adjectives come before nouns and nouns may have possessive -s and followed by another noun
  NP: {<NP><,>*<NP>*<,>*<NP>*<CC>?<NP>}    # Multiple nouns can come in comma and 'and'
"""

chunk("salt and pepper", NP_REGEX, "NP")
# pos_tag("salt and pepper")

['salt and pepper']

These are the results of the chunking. Some chunks are used more than once

In [230]:
noun_phrases = []

for name in ingres_nn_tokens:
    noun_phrases = noun_phrases + (chunk(name, NP_REGEX, "NP"))
    
noun_phrases

["confectioners ' sugar",
 'Goya Corn Oil',
 'sea salt and ground',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'Goya Hot Sauce',
 'Salt and ground',
 'margarita or kosher salt',
 'salt and pepper',
 'oil cooking spray',
 'salt and ground',
 'spicy cilantro chutney',
 'plain bread crumbs',
 'Salt and pepper',
 'Chopped Italian parsley',
 'mustard or Kikkoman Sweet',
 'salt and pepper',
 'Parsley or cilantro',
 'cream or half-and-half',
 'coarse kosher salt',
 'salt and pepper',
 "confectioners ' sugar",
 'salt and ground pepper',
 'tomato and clam juice cocktail',
 'clam juice cocktail',
 'salt and ground',
 'kosher salt and ground',
 'salt and ground',
 'salt and ground',
 'oil cooking spray',
 'coarse sea salt',
 'salt and ground',
 'cheesecloth and kitchen string',
 'Goya Ground Black Pepper',
 'paper candy cups',
 'Salt and pepper',
 'buttons ,/, oyster ,/, shiitake',
 'portobello and crimini']

Fix typo

In [231]:
noun_phrases = ["buttons, oyster, shitake" if x=="buttons ,/, oyster ,/, shiitake" else x for x in noun_phrases]

noun_phrases

["confectioners ' sugar",
 'Goya Corn Oil',
 'sea salt and ground',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'Goya Hot Sauce',
 'Salt and ground',
 'margarita or kosher salt',
 'salt and pepper',
 'oil cooking spray',
 'salt and ground',
 'spicy cilantro chutney',
 'plain bread crumbs',
 'Salt and pepper',
 'Chopped Italian parsley',
 'mustard or Kikkoman Sweet',
 'salt and pepper',
 'Parsley or cilantro',
 'cream or half-and-half',
 'coarse kosher salt',
 'salt and pepper',
 "confectioners ' sugar",
 'salt and ground pepper',
 'tomato and clam juice cocktail',
 'clam juice cocktail',
 'salt and ground',
 'kosher salt and ground',
 'salt and ground',
 'salt and ground',
 'oil cooking spray',
 'coarse sea salt',
 'salt and ground',
 'cheesecloth and kitchen string',
 'Goya Ground Black Pepper',
 'paper candy cups',
 'Salt and pepper',
 'buttons, oyster, shitake',
 'portobello and crimini']

In [232]:
for name in ingres_nnp_tokens:
    noun_phrases = noun_phrases + (chunk(name, NP_REGEX, "NP"))
    
noun_phrases

["confectioners ' sugar",
 'Goya Corn Oil',
 'sea salt and ground',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'Goya Hot Sauce',
 'Salt and ground',
 'margarita or kosher salt',
 'salt and pepper',
 'oil cooking spray',
 'salt and ground',
 'spicy cilantro chutney',
 'plain bread crumbs',
 'Salt and pepper',
 'Chopped Italian parsley',
 'mustard or Kikkoman Sweet',
 'salt and pepper',
 'Parsley or cilantro',
 'cream or half-and-half',
 'coarse kosher salt',
 'salt and pepper',
 "confectioners ' sugar",
 'salt and ground pepper',
 'tomato and clam juice cocktail',
 'clam juice cocktail',
 'salt and ground',
 'kosher salt and ground',
 'salt and ground',
 'salt and ground',
 'oil cooking spray',
 'coarse sea salt',
 'salt and ground',
 'cheesecloth and kitchen string',
 'Goya Ground Black Pepper',
 'paper candy cups',
 'Salt and pepper',
 'buttons, oyster, shitake',
 'portobello and crimini',
 "confectioners ' sugar",
 'Goya Corn Oil',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'Goya H

In [233]:
noun_phrases = sort_unique_list(noun_phrases)

noun_phrases

['Chopped Italian parsley',
 'Goya Corn Oil',
 'Goya Ground Black Pepper',
 'Goya Hot Sauce',
 'Parsley or cilantro',
 'Reynolds Wrap Heavy Duty Aluminum Foil',
 'Salt and ground',
 'Salt and pepper',
 'buttons, oyster, shitake',
 'cheesecloth and kitchen string',
 'clam juice cocktail',
 'coarse kosher salt',
 'coarse sea salt',
 "confectioners ' sugar",
 'cream or half-and-half',
 'kosher salt and ground',
 'margarita or kosher salt',
 'mustard or Kikkoman Sweet',
 'oil cooking spray',
 'paper candy cups',
 'plain bread crumbs',
 'portobello and crimini',
 'salt and ground',
 'salt and ground pepper',
 'salt and pepper',
 'sea salt and ground',
 'spicy cilantro chutney',
 'tomato and clam juice cocktail']

Fix typos

In [234]:
prepositional_phrases = sort_unique_list(prepositional_phrases + prepositional_phrases2)

prepositional_phrases = ["as Bull's-Eye Texas-Style Bold Barbeque Sauce" if x=="(PP\n  as\n  Bull's-Eye Texas-Style Bold Barbeque Sauce" else x for x in prepositional_phrases]
prepositional_phrases = ["as Grill Mates Montreal Chicken Seasoning" if x=="(PP\n  as\n  Grill Mates Montreal Chicken Seasoning" else x for x in prepositional_phrases]

prepositional_phrases

["as Bull's-Eye Texas-Style Bold Barbeque Sauce",
 'as Grill Mates Montreal Chicken Seasoning',
 'Of This World Spaghetti',
 'Under a Brick',
 'about 4 inches',
 'about 4 inches thick',
 'across the grain',
 'against the grain',
 'ancho chile powder',
 'arroz con Pollo',
 'as Aloha™ Shoyu',
 'as Archer Farms',
 'as Bacardi Coconut™',
 'as Badia Complete Seasoning',
 'as Badia Tropical',
 "as Baker 's Angel Flake",
 "as Baker 's German",
 'as Baker Fine Dessert Filling',
 'as Barilla Napoletana',
 'as Betty Crocker',
 'as Beyond Meat',
 'as Beyond Meat Beyond Beef',
 "as Bob 's Red Mill",
 'as Bob Evans',
 'as Cabernet Sauvignon',
 'as Cabot Seriously Sharp',
 "as Campbell 's",
 "as Campbell 's Healthy Request",
 "as Cavender 's",
 'as Chantaboon Rice Noodles',
 'as Chocolate Ibarra',
 'as Classico Cabernet Marinara',
 'as Coco Lopez',
 'as Cool Whip',
 'as Country Crock',
 'as De Cecco',
 'as Diamond Crystal',
 'as Diet Sprite',
 'as Duncan Hines',
 'as El Paso',
 'as El Pato',
 'as FA

Save all the phrases as a txt file

In [235]:
all_phrases = sort_unique_list(prepositional_phrases + noun_phrases)

with open('all_phrases.txt', 'w') as filehandle:
    for listitem in all_phrases:
        filehandle.write('%s\n' % listitem)

## Data merging and creating bigram

In [236]:
all_recipe_names2[:10]

['',
 'Irish potato cake',
 'jeera rice',
 'dessert nachos',
 'moroccan lentil soup with veggies',
 'beef cacciatore',
 'sweet potato hummus',
 'macaron',
 'chili casserole',
 'Italian cookies ii']

In [237]:
p_ingredients[:10]

['4.4 cup butter',
 '4 teaspoons garlic powder, divided',
 '4 green bell peppers, cut into chunks',
 '4 sprigs fresh thyme leaves',
 '4.4 cups shredded mozzarella cheese',
 '4 teaspoon mayonnaise',
 '4 teaspoons heavy cream',
 '4 onions, quartered',
 '4 potatoes, peeled and cut into 4 inch cubes',
 '4 cucumber, chopped']

Generate bigram from each entry, rather than directly as a whole chunk of text, since they were not joined originally in the source

In [238]:
ingres_and_names = all_recipe_names2 + p_ingredients

In [239]:
all_tokens = []

for entry in ingres_and_names:
  tokens = nltk.word_tokenize(entry)
  all_tokens = all_tokens + tokens

all_tokens = list(set(all_tokens))
all_tokens[:20]

['krupnikas',
 'garbanzos',
 'Farmer',
 'cultures',
 'macaron',
 'Brennan',
 'bite-sized',
 'Gnocchi',
 'linguica',
 'natural-style',
 'aminos',
 'panna',
 'Halloween',
 'really',
 'caneles',
 'tofu',
 'husked',
 'Lance',
 'Fiori',
 'Hellman']

In [240]:
len(all_tokens)

5513

In [241]:
from nltk.util import ngrams
from collections import Counter

def generate_bigram_from_entry(entry):
  tokens = nltk.word_tokenize(entry)
  bigrams = ngrams(tokens,2)
  return bigrams

Counter(generate_bigram_from_entry("4 tablespoons grated orange peel"))

Counter({('4', 'tablespoons'): 1,
         ('tablespoons', 'grated'): 1,
         ('grated', 'orange'): 1,
         ('orange', 'peel'): 1})

Combine individual bigram

In [242]:
from collections import Counter

all_bigrams = {}

for name in ingres_and_names:
  try:
    all_bigrams = Counter(all_bigrams)+Counter(generate_bigram_from_entry(name))
  except Exception as e:
    pass
    
all_bigrams

Counter({('Irish', 'potato'): 6,
         ('potato', 'cake'): 1,
         ('jeera', 'rice'): 1,
         ('dessert', 'nachos'): 1,
         ('moroccan', 'lentil'): 2,
         ('lentil', 'soup'): 15,
         ('soup', 'with'): 15,
         ('with', 'veggies'): 1,
         ('beef', 'cacciatore'): 1,
         ('sweet', 'potato'): 26,
         ('potato', 'hummus'): 1,
         ('chili', 'casserole'): 1,
         ('Italian', 'cookies'): 4,
         ('cookies', 'ii'): 4,
         ('onion', 'soup'): 20,
         ('soup', 'gratinee'): 1,
         ('moong', 'dal'): 2,
         ('Canadian', 'pork'): 1,
         ('pork', 'loin'): 54,
         ('loin', 'chops'): 12,
         ('flavorful', 'beef'): 1,
         ('beef', 'stir-fry'): 6,
         ('vegetarian', 'black'): 1,
         ('black', 'bean'): 34,
         ('bean', 'enchiladas'): 2,
         ('pastitsio', 'iv'): 1,
         ('Irish', 'heritage'): 1,
         ('heritage', 'cabbage'): 1,
         ('homemade', 'pizza'): 1,
         ('calabaza', 

Group bigrams with same first word together into a dictionary

In [243]:
def find_dict_tuple_key(search):
    entry = {
        "token": search,
        "bigrams": []
    }
    bigrams = {x: all_bigrams[x] for x in all_bigrams.keys() if x[0] == search}
    for key, value in bigrams.items():
        newDict = {key[1]: value}
        entry["bigrams"].append(newDict)
    return entry

find_dict_tuple_key('spicy')

{'token': 'spicy',
 'bigrams': [{'tomato': 1},
  {'yogurt': 1},
  {'orange': 4},
  {'thai': 4},
  {'sushi': 1},
  {'dipping': 1},
  {'Indian': 4},
  {'basil': 2},
  {'chicken': 4},
  {'Southwest': 1},
  {'pork': 5},
  {'shrimp': 3},
  {'Sinterklass': 1},
  {'pesto': 1},
  {'tuna': 2},
  {'cabbage': 1},
  {'noodles': 1},
  {'Vietnamese': 2},
  {'Mexican-American': 1},
  {'avocado': 1},
  {'African': 1},
  {'beef': 3},
  {'Chinese': 2},
  {'salmon': 1},
  {'Asian-Style': 1},
  {'eggplant': 3},
  {'himalayan': 1},
  {'bok': 1},
  {'Korean': 3},
  {'and': 1},
  {'yellowtail': 1},
  {'stir': 1},
  {'mango': 1},
  {'Peruvian': 1},
  {'rice': 2},
  {'green': 1},
  {'Asian': 1},
  {'marinated': 1},
  {'calabrian': 1},
  {'szechuan': 1},
  {'vegan': 1},
  {'penyet': 1},
  {'crispy': 1},
  {'stir-fry': 1},
  {'feta': 1},
  {'fried': 1},
  {'red': 2},
  {'Italian': 4},
  {'banana': 1},
  {'peach': 1},
  {'brown': 2},
  {'refried': 1},
  {'cilantro': 1},
  {'curry': 1},
  {'Spanish': 3},
  {'Portu

In [244]:
bigram_in_list = []
for value in all_tokens:
    bigram_in_list.append(find_dict_tuple_key(value))
    
bigram_in_list

[{'token': 'krupnikas', 'bigrams': []},
 {'token': 'garbanzos', 'bigrams': []},
 {'token': 'Farmer', 'bigrams': [{"'s": 1}]},
 {'token': 'cultures', 'bigrams': []},
 {'token': 'macaron', 'bigrams': []},
 {'token': 'Brennan', 'bigrams': [{"'s": 1}]},
 {'token': 'bite-sized',
  'bigrams': [{'pieces': 27}, {'chunks': 3}, {'potato': 1}, {'cubes': 2}]},
 {'token': 'Gnocchi', 'bigrams': []},
 {'token': 'linguica', 'bigrams': [{'sausage': 5}]},
 {'token': 'natural-style', 'bigrams': [{'peanut': 1}]},
 {'token': 'aminos', 'bigrams': []},
 {'token': 'panna', 'bigrams': [{'cotta': 4}]},
 {'token': 'Halloween', 'bigrams': [{'Quesadillas': 1}]},
 {'token': 'really', 'bigrams': [{'real': 1}]},
 {'token': 'caneles', 'bigrams': [{'de': 1}]},
 {'token': 'tofu',
  'bigrams': [{'and': 4},
   {'stir': 1},
   {'salad': 2},
   {'parmigiana': 1},
   {'stew': 1},
   {'with': 1},
   {'soup': 1},
   {'slices': 1},
   {'hiyayakko': 1},
   {'lasagna': 1},
   {'bites': 1},
   {'stir-fry': 1},
   {',': 33},
   {'o

## Add Phonetics

In [245]:
!pipenv install eng-to-ipa

Installing eng-to-ipa...

Installing dependencies from Pipfile.lock (577ce1)...
Ignoring argcomplete: markers 'python_full_version < "3.8.0"' don't match your environment
Ignoring importlib-metadata: markers 'python_version == "3.7" and python_full_version < "3.8.0" and python_full_version < "3.8.0" and python_full_version < "3.8.0"' don't match your environment
Ignoring typing-extensions: markers 'python_full_version < "3.8.0"' don't match your environment




[    ] Installing...
[=   ] Installing eng-to-ipa...
[==  ] Installing eng-to-ipa...
[=== ] Installing eng-to-ipa...
[ ===] Installing eng-to-ipa...
[  ==] Installing eng-to-ipa...
[   =] Installing eng-to-ipa...
[    ] Installing eng-to-ipa...
[   =] Installing eng-to-ipa...
[  ==] Installing eng-to-ipa...
[ ===] Installing eng-to-ipa...
[====] Installing eng-to-ipa...
[=== ] Installing eng-to-ipa...
[==  ] Installing eng-to-ipa...
[=   ] Installing eng-to-ipa...
[    ] Installing eng-to-ipa...
[=   ] Installing eng-to-ipa...
[==  ] Installing eng-to-ipa...
[=== ] Installing eng-to-ipa...
[ ===] Installing eng-to-ipa...
[  ==] Installing eng-to-ipa...
[   =] Installing eng-to-ipa...
[    ] Installing eng-to-ipa...
[   =] Installing eng-to-ipa...
[  ==] Installing eng-to-ipa...
[ ===] Installing eng-to-ipa...
[====] Installing eng-to-ipa...
Adding eng-to-ipa to Pipfile's [packages]...
Installation Succeeded 


In [246]:
import eng_to_ipa as eng_to_ipa

eng_to_ipa.convert("hey!")

'heɪ!'

In some cases, when ipa conversion fails, the original spelling is used instead. So, for file efficiency, only those that are converted successfully should be kept.

In [249]:
eng_to_ipa.convert("bite-sized")

'bite-sized*'

In [250]:
for bigram in bigram_in_list:
    try:
        ipa =  eng_to_ipa.convert(eng_to_ipa.convert(bigram["token"]))
        if bigram["token"] not in ipa:
            bigram["ipa"] = eng_to_ipa.convert(eng_to_ipa.convert(bigram["token"]))
    except Exception as e:
        pass

bigram_in_list

[{'token': 'krupnikas', 'bigrams': [], 'ipa': 'krupnikas**'},
 {'token': 'garbanzos', 'bigrams': [], 'ipa': 'garbanzos**'},
 {'token': 'Farmer', 'bigrams': [{"'s": 1}], 'ipa': 'ˈˈfɑrmər*'},
 {'token': 'cultures', 'bigrams': [], 'ipa': 'ˈˈkəlʧərz*'},
 {'token': 'macaron', 'bigrams': [], 'ipa': 'macaron**'},
 {'token': 'Brennan', 'bigrams': [{"'s": 1}], 'ipa': 'ˈˈbrɛnən*'},
 {'token': 'bite-sized',
  'bigrams': [{'pieces': 27}, {'chunks': 3}, {'potato': 1}, {'cubes': 2}],
  'ipa': 'bite-sized**'},
 {'token': 'Gnocchi', 'bigrams': [], 'ipa': 'ˈˈnoʊki*'},
 {'token': 'linguica', 'bigrams': [{'sausage': 5}], 'ipa': 'linguica**'},
 {'token': 'natural-style',
  'bigrams': [{'peanut': 1}],
  'ipa': 'natural-style**'},
 {'token': 'aminos', 'bigrams': [], 'ipa': 'aminos**'},
 {'token': 'panna', 'bigrams': [{'cotta': 4}], 'ipa': 'panna**'},
 {'token': 'Halloween', 'bigrams': [{'Quesadillas': 1}], 'ipa': 'ˌˌhæləˈwin*'},
 {'token': 'really', 'bigrams': [{'real': 1}], 'ipa': 'ˈˈrɪli*'},
 {'token': 'c

Save bigram list, which contain IPA symbol and bigrams of each token into a json file

In [251]:
import json

with open('bigrams.json', 'w') as f:
    json.dump(bigram_in_list, f)

# Create edit distance