# Steamboat Squad

Import and load data

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import json

with open("recipes_ingredients.json", "r") as json_file:
    recipes = json.load(json_file)
    
len(recipes)

4702

Overview of data structure. This is a list of dictionary, where each dictionary is a recipe with its name, ingredients and url

In [3]:
recipes[0]

{'url': 'https://www.allrecipes.com/recipe/18318/pan-fried-asparagus/',
 'name': 'Pan-Fried Asparagus',
 'ingredients': ['¼ cup butter ',
  '2 tablespoons olive oil ',
  '1 teaspoon coarse salt ',
  '¼ teaspoon ground black pepper ',
  '3 cloves garlic, minced ',
  '1 pound fresh asparagus spears, trimmed ']}

Deleting url key

In [4]:
for recipe in recipes:
    del recipe['url']
recipes[0]

{'name': 'Pan-Fried Asparagus',
 'ingredients': ['¼ cup butter ',
  '2 tablespoons olive oil ',
  '1 teaspoon coarse salt ',
  '¼ teaspoon ground black pepper ',
  '3 cloves garlic, minced ',
  '1 pound fresh asparagus spears, trimmed ']}

# Preprocessing Recipe Names
- Lower-casing (normalise words by using POS tagging)
- Change numbers to fix number (place holder)

NLTK has a help function that explains its POS tags.

In [5]:
import nltk
from nltk import pos_tag, word_tokenize, RegexpParser, Tree
from nltk.tokenize import PunktSentenceTokenizer

nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [6]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

Using %%capture, save the NLTK help text as a string

In [7]:
%%capture cap --no-stderr

nltk.help.upenn_tagset()

In [8]:
cap.stdout

'$: dollar\n    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$\n\'\': closing quotation mark\n    \' \'\'\n(: opening parenthesis\n    ( [ {\n): closing parenthesis\n    ) ] }\n,: comma\n    ,\n--: dash\n    --\n.: sentence terminator\n    . ! ?\n:: colon or ellipsis\n    : ; ...\nCC: conjunction, coordinating\n    & \'n and both but either et for less minus neither nor or plus so\n    therefore times v. versus vs. whether yet\nCD: numeral, cardinal\n    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-\n    seven 1987 twenty \'79 zero two 78-degrees eighty-four IX \'60s .025\n    fifteen 271,124 dozen quintillion DM2,000 ...\nDT: determiner\n    all an another any both del each either every half la many much nary\n    neither no some such that the them these this those\nEX: existential there\n    there\nFW: foreign word\n    gemeinschaft hund ich jeux habeas Haementeria Herr K\'ang-si vous\n    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte\n    terram 

Using RE, get all the tag names

In [9]:
import re

ALL_POS = re.findall(".*: +", cap.stdout)

for i, pos in enumerate(ALL_POS):
  ALL_POS[i] = pos.replace(': ', '')


ALL_POS

['$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 '    ',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``']

In [10]:
ALL_POS.remove('    ')
ALL_POS

['$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``']

Create a function to pos tag a text

In [11]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def tag_pos(corpus):
    text=word_tokenize(corpus)
    return nltk.pos_tag(text)

tag_pos("This is a test sentence.")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('test', 'NN'),
 ('sentence', 'NN'),
 ('.', '.')]

Create a function that POS tag and returns words with specific POS

In [12]:
def get_words_with_pos(text, pos):
  tagged = tag_pos(text)
  return [t for t in tagged if t[1].startswith(pos)]

get_words_with_pos("This is a test sentence.", "NN")

[('test', 'NN'), ('sentence', 'NN')]

POS tag all recipe names

In [13]:
tagged_recipe_names = []

for i, recipe in enumerate(recipes):
  try:
    tagged_recipe_names.append(tag_pos(recipes[i]['name']))
  except Exception as e:
    pass

len(tagged_recipe_names)

4701

## Data cleaning for names based on POS tagging

Looking at the first 10 tagged recipe names, there is a need for pre-processing, as NLTK's tagging is confused by the letter casing.

In [14]:
tagged_recipe_names[:10]

[[('Pan-Fried', 'JJ'), ('Asparagus', 'NNP')],
 [('Pan', 'NNP'),
  ('de', 'FW'),
  ('Muertos', 'NNP'),
  ('(', '('),
  ('Mexican', 'NNP'),
  ('Bread', 'NNP'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('Dead', 'NNP'),
  (')', ')')],
 [('Creamy', 'NNP'), ('Au', 'NNP'), ('Gratin', 'NNP'), ('Potatoes', 'NNP')],
 [('Super-Delicious', 'JJ'), ('Zuppa', 'NNP'), ('Toscana', 'NNP')],
 [('Simple', 'JJ'), ('Teriyaki', 'NNP'), ('Sauce', 'NNP')],
 [('Spicy', 'JJ'),
  ('Korean', 'NNP'),
  ('Fried', 'NNP'),
  ('Chicken', 'NNP'),
  ('with', 'IN'),
  ('Gochujang', 'NNP'),
  ('Sauce', 'NNP')],
 [('Spaghetti', 'NNP'), ('Aglio', 'NNP'), ('e', 'NN'), ('Olio', 'NNP')],
 [('Easy', 'JJ'), ('Garam', 'NNP'), ('Masala', 'NNP')],
 [('Easy', 'NNP'), ('Chorizo', 'NNP'), ('Street', 'NNP'), ('Tacos', 'NNP')],
 [('Tres', 'NNS'),
  ('Leches', 'NNP'),
  ('(', '('),
  ('Milk', 'NNP'),
  ('Cake', 'NNP'),
  (')', ')')]]

Create a function that returns all tagged words with the same tag. NLTK's POS tagging assumes that capitalized noun means proper noun (name).

In [15]:
def list_words_with_tag(tuple_list, pos):
  results = []
  for name in tuple_list:
    for tag in name:
      if tag[1] == pos:
        results.append(tag[0])
  return results

list_words_with_tag(tagged_recipe_names, "NNP")

['Asparagus',
 'Pan',
 'Muertos',
 'Mexican',
 'Bread',
 'Dead',
 'Creamy',
 'Au',
 'Gratin',
 'Potatoes',
 'Zuppa',
 'Toscana',
 'Teriyaki',
 'Sauce',
 'Korean',
 'Fried',
 'Chicken',
 'Gochujang',
 'Sauce',
 'Spaghetti',
 'Aglio',
 'Olio',
 'Garam',
 'Masala',
 'Easy',
 'Chorizo',
 'Street',
 'Tacos',
 'Leches',
 'Milk',
 'Cake',
 'Cabbage',
 'Rolls',
 'Gravy',
 'Shrimp',
 'Scampi',
 'Pasta',
 'Lemon',
 'Chicken',
 'Potato',
 'Bake',
 'Mexican',
 'Casserole',
 'Caldo',
 'Res',
 'Mexican',
 'Beef',
 'Soup',
 'Nogada',
 'Mexican',
 'Stuffed',
 'Poblano',
 'Peppers',
 'Walnut',
 'Sauce',
 'Apple',
 'Cake',
 'Flan',
 'Pork',
 'Chops',
 'Sauerkraut',
 'Spicy',
 'Thai',
 'Basil',
 'Chicken',
 'Pad',
 'Krapow',
 'Gai',
 'Spaghetti',
 'Cacio',
 'Pepe',
 'Chef',
 'John',
 'Chicken',
 'Kiev',
 'Chicken',
 'Onions',
 'Fajita',
 'Perfect',
 'Sushi',
 'Rice',
 'Baked',
 'Chicken',
 'German',
 'Potato',
 'Salad',
 'Miso',
 'Soup',
 'Mexican',
 'Rice',
 'II',
 'Haluski',
 'Labneh',
 'Lebanese',
 'Y

In [16]:
all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  all_name_tags.append(new_dic)

In [17]:
def get_tag_number(tag_list):
  tag_numbers = []
  for tag in tag_list:
    for key, value in tag.items(): 
      new_dict = {key: len(value)}
    tag_numbers.append(new_dict)
  return tag_numbers

get_tag_number(all_name_tags)

[{'$': 1},
 {"''": 7},
 {'(': 529},
 {')': 529},
 {',': 63},
 {'--': 0},
 {'.': 10},
 {':': 98},
 {'CC': 555},
 {'CD': 74},
 {'DT': 104},
 {'EX': 0},
 {'FW': 47},
 {'IN': 482},
 {'JJ': 1822},
 {'JJR': 4},
 {'JJS': 27},
 {'LS': 0},
 {'MD': 2},
 {'NN': 571},
 {'NNP': 13139},
 {'NNPS': 46},
 {'NNS': 307},
 {'PDT': 0},
 {'POS': 348},
 {'PRP': 72},
 {'PRP$': 20},
 {'RB': 33},
 {'RBR': 0},
 {'RBS': 1},
 {'RP': 2},
 {'SYM': 0},
 {'TO': 20},
 {'UH': 0},
 {'VB': 24},
 {'VBD': 39},
 {'VBG': 50},
 {'VBN': 133},
 {'VBP': 10},
 {'VBZ': 22},
 {'WDT': 4},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 7},
 {'``': 6}]

Some names have numbers (CD). Some are obviously not numbers, like 'Figgy'

In [18]:
def get_values_from_dict_list(dict_list, key):
  values = []
  for d in dict_list:
    if key in d:
      values.append(d[key])
  return values

cd_tokens = get_values_from_dict_list(all_name_tags, 'CD')[0]
cd_tokens

['5',
 '16',
 '2',
 '13',
 '300',
 'Figgy',
 '3',
 '9',
 'Two',
 '9',
 '22',
 '10',
 '15',
 'One',
 '18',
 'Ten',
 'Flounder',
 'Three',
 'Ziti',
 'One',
 '21',
 'Four',
 '9',
 '65',
 '17',
 '14',
 '10',
 "'n",
 '15',
 '8',
 'Minestrone',
 'Four',
 '35',
 'Fly',
 '15',
 '23',
 '8',
 '15',
 '21',
 "That's-a",
 'Tex-Mex',
 '14',
 '17',
 'Five',
 '10',
 '18',
 '5',
 "'Otai",
 '17',
 '3',
 '17',
 '75',
 '17',
 '20',
 'Take-Out',
 '16',
 '12',
 'Three',
 "'Three",
 '15',
 '20',
 '16',
 '12',
 '15',
 '22',
 '12',
 'Three',
 '21',
 '21',
 '25',
 '7',
 '10',
 '19',
 '20']

Create a function that searches for recipe name with specific string

In [19]:
def find_value_with_char(dic_list, key, char):
  matches = []
  for recipe in dic_list:
    try:
      if char in recipe[key]:
        matches.append(recipe[key])
    except Exception as e:
      pass
  return matches

find_value_with_char(recipes, 'name', 'Figgy')

['Figgy Pudding']

'Three cup chicken' is indeed a name. On the other hand, numerics, such as 9 and 13 are not part of the actual names of dishes. So, numerics, instead of NLTK's CD, should be treated. This treatment should be done using regex.

In [20]:
for cd in cd_tokens:
  print(find_value_with_char(recipes, 'name', cd))

['Our 5 Best Avgolemono Soup Recipes', '5-Ingredient Mexican Casserole', '15 Mexican-Inspired Ground Beef Casseroles That Deliver Big Flavor With Every Satisfying Bite', 'Chicken 65', 'Pan-Roasted 5-Spice Pork Loin', 'The 15 Most Iconic French Desserts', '35 Quick and Easy Chinese Dinners You Can Make at Home', '15 Essential North Indian Recipes', '15 Essential North Indian Recipes', '18 Easy Mexican Dishes With 5 Ingredients or Less', 'French 75 Cocktail', '15 Top-Rated Traditional German Christmas Cookies', '15 Traditional Italian Christmas Dinner Recipes', "25 Italian Cookies You'll Love"]
['16 German Recipes That Are Comfort Food Favorites', '16 Mexican-Inspired Casseroles for Family-Pleasing Dinners', '16 Essential Puerto Rican Recipes']
['2 Minute Cheese Quesadillas', "22 Recipes Using a Whole Baguette (That Aren't Sandwiches)", 'Our 21 Best Authentic Mexican Recipes', '23 Delicious Ways the World Cooks Pork Shoulder', '21 Easy Dinners That Start with Packaged Gnocchi', 'Our 20 B

Create a function that searches a regex pattern from a text

In [21]:
def searchWordsPatt(text, patt):
    array = re.findall(patt, text)
    return array

NUMPATTERN = r'[0-9]+'
searchWordsPatt("I want 1 cup of tea", NUMPATTERN)

['1']

Create a function that substitutes regex patterns with a given value

In [22]:
def searchReplacePatt(text, patt, new_val):
  return re.sub(patt, new_val, text)

NUMSPACEPATTERN = r'(\d+\s)'
searchReplacePatt("I want 1 cup of tea", NUMSPACEPATTERN, "")

'I want cup of tea'

searchReplacePatt, except it iterates recipe list

In [23]:
def searchReplacePattList(dict_list, patt, new_val, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            dict_list[i][key] = searchReplacePatt(dict_list[i][key], patt, new_val)
        except Exception as e:
            pass

searchReplacePattList, but adds a substring at given index

In [24]:
def searchReplaceAddPattList(dict_list, patt, new_val, substring, index=0, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            dict_list[i][key] = searchReplacePatt(dict_list[i][key], patt, new_val)
            added_string = list(dict_list[i][key]).insert(index, substring)
            dict_list[i][key]=''.join(added_string)
        except Exception as e:
            pass

Remove numerics from name

In [25]:
import re

p_recipes = recipes

searchReplacePattList(p_recipes, NUMSPACEPATTERN, "")

def retag(text_list, key):
  new_list = []
  for i, recipe in enumerate(text_list):
    try:
      new_list.append(tag_pos(recipes[i][key]))
    except Exception as e:
      pass
  return new_list

tagged_recipe_names = retag(p_recipes, "name")

Get the new remaining CD

In [26]:
new_cd_tokens = list_words_with_tag(tagged_recipe_names, "CD")
new_cd_tokens

['Figgy',
 'Two',
 'One',
 'Ten',
 'Flounder',
 'Three',
 'Ziti',
 'One',
 'Four',
 '65',
 "'n",
 'Minestrone',
 'Four',
 'Fly',
 "That's-a",
 'Tex-Mex',
 'Five',
 "'Otai",
 'Take-Out',
 'Three',
 "'Three",
 'Three']

The remaining numbers (CD) are part of actual recipe names

In [27]:
for cd in new_cd_tokens:
  print(find_value_with_char(p_recipes, 'name', cd))

['Figgy Pudding']
['Two-Ingredient Naan', 'Pollo alla Birra for Two']
['A Number One Egg Bread', 'One-Egg Egg Drop Soup', 'One Pot Thai-Style Rice Noodles', 'One-Pot Vegan Potato-Lentil Curry', 'One-Bite Thai "Flavor Bomb" Salad Wraps (Miang Kham)', 'Easy One-Skillet Ground Beef Burrito', 'One-Pot Greek Lemon Chicken and Rice']
['Tender Italian Baked Chicken', 'Tuscan Pork Tenderloin', 'Asian Pork Tenderloin', 'Italian Pork Tenderloin', 'Sweet and Sour Pork Tenderloin', 'Chipotle Crusted Pork Tenderloin', 'Ten Minute Szechuan Chicken', 'Thai Quivering Tenderloins', 'Spicy Pork Tenderloin', 'Chinese Pork Tenderloin', 'Grecian Pork Tenderloin', 'Havana Slow Cooker Pork Tenderloin', 'Curry Pork Tenderloin', 'Tender Juicy Skirt Steak  (Churrasco)', 'Spicy and Tender Corned Beef', 'Pan Roasted Pork Tenderloin with a Blue Cheese and Olive Stuffing']
['Flounder Mediterranean']
['Pastel de Tres Leches (Three Milk Cake)', 'Three-Meat Italian Meatballs', 'Three Cheese Manicotti II', 'Taiwanese-S

In [28]:
new_all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  new_all_name_tags.append(new_dic)

Can and 'll are the modal verbs found

In [29]:
md_tokens = list_words_with_tag(tagged_recipe_names, "MD")
md_tokens

['Can', "'ll"]

'can' is caused by words such as Canadian, which is processed in next section. But, 'you'll love' is not part of recipe name and more of an expression

In [30]:
for md in md_tokens:
  print(find_value_with_char(p_recipes, 'name', md))

['Canadian Yellow Split Pea Soup with Ham', 'French Canadian Tourtiere', 'Pure Maple Candy', 'Cannoli', 'The Original Donair From the East Coast of Canada', 'Sauerkraut for Canning', 'Tourtiere (French Canadian Meat Pie)', 'Pumpkin Cannoli', 'Puerto Rican Canned Corned Beef Stew', 'Canadian Pork Loin Chops', 'Caneles de Bordeaux', 'Canadian Walleye (Pickerel)', "Thera's Canadian Fried Dough", 'Italian Baked Cannelloni', 'Canary Island Red Mojo Sauce', 'Mexican Tamarind Candy', 'Cantonese Chicken Chow Mein', 'Roti Canai/Paratha (Indian Pancake)', 'Polvorones de Canele (Cinnamon Cookies)', 'Miraculous Canadian Sugar Pie', 'Canadian Tea Biscuits', 'Peanut Butter Potato Candy', 'Irish Potato Candy', 'Filipino Pancit Bihon with Canton', 'Gorton (French-Canadian Pork Spread)', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Chocolate Cantucci', 'Cantonese Style Lobster', 'Real Canadian Poutine', 'French Canadian Meatball Stew', 'Canadian Butter Tarts', 'Canadian Apple Pie', 'Cantones

Removing "You'll" and retagging new list

In [31]:
searchReplacePattList(p_recipes, r"(You'll Love)", "")
tagged_recipe_names = retag(p_recipes, "name")

'll' removed

In [32]:
new_md_tokens = list_words_with_tag(tagged_recipe_names, "MD")
new_md_tokens

['Can']

In [33]:
for md in new_md_tokens:
  print(find_value_with_char(p_recipes, 'name', md))

['Canadian Yellow Split Pea Soup with Ham', 'French Canadian Tourtiere', 'Pure Maple Candy', 'Cannoli', 'The Original Donair From the East Coast of Canada', 'Sauerkraut for Canning', 'Tourtiere (French Canadian Meat Pie)', 'Pumpkin Cannoli', 'Puerto Rican Canned Corned Beef Stew', 'Canadian Pork Loin Chops', 'Caneles de Bordeaux', 'Canadian Walleye (Pickerel)', "Thera's Canadian Fried Dough", 'Italian Baked Cannelloni', 'Canary Island Red Mojo Sauce', 'Mexican Tamarind Candy', 'Cantonese Chicken Chow Mein', 'Roti Canai/Paratha (Indian Pancake)', 'Polvorones de Canele (Cinnamon Cookies)', 'Miraculous Canadian Sugar Pie', 'Canadian Tea Biscuits', 'Peanut Butter Potato Candy', 'Irish Potato Candy', 'Filipino Pancit Bihon with Canton', 'Gorton (French-Canadian Pork Spread)', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Chocolate Cantucci', 'Cantonese Style Lobster', 'Real Canadian Poutine', 'French Canadian Meatball Stew', 'Canadian Butter Tarts', 'Canadian Apple Pie', 'Cantones

In [34]:
bracket_tokens = list(set(list_words_with_tag(tagged_recipe_names, "(")))
bracket_tokens

['(']

Examining brackers in names. Most of the words in brackets are translations

In [35]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Tres Leches (Milk Cake)', 'Caldo de Res (Mexican Beef Soup)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', '

"(no red sauce here...golden)" needs to be removed

In [36]:
# Redundant descriptions
searchReplacePattList(p_recipes,  r"(no red sauce here...golden)", "")
searchReplacePattList(p_recipes, r"(From a Swede!)", "")
searchReplacePattList(p_recipes, r"(from a Chinese person)", "")
searchReplacePattList(p_recipes, r"(Now Vegetarian!)", "")
searchReplacePattList(p_recipes, r"a.k.a. ", "")
searchReplacePattList(p_recipes, r"(That Aren't Sandwiches)", "")

# Remove copyright symbol
searchReplacePattList(p_recipes, r"&reg;", "")
# Asian Sesame Seared or Grilled Tuna (Gluten Free) => Gluten Free Asian Sesame Seared or Grilled Tuna
searchReplaceAddPattList(p_recipes, r"(Gluten Free)", "", "glutten-free")
tagged_recipe_names = retag(p_recipes, "name")

In [37]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Tres Leches (Milk Cake)', 'Caldo de Res (Mexican Beef Soup)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', '

In [38]:
fw_tokens = list(set(list_words_with_tag(tagged_recipe_names, "FW")))
fw_tokens

['Rassolnik', 'de', 'et']

In [39]:
fw_names = []
for fw in fw_tokens:
    names = find_value_with_char(p_recipes, 'name', fw)
    print(names)
    fw_names = fw_names + names
fw_names = list(set(fw_names))

['Rassolnik with Rice (Russian Pickle Soup)']
['Pan de Muertos (Mexican Bread of the Dead)', 'Caldo de Res (Mexican Beef Soup)', 'Tender Italian Baked Chicken', 'Herbs de Provence', "Chef John's Beef Rouladen", 'Fideo', 'Tomatillo Salsa Verde', 'Ground Beef with Homemade Taco Seasoning Mix', 'German Beef Rouladen', 'Buche de Noel', 'Tuscan Pork Tenderloin', 'Sauteed Sweet Plantains (Tajaditas Dulces de Platano)', 'Homemade Mozzarella Cheese', 'Kotlet Schabowy (Polish Breaded Pork Chop)', 'Semmelknoedel (Bread Dumplings)', 'Homemade Manti (Traditional Turkish Dumplings)', 'Kalamata Olive Tapenade', 'Barbacoa-Style Shredded Beef', "Ingrid's Rouladen", 'Original Homemade Italian Beef', 'Slow Cooker Chile Verde', 'Chicken and Sliders', 'Caldo Verde (Portuguese Sausage Kale Soup)', 'German Hamburgers (Frikadellen)', 'Slow Cooker Mexican Recipes Under Calories', 'Asian Pork Tenderloin', 'Harissa Powder', 'Colorado Green Chili (Chile Verde)', 'Schupfnudeln (German Fried Potato Dumplings)', 'F

In [40]:
fw_names

['Spaghetti Squash Pad Thai',
 'Chicken, Feta Cheese, and Sun-Dried Tomato Wraps',
 'Knodel',
 'Cuban Inspired Millet',
 'Pastelon de Platano Maduro (Dominican-Style Yellow Plantain Pie)',
 "Homemade Za'atar",
 'Bruschetta al Pomodoro',
 'Cuban Shredded Pork',
 'Italian Chicken Marinade',
 "Kris' Amazing Shredded Mexican Beef",
 'Authentic Enchiladas Verdes',
 'Sweet Sesame Slaw',
 'Tamales Oaxaque&ntilde;os (Oaxacan-Style Tamales)',
 'Classic Carbonara with Pancetta',
 'Instant Pot Chicken Posole Verde',
 'Real Homemade Bologna',
 'Curry Pork Tenderloin',
 'Portuguese Sweet Rice',
 'Homemade Chicken Enchiladas',
 "Bon Appetit's Meatballs",
 'Sicilian Spaghetti',
 'Easy Shrimp Vegetable Stir Fry',
 'Vegetarian Pad Thai',
 'Tajine de Poulet aux Carottes et Patates Douces (Chicken and Sweet Potato Tagine)',
 'Amaretti',
 'Italian Braised Pork Shoulder',
 'Dutch Croquetten',
 'Spaghetti Alla Carbonara Tradizionali',
 "Ninabell's Appetizer Meatballs",
 'Vegetarian Black Bean Enchiladas',
 

Names that both have foreign words and bracket

In [41]:
bracket_and_fw = [name for name in bracketed_names if name in fw_names]
bracket_and_fw

['Pastelon de Platano Maduro (Dominican-Style Yellow Plantain Pie)',
 'Tamales Oaxaque&ntilde;os (Oaxacan-Style Tamales)',
 'Tajine de Poulet aux Carottes et Patates Douces (Chicken and Sweet Potato Tagine)',
 'Key Sir Alicha (Ethiopian Beets and Potatoes)',
 'Brazilian Style Flan (Pudim de Leite Condensado)',
 'Poblano and Cheese Tamales (Tamales de Rajas con Queso)',
 'Indian Bread Pudding (Double Ka Meeta)',
 'Beef Barbacoa (Barbacoa de Res)',
 'Polvorones de Canele (Cinnamon Cookies)',
 'Carlota de Limon (Mexican Lemon Icebox Cake)',
 "Bucatini Cacio e Pepe (Roman Sheep Herder's Pasta)",
 'French Cookies (Belgi Galettes)',
 'Refreshing Oatmeal Drink (Agua de Avena)',
 'Banana-Dulce de Leche Pie (Banana-Caramel Pie)',
 'Yogurt-Marinated Salmon Fillets (Dahi Machhali Masaledar)',
 'Homemade Irish (Whiskey) Cream',
 'Mexican Chicken Meatball Soup (Sopa de Albondigas de Pollo)',
 'Upside Down (Maqluba)',
 'Bagna Calda (Italian Garlic-Anchovy-Sardine Appetizer)',
 'Homemade Manti (Tradi

Split the names into two names, one outside and one inside

In [42]:
BRACKET_REGEX = " \(.*\)"
def break_fw_bracket(name):
    name1 = re.findall(BRACKET_REGEX, name)[0]
    name1 = name1[name1.find("(")+1:name1.find(")")]
    name2 = re.sub(BRACKET_REGEX, "", name)
    return name1, name2

print(break_fw_bracket("Hearty Caldo de Res (Mexican Beef Soup)"))
print(break_fw_bracket("Ukha (Russian Fish Soup)"))

('Mexican Beef Soup', 'Hearty Caldo de Res')
('Russian Fish Soup', 'Ukha')


Apply the split function. Delete old recipe with bracket and foreign words. In both of the new recipes, duplicate old ingredients.

In [43]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in bracket_and_fw:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

There are still remaining names with bracket, mostly due to the foreign words not being recognized.

In [44]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Tres Leches (Milk Cake)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', 'Kalbi (Korean BBQ Short Ribs)', 'Macaron (French Macaroon)', 'Atsara (Papaya Relish)', 'Authentic Chinese Egg Rolls ()', 'Greek Le

In [45]:
bracketed_names

['Indian Butter Chicken (Chicken Makhani)',
 'Slow Cooker Lengua (Beef Tongue)',
 'Bee Sting Cake (Bienenstich) II',
 "Bigos (Polish Hunter's Stew)",
 'Oyakodon (Japanese Chicken and Egg Rice Bowl)',
 'Berbere (Ethiopian Spice)',
 'Steamed Egg (Chawan Mushi)',
 'Fleischkuechle (Flesh-Keek-Luh)',
 'Swedish Chocolate Balls (Chokladbollar)',
 'Papa a la Huancaina (Huancayo-Style Potatoes)',
 'Sinigang na Bangus (Filipino Milkfish in Tamarind Broth)',
 'Ethiopian Cabbage and Potato Dish (Atkilt)',
 'Lentils and Rice with Fried Onions (Mujadarrah)',
 'Kelewele (Spicy Fried Plantains)',
 'Bionicos (Mexican Fruit Bowls)',
 'Swedish Meatballs ()',
 'Rajma (Kidney Bean Curry)',
 'Knedliky - Czech Dumpling with Sauerkraut (Zeli)',
 'Quick Chinese-Style Vermicelli (Rice Noodles)',
 'Chicken Adobo with Coconut Milk (Adobo sa Gata)',
 'Laotian Grilled Chicken (Ping Gai)',
 'Korean Spicy Chicken and Potato (Tak Toritang)',
 'Tuscan Onion Soup (Carabaccia)',
 'Spinach and Tomato Dal (Indian Lentil So

Most of the brackets are at the end of each name. For those that are in the middle, they are translations of one of the words in the name.

In [46]:
b_name_end = []
b_name_mid = []
for b_name in bracketed_names:
    if b_name.endswith(')'):
        b_name_end.append(b_name)
    else:
        b_name_mid.append(b_name)
        
b_name_end

['Indian Butter Chicken (Chicken Makhani)',
 'Slow Cooker Lengua (Beef Tongue)',
 "Bigos (Polish Hunter's Stew)",
 'Oyakodon (Japanese Chicken and Egg Rice Bowl)',
 'Berbere (Ethiopian Spice)',
 'Steamed Egg (Chawan Mushi)',
 'Fleischkuechle (Flesh-Keek-Luh)',
 'Swedish Chocolate Balls (Chokladbollar)',
 'Papa a la Huancaina (Huancayo-Style Potatoes)',
 'Sinigang na Bangus (Filipino Milkfish in Tamarind Broth)',
 'Ethiopian Cabbage and Potato Dish (Atkilt)',
 'Lentils and Rice with Fried Onions (Mujadarrah)',
 'Kelewele (Spicy Fried Plantains)',
 'Bionicos (Mexican Fruit Bowls)',
 'Swedish Meatballs ()',
 'Rajma (Kidney Bean Curry)',
 'Knedliky - Czech Dumpling with Sauerkraut (Zeli)',
 'Quick Chinese-Style Vermicelli (Rice Noodles)',
 'Chicken Adobo with Coconut Milk (Adobo sa Gata)',
 'Laotian Grilled Chicken (Ping Gai)',
 'Korean Spicy Chicken and Potato (Tak Toritang)',
 'Tuscan Onion Soup (Carabaccia)',
 'Spinach and Tomato Dal (Indian Lentil Soup)',
 'Mandazi (African Donuts)',
 

In [47]:
b_name_mid

['Bee Sting Cake (Bienenstich) II',
 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms',
 'Lengua (Beef Tongue) Stew',
 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish',
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Albondigas (Meatballs) en Chipotle',
 'Classic Cuban Midnight (Medianoche) Sandwich',
 'Lazy Golumpki (Stuffed Cabbage) Soup',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce',
 'Hawaiian Bruddah Potato Mac (Macaroni) Salad',
 'Spicy Indian (Gujarati) Green Beans',
 'Lamb (Gosht) Biryani',
 'Coconut (Haupia) and Chocolate Pie',
 'Jeera (Cumin) Rice',
 'Ulu (Breadfruit) Pancakes',
 'Korean Bean Curd (Miso) Soup',
 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce',
 'Besan (Gram Flour) Halwa',
 "World's Best () Lasagna",
 'Zito (Zhito/Koljivo) - Serbian Wheat Pudding',
 'Seaweed (Nori) Soup']

On the other hand, without parenthesis anymore, names with foregin words tagged are now clean

In [48]:
fw_names = []
for fw in fw_tokens:
    names = find_value_with_char(p_recipes, 'name', fw)
    print(names)
    fw_names = fw_names + names
fw_names = list(set(fw_names))

['Rassolnik with Rice']
['Tender Italian Baked Chicken', 'Herbs de Provence', "Chef John's Beef Rouladen", 'Fideo', 'Tomatillo Salsa Verde', 'Ground Beef with Homemade Taco Seasoning Mix', 'German Beef Rouladen', 'Buche de Noel', 'Tuscan Pork Tenderloin', 'Homemade Mozzarella Cheese', 'Kalamata Olive Tapenade', 'Barbacoa-Style Shredded Beef', "Ingrid's Rouladen", 'Original Homemade Italian Beef', 'Slow Cooker Chile Verde', 'Chicken and Sliders', 'Slow Cooker Mexican Recipes Under Calories', 'Asian Pork Tenderloin', 'Harissa Powder', 'Italian Chicken Marinade', 'Cinder Toffee', 'Enchiladas Verdes', 'Authentic Enchiladas Verdes', 'Korean BBQ Chicken Marinade', 'Homemade Lasagna Sheets', 'Elk Steak Marinade', 'Modenese Pork Chops', 'Italian Pork Tenderloin', 'German Rouladen', 'Brazilian Lemonade', 'Shredded Beef Enchiladas', 'Brigadeiro', 'Homemade Hoisin Sauce', 'Caneles de Bordeaux', 'Homemade Portuguese Chicken', 'Homemade Spaghetti Sauce', 'Pasta de Sardine', 'Sweet and Sour Pork Ten

In [49]:
fw_names

['Spaghetti Squash Pad Thai',
 'Chicken, Feta Cheese, and Sun-Dried Tomato Wraps',
 'Knodel',
 'Cuban Inspired Millet',
 'Vegan Spaghetti and Meatballs',
 'Tamales Oaxaque&ntilde;os',
 "Homemade Za'atar",
 'Bruschetta al Pomodoro',
 'Cuban Shredded Pork',
 'Italian Chicken Marinade',
 'Pao de Queijo',
 'Schupfnudeln',
 "Kris' Amazing Shredded Mexican Beef",
 'Authentic Enchiladas Verdes',
 'Sweet Sesame Slaw',
 'Classic Carbonara with Pancetta',
 'Instant Pot Chicken Posole Verde',
 'Real Homemade Bologna',
 'Curry Pork Tenderloin',
 'Pastel de Leches',
 'Portuguese Sweet Rice',
 'Homemade Chicken Enchiladas',
 "Bon Appetit's Meatballs",
 'Sicilian Spaghetti',
 'Easy Shrimp Vegetable Stir Fry',
 'Vegetarian Pad Thai',
 'Amaretti',
 'Italian Braised Pork Shoulder',
 'Dutch Croquetten',
 'Spaghetti Alla Carbonara Tradizionali',
 "Roman Sheep Herder's Pasta",
 'Pasta Con Sarde',
 "Ninabell's Appetizer Meatballs",
 'Vegetarian Black Bean Enchiladas',
 'Bermuda Fish Chowder',
 'Polish Bread

For the remaining names with bracket at the end, split into two new recipe names

In [50]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_end:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Tres Leches (Milk Cake)
Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)
Spicy Thai Basil Chicken (Pad Krapow Gai)
Labneh (Lebanese Yogurt)
Indian Chicken Curry (Murgh Kari)
Keema Aloo (Ground Beef and Potatoes)
Turkish Eggs (Cilbir)
South African Melktert (Milk Tart)
Ukrainian Apple Cake (Yabluchnyk)
Spanish Garlic Shrimp (Gambas al Ajillo)
German Potato Dumplings (Kartoffelkloesse)
Apfelkuchen (Apple Cake)
Eggplant Caponata (Sicilian Version)
Chana Masala (Savory Indian Chick Peas)
Ricotta Pie (Old Italian Recipe)
Easy Blini (Russian Pancake)
Easy Bulgogi (Korean BBQ Beef)
Carne en su Jugo (Meat in its Juices)
Ghormeh Sabzi (Persian Herb Stew)
Puerto Rican Tostones (Fried Plantains)
Kalbi (Korean BBQ Short Ribs)
Macaron (French Macaroon)
Atsara (Papaya Relish)
Authentic Chinese Egg Rolls ()
Greek Lentil Soup (Fakes)
Lumpia (Shanghai version)
Northern Ontario Partridge (Ruffed Grouse)
Vampiros Mexicanos (Mexican Vampires)
Jamaican Saltfish Fritters (Stamp and Go)
Slo

For some reasons, need to run the cell twice

In [51]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_end:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Polish Noodles (Cottage Cheese and Noodles)
Oyakodon (Japanese Chicken and Egg Rice Bowl)
Papas Rellenas (Fried Stuffed Potatoes)
Blaukraut (German Red Cabbage)
Irish Boiled Dinner (Corned Beef)
True Dominican Sancocho (Latin 7-Meat Stew)
Blini (Russian Pancakes)
Oeufs Cocotte (Baked Eggs)
Ropa Vieja (Cuban Beef)
Lace Cookies (Florentine Cookies)
Sinigang na Bangus (Filipino Milkfish in Tamarind Broth)
Schwabischer Kartoffelsalat (German Potato Salad - Schwabisch Style)
Roti Canai/Paratha (Indian Pancake)
Melanzana alla Parmigiana (Perfect Eggplant Parmigiana)
Pierogi (Traditional Polish Dumplings)
Nipples of Venus (Capezzoli di Venere)
Samosadilla (Samosa Quesadilla)
Bulgogi (Korean Barbecued Beef)
Sabaayad (Somali Flatbread)
Filipino Baked Milkfish (Baked Bangus)
Ash-e Reshteh (Persian Legume Soup)
Lentil and Cactus Soup (Mom's Recipe)
Ethiopian Cabbage and Potato Dish (Atkilt)
Finnish Kropser (Baked Pancakes)
Oma's Griessnockerlsuppe (Beef and Semolina Dumpling Soup)
Kewa Datshi (Bh

Only the names with bracket in the middle of their names remain

In [52]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names= bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Classic Cuban Midnight (Medianoche) Sandwich', 'Spicy Indian (Gujarati) Green Beans', "World's Best () Lasagna", 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce', 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce', 'Bee Sting Cake (Bienenstich) II', 'Coconut (Haupia) and Chocolate Pie', 'Lamb (Gosht) Biryani', 'Jeera (Cumin) Rice', 'Pollo (Chicken) Fricassee from Puerto Rico', 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish', 'Lazy Golumpki (Stuffed Cabbage) Soup', 'Ulu (Breadfruit) Pancakes', 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican', 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce', 'Seaweed (Nori) Soup', 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms', 'Hawaiian Bruddah Potato Mac (Macaroni) Salad', 'Korean Bean Curd (Miso) Soup', 'Lengua (Beef Tongue) Stew', 'Albondigas (Meatballs) en Chipotle', 'Zito (Zhito/Koljivo) - Serbian Wheat Pudding', 'Besan (Gram Flour) Halwa']


Mac and rapini is only synonymous the the one word before them. Otherwise, the bracketed words are synonymous to all the words before them combined.

In [53]:
bracketed_names

['Zito (Zhito/Koljivo) - Serbian Wheat Pudding',
 'Bee Sting Cake (Bienenstich) II',
 'Coconut (Haupia) and Chocolate Pie',
 'Jeera (Cumin) Rice',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce',
 'Besan (Gram Flour) Halwa',
 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce',
 'Hawaiian Bruddah Potato Mac (Macaroni) Salad',
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Lengua (Beef Tongue) Stew',
 'Classic Cuban Midnight (Medianoche) Sandwich',
 'Lazy Golumpki (Stuffed Cabbage) Soup',
 'Ulu (Breadfruit) Pancakes',
 'Albondigas (Meatballs) en Chipotle',
 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish',
 'Korean Bean Curd (Miso) Soup',
 'Spicy Indian (Gujarati) Green Beans',
 'Lamb (Gosht) Biryani',
 "World's Best () Lasagna",
 'Seaweed (Nori) Soup',
 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms']

The names can still be duplicated into 2, except that the bracketed word replaces the words before in the second new name, treating them as synonyms.

In [54]:
def convert_bracket_synonym(name, num=0):
    name1 = re.findall(BRACKET_REGEX, name)[0]
    name1 = name1[name1.find("(")+1:name1.find(")")]
    name1_suffix = name.split(')')[1]
    if num==0:
        name1 = name1 + name1_suffix
        name2 = re.sub(BRACKET_REGEX, "", name)
    else:
        name1_prefix = name.split('(')[0]
        name1_prefix = name1_prefix[:-num]
        name1 = name1_prefix + name1 + name1_suffix
        name2 = re.sub(BRACKET_REGEX, " ", name)
    return name1, name2

print(convert_bracket_synonym("Lamb (Gosht) Biryani"))
print(convert_bracket_synonym("Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce", 1))
print(convert_bracket_synonym("Hawaiian Bruddah Potato Mac (Macaroni) Salad", 1))

('Gosht Biryani', 'Lamb Biryani')
('Fusilli with RapiniBroccoli Rabe, Garlic, and Tomato Wine Sauce', 'Fusilli with Rapini , Garlic, and Tomato Wine Sauce')
('Hawaiian Bruddah Potato MacMacaroni Salad', 'Hawaiian Bruddah Potato Mac  Salad')


In [55]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_mid:
            newname1, newname2 = convert_bracket_synonym(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Classic Cuban Midnight (Medianoche) Sandwich
Spicy Indian (Gujarati) Green Beans
World's Best () Lasagna
Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce
Kimchi Jun (Kimchi Pancake) and Dipping Sauce
Bee Sting Cake (Bienenstich) II
Coconut (Haupia) and Chocolate Pie
Lamb (Gosht) Biryani
Jeera (Cumin) Rice
Pollo (Chicken) Fricassee from Puerto Rico
Fish Sinigang (Tilapia) - Filipino Sour Broth Dish
Lazy Golumpki (Stuffed Cabbage) Soup
Ulu (Breadfruit) Pancakes
Fried Chicken Chunks (Chicharrones De Pollo) Dominican
Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce
Seaweed (Nori) Soup
Vareniki (Russian Pierogi) with Potatoes and Mushrooms
Hawaiian Bruddah Potato Mac (Macaroni) Salad
Korean Bean Curd (Miso) Soup
Lengua (Beef Tongue) Stew
Albondigas (Meatballs) en Chipotle
Zito (Zhito/Koljivo) - Serbian Wheat Pudding
Besan (Gram Flour) Halwa


Successfully removed all brackets from recipe names

In [56]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names= bracketed_names + names

bracketed_names = list(set(bracketed_names))
bracketed_names

[]


[]

Dashes are mostly adjectives, but things like semi colon need to be removed. As for colons, its mostly translation. Semicolons are caused by K&auml;, which are dishes with special characters or German words.

In [57]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-', ';', ':']

In [58]:
for colon in colon_tokens:
  print(find_value_with_char(p_recipes, 'name', colon))

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Her

In [59]:
def remove_entry_with(dict_list, target, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            if target in dict_list[i]["name"]:
                dict_list.remove(dict_list[i])
        except Exception as e:
            pass

In [60]:
for semicolon in ["Quorn&trade;", "Sp&auml;tzle", "Tamales Oaxaque&ntilde;os", "K&auml;sesahnetorte", "Salte&ntilde;as"]:
    remove_entry_with(p_recipes, semicolon)
tagged_recipe_names = retag(p_recipes, "name")

Semi colons cleaned

In [61]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-', ':']

In [62]:
for colon in colon_tokens:
  print(find_value_with_char(p_recipes, 'name', colon))

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Her

For these 2 names, colons are used for describing

In [63]:
# Spaghetti alla Carbonara: the Traditional Italian Recipe => traditional Italian Spaghetti alla Carbonara
searchReplaceAddPattList(p_recipes, r": the Traditional Italian Recipe", "", "traditional Italian ")
# Grandma's Focaccia: Baraise Style => Grandma's Baraise Style Focaccia
searchReplaceAddPattList(p_recipes, r": Baraise Style", "", "Baraise style ", index=10)
tagged_recipe_names = retag(p_recipes, "name")

Cleaned 2 names with colon. If the dashes are between a word, they are either part of a word's spelling or joining two words together, typically as an adjective. However, if it is between spaces, they are translations.

In [64]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-', ':']

In [65]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Her

['Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Haluski - Cabbage and Noodles',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Carnitas - Pressure Cooker',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 'Onigiri - Japanese Rice Balls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Chicken French - Rochester, NY Style',
 'Velveting Chicken Breast, Chinese Re

But in some cases, they are words after the dashes describe the dish, such as Rochester, NY Style and Restaurant Style

In [66]:
for colname in new_colon_names:
    if len(re.findall("( - )|(: )", colname)) > 0:
        print(colname)

Haluski - Cabbage and Noodles
Carnitas - Pressure Cooker
Onigiri - Japanese Rice Balls
Chicken French - Rochester, NY Style
Taqueria Style Tacos - Carne Asada
Al Kabsa - Traditional Saudi Rice and Chicken
Italian Subs - Restaurant Style
Bazlama - Turkish Flat Bread
Norwegian Pancakes - Pannekaken
Pain de Campagne - Country French Bread
Flemish Frites - Belgian Fries with Andalouse Sauce
Portuguese Custard Tarts - Pasteis de Nata
Eggplant Parmesan - Gluten-Free
Tonkatsu - Asian-Style Pork Chop
Indian Eggplant - Bhurtha
Hot Pepper Sauce - A Trinidadian Staple
The Sarge's Goetta - German Breakfast Treat
Italian Sausage - Tuscan Style
Honey Milk Tea - Hong Kong Style
Mexican Lasagna - No Lasagna Noodles!
Lumpia - Filipino Shrimp and Pork Egg Rolls
Portuguese Muffins - Bolo Levedo
Curry Pasta - Pakistani Style
Cauliflower and Potato Stir-Fry - East Indian Recipe
Keftedes - Greek Meatballs
Brasato al Barolo - Braised Chuck Roast in Red Wine
Potato Salad - German Kartoffel
Tembleque de Coco -

Replace or remove the remaining dashes that are surrounded by spaces

In [67]:
# Chicken French - Rochester, NY Style => Rochester, NY Style Chicken French
searchReplaceAddPattList(p_recipes, r" - Rochester, NY Style", "", "Rochester, NY Style ")
# Carnitas - Pressure Cooker => pressure cooker carnitas
searchReplaceAddPattList(p_recipes, r" - Rochester, NY Style", "", "Rochester, NY Style ")
# Italian Subs - Restaurant Style => restaurant style Italian subs
searchReplaceAddPattList(p_recipes, r" - Restaurant Style", "", "restaurant style ")
# Eggplant Parmesan - Gluten-Free => glutten-free Eggplant Parmesan
searchReplaceAddPattList(p_recipes, r" - Gluten-Free", "", "glutten-free ")
# Italian Sausage - Tuscan Style => Tuscan style Italian Sausage
searchReplaceAddPattList(p_recipes, r" - Tuscan Style", "", "Tuscan style ")
# Honey Milk Tea - Hong Kong Style => Hong Kong style Honey Milk Tea
searchReplaceAddPattList(p_recipes, r" - Hong Kong Style", "", "Hong Kong style ")
# Curry Pasta - Pakistani Style => Pakistani style Curry Pasta
searchReplaceAddPattList(p_recipes, r" - Pakistani Style", "", "Pakistani style ")
# Cauliflower and Potato Stir-Fry - East Indian Recipe => East Indian style Cauliflower and Potato Stir-Fry
searchReplaceAddPattList(p_recipes, r" - East Indian Recipe", "", "East Indian style ")
# German Potato Salad - Schwabisch Style => Schwabisch style German Potato Salad
searchReplaceAddPattList(p_recipes, r" - Schwabisch Style", "", "Schwabisch style ")
# Tilapia - Filipino Sour Broth Dish => Filipino Sour Broth tilapia
searchReplaceAddPattList(p_recipes, r"Tilapia - ", "", "tilapia", index=20)
# Fish Sinigang - Filipino Sour Broth Dish - Schwabisch Style => Filipino Sour Broth Sinigang fish
searchReplaceAddPattList(p_recipes, r"Fish Sinigang - ", "", "Sinigang fish", index=20)

# remove  - A Trinidadian Staple from Hot Pepper Sauce - A Trinidadian Staple
searchReplacePattList(p_recipes, r" - A Trinidadian Staple", "")
# remove  - German Breakfast Treat from The Sarge's Goetta - German Breakfast Treat
searchReplacePattList(p_recipes, r" - German Breakfast Treat", "")
# remove  - No Lasagna Noodles! from Mexican Lasagna - No Lasagna Noodles!
searchReplacePattList(p_recipes, r" - No Lasagna Noodles!", "")
# remove  - Not Just for Chicken from Sweet and Sour Jam - Not Just for Chicken
searchReplacePattList(p_recipes, r" - Not Just for Chicken", "")
                      
tagged_recipe_names = retag(p_recipes, "name")

In [68]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Herb Linguine', 'Korean-style Seaweed Soup'

['Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Haluski - Cabbage and Noodles',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Carnitas - Pressure Cooker',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 'Onigiri - Japanese Rice Balls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Velveting Chicken Breast, Chinese Restaurant-Style',
 'Garlic-Herb Linguine',

The remaining names with dashes surrounded by dashes are translations, which can be split into two names

In [69]:
colnames_to_split = []
for colname in new_colon_names:
    if len(re.findall("( - )|(: )", colname)) > 0:
        print(colname)
        colnames_to_split.append(colname)

Haluski - Cabbage and Noodles
Carnitas - Pressure Cooker
Onigiri - Japanese Rice Balls
Taqueria Style Tacos - Carne Asada
Al Kabsa - Traditional Saudi Rice and Chicken
Bazlama - Turkish Flat Bread
Norwegian Pancakes - Pannekaken
Pain de Campagne - Country French Bread
Flemish Frites - Belgian Fries with Andalouse Sauce
Portuguese Custard Tarts - Pasteis de Nata
Tonkatsu - Asian-Style Pork Chop
Indian Eggplant - Bhurtha
Lumpia - Filipino Shrimp and Pork Egg Rolls
Portuguese Muffins - Bolo Levedo
Keftedes - Greek Meatballs
Brasato al Barolo - Braised Chuck Roast in Red Wine
Potato Salad - German Kartoffel
Tembleque de Coco - Coconut Tembleque
Kroppkakor - Swedish Potato Dumplings
Ladolemono - Lemon Oil Sauce for Fish or Chicken
Mie Goreng - Indonesian Fried Noodles
Vaselopita - Greek New Years Cake
Knedliky - Czech Dumpling with Sauerkraut
Zhito/Koljivo - Serbian Wheat Pudding
Zito - Serbian Wheat Pudding
Doro Wat: Ethiopian Chicken Dish


In [70]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in colnames_to_split:
            splits = re.split("( - )|(: )", p_recipes[i]["name"])
            newname1 = splits[0]
            newname2 = splits[len(splits)-1]
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

The remaining names with dash are those in words

In [71]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-']

In [72]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Herb Linguine', 'Korean-style Seaweed Soup', 'Ube-Macapuno Cake', 'Cuban-Style Yuca', 'Japanese-Style Cabbage Salad', "Jorge's Indian-Spice

['Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Velveting Chicken Breast, Chinese Restaurant-Style',
 'Garlic-Herb Linguine',
 'Korean-style Seaweed Soup',
 'Ube-Macapuno Cake',
 'Cuban-Style Yuca',
 'Japanese-Style Cabbage 

!, ? and . are found, which are odd for recipe names

In [73]:
punc_tokens = list_words_with_tag(tagged_recipe_names, ".")
punc_tokens

['!', '!', '!', '!', '.', '?']

The punctuations are mostly slang abbreviations and exclamations

In [74]:
for punc in list(set(punc_tokens)):
  print(find_value_with_char(p_recipes, 'name', punc))

['Real Canadian Butter Tarts, eh?']
["Our Top P.F. Chang's Copycat Recipes", "Perfect St. Patrick's Day Cake"]
['Sangria! Sangria!', 'Oatmeal Apple Crisp To Die For!', "Sushi House Salad Dressing, It's ORANGE!"]


Remove the exclamations

In [75]:
searchReplacePattList(p_recipes, r"! Sangria!", "")
searchReplacePattList(p_recipes, r" To Die For!", "")
searchReplacePattList(p_recipes, r", It's ORANGE!", "")
searchReplacePattList(p_recipes, r", eh\?", "")
searchReplacePattList(p_recipes, r"Our Top ", "")

tagged_recipe_names = retag(p_recipes, "name")

Fullstops that remain are part of recipe names

In [77]:
punc_tokens = list_words_with_tag(tagged_recipe_names, ".")
punc_tokens

['.']

In [78]:
for punc in list(set(punc_tokens)):
  print(find_value_with_char(p_recipes, 'name', punc))

["P.F. Chang's Copycat Recipes", "Perfect St. Patrick's Day Cake"]


Some 'that' can be found

In [80]:
wdt_tokens = list_words_with_tag(tagged_recipe_names, "WDT")
wdt_tokens

['That', 'That', 'That', 'That']

The 'that's are used to add details, but not actual recipe name

In [81]:
for wdt in list(set(wdt_tokens)):
  print(find_value_with_char(p_recipes, 'name', wdt))

['German Recipes That Are Comfort Food Favorites', 'Mexican-Inspired Ground Beef Casseroles That Deliver Big Flavor With Every Satisfying Bite', 'Tuscan Recipes That Reveal the Best of Italian Cooking', 'Easy Dinners That Start with Packaged Gnocchi', "That's-a Meatloaf", 'Favorite Recipes That Show Off Armenian Cuisine', 'Our Best Stir-Fry Recipes That Are Even Better Than Take-Out', 'Comforting Polish Cabbage Recipes That Are Family Favorites']


Remove

In [82]:
searchReplacePattList(p_recipes, r" That Are Comfort Food Favorites", "")
searchReplacePattList(p_recipes, r" That Deliver Big Flavor With Every Satisfying Bite", "")
searchReplacePattList(p_recipes, r" That Reveal the Best of Italian Cooking", "")
searchReplacePattList(p_recipes, r"That's-a ", "")
searchReplacePattList(p_recipes, r"Favorite Recipes That Show Off ", "")
searchReplacePattList(p_recipes, r" That Are Even Better Than Take-Out", "")
searchReplacePattList(p_recipes, r" That Are Family Favorites", "")

searchReplaceAddPattList(p_recipes, r" That Start with Packaged Gnocchi", "", "packaged gnocchi ", index=5)
tagged_recipe_names = retag(p_recipes, "name")

That removed

In [83]:
wdt_tokens = list_words_with_tag(tagged_recipe_names, "WDT")
wdt_tokens

[]

There's some 'how's

In [85]:
wrb_tokens = list_words_with_tag(tagged_recipe_names, "WRB")
wrb_tokens

['How', 'How', 'How', 'How', 'How', 'How', 'How']

In [86]:
for wrb in list(set(wrb_tokens)):
  print(find_value_with_char(p_recipes, 'name', wrb))

['How to Make Coquilles Saint-Jacques', 'How to Make Bolognese Sauce', 'How to Make Beef Satay', 'How to Make Peanut Dipping Sauce', 'How to Make Tres Leches Cake', 'How to Make Cassoulet', 'How to Make Turkey Manicotti']


Remove the 'how's and keep only the name

In [87]:
searchReplacePattList(p_recipes, r"How to Make ", "")

tagged_recipe_names = retag(p_recipes, "name")

In [88]:
list_words_with_tag(tagged_recipe_names, "WRB")

[]

There's some personal pronouns (possessive)

In [95]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP$")
prp_tokens

['Our',
 'My',
 'My',
 'My',
 'Our',
 'My',
 'Our',
 'My',
 'My',
 'My',
 'Our',
 'My',
 'My',
 'Your',
 'Our',
 'Our',
 'Our',
 'My',
 'its']

In [97]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['Our Best Avgolemono Soup Recipes', 'Our Best Authentic Mexican Recipes', 'Our Best Empanada Recipes', 'Our Best Indian Recipes for Beginner Cooks', 'Our Best Stir-Fry Recipes', 'Our Favorite German Potato Recipes', 'Say Aloha to Our Best Hawaiian Recipes']
['Anzac Biscuits I', "Sadie's Buttermilk Biscuits", 'Canadian Tea Biscuits', 'Empire Biscuits', 'Pastitsio IV', 'Crescent Butter Biscuits', 'Pastitsio', "Nanny's Newfoundland Tea Biscuits", 'Meat in its Juices']
['Sweet Recipes to Complete Your Indian Dinner', 'Melt-in-Your-Mouth Beef Cacciatore', 'Polish Recipes to Make Your Grandmother Proud']
['My Own Famous Stuffed Grape Leaves', 'My Best Chicken Piccata', 'My Favorite Sesame Noodles', 'My Chicken Parmesan', "My Mom's Greek Lemon Rice", 'My Fly Stir-Fry', 'My Chicken Pho Recipe', 'My Tangy German Potato Salad', 'My Big Fat Greek Baked Beans', "My Grandmother's French Dressing"]


Most can be removed

In [98]:
searchReplacePattList(p_recipes, r"Our ", "")
searchReplacePattList(p_recipes, r"Your ", "")
searchReplacePattList(p_recipes, r"Melt-in-Your-Mouth ", "")
searchReplacePattList(p_recipes, r"My Own ", "")
searchReplacePattList(p_recipes, r"My Best ", "")
searchReplacePattList(p_recipes, r"My Favorite ", "")
searchReplacePattList(p_recipes, r"My Mom's ", "")
searchReplacePattList(p_recipes, r"My Grandmother's ", "")
searchReplacePattList(p_recipes, r"My ", "")

tagged_recipe_names = retag(p_recipes, "name")

The remaining ones are misclassified tags by nltk

In [99]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP$")
prp_tokens

['its']

In [100]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['Anzac Biscuits I', "Sadie's Buttermilk Biscuits", 'Canadian Tea Biscuits', 'Empire Biscuits', 'Pastitsio IV', 'Crescent Butter Biscuits', 'Pastitsio', "Nanny's Newfoundland Tea Biscuits", 'Meat in its Juices']


There's some personal pronouns

In [103]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP")
prp_tokens

['I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'You',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'You',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'We',
 'I',
 'I',
 'I']

In [104]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['German Apple Cake I', 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Mexican Rice II', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Hot German Potato Salad III', 'Stuffed Shells I', 'Baked Penne with Italian Sausage', 'German Pancakes II', "Grandma's Noodles II", 'Hot Italian Giardiniera', 'Russian Tea Cakes I', 'Fried Irish Cabbage with Bacon', 'Greek Salad I', 'Irish Boxty', 'Italian Sausage Stuffed Shells', 'Vegetarian Mexican Inspired Stuffed Peppers', 'Indian Saag', 'Chicken Enchiladas II', 'Chinese Chicken Fried Rice II', 'Quiche Lorraine I', 'Italian Cream Cheese and Ricotta Cheesecake', 'Taco Seasoning II', "Ingrid's Rouladen", 'Baked Italian Chicken Thighs', 'Original Homemade Italian Beef', 'Pork Chops Italiano', 'Pizzelles III', 'Eclairs II', 'Real Italian Calzones', 'Old Italian Meat Sauce', "Chef John's Italian Sausage Chili", 'Italian Bread Using a Bread Machine', 'Incredibly Delicious Italian Cream Cake', 'Italian Wedding Cookies III', 'B

Not much to remove, since most are misclassified POS

In [105]:
searchReplacePattList(p_recipes, r" You Can Make at Home", "")

tagged_recipe_names = retag(p_recipes, "name")

Some base verbs can be removed

In [108]:
vb_tokens = list_words_with_tag(tagged_recipe_names, "VB")
vb_tokens

['Take',
 'Make',
 'Take',
 'Kedgeree',
 'Swordfish',
 'Serve',
 'Make',
 'Celebrate',
 'Chicken',
 'Pata',
 'aux',
 'Poulet',
 'Papa',
 'Tarte',
 'Pollo',
 'Pancake',
 'Dutch',
 'Kransekake',
 'Dish',
 'Pannekaken']

In [109]:
for vb in list(set(vb_tokens)):
  print(find_value_with_char(p_recipes, 'name', vb))

['Pannekaken']
['Dutch Apple Pie with Oatmeal Streusel', 'Dutch Apple Cake', 'Dutch Apple Pie', 'Dutch Croquetten', 'Dutch Apple Berry Pie', 'Dutch Leek Casserole', 'Dutch Pancakes', 'Dutch Butter Cake', 'Dutch Doughnuts', 'Dutch Apple Tart', 'Dutch Meatballs', 'Dutch Mini Pancakes']
['Caneles de Bordeaux', 'Tajine de Poulet aux Carottes et Patates Douces', 'Tarte aux Moutarde', 'Clafoutis aux Cerises']
['Kransekake']
['Poulet de Provencal', 'Tajine de Poulet aux Carottes et Patates Douces', 'Poulet a la Moutarde']
["Chef John's Patatas Bravas", 'Authentic Patatas Bravas', 'Paksiw na Pata', 'Tajine de Poulet aux Carottes et Patates Douces']
['Kedgeree']
["Papa Drexler's Bavarian Pretzels", 'Carne Con Papas', "Papa Oriold's Spaetzle", 'Papaya Relish', 'Papa a la Huancaina', 'Papas con Chorizo', 'Papas Rellenas']
['The Best Recipes to Celebrate Oktoberfest']
['Mediterranean Stuffed Swordfish', 'Swordfish a la Siciliana', "Doreen's Asian-Inspired Swordfish Steaks"]
['Make-Ahead Vegetarian

Remove recipe names with instruction

In [110]:
searchReplacePattList(p_recipes, r" to Make at Home", "")
searchReplacePattList(p_recipes, r" to Make Grandmother Proud", "")
searchReplacePattList(p_recipes, r"Ways The World Makes Chicken And ", "")

searchReplaceAddPattList(p_recipes, r"Make Ahead ", "", "packaged gnocchi ")

tagged_recipe_names = retag(p_recipes, "name")

In [111]:
for vb in list(set(vb_tokens)):
  print(find_value_with_char(p_recipes, 'name', vb))

['Pannekaken']
['Dutch Apple Pie with Oatmeal Streusel', 'Dutch Apple Cake', 'Dutch Apple Pie', 'Dutch Croquetten', 'Dutch Apple Berry Pie', 'Dutch Leek Casserole', 'Dutch Pancakes', 'Dutch Butter Cake', 'Dutch Doughnuts', 'Dutch Apple Tart', 'Dutch Meatballs', 'Dutch Mini Pancakes']
['Caneles de Bordeaux', 'Tajine de Poulet aux Carottes et Patates Douces', 'Tarte aux Moutarde', 'Clafoutis aux Cerises']
['Kransekake']
['Poulet de Provencal', 'Tajine de Poulet aux Carottes et Patates Douces', 'Poulet a la Moutarde']
["Chef John's Patatas Bravas", 'Authentic Patatas Bravas', 'Paksiw na Pata', 'Tajine de Poulet aux Carottes et Patates Douces']
['Kedgeree']
["Papa Drexler's Bavarian Pretzels", 'Carne Con Papas', "Papa Oriold's Spaetzle", 'Papaya Relish', 'Papa a la Huancaina', 'Papas con Chorizo', 'Papas Rellenas']
['The Best Recipes to Celebrate Oktoberfest']
['Mediterranean Stuffed Swordfish', 'Swordfish a la Siciliana', "Doreen's Asian-Inspired Swordfish Steaks"]
['Make-Ahead Vegetarian

Words like best and most can be removed

In [117]:
rbs_tokens = list_words_with_tag(tagged_recipe_names, "RBS")
rbs_tokens

['Best', 'Most', 'Best']

In [118]:
for rbs in list(set(rbs_tokens)):
  print(find_value_with_char(p_recipes, 'name', rbs))

['Best Bobotie', 'Best Fried Walleye', 'Best Avgolemono Soup Recipes', "Chef John's Best German Recipes", 'The Best Thai Peanut Sauce', 'Best Ever Russian Beef Stroganoff', "Grandma's Best Ever Sour Cream Lasagna", 'Best Guacamole', 'Best Ever Slow Cooker Italian Beef Roast', 'The Best Pavlova', "Savannah's Best Marinated Portobello Mushrooms", 'Best Peanut Sauce', 'Best Ever Carne Asada Marinade', "Mom's Best Spaghetti Sauce", 'The Best Korean Chicken Recipes', 'Best Instant Pot Chicken Cacciatore', 'Best Ziti Ever', 'Best Authentic Mexican Recipes', 'Best Empanada Recipes', 'Best Ziti Ever with Sausage', 'Best Chicken Parmesan', 'Best Pernil Ever', 'The Best Ricotta Pancakes', 'Best Indian Recipes for Beginner Cooks', 'Best Hot Sauce', 'Best Ever Irish Soda Bread', 'Best Hummus', 'The Best Thai Tom Kha Soup Recipe', 'Best French Macarons', 'Best Falafel', "Gordo's Best of the Best Lasagna", 'The Best Classic Beef Stroganoff', 'Best Asian Slow Cooker Recipes', 'Best Cheesy Broccoli So

In [119]:
searchReplacePattList(p_recipes, r"Grandma's Best Ever ", "")
searchReplacePattList(p_recipes, r"Best Ever ", "")
searchReplacePattList(p_recipes, r"Best ", "")
searchReplacePattList(p_recipes, r" ever", "")
searchReplacePattList(p_recipes, r"The Most Iconic ", "")

tagged_recipe_names = retag(p_recipes, "name")

In [121]:
rbs_tokens = list_words_with_tag(tagged_recipe_names, "RBS")
rbs_tokens

[]

Adverbs with -ly can be removed, except for the misclassified ones mainly caused by foreign recipe names

In [124]:
rb_tokens = list_words_with_tag(tagged_recipe_names, "RB")
rb_tokens

['Absolutely',
 'Aebleskiver',
 'Incredibly',
 'Perfectly',
 'Absolutely',
 'Oven',
 'Perfectly',
 'Absolutely',
 'Heavenly',
 'Asiago',
 'Philly',
 'Family',
 'Deadly',
 'Yet',
 'Absolutely',
 'Ever',
 'Tourtiere',
 'Tourtiere',
 'Soon',
 'Here',
 'Long',
 'Tourtiere',
 'Tourtiere']

In [125]:
for rb in list(set(rb_tokens)):
  print(find_value_with_char(p_recipes, 'name', rb))

['No Tomato Paste Here']
['Deadly Delicious Lasagna']
['Heavenly Raspberry Dessert']
['Philly Cheesesteak Quesadillas']
['Asiago Sun-Dried Tomato Pasta', 'Chicken and Bowtie Pasta with Asiago Cream Sauce']
['Perfectly Moist Irish Wheaten Bread', 'Perfectly Dry Roasted Chickpeas']
['Ziti Ever', 'Ziti Ever with Sausage', 'Pernil Ever', 'Date Squares Ever']
['French Canadian Tourtiere', 'Traditional French Canadian Tourtiere', 'Reveillon Tourtiere', 'Tourtiere Spices', 'Tourtiere', 'Tourtiere', 'Tourtiere', 'Tourtiere']
['Chicken Long Rice Soup', 'Vietnamese Chicken and Long-Grain Rice Congee', 'Long Soup', 'Philippine Longanisa de Eugenio', 'Long Drink']
['Air Fryer Oven Taco Shells', 'Oven Kalua Pork', 'Oven-Roasted Chicken Thighs', 'Oven Baked Chicken Teriyaki', 'Oven-Baked Chicken Fajitas', 'Oven-Baked Teriyaki Chicken Thighs', 'Crispy Oven Beef-and-Bean Tostadas', "Oven-Roasted Za'atar Chicken Breasts", 'Chicken Chimi in the Oven']
['Soon Du Bu Jigae']
['Yet Turkey Chili']
['Absolute

In [129]:
searchReplacePattList(p_recipes, r"Deadly Delicious ", "")
searchReplacePattList(p_recipes, r"Heavenly ", "")
searchReplacePattList(p_recipes, r"Perfectly ", "")
searchReplacePattList(p_recipes, r"Absolutely Fabulous ", "")
searchReplacePattList(p_recipes, r"Absolutely Amazing  ", "")
searchReplacePattList(p_recipes, r"Absolutely Delicious ", "")
searchReplacePattList(p_recipes, r"Absolutely Perfect ", "")

searchReplaceAddPattList(p_recipes, r"No Tomato Paste Here", "", "tomato paste")

tagged_recipe_names = retag(p_recipes, "name")

In [130]:
rb_tokens = list_words_with_tag(tagged_recipe_names, "RB")
rb_tokens

['Aebleskiver',
 'Incredibly',
 'Absolutely',
 'Oven',
 'Asiago',
 'Philly',
 'Family',
 'Yet',
 'Ever',
 'Tourtiere',
 'Tourtiere',
 'Soon',
 'Long',
 'Tourtiere',
 'Tourtiere']

In [131]:
for rb in list(set(rb_tokens)):
  print(find_value_with_char(p_recipes, 'name', rb))

['Ziti Ever', 'Ziti Ever with Sausage', 'Pernil Ever', 'Date Squares Ever']
['Philly Cheesesteak Quesadillas']
['Asiago Sun-Dried Tomato Pasta', 'Chicken and Bowtie Pasta with Asiago Cream Sauce']
['French Canadian Tourtiere', 'Traditional French Canadian Tourtiere', 'Reveillon Tourtiere', 'Tourtiere Spices', 'Tourtiere', 'Tourtiere', 'Tourtiere', 'Tourtiere']
['Chicken Long Rice Soup', 'Vietnamese Chicken and Long-Grain Rice Congee', 'Long Soup', 'Philippine Longanisa de Eugenio', 'Long Drink']
['Air Fryer Oven Taco Shells', 'Oven Kalua Pork', 'Oven-Roasted Chicken Thighs', 'Oven Baked Chicken Teriyaki', 'Oven-Baked Chicken Fajitas', 'Oven-Baked Teriyaki Chicken Thighs', 'Crispy Oven Beef-and-Bean Tostadas', "Oven-Roasted Za'atar Chicken Breasts", 'Chicken Chimi in the Oven']
['Soon Du Bu Jigae']
['Yet Turkey Chili']
['Absolutely Amazing Ahi']
['Aebleskiver', 'Dansk Aebleskiver']
['Incredibly Delicious Italian Cream Cake']
['Willard Family German Chocolate Cake', 'Mexican-Inspired Cas

In [133]:
all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  all_name_tags.append(new_dic)

get_tag_number(all_name_tags)

[{'$': 1},
 {"''": 7},
 {'(': 0},
 {')': 0},
 {',': 62},
 {'--': 0},
 {'.': 1},
 {':': 1},
 {'CC': 497},
 {'CD': 23},
 {'DT': 96},
 {'EX': 0},
 {'FW': 67},
 {'IN': 464},
 {'JJ': 1895},
 {'JJR': 2},
 {'JJS': 1},
 {'LS': 0},
 {'MD': 0},
 {'NN': 660},
 {'NNP': 12705},
 {'NNPS': 36},
 {'NNS': 389},
 {'PDT': 0},
 {'POS': 345},
 {'PRP': 69},
 {'PRP$': 1},
 {'RB': 15},
 {'RBR': 0},
 {'RBS': 0},
 {'RP': 2},
 {'SYM': 0},
 {'TO': 10},
 {'UH': 0},
 {'VB': 18},
 {'VBD': 39},
 {'VBG': 59},
 {'VBN': 139},
 {'VBP': 9},
 {'VBZ': 29},
 {'WDT': 0},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 6}]

## Examining other POS in names

In [116]:
vbz_tokens = list_words_with_tag(tagged_recipe_names, "VBZ")
vbz_tokens

['Ties',
 'el',
 'Leaves',
 'al',
 'al',
 'Leaves',
 'au',
 'di',
 'Ways',
 'de',
 'al',
 'Breasts',
 'en',
 'e',
 'al',
 'Leaves',
 'Breasts',
 'al',
 'di',
 'aux',
 'di',
 'Leaves',
 'au',
 'di',
 'di',
 'al',
 'en',
 'en']

In [115]:
vbp_tokens = list_words_with_tag(tagged_recipe_names, "VBP")
vbp_tokens

['Rellenos',
 'Greek',
 'Divine',
 'Wat',
 'Be',
 'en',
 'Mexicanos',
 'Rellenos',
 'en']

In [114]:
vbg_tokens = list_words_with_tag(tagged_recipe_names, "VBG")
vbg_tokens

['Seasoning',
 'Pudding',
 'Using',
 'Canning',
 'Pudding',
 'Velveting',
 'Pudding',
 'Pudding',
 'Pudding',
 'Seasoning',
 'Comforting',
 'Seasoning',
 'Pouding',
 'Pudding',
 'Amazing',
 'Pudding',
 'Refreshing',
 'Pudding',
 'Seasoning',
 'Dressing',
 'Comforting',
 'Pudding',
 'Making',
 'Comforting',
 'Pudding',
 'Dumpling',
 'Dipping',
 'Refreshing',
 'Pudding',
 'Seasoning',
 'Seasoning',
 'Filling',
 'Thanksgiving',
 'Stuffing',
 'Pudding',
 'Pudding',
 'Refreshing',
 'Pudding',
 'Sizzling',
 'Topping',
 'Amazing',
 'Refreshing',
 'Comforting',
 'Dressing',
 'Using',
 'Seasoning',
 'Refreshing',
 'Pudding',
 'Pudding',
 'Pudding',
 'Ping',
 'Pudding',
 'Pudding',
 'Pudding',
 'Pudding',
 'Pudding',
 'Dumpling',
 'Pudding']

In [113]:
vbd_tokens = list_words_with_tag(tagged_recipe_names, "VBD")
vbd_tokens

['Braised',
 'Fried',
 'Corned',
 'Corned',
 'Pickled',
 'Marinated',
 'Shredded',
 'Braised',
 'Fashioned',
 'Filled',
 'Corned',
 'Fashioned',
 'Pickled',
 'Braised',
 'Breaded',
 'Fried',
 'Grilled',
 'Braised',
 'Pickled',
 'Braised',
 'Braised',
 'Planked',
 'Corned',
 'Corned',
 'Braised',
 'Infused',
 'Corned',
 'Obsessed',
 'Pickled',
 'Pulled',
 'Roasted',
 'Broiled',
 'Pickled',
 'Roasted',
 'di',
 'Braised',
 'Braised',
 'Pickled',
 'Mulled',
 'Pickled',
 'Boiled']

In [107]:
rp_tokens = list_words_with_tag(tagged_recipe_names, "RP")
rp_tokens

['Hanout', 'Over']

In [101]:
comma_tokens = list(set(list_words_with_tag(tagged_recipe_names, ",")))
comma_tokens

[',']

In [90]:
for c in list(set(comma_tokens)):
  print(find_value_with_char(p_recipes, 'name', c))

['Bow Ties with Sausage, Tomatoes and Cream', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Chicken, Spinach, and Cheese Pasta Bake', 'Super-Simple, Super-Spicy Mongolian Beef', 'Creamy Potato, Carrot, and Leek Soup', 'Beef, Mushroom and Guinness Pie', 'Easy, Chewy Flourless Peanut Butter Cookies', 'Filipino Steamed Rice, Cebu Style', 'Orange, Honey and Soy Chicken', 'Chicken Francese, Italian-Style', 'Duck with Honey, Soy, and Ginger', 'Steak, Onion, and Pepper Fajitas', 'Indian Carrots, Peas and Potatoes', 'Simple, Baked Finnan Haddie', 'Indian-Style Rice with Cashews, Raisins and Turmeric', 'Serbian Ground Beef, Veggie, and Potato Bake', 'Fried Rice with Ginger, Hoisin, and Sesame', 'Chard Lentil Soup, Lebanese-Style', 'Easy, Cheesy Tortellini Bake', 'Curried Cashew, Pear, and Grape Salad', 'Pork, Sauerkraut and Dumplings', 'Spinach, Feta, and Pine Nut Ravioli Filling', 'Bell Pepper, Tomato, and Potato Indian Curry', 'Mascarpone Pasta with Chicken, Bacon and Spinach', 'Past

In [91]:
jjr_tokens = list(set(list_words_with_tag(tagged_recipe_names, "JJR")))
jjr_tokens

['Healthier', 'Lighter']

In [92]:
for j in list(set(jjr_tokens)):
  print(find_value_with_char(p_recipes, 'name', j))

['Healthier Bang Bang Chicken in the Air Fryer', 'Healthier Swedish Meatballs', 'Healthier Pan-Fried Honey-Sesame Chicken', 'Healthier Chicken Enchiladas I', 'Healthier Honey-Sesame Chicken']
['Lighter Mexican Meatloaf']


In [93]:
jjs_tokens = list(set(list_words_with_tag(tagged_recipe_names, "JJS")))
jjs_tokens

['Best', 'Oktoberfest']

In [94]:
for j in list(set(jjs_tokens)):
  print(find_value_with_char(p_recipes, 'name', j))

['Best Bobotie', 'Best Fried Walleye', 'Our Best Avgolemono Soup Recipes', "Chef John's Best German Recipes", 'The Best Thai Peanut Sauce', 'Best Ever Russian Beef Stroganoff', "Grandma's Best Ever Sour Cream Lasagna", 'Best Guacamole', 'Best Ever Slow Cooker Italian Beef Roast', 'The Best Pavlova', 'My Best Chicken Piccata', "Savannah's Best Marinated Portobello Mushrooms", 'Best Peanut Sauce', 'Best Ever Carne Asada Marinade', "Mom's Best Spaghetti Sauce", 'The Best Korean Chicken Recipes', 'Best Instant Pot Chicken Cacciatore', 'Best Ziti Ever', 'Our Best Authentic Mexican Recipes', 'Our Best Empanada Recipes', 'Best Ziti Ever with Sausage', 'Best Chicken Parmesan', 'Best Pernil Ever', 'The Best Ricotta Pancakes', 'Our Best Indian Recipes for Beginner Cooks', 'Best Hot Sauce', 'Best Ever Irish Soda Bread', 'Best Hummus', 'The Best Thai Tom Kha Soup Recipe', 'Best French Macarons', 'Best Falafel', "Gordo's Best of the Best Lasagna", 'The Best Classic Beef Stroganoff', 'Best Asian Slo

In [88]:
dt_tokens = list_words_with_tag(tagged_recipe_names, "DT")
dt_tokens

['a',
 'The',
 'No',
 'The',
 'the',
 'a',
 'the',
 'The',
 'the',
 'the',
 'a',
 'the',
 'the',
 'A',
 'a',
 'The',
 'a',
 'the',
 'the',
 'a',
 'a',
 'A',
 'The',
 'A',
 'the',
 'a',
 'a',
 'The',
 'a',
 'a',
 'The',
 'the',
 'The',
 'This',
 'The',
 'a',
 'a',
 'the',
 'The',
 'a',
 'a',
 'The',
 'The',
 'a',
 'A',
 'the',
 'the',
 'No',
 'the',
 'a',
 'a',
 'The',
 'The',
 'a',
 'The',
 'the',
 'the',
 'The',
 'the',
 'a',
 'a',
 'The',
 'a',
 'the',
 'a',
 'The',
 'All',
 'The',
 'a',
 'the',
 'the',
 'the',
 'The',
 'The',
 'A',
 'a',
 'the',
 'a',
 'the',
 'The',
 'the',
 'a',
 'The',
 'a',
 'a',
 'the',
 'a',
 'a',
 'the',
 'a',
 'An',
 'the',
 'a',
 'a',
 'No',
 'a',
 'No',
 'a',
 'No']

In [635]:
for dt in list(set(dt_tokens)):
  print(find_value_with_char(p_recipes, 'name', dt))

["Angela's Awesome Enchiladas", 'Antipasto Pasta Salad', 'Chicken Piccata with Angel Hair Pasta', "Anne's Chicken Chilaquiles Rojas", 'Irish Bacon And Cabbage Soup', 'Italian Anisette Cookies', 'Anzac Biscuits I', "Andy's Spicy Green Chile Pork", "Aunt Anne's Sesame Cookies", 'Creme Anglaise Sauce', 'Italian Cookies with Anise', 'Chocolate Beer Cupcakes With Whiskey Filling And Irish Cream Icing', 'Anisette Toast', 'Recipes for Anyone Obsessed With the Great British Baking Show', 'Ways The World Makes Chicken And Rice', 'Italian Anise Cookies', "Angela's Asian-Inspired Chicken Noodle Soup", 'Brodetto Ancona-Style', 'Italian Garlic-Anchovy-Sardine Appetizer', 'An Easy-as-Pie Sweet and Sour Sauce', 'Anko', 'Ancient Roman Cheesecake', 'Mexican Pulled Pork in Annatto Sauce', 'Belgian Fries with Andalouse Sauce']
["Grandma's Noodles II", 'Buche de Noel', 'Norwegian Lefse', 'Vegetarian Chinese Fried Noodles', 'No Fail Bean Pie', 'Vermicelli Noodle Bowl', 'Chicken Udon Noodle Soup', 'Peanut B

In [160]:
to_tokens = list_words_with_tag(tagged_recipe_names, "TO")
to_tokens

['to',
 'to',
 'to',
 'to',
 'to',
 'to',
 'To',
 'to',
 'to',
 'na',
 'to',
 'to',
 'to',
 'to',
 'to',
 'To',
 'to',
 'na',
 'na',
 'na']

In [161]:
for to in list(set(to_tokens)):
  print(find_value_with_char(p_recipes, 'name', to))

['Creamy Au Gratin Potatoes', 'Greek Lemon Chicken and Potato Bake', 'Authentic German Potato Salad', 'How to Make Coquilles Saint-Jacques', 'Wonton Wrappers', 'Spinach Tomato Tortellini', 'Hot German Potato Salad III', 'Hot German Potato Salad Casserole', 'Chicken Cacciatore in a Slow Cooker', 'Wonton Soup', 'Pesto Cream Sauce', 'Creamy Pesto Shrimp', 'How to Make Bolognese Sauce', 'Fabulous Wet Burritos', 'Sofrito', 'Russian Mushroom and Potato Soup', 'Bow Ties with Sausage, Tomatoes and Cream', 'Chicken Wonton Tacos', 'Tomato Basil Salmon', 'Spanish Octopus', 'Potato Scones', 'Lyonnaise Potatoes', 'Pesto', 'Chicken Pesto Pizza', 'Victoria Sponge Cake', 'Ratatouille', 'Addictive Sweet Potato Burritos', 'Hasselback Potatoes', 'Tembleque Puerto Rican Coconut Pudding', 'Pesto Grilled Cheese Sandwich', 'Delicious Black Bean Burritos', 'Italian Stewed Tomatoes', 'Thai Sweet Potato Soup', 'Antipasto Pasta Salad', 'Gelato', 'Rigatoni alla Genovese', 'Oktoberfest Chicken and Red Cabbage', 'G

Chicken is considered dollar?

In [162]:
dol_tokens = list_words_with_tag(tagged_recipe_names, "$")
dol_tokens

['Chicken']

It's a tagging error, so this can be ignored

In [163]:
for dol in dol_tokens:
  print(find_value_with_char(p_recipes, 'name', dol))

['Spicy Korean Fried Chicken with Gochujang Sauce', 'Greek Lemon Chicken and Potato Bake', "Chef John's Chicken Kiev", 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Chicken Katsu', 'Chicken Stir-Fry', 'Mexican-Style Chicken Taco Casserole', 'Curry Stand Chicken Tikka Masala Sauce', 'Chicken Enchiladas V', 'Jamaican Style Curry Chicken', 'Salsa Chicken', 'Grilled Asian Chicken', 'Chicken Tikka Masala', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Turkish Chicken Kebabs', 'Chicken Souvlaki with Tzatziki Sauce', 'Greek Lemon Chicken Soup', 'Chicken Cacciatore in a Slow Cooker', 'Chicken and Broccoli Stir-Fry', 'Creamy Chicken Lasagna', 'Broccoli and Chicken Stir-Fry', 'Chicken Parmigiana', 'Shoyu Chicken', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Sheet Pan Chicken Fajitas', 'White Chicken Enchilada Slow-Cooker Casserole', 'Chicken Enchiladas II', 'Chinese Chicken Fried Rice II', 'Chicken Milanese', 'Chicken Massaman Curry', "Chef J

There are some quotation marks

In [164]:
quote_tokens = list_words_with_tag(tagged_recipe_names, "''")
quote_tokens

["''", "''", "'", "''", "''", "''", "''"]

Quotation marks are caused by possessive -'s

In [165]:
for quote in quote_tokens:
  print(find_value_with_char(p_recipes, 'name', quote))

[]
[]
["Chef John's Chicken Kiev", "Angela's Awesome Enchiladas", "Randy's Slow Cooker Ravioli Lasagna", "'Chinese Buffet' Green Beans", "Chef John's Beef Rouladen", "Corned Beef and Cabbage Shepherd's Pie", "Gramma's Date Squares", "Authentic Russian Salad 'Olivye'", "Chef John's Meatless Meatballs", "Chef John's Beef Goulash", "Grandma's Noodles II", "Chef John's Clotted Cream", "Newfoundland Jigg's Dinner", "Chef John's Coq Au Vin", "Chef John's Loco Moco", "Dash's Donair", "Turkey Shepherd's Pie", "Papa Drexler's Bavarian Pretzels", "Bob's Stuffed Banana Peppers", "Chef John's Swedish Meatballs", "Chef John's Best German Recipes", "Chef John's Chicken Tikka Masala", "Maria's Mexican Rice", "Mom's Buttermilk Pancakes", "Geneva's Ultimate Hungarian Mushroom Soup", "Charley's Slow Cooker Mexican Style Meat", "Ingrid's Rouladen", "Chef John's Lasagna", "Lola's Horchata", "Chef John's Italian Sausage Chili", "Kid's Favorite Pizza Casserole", "Traci's Adobo Seasoning", "Frank's Favorite 

Some commas were found

In [166]:
recipe_names_pos[","]

NameError: name 'recipe_names_pos' is not defined

In [None]:
for com in recipe_names_pos[","]:
  print(find_name_with_char(com[0]))

 For now, leave the preprocessing of the recipe names first.

## Preprocessing of ingredients

In [211]:
p_ingredients = []

for recipe in p_recipes:
    p_ingredients = p_ingredients + recipe['ingredients']
    
p_ingredients = list(set(p_ingredients))
len(p_ingredients)

19342

In [212]:
p_ingredients[:10]

['4 eggs, well beaten ',
 '3 tablespoons chopped green onion ',
 '2 jalapeno peppers, seeded and thinly sliced ',
 '12 fresh mint leaves ',
 '4 eggs, beaten ',
 '⅓ cup soy sauce ',
 '½ cup diced fresh mushrooms ',
 '2 pounds fresh corn masa dough ',
 '2 green bell peppers ',
 ' Finely chopped white onions ']

In [213]:
for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = p_ingredients[i].strip()

p_ingredients[:10]

['4 eggs, well beaten',
 '3 tablespoons chopped green onion',
 '2 jalapeno peppers, seeded and thinly sliced',
 '12 fresh mint leaves',
 '4 eggs, beaten',
 '⅓ cup soy sauce',
 '½ cup diced fresh mushrooms',
 '2 pounds fresh corn masa dough',
 '2 green bell peppers',
 'Finely chopped white onions']

A reusable function that re-tags ingredients

In [214]:
def retag_ingredients():
    tagged_recipe_ingredients = []

    for ingredient in p_ingredients:
        tagged_recipe_ingredients.append(tag_pos(ingredient))
        
    return tagged_recipe_ingredients

tagged_recipe_ingredients = retag_ingredients()
tagged_recipe_ingredients[:10]

[[('4', 'CD'), ('eggs', 'NNS'), (',', ','), ('well', 'RB'), ('beaten', 'VB')],
 [('3', 'CD'),
  ('tablespoons', 'NNS'),
  ('chopped', 'VBD'),
  ('green', 'JJ'),
  ('onion', 'NN')],
 [('2', 'CD'),
  ('jalapeno', 'NN'),
  ('peppers', 'NNS'),
  (',', ','),
  ('seeded', 'VBD'),
  ('and', 'CC'),
  ('thinly', 'RB'),
  ('sliced', 'VBD')],
 [('12', 'CD'), ('fresh', 'JJ'), ('mint', 'NN'), ('leaves', 'NNS')],
 [('4', 'CD'), ('eggs', 'NNS'), (',', ','), ('beaten', 'VB')],
 [('⅓', 'JJ'), ('cup', 'NN'), ('soy', 'NN'), ('sauce', 'NN')],
 [('½', 'JJ'),
  ('cup', 'NN'),
  ('diced', 'VBD'),
  ('fresh', 'JJ'),
  ('mushrooms', 'NNS')],
 [('2', 'CD'),
  ('pounds', 'NNS'),
  ('fresh', 'JJ'),
  ('corn', 'NN'),
  ('masa', 'NNS'),
  ('dough', 'IN')],
 [('2', 'CD'), ('green', 'JJ'), ('bell', 'NN'), ('peppers', 'NNS')],
 [('Finely', 'RB'), ('chopped', 'VBN'), ('white', 'JJ'), ('onions', 'NNS')]]

Numbers need a placeholder

In [215]:
list_words_with_tag(tagged_recipe_ingredients, "CD")

['4',
 '3',
 '2',
 '12',
 '4',
 '2',
 '2',
 '1',
 '2',
 '2',
 '6',
 '4',
 '2',
 '1',
 '8',
 '1',
 '1',
 '1',
 '1',
 '2',
 '2',
 '3',
 '1',
 '3',
 '8',
 '2',
 '2',
 '3',
 '1',
 '1',
 '16',
 '1',
 '1',
 'kalamata',
 '2',
 '1',
 '1',
 '2',
 '1',
 '12',
 '3',
 '2',
 '1',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '2',
 '8',
 '1',
 '14',
 '2',
 '3',
 '2',
 '2',
 '1',
 '1',
 '3',
 '1',
 '1',
 '2',
 '4',
 '1',
 '1',
 '4',
 '5',
 '1',
 '1.75',
 '1',
 '2',
 '2',
 '1',
 '6',
 '2',
 '3',
 '1',
 '2',
 '15',
 '1',
 '1',
 '7',
 '1',
 '1',
 '9',
 '1',
 '8',
 '2',
 '4',
 '6',
 '2',
 '28',
 '4',
 '2',
 '4',
 '5',
 '2',
 '1',
 '8',
 '4',
 '1',
 '2',
 '15',
 '2',
 '2',
 '1',
 '2',
 '11',
 '1',
 '2',
 '1',
 '4',
 '4',
 '2',
 '14',
 '1',
 '9',
 '1',
 '1',
 '1',
 '4',
 '1',
 '8',
 '2',
 '1',
 '1',
 '1',
 '15',
 '1',
 '6',
 '4',
 '3',
 '12',
 '3',
 '3',
 '4',
 '2',
 '2',
 '1',
 '8',
 '4',
 '1',
 '2',
 '1',
 '6',
 '1',
 '2',
 '3',
 '1',
 '3',
 '1',
 '1',
 '1',
 '5.3',
 '1',
 '1',
 '1',
 '14',
 '1',
 '32',
 '3',
 '1

NLTK assumes fractions as JJ (adjectives)

In [216]:
list_words_with_tag(tagged_recipe_ingredients, "JJ")

['green',
 'fresh',
 '⅓',
 '½',
 'fresh',
 'fresh',
 'green',
 'white',
 '½',
 'fresh',
 'all-purpose',
 'additional',
 '½',
 'green',
 'fresh',
 '¼',
 'mixed',
 'fresh',
 'fresh',
 'small',
 'curd',
 'vegetable',
 '⅔',
 'shrimp-flavored',
 'instant',
 'jasmine',
 '¼',
 '1/2-inch',
 'long',
 'Korean',
 'red',
 '⅝',
 'white',
 '½',
 'vegetable',
 'uncooked',
 'angel',
 '¼',
 'Italian',
 'jumbo',
 'green',
 'unsweetened',
 '¾',
 'green',
 'dried',
 'glutinous',
 'napa',
 'large',
 'black',
 'all-purpose',
 'pork',
 '1-inch',
 'Italian',
 'cheese',
 'extra',
 'olive',
 'yellow',
 'Chinese',
 'five-spice',
 'garlic',
 '½',
 'fresh',
 'large',
 'fresh',
 'such',
 'teaspoon',
 'fresh-squeezed',
 'avocados',
 'green',
 '⅞',
 'olive',
 'large',
 'white',
 'chickpeas',
 'drained',
 'salsa',
 'small',
 'white',
 '½',
 'unsalted',
 'multi-grain',
 '½',
 'cold',
 'boneless',
 '¾',
 'dry',
 'small',
 'kidney',
 'uncooked',
 'quart',
 'hot',
 'large',
 '½',
 '¾',
 'Italian',
 'large',
 'yellow',
 '½

Create a function that converts any fraction in a text to integer

In [217]:
import unicodedata

def fraction_to_int(text):
  for i, char in enumerate(text):
    try:
      # unicode.numeric converts fractions such as ½ to decimal place, 0.25
      # int() removes decimal places, str() allows it to be joined with original text
      text = text[:i] + str(int(unicodedata.numeric(char))) + text[i + 1:]
    except Exception as e:
      pass
  # Because number + fraction, such as 1 1/4 may be converted to 1 0, so use re.sub to remove
  text = re.sub("([0-9]+ [0])+", "4", text)
  return text

for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = fraction_to_int(p_ingredients[i])

tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['4 eggs, well beaten',
 '3 tablespoons chopped green onion',
 '2 jalapeno peppers, seeded and thinly sliced',
 '12 fresh mint leaves',
 '4 eggs, beaten',
 '0 cup soy sauce',
 '0 cup diced fresh mushrooms',
 '2 pounds fresh corn masa dough',
 '2 green bell peppers',
 'Finely chopped white onions',
 '4 cups fresh lemon juice',
 '2 cups all-purpose flour, plus additional for dusting',
 '4 cups beef broth',
 '6 tablespoons chopped green onions, divided',
 '4 tablespoons chopped fresh dill weed',
 '4 cups chicken broth',
 '1 (8 ounce) package mixed candied fruit',
 '1 cup butter, room temperature',
 '1 teaspoon chopped fresh ginger',
 '1 (1 inch) piece fresh ginger root, minced']

By converting fractions into integers, NLTK stops seeing them as adjectives (JJ) and instead, they are considered numbers (CD)

In [218]:
list_words_with_tag(tagged_recipe_ingredients, "JJ")

['green',
 'fresh',
 'fresh',
 'fresh',
 'green',
 'white',
 'fresh',
 'all-purpose',
 'additional',
 'green',
 'fresh',
 'mixed',
 'fresh',
 'fresh',
 'small',
 'curd',
 'vegetable',
 'shrimp-flavored',
 'instant',
 'jasmine',
 'cup',
 '1/2-inch',
 'long',
 'Korean',
 'red',
 'cup',
 'white',
 'vegetable',
 'uncooked',
 'angel',
 'Italian',
 'jumbo',
 'green',
 'unsweetened',
 'green',
 'dried',
 'glutinous',
 'napa',
 'large',
 'black',
 'all-purpose',
 'pork',
 '1-inch',
 'Italian',
 'cheese',
 'extra',
 'olive',
 'yellow',
 'Chinese',
 'five-spice',
 'garlic',
 'fresh',
 'large',
 'fresh',
 'such',
 'teaspoon',
 'fresh-squeezed',
 'avocados',
 'green',
 'olive',
 'large',
 'white',
 'chickpeas',
 'drained',
 'salsa',
 'small',
 'white',
 'unsalted',
 'multi-grain',
 'cold',
 'boneless',
 'dry',
 'small',
 'kidney',
 'uncooked',
 'quart',
 'hot',
 'large',
 'Italian',
 'large',
 'yellow',
 'beef',
 'white',
 'hard',
 'fresh',
 'unsalted',
 'large',
 'small',
 'small',
 'sweet',
 'ye

Replace all the numbers with placeholder of 4

In [219]:
for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = searchReplacePatt(p_ingredients[i], NUMPATTERN, "4")
    
tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['4 eggs, well beaten',
 '4 tablespoons chopped green onion',
 '4 jalapeno peppers, seeded and thinly sliced',
 '4 fresh mint leaves',
 '4 eggs, beaten',
 '4 cup soy sauce',
 '4 cup diced fresh mushrooms',
 '4 pounds fresh corn masa dough',
 '4 green bell peppers',
 'Finely chopped white onions',
 '4 cups fresh lemon juice',
 '4 cups all-purpose flour, plus additional for dusting',
 '4 cups beef broth',
 '4 tablespoons chopped green onions, divided',
 '4 tablespoons chopped fresh dill weed',
 '4 cups chicken broth',
 '4 (4 ounce) package mixed candied fruit',
 '4 cup butter, room temperature',
 '4 teaspoon chopped fresh ginger',
 '4 (4 inch) piece fresh ginger root, minced']

In [239]:
new_cd_tokens = list(set(list_words_with_tag(tagged_recipe_ingredients, "CD")))
new_cd_tokens.remove('4')
new_cd_tokens

['four',
 'seven',
 'millet',
 'zapallo',
 'bleu',
 'marinara',
 'fontina',
 '4/4',
 'ziti',
 'xanthan',
 'yum',
 '4.4',
 "za'atar",
 'mozzarella',
 'zucchini',
 'yellow',
 'provolone',
 'kalamata',
 'mascarpone',
 'one',
 '4/4x4/4',
 'mostaccioli']

In [241]:
def find_ingre_with_substring(sub):
    matches = []
    for ingre in p_ingredients:
        if sub in ingre:
            matches.append(ingre)
    return matches


find_ingre_with_substring('4/4')

['4 cup 4/4-inch long vermicelli',
 '4 pound sweet potatoes, peeled and cut into 4 4/4-inch chunks',
 '4 cups apples, peeled and sliced into 4/4-inch wedges',
 '4 kaffir lime leaves, cut into 4/4-inch pieces',
 '4 (4 pound) pork tenderloin, cut into thin 4 4/4 inch strips',
 '4 ounces halloumi cheese, cut into 4/4 inch thick sticks',
 '4 red bell pepper, cut into 4/4-inch strips',
 '4 cup red bell pepper, cut into 4/4 inch strips',
 '4 pound hot Italian turkey sausage, cut into 4/4 inch slices',
 '4 eggplant, cut into 4/4 inch cubes',
 '4 skinless, boneless chicken breast halves - pounded to 4/4 inch thickness',
 '4 pounds beef stew meat, cut into 4/4 inch pieces',
 '4 (4 4/4) pound chicken, cut into 4 pieces',
 '4 fresh green beans, cut into 4/4 inch pieces',
 '4 large cucumbers, peeled, halved lengthwise, seeded, and cut into 4/4-inch slices',
 '4 raw chop with refuse, 4 g; (blank) 4.4 ounces boneless pork chops, pounded to 4/4 inch thick',
 '4 (4 4/4 inch) piece fresh ginger, sliced

In [242]:
find_ingre_with_substring('4.4')

['4 (4.4 ounce) package pepperoni (such as Hormel®)',
 '4 (4.4 ounce) container Greek yogurt',
 '4 (4.4 ounce) container aji nori furikake (seasoned seaweed and sesame rice topping)',
 '4 (4.4 ounce) cans coconut milk',
 '4 (4.4 ounce) package corn bread mix',
 '4 (4.4 ounce) can whole kernel corn, with liquid',
 '4 (4.4 ounce) package beef top sirloin, thinly sliced and cut into bite-size pieces',
 '4 (4.4 ounce) can water',
 '4 (4.4 ounce) package dry taco seasoning mix',
 '4 (4.4 ounce) can white corn kernels, drained',
 '4 raw chop with refuse, 4 g; (blank) 4.4 ounces boneless pork chops, pounded to 4/4 inch thick',
 '4 (4.4 ounce) boxes raisins',
 '4 (4.4 ounce) can whole kernel corn, drained',
 '4 (4.4 inch square) wonton wrappers',
 '4 (4.4 ounce) can condensed cream of celery soup',
 '4 (4.4 ounce) cans white beans, rinsed and drained',
 '4 (4.4 ounce) can chicken broth',
 '4 (4.4 ounce) package spaghetti sauce mix',
 '4 (4.4 ounce) cans navy beans, rinsed and drained',
 '4 (4.

In [243]:
all_ingre_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_ingredients, POS)}
  all_ingre_tags.append(new_dic)

get_tag_number(all_ingre_tags)

[{'$': 0},
 {"''": 14},
 {'(': 3829},
 {')': 3828},
 {',': 8512},
 {'--': 0},
 {'.': 18},
 {':': 304},
 {'CC': 3075},
 {'CD': 21862},
 {'DT': 99},
 {'EX': 0},
 {'FW': 52},
 {'IN': 2931},
 {'JJ': 13324},
 {'JJR': 523},
 {'JJS': 6},
 {'LS': 0},
 {'MD': 612},
 {'NN': 32978},
 {'NNP': 2434},
 {'NNPS': 2},
 {'NNS': 13610},
 {'PDT': 1},
 {'POS': 101},
 {'PRP': 2},
 {'PRP$': 1},
 {'RB': 1452},
 {'RBR': 5},
 {'RBS': 0},
 {'RP': 13},
 {'SYM': 53},
 {'TO': 1046},
 {'UH': 0},
 {'VB': 1700},
 {'VBD': 8944},
 {'VBG': 354},
 {'VBN': 3464},
 {'VBP': 646},
 {'VBZ': 587},
 {'WDT': 1},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 0}]

In [245]:
colon_tags = set(list(list_words_with_tag(tagged_recipe_ingredients, ":")))
colon_tags

{'-', '--', ':', ';'}

In [251]:
colon_ingres = []

for c in colon_tags:
    print(find_ingre_with_substring(c))
    colon_ingres = colon_ingres + find_ingre_with_substring(c)

['4 large skinless, boneless chicken breast halves -- trimmed and cut into 4-inch pieces']
['4 cups all-purpose flour, plus additional for dusting', '4 cup freshly shredded Parmigiano-Reggiano cheese', '4 (4 ounce) package shrimp-flavored instant ramen noodles', '4 cup 4/4-inch long vermicelli', '4 (4 ounce) can reduced-sodium black beans, rinsed and drained', '4 cups sifted all-purpose flour', '4 pounds pork shoulder, cut into 4-inch cubes', '4 teaspoon Chinese five-spice powder', '4 teaspoon fresh-squeezed lemon juice', '4 avocados - peeled, pitted, and cubed', '4 multi-grain tortillas, or more to taste', '4 cups fat-free, reduced-sodium chicken broth', '4 skinless, boneless chicken breast halves - cubed', '4 skinless, boneless chicken breast halves - cut into 4 inch strips', '4 tablespoon minced fresh flat-leaf parsley', '4 medium eggplant, chopped into bite-size pieces', '4 cups low-sodium vegetable broth', '4 hard-boiled eggs, sliced', '4 hard-boiled eggs, peeled', '4 (4 ounce) pa

In [249]:
find_ingre_with_substring("--")

['4 large skinless, boneless chicken breast halves -- trimmed and cut into 4-inch pieces']

In [250]:
find_ingre_with_substring(":")

['Gravy:',
 'Meatballs:',
 'Spice Blend:',
 'Chipotle Mayonnaise:',
 'Dipping Sauce:',
 'Caramel:',
 'Fillings:']

## Data merging

In [135]:
all_recipe_names = []

for recipe in p_recipes:
    try:
        all_recipe_names.append(recipe['name'])
    except Exception as e:
        pass
    
all_recipe_names[:10]

['Pan-Fried Asparagus',
 'Creamy Au Gratin Potatoes',
 'Super-Delicious Zuppa Toscana',
 'Simple Teriyaki Sauce',
 'Spicy Korean Fried Chicken with Gochujang Sauce',
 'Spaghetti Aglio e Olio',
 'Easy Garam Masala',
 'Easy Chorizo Street Tacos',
 'Russian Cabbage Rolls with Gravy',
 'Shrimp Scampi with Pasta']

In [137]:
all_recipe_names_corpus = ("\n").join(all_recipe_names)

all_recipe_names_corpus

'Pan-Fried Asparagus\nCreamy Au Gratin Potatoes\nSuper-Delicious Zuppa Toscana\nSimple Teriyaki Sauce\nSpicy Korean Fried Chicken with Gochujang Sauce\nSpaghetti Aglio e Olio\nEasy Garam Masala\nEasy Chorizo Street Tacos\nRussian Cabbage Rolls with Gravy\nShrimp Scampi with Pasta\nGreek Lemon Chicken and Potato Bake\nEasy Mexican Casserole\nGerman Apple Cake I\nSpanish Flan\nGerman Pork Chops and Sauerkraut\nSpaghetti Cacio e Pepe\nChef John\'s Chicken Kiev\nIndian-Style Chicken and Onions\nFajita Seasoning\nPerfect Sushi Rice\nTender Italian Baked Chicken\nAuthentic German Potato Salad\nMiso Soup\nMexican Rice II\nSpongy Japanese Cheesecake\nChicken Katsu\nChicken Stir-Fry\nQuick Beef Stir-Fry\nEasy Authentic Mexican Rice\nHerbs de Provence\nGreek/House Dressing\nFrench Bread\nFocaccia Bread\nJamaican Fried Dumplings\nGluehwein\nCoquilles Saint-Jacques\nMexican-Style Chicken Taco Casserole\nRosemary Braised Lamb Shanks\nMake-Ahead Vegetarian Moroccan Stew\nCurry Stand Chicken Tikka Ma

## Old code (need to be changed / replaced later on)

Flatten each data entry into a string

In [None]:
corpus_list = []
for item in recipes:
    item['ingredients']=','.join(item['ingredients'])
    try:
        item['text'] = item['name'] + " " + item["ingredients"]
    except Exception as e:
        item['name'] = ""
        item['text'] = item['name'] + " " + item["ingredients"]
        print(e)
    corpus_list.append(item['text'])
    
corpus_list[:3]

'name'


['Pan-Fried Asparagus ¼ cup butter ,2 tablespoons olive oil ,1 teaspoon coarse salt ,¼ teaspoon ground black pepper ,3 cloves garlic, minced ,1 pound fresh asparagus spears, trimmed ',
 'Pan de Muertos (Mexican Bread of the Dead) ¼ cup margarine ,¼ cup milk ,¼ cup warm water (110 degrees F/45 degrees C) ,3 cups all-purpose flour ,1\u2009¼ teaspoons active dry yeast ,½ teaspoon salt ,2 teaspoons anise seed ,¼ cup white sugar ,2 eggs, beaten ,2 teaspoons orange zest ,¼ cup white sugar ,¼ cup orange juice ,1 tablespoon orange zest ,2 tablespoons white sugar ',
 'Creamy Au Gratin Potatoes 4 russet potatoes, sliced into 1/4 inch slices ,1 onion, sliced into rings , salt and pepper to taste ,3 tablespoons butter ,3 tablespoons all-purpose flour ,½ teaspoon salt ,2 cups milk ,1\u2009½ cups shredded Cheddar cheese ']

Convert entire flattened list into a string

In [None]:
corpus = ','.join(corpus_list)

Compute bigram

In [None]:
import nltk
nltk.download('punkt')

tokens = nltk.word_tokenize(corpus)
bigrams = nltk.bigrams(tokens)
frequence = nltk.FreqDist(bigrams)
for key,value in frequence.items():
    print(key,value)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('taste', ',7') 1
('and', 'Tangy') 2
('Tangy', 'Thai') 1
('French', '75') 1
('75', 'Cocktail') 1
('ounces', 'lemon') 1
('ounces', 'gin') 1
('gin', ',2') 2
('teaspoons', 'simple') 1
('chilled', 'Champagne') 1
('Champagne', ',1') 1
('or', 'orange') 2
('orange', 'slice') 1
('slice', 'for') 2
('Aunt', 'Bert') 1
('Bert', "'s") 1
("'s", 'Fruitcake') 1
('Fruitcake', 'Cookies') 1
('white', 'raisins') 1
('pound', 'dates') 1
('slices', 'candied') 1
('cinnamon', ',28') 1
(',28', 'ounces') 1
('ounces', 'pecans') 1
(',', 'Kerala') 1
('Kerala', 'Chicken') 1
('root', ',8') 1
(',', 'Adriel') 1
('Adriel', "'s") 1
('Chinese', 'Curry') 1
('Portuguese', 'Kale') 1
('white', 'pea') 1
('pea', 'beans') 1
('beef', 'soup') 1
('soup', 'bones') 1
('bunches', 'kale') 2
('quart', 'hot') 2
('water', 'or') 1
('de', 'Mariscos') 1
('Mariscos', '(') 1
('(', 'Seafood') 1
('Seafood', 'Soup') 2
('stock', '(') 2
('as', 'Swanson®') 1
('Swanson®', ')') 1
('fresh

In [None]:
len(tokens)

299068

Convert bigrams into dictionaries, with bigram as key, frequency as value

In [None]:
result = dict(sorted(frequence.items(), key=lambda item: item[0]))
result

{('!', ')'): 2,
 ('!', ','): 1,
 ('!', '1'): 1,
 ('!', '3'): 1,
 ('!', 'Sangria'): 1,
 ('!', 'cooking'): 1,
 ('!', '½'): 1,
 ('#', '1'): 1,
 ('%', ')'): 2,
 ('%', '-lean'): 1,
 ('%', 'Blue'): 1,
 ('%', 'agave'): 1,
 ('%', 'alcohol'): 1,
 ('%', 'cocao'): 1,
 ('%', 'fat'): 1,
 ('%', 'lean'): 4,
 ('%', 'milk'): 4,
 ('%', 'reduced'): 1,
 ('&', 'Beans'): 1,
 ('&', 'B®'): 1,
 ('&', 'Onion'): 1,
 ('&', 'Salad'): 1,
 ('&', 'Sour'): 1,
 ('&', 'auml'): 3,
 ('&', 'ntilde'): 2,
 ('&', 'reg'): 51,
 ('&', 'trade'): 1,
 ("'", '('): 1,
 ("'", '1'): 1,
 ("'", '5'): 1,
 ("'", '6'): 1,
 ("'", 'Amazing'): 1,
 ("'", 'Fajitas'): 1,
 ("'", 'Green'): 1,
 ("'", 'Hungarian'): 1,
 ("'", 'Jamaican'): 1,
 ("'", 'Mince'): 1,
 ("'", 'Own®'): 1,
 ("'", 'Posole'): 1,
 ("'", 'Salmon'): 1,
 ("'", 'Soup'): 1,
 ("'", 'Stormy'): 2,
 ("'", 'ammonia'): 1,
 ("'", 'coating'): 2,
 ("'", 'liver'): 1,
 ("'", 'sugar'): 153,
 ("'", 'yeast'): 2,
 ("''", '('): 1,
 ("''", ')'): 1,
 ("''", '-long'): 1,
 ("''", 'Chicken'): 2,
 ("''", 'C

Get unique tokens and sort them in an ascending order

In [None]:
unique_tokens = sorted(list(set(tokens)))
unique_tokens

['!',
 '#',
 '%',
 '&',
 "'",
 "''",
 "'Bride",
 "'Calabacitas",
 "'Chinese",
 "'Fricot",
 "'Olivye",
 "'Otai",
 "'Three",
 "'ll",
 "'n",
 "'s",
 '(',
 ')',
 '*',
 ',',
 ',1',
 ',10',
 ',11',
 ',12',
 ',13',
 ',14',
 ',15',
 ',16',
 ',17',
 ',18',
 ',19',
 ',2',
 ',20',
 ',21',
 ',22',
 ',23',
 ',24',
 ',25',
 ',26',
 ',27',
 ',28',
 ',29',
 ',3',
 ',3-Ingredient',
 ',30',
 ',32',
 ',34',
 ',35',
 ',36',
 ',38',
 ',4',
 ',40',
 ',48',
 ',5',
 ',5-Ingredient',
 ',50',
 ',6',
 ',60',
 ',7',
 ',8',
 ',80',
 ',9',
 '-',
 '--',
 '-lean',
 '-long',
 '.',
 '...',
 '.063',
 '.18',
 '.24',
 '.25',
 '.7',
 '.75',
 '/',
 '0.6',
 '00',
 '1',
 '1-1/2',
 '1-1/2-inch',
 '1-inch',
 '1-inch-thick',
 '1-pound',
 '1.063',
 '1.12',
 '1.2',
 '1.25',
 '1.27',
 '1.41',
 '1.5',
 '1.75',
 '1.9',
 '1/2',
 '1/2-',
 '1/2-inch',
 '1/2-inch-long',
 '1/2-inch-thick',
 '1/2-pound',
 '1/2x1/4',
 '1/3',
 '1/3-inch',
 '1/4',
 '1/4-inch',
 '1/4-inch-thick',
 '1/8',
 '1/8-inch',
 '1/8-inch-thick',
 '10',
 '10.25',
 '10.5'

Combine bigrams of the same first word into a dictionary

In [None]:
def find_dict_tuple_key(search):
    entry = {
        "token": search,
        "bigrams": []
    }
    bigrams = {x: result[x] for x in result.keys() if x[0] == search}
    for key, value in bigrams.items():
        newDict = {key[1]: value}
        entry["bigrams"].append(newDict)
    return entry

find_dict_tuple_key('Garlic')

{'token': 'Garlic',
 'bigrams': [{',': 2},
  {',1': 4},
  {',3': 1},
  {'1': 1},
  {'10': 1},
  {'2': 2},
  {'Alfredo': 1},
  {'Beef': 1},
  {'Butter': 1},
  {'Cheddar': 1},
  {'Chicken': 4},
  {'Dill': 1},
  {'Fried': 2},
  {'Mashed': 1},
  {'Parmesan': 1},
  {'Paste': 1},
  {'Pizza': 1},
  {'Pork': 1},
  {'Potato': 1},
  {'Potatoes': 1},
  {'Prawns': 2},
  {'Rice': 1},
  {'Salsa': 1},
  {'Sauce': 5},
  {'Scalloped': 2},
  {'Shrimp': 2},
  {'Soup': 2},
  {'Spinach': 1},
  {'Teriyaki': 1},
  {'Tzatziki': 1},
  {'Wine': 1},
  {'Wings': 1},
  {'and': 3},
  {'without': 1},
  {'¼': 1}]}

Do the same to all the tokens to create a list of dictionaries

In [None]:
bigram_list = []
for value in unique_tokens:
    bigram_list.append(find_dict_tuple_key(value))
    
bigram_list

[{'token': '!',
  'bigrams': [{')': 2},
   {',': 1},
   {'1': 1},
   {'3': 1},
   {'Sangria': 1},
   {'cooking': 1},
   {'½': 1}]},
 {'token': '#', 'bigrams': [{'1': 1}]},
 {'token': '%',
  'bigrams': [{')': 2},
   {'-lean': 1},
   {'Blue': 1},
   {'agave': 1},
   {'alcohol': 1},
   {'cocao': 1},
   {'fat': 1},
   {'lean': 4},
   {'milk': 4},
   {'reduced': 1}]},
 {'token': '&',
  'bigrams': [{'Beans': 1},
   {'B®': 1},
   {'Onion': 1},
   {'Salad': 1},
   {'Sour': 1},
   {'auml': 3},
   {'ntilde': 2},
   {'reg': 51},
   {'trade': 1}]},
 {'token': "'",
  'bigrams': [{'(': 1},
   {'1': 1},
   {'5': 1},
   {'6': 1},
   {'Amazing': 1},
   {'Fajitas': 1},
   {'Green': 1},
   {'Hungarian': 1},
   {'Jamaican': 1},
   {'Mince': 1},
   {'Own®': 1},
   {'Posole': 1},
   {'Salmon': 1},
   {'Soup': 1},
   {'Stormy': 2},
   {'ammonia': 1},
   {'coating': 2},
   {'liver': 1},
   {'sugar': 153},
   {'yeast': 2}]},
 {'token': "''",
  'bigrams': [{'(': 1},
   {')': 1},
   {'-long': 1},
   {'Chicken': 

In [None]:
len(bigram_list)

6107

In [None]:
len(unique_tokens)

6107

## Numbers and placeholder

## POS tagging

# Create edit distance

# Create bigram

#Chunking/Phrases
