# Steamboat Squad

Import and load data

In [211]:
# from google.colab import drive
# drive.mount('/content/drive')

In [212]:
import json

with open("recipes_ingredients.json", "r") as json_file:
    recipes = json.load(json_file)
    
len(recipes)

4702

Overview of data structure. This is a list of dictionary, where each dictionary is a recipe with its name, ingredients and url

In [213]:
recipes[0]

{'url': 'https://www.allrecipes.com/recipe/18318/pan-fried-asparagus/',
 'name': 'Pan-Fried Asparagus',
 'ingredients': ['¼ cup butter ',
  '2 tablespoons olive oil ',
  '1 teaspoon coarse salt ',
  '¼ teaspoon ground black pepper ',
  '3 cloves garlic, minced ',
  '1 pound fresh asparagus spears, trimmed ']}

Deleting url key

In [214]:
for recipe in recipes:
    del recipe['url']
recipes[0]

{'name': 'Pan-Fried Asparagus',
 'ingredients': ['¼ cup butter ',
  '2 tablespoons olive oil ',
  '1 teaspoon coarse salt ',
  '¼ teaspoon ground black pepper ',
  '3 cloves garlic, minced ',
  '1 pound fresh asparagus spears, trimmed ']}

# Preprocessing Recipe Names
- Lower-casing (normalise words by using POS tagging)
- Change numbers to fix number (place holder)

NLTK has a help function that explains its POS tags.

In [215]:
import nltk
from nltk import pos_tag, word_tokenize, RegexpParser, Tree
from nltk.tokenize import PunktSentenceTokenizer

nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [216]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

Using %%capture, save the NLTK help text as a string

In [217]:
%%capture cap --no-stderr

nltk.help.upenn_tagset()

In [218]:
cap.stdout

'$: dollar\n    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$\n\'\': closing quotation mark\n    \' \'\'\n(: opening parenthesis\n    ( [ {\n): closing parenthesis\n    ) ] }\n,: comma\n    ,\n--: dash\n    --\n.: sentence terminator\n    . ! ?\n:: colon or ellipsis\n    : ; ...\nCC: conjunction, coordinating\n    & \'n and both but either et for less minus neither nor or plus so\n    therefore times v. versus vs. whether yet\nCD: numeral, cardinal\n    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-\n    seven 1987 twenty \'79 zero two 78-degrees eighty-four IX \'60s .025\n    fifteen 271,124 dozen quintillion DM2,000 ...\nDT: determiner\n    all an another any both del each either every half la many much nary\n    neither no some such that the them these this those\nEX: existential there\n    there\nFW: foreign word\n    gemeinschaft hund ich jeux habeas Haementeria Herr K\'ang-si vous\n    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte\n    terram 

Using RE, get all the tag names

In [219]:
import re

ALL_POS = re.findall(".*: +", cap.stdout)

for i, pos in enumerate(ALL_POS):
  ALL_POS[i] = pos.replace(': ', '')


ALL_POS

['$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 '    ',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``']

In [220]:
ALL_POS.remove('    ')
ALL_POS

['$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``']

Create a function to pos tag a text

In [221]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def tag_pos(corpus):
    text=word_tokenize(corpus)
    return nltk.pos_tag(text)

tag_pos("This is a test sentence.")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tanke\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('test', 'NN'),
 ('sentence', 'NN'),
 ('.', '.')]

Create a function that POS tag and returns words with specific POS

In [222]:
def get_words_with_pos(text, pos):
  tagged = tag_pos(text)
  return [t for t in tagged if t[1].startswith(pos)]

get_words_with_pos("This is a test sentence.", "NN")

[('test', 'NN'), ('sentence', 'NN')]

POS tag all recipe names

In [223]:
tagged_recipe_names = []

for i, recipe in enumerate(recipes):
  try:
    tagged_recipe_names.append(tag_pos(recipes[i]['name']))
  except Exception as e:
    pass

len(tagged_recipe_names)

4701

## Data cleaning for names based on POS tagging

Looking at the first 10 tagged recipe names, there is a need for pre-processing, as NLTK's tagging is confused by the letter casing.

In [224]:
tagged_recipe_names[:10]

[[('Pan-Fried', 'JJ'), ('Asparagus', 'NNP')],
 [('Pan', 'NNP'),
  ('de', 'FW'),
  ('Muertos', 'NNP'),
  ('(', '('),
  ('Mexican', 'NNP'),
  ('Bread', 'NNP'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('Dead', 'NNP'),
  (')', ')')],
 [('Creamy', 'NNP'), ('Au', 'NNP'), ('Gratin', 'NNP'), ('Potatoes', 'NNP')],
 [('Super-Delicious', 'JJ'), ('Zuppa', 'NNP'), ('Toscana', 'NNP')],
 [('Simple', 'JJ'), ('Teriyaki', 'NNP'), ('Sauce', 'NNP')],
 [('Spicy', 'JJ'),
  ('Korean', 'NNP'),
  ('Fried', 'NNP'),
  ('Chicken', 'NNP'),
  ('with', 'IN'),
  ('Gochujang', 'NNP'),
  ('Sauce', 'NNP')],
 [('Spaghetti', 'NNP'), ('Aglio', 'NNP'), ('e', 'NN'), ('Olio', 'NNP')],
 [('Easy', 'JJ'), ('Garam', 'NNP'), ('Masala', 'NNP')],
 [('Easy', 'NNP'), ('Chorizo', 'NNP'), ('Street', 'NNP'), ('Tacos', 'NNP')],
 [('Tres', 'NNS'),
  ('Leches', 'NNP'),
  ('(', '('),
  ('Milk', 'NNP'),
  ('Cake', 'NNP'),
  (')', ')')]]

Create a function that returns all tagged words with the same tag. NLTK's POS tagging assumes that capitalized noun means proper noun (name).

In [225]:
def list_words_with_tag(tuple_list, pos):
  results = []
  for name in tuple_list:
    for tag in name:
      if tag[1] == pos:
        results.append(tag[0])
  return results

list_words_with_tag(tagged_recipe_names, "NNP")

['Asparagus',
 'Pan',
 'Muertos',
 'Mexican',
 'Bread',
 'Dead',
 'Creamy',
 'Au',
 'Gratin',
 'Potatoes',
 'Zuppa',
 'Toscana',
 'Teriyaki',
 'Sauce',
 'Korean',
 'Fried',
 'Chicken',
 'Gochujang',
 'Sauce',
 'Spaghetti',
 'Aglio',
 'Olio',
 'Garam',
 'Masala',
 'Easy',
 'Chorizo',
 'Street',
 'Tacos',
 'Leches',
 'Milk',
 'Cake',
 'Cabbage',
 'Rolls',
 'Gravy',
 'Shrimp',
 'Scampi',
 'Pasta',
 'Lemon',
 'Chicken',
 'Potato',
 'Bake',
 'Mexican',
 'Casserole',
 'Caldo',
 'Res',
 'Mexican',
 'Beef',
 'Soup',
 'Nogada',
 'Mexican',
 'Stuffed',
 'Poblano',
 'Peppers',
 'Walnut',
 'Sauce',
 'Apple',
 'Cake',
 'Flan',
 'Pork',
 'Chops',
 'Sauerkraut',
 'Spicy',
 'Thai',
 'Basil',
 'Chicken',
 'Pad',
 'Krapow',
 'Gai',
 'Spaghetti',
 'Cacio',
 'Pepe',
 'Chef',
 'John',
 'Chicken',
 'Kiev',
 'Chicken',
 'Onions',
 'Fajita',
 'Perfect',
 'Sushi',
 'Rice',
 'Baked',
 'Chicken',
 'German',
 'Potato',
 'Salad',
 'Miso',
 'Soup',
 'Mexican',
 'Rice',
 'II',
 'Haluski',
 'Labneh',
 'Lebanese',
 'Y

In [226]:
all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  all_name_tags.append(new_dic)

In [227]:
def get_tag_number(tag_list):
  tag_numbers = []
  for tag in tag_list:
    for key, value in tag.items(): 
      new_dict = {key: len(value)}
    tag_numbers.append(new_dict)
  return tag_numbers

get_tag_number(all_name_tags)

[{'$': 1},
 {"''": 7},
 {'(': 529},
 {')': 529},
 {',': 63},
 {'--': 0},
 {'.': 10},
 {':': 98},
 {'CC': 555},
 {'CD': 74},
 {'DT': 104},
 {'EX': 0},
 {'FW': 47},
 {'IN': 482},
 {'JJ': 1822},
 {'JJR': 4},
 {'JJS': 27},
 {'LS': 0},
 {'MD': 2},
 {'NN': 571},
 {'NNP': 13139},
 {'NNPS': 46},
 {'NNS': 307},
 {'PDT': 0},
 {'POS': 348},
 {'PRP': 72},
 {'PRP$': 20},
 {'RB': 33},
 {'RBR': 0},
 {'RBS': 1},
 {'RP': 2},
 {'SYM': 0},
 {'TO': 20},
 {'UH': 0},
 {'VB': 24},
 {'VBD': 39},
 {'VBG': 50},
 {'VBN': 133},
 {'VBP': 10},
 {'VBZ': 22},
 {'WDT': 4},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 7},
 {'``': 6}]

Some names have numbers (CD). Some are obviously not numbers, like 'Figgy'

In [228]:
def get_values_from_dict_list(dict_list, key):
  values = []
  for d in dict_list:
    if key in d:
      values.append(d[key])
  return values

cd_tokens = get_values_from_dict_list(all_name_tags, 'CD')[0]
cd_tokens

['5',
 '16',
 '2',
 '13',
 '300',
 'Figgy',
 '3',
 '9',
 'Two',
 '9',
 '22',
 '10',
 '15',
 'One',
 '18',
 'Ten',
 'Flounder',
 'Three',
 'Ziti',
 'One',
 '21',
 'Four',
 '9',
 '65',
 '17',
 '14',
 '10',
 "'n",
 '15',
 '8',
 'Minestrone',
 'Four',
 '35',
 'Fly',
 '15',
 '23',
 '8',
 '15',
 '21',
 "That's-a",
 'Tex-Mex',
 '14',
 '17',
 'Five',
 '10',
 '18',
 '5',
 "'Otai",
 '17',
 '3',
 '17',
 '75',
 '17',
 '20',
 'Take-Out',
 '16',
 '12',
 'Three',
 "'Three",
 '15',
 '20',
 '16',
 '12',
 '15',
 '22',
 '12',
 'Three',
 '21',
 '21',
 '25',
 '7',
 '10',
 '19',
 '20']

Create a function that searches for recipe name with specific string

In [229]:
def find_value_with_char(dic_list, key, char):
  matches = []
  for recipe in dic_list:
    try:
      if char in recipe[key]:
        matches.append(recipe[key])
    except Exception as e:
      pass
  return matches

find_value_with_char(recipes, 'name', 'Figgy')

['Figgy Pudding']

'Three cup chicken' is indeed a name. On the other hand, numerics, such as 9 and 13 are not part of the actual names of dishes. So, numerics, instead of NLTK's CD, should be treated. This treatment should be done using regex.

In [230]:
for cd in cd_tokens:
  print(find_value_with_char(recipes, 'name', cd))

['Our 5 Best Avgolemono Soup Recipes', '5-Ingredient Mexican Casserole', '15 Mexican-Inspired Ground Beef Casseroles That Deliver Big Flavor With Every Satisfying Bite', 'Chicken 65', 'Pan-Roasted 5-Spice Pork Loin', 'The 15 Most Iconic French Desserts', '35 Quick and Easy Chinese Dinners You Can Make at Home', '15 Essential North Indian Recipes', '15 Essential North Indian Recipes', '18 Easy Mexican Dishes With 5 Ingredients or Less', 'French 75 Cocktail', '15 Top-Rated Traditional German Christmas Cookies', '15 Traditional Italian Christmas Dinner Recipes', "25 Italian Cookies You'll Love"]
['16 German Recipes That Are Comfort Food Favorites', '16 Mexican-Inspired Casseroles for Family-Pleasing Dinners', '16 Essential Puerto Rican Recipes']
['2 Minute Cheese Quesadillas', "22 Recipes Using a Whole Baguette (That Aren't Sandwiches)", 'Our 21 Best Authentic Mexican Recipes', '23 Delicious Ways the World Cooks Pork Shoulder', '21 Easy Dinners That Start with Packaged Gnocchi', 'Our 20 B

Create a function that searches a regex pattern from a text

In [231]:
def searchWordsPatt(text, patt):
    array = re.findall(patt, text)
    return array

NUMPATTERN = r'[0-9]+'
searchWordsPatt("I want 1 cup of tea", NUMPATTERN)

['1']

Create a function that substitutes regex patterns with a given value

In [232]:
def searchReplacePatt(text, patt, new_val):
  return re.sub(patt, new_val, text)

NUMSPACEPATTERN = r'(\d+\s)'
searchReplacePatt("I want 1 cup of tea", NUMSPACEPATTERN, "")

'I want cup of tea'

searchReplacePatt, except it iterates recipe list

In [233]:
def searchReplacePattList(dict_list, patt, new_val, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            dict_list[i][key] = searchReplacePatt(dict_list[i][key], patt, new_val)
        except Exception as e:
            pass

searchReplacePattList, but adds a substring at given index

In [234]:
def searchReplaceAddPattList(dict_list, patt, new_val, substring, index=0, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            dict_list[i][key] = searchReplacePatt(dict_list[i][key], patt, new_val)
            added_string = list(dict_list[i][key]).insert(index, substring)
            dict_list[i][key]=''.join(added_string)
        except Exception as e:
            pass

Remove numerics from name

In [235]:
import re

p_recipes = recipes

searchReplacePattList(p_recipes, NUMSPACEPATTERN, "")

def retag(text_list, key):
  new_list = []
  for i, recipe in enumerate(text_list):
    try:
      new_list.append(tag_pos(recipes[i][key]))
    except Exception as e:
      pass
  return new_list

tagged_recipe_names = retag(p_recipes, "name")

Get the new remaining CD

In [236]:
new_cd_tokens = list_words_with_tag(tagged_recipe_names, "CD")
new_cd_tokens

['Figgy',
 'Two',
 'One',
 'Ten',
 'Flounder',
 'Three',
 'Ziti',
 'One',
 'Four',
 '65',
 "'n",
 'Minestrone',
 'Four',
 'Fly',
 "That's-a",
 'Tex-Mex',
 'Five',
 "'Otai",
 'Take-Out',
 'Three',
 "'Three",
 'Three']

The remaining numbers (CD) are part of actual recipe names

In [237]:
for cd in new_cd_tokens:
  print(find_value_with_char(p_recipes, 'name', cd))

['Figgy Pudding']
['Two-Ingredient Naan', 'Pollo alla Birra for Two']
['A Number One Egg Bread', 'One-Egg Egg Drop Soup', 'One Pot Thai-Style Rice Noodles', 'One-Pot Vegan Potato-Lentil Curry', 'One-Bite Thai "Flavor Bomb" Salad Wraps (Miang Kham)', 'Easy One-Skillet Ground Beef Burrito', 'One-Pot Greek Lemon Chicken and Rice']
['Tender Italian Baked Chicken', 'Tuscan Pork Tenderloin', 'Asian Pork Tenderloin', 'Italian Pork Tenderloin', 'Sweet and Sour Pork Tenderloin', 'Chipotle Crusted Pork Tenderloin', 'Ten Minute Szechuan Chicken', 'Thai Quivering Tenderloins', 'Spicy Pork Tenderloin', 'Chinese Pork Tenderloin', 'Grecian Pork Tenderloin', 'Havana Slow Cooker Pork Tenderloin', 'Curry Pork Tenderloin', 'Tender Juicy Skirt Steak  (Churrasco)', 'Spicy and Tender Corned Beef', 'Pan Roasted Pork Tenderloin with a Blue Cheese and Olive Stuffing']
['Flounder Mediterranean']
['Pastel de Tres Leches (Three Milk Cake)', 'Three-Meat Italian Meatballs', 'Three Cheese Manicotti II', 'Taiwanese-S

In [238]:
new_all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  new_all_name_tags.append(new_dic)

Can and 'll are the modal verbs found

In [239]:
md_tokens = list_words_with_tag(tagged_recipe_names, "MD")
md_tokens

['Can', "'ll"]

'can' is caused by words such as Canadian, which is processed in next section. But, 'you'll love' is not part of recipe name and more of an expression

In [240]:
for md in md_tokens:
  print(find_value_with_char(p_recipes, 'name', md))

['Canadian Yellow Split Pea Soup with Ham', 'French Canadian Tourtiere', 'Pure Maple Candy', 'Cannoli', 'The Original Donair From the East Coast of Canada', 'Sauerkraut for Canning', 'Tourtiere (French Canadian Meat Pie)', 'Pumpkin Cannoli', 'Puerto Rican Canned Corned Beef Stew', 'Canadian Pork Loin Chops', 'Caneles de Bordeaux', 'Canadian Walleye (Pickerel)', "Thera's Canadian Fried Dough", 'Italian Baked Cannelloni', 'Canary Island Red Mojo Sauce', 'Mexican Tamarind Candy', 'Cantonese Chicken Chow Mein', 'Roti Canai/Paratha (Indian Pancake)', 'Polvorones de Canele (Cinnamon Cookies)', 'Miraculous Canadian Sugar Pie', 'Canadian Tea Biscuits', 'Peanut Butter Potato Candy', 'Irish Potato Candy', 'Filipino Pancit Bihon with Canton', 'Gorton (French-Canadian Pork Spread)', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Chocolate Cantucci', 'Cantonese Style Lobster', 'Real Canadian Poutine', 'French Canadian Meatball Stew', 'Canadian Butter Tarts', 'Canadian Apple Pie', 'Cantones

Removing "You'll" and retagging new list

In [241]:
searchReplacePattList(p_recipes, r"(You'll Love)", "")
tagged_recipe_names = retag(p_recipes, "name")

'll' removed

In [242]:
new_md_tokens = list_words_with_tag(tagged_recipe_names, "MD")
new_md_tokens

['Can']

In [243]:
for md in new_md_tokens:
  print(find_value_with_char(p_recipes, 'name', md))

['Canadian Yellow Split Pea Soup with Ham', 'French Canadian Tourtiere', 'Pure Maple Candy', 'Cannoli', 'The Original Donair From the East Coast of Canada', 'Sauerkraut for Canning', 'Tourtiere (French Canadian Meat Pie)', 'Pumpkin Cannoli', 'Puerto Rican Canned Corned Beef Stew', 'Canadian Pork Loin Chops', 'Caneles de Bordeaux', 'Canadian Walleye (Pickerel)', "Thera's Canadian Fried Dough", 'Italian Baked Cannelloni', 'Canary Island Red Mojo Sauce', 'Mexican Tamarind Candy', 'Cantonese Chicken Chow Mein', 'Roti Canai/Paratha (Indian Pancake)', 'Polvorones de Canele (Cinnamon Cookies)', 'Miraculous Canadian Sugar Pie', 'Canadian Tea Biscuits', 'Peanut Butter Potato Candy', 'Irish Potato Candy', 'Filipino Pancit Bihon with Canton', 'Gorton (French-Canadian Pork Spread)', 'Quick and Easy Chinese Dinners You Can Make at Home', 'Chocolate Cantucci', 'Cantonese Style Lobster', 'Real Canadian Poutine', 'French Canadian Meatball Stew', 'Canadian Butter Tarts', 'Canadian Apple Pie', 'Cantones

In [244]:
searchReplacePattList(p_recipes, r"\/", " or ")
tagged_recipe_names = retag(p_recipes, "name")

In [245]:
bracket_tokens = list(set(list_words_with_tag(tagged_recipe_names, "(")))
bracket_tokens

['(']

Examining brackers in names. Most of the words in brackets are translations

In [246]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Tres Leches (Milk Cake)', 'Caldo de Res (Mexican Beef Soup)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', '

"(no red sauce here...golden)" needs to be removed

In [247]:
# Redundant descriptions
searchReplacePattList(p_recipes,  r"(no red sauce here...golden)", "")
searchReplacePattList(p_recipes, r"(From a Swede!)", "")
searchReplacePattList(p_recipes, r"(from a Chinese person)", "")
searchReplacePattList(p_recipes, r"(Now Vegetarian!)", "")
searchReplacePattList(p_recipes, r"a.k.a. ", "")
searchReplacePattList(p_recipes, r"(That Aren't Sandwiches)", "")

# Remove copyright symbol
searchReplacePattList(p_recipes, r"&reg;", "")
# Asian Sesame Seared or Grilled Tuna (Gluten Free) => Gluten Free Asian Sesame Seared or Grilled Tuna
searchReplaceAddPattList(p_recipes, r"(Gluten Free)", "", "glutten-free")
tagged_recipe_names = retag(p_recipes, "name")

In [248]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Tres Leches (Milk Cake)', 'Caldo de Res (Mexican Beef Soup)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Bibimbap (Korean Rice With Mixed Vegetables)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', '

In [249]:
fw_tokens = list(set(list_words_with_tag(tagged_recipe_names, "FW")))
fw_tokens

['de', 'Rassolnik', 'et']

In [250]:
fw_names = []
for fw in fw_tokens:
    names = find_value_with_char(p_recipes, 'name', fw)
    print(names)
    fw_names = fw_names + names
fw_names = list(set(fw_names))

['Pan de Muertos (Mexican Bread of the Dead)', 'Caldo de Res (Mexican Beef Soup)', 'Tender Italian Baked Chicken', 'Herbs de Provence', "Chef John's Beef Rouladen", 'Fideo', 'Tomatillo Salsa Verde', 'Ground Beef with Homemade Taco Seasoning Mix', 'German Beef Rouladen', 'Buche de Noel', 'Tuscan Pork Tenderloin', 'Sauteed Sweet Plantains (Tajaditas Dulces de Platano)', 'Homemade Mozzarella Cheese', 'Kotlet Schabowy (Polish Breaded Pork Chop)', 'Semmelknoedel (Bread Dumplings)', 'Homemade Manti (Traditional Turkish Dumplings)', 'Kalamata Olive Tapenade', 'Barbacoa-Style Shredded Beef', "Ingrid's Rouladen", 'Original Homemade Italian Beef', 'Slow Cooker Chile Verde', 'Chicken and Sliders', 'Caldo Verde (Portuguese Sausage Kale Soup)', 'German Hamburgers (Frikadellen)', 'Slow Cooker Mexican Recipes Under Calories', 'Asian Pork Tenderloin', 'Harissa Powder', 'Colorado Green Chili (Chile Verde)', 'Schupfnudeln (German Fried Potato Dumplings)', 'French Butter Cakes (Madeleines)', 'Italian Chi

In [251]:
fw_names

['Mexican Chicken Meatball Soup (Sopa de Albondigas de Pollo)',
 'Easy Sheet Pan Greek Chicken',
 'Moroccan Breakfast Skillet',
 'Scandinavian Sweetheart Waffles',
 "Papa Oriold's Spaetzle",
 'Filipino Spaghetti',
 'Stir-Fry Chicken and Vegetables',
 'Loukoumades',
 'Agua de Jamaica (Hibiscus Water)',
 "Suki's Spinach and Feta Pasta",
 'Keto Beef Egg Roll Slaw',
 'Pan de Muertos (Mexican Bread of the Dead)',
 'Stir-Fried Chicken with Tofu and Mixed Vegetables',
 'Cabbage Pico de Gallo',
 'Caldo Verde (Portuguese Green Soup)',
 'Italian-Style Bruschetta',
 'Thai Barbeque Marinade',
 'Sweet and Sour Sauce II',
 'Vegetable Curry',
 'Chicken and Chinese Vegetable Stir-Fry',
 'Sweet and Tangy Thai Cucumber Salad',
 'Spicy Italian Pork Cutlets',
 'Vietnamese Salad Rolls',
 'Homemade Chicken Fajitas',
 'Copycat Olive Garden Recipes to Make at Home',
 'French Baguettes',
 "Chef John's Baby Porchetta",
 'Indian Sweet Bread',
 'Tamales Oaxaque&ntilde;os (Oaxacan-Style Tamales)',
 'Colorful Veget

Names that both have foreign words and bracket

In [252]:
bracket_and_fw = [name for name in bracketed_names if name in fw_names]
bracket_and_fw

['Mexican Chicken Meatball Soup (Sopa de Albondigas de Pollo)',
 'Agua de Jamaica (Hibiscus Water)',
 'Pan de Muertos (Mexican Bread of the Dead)',
 'Caldo Verde (Portuguese Green Soup)',
 'Tamales Oaxaque&ntilde;os (Oaxacan-Style Tamales)',
 'Pastelon de Platano Maduro (Dominican-Style Yellow Plantain Pie)',
 'Polvorones de Canele (Cinnamon Cookies)',
 'Rigatoni al Segreto (Rigatoni with Secret Sauce)',
 'Semmelknoedel (Bread Dumplings)',
 'Mini Molletes de Frijoles (Mexican Bruschetta with Beans)',
 'Cazuela de Vaca (Beef and Pumpkin Stew)',
 'Authentic Chicken Empanadas (Empanadas de Pollo)',
 'Bibimbap (Korean Rice With Mixed Vegetables)',
 'Red Chicken Tamales (Tamales Rojos de Pollo)',
 'Chicken Enchiladas with Green Chile Sauce (Salsa Verde)',
 "Paksiw na Pata (Pig's Feet Stew)",
 'Anko (Sweet Red Bean Paste)',
 'French Cookies (Belgi Galettes)',
 "Chef John's Spanish Garlic Soup (Sopa de Ajo)",
 'Schupfnudeln (German Fried Potato Dumplings)',
 'Hearty Caldo de Res (Mexican Beef

Split the names into two names, one outside and one inside

In [253]:
BRACKET_REGEX = " \(.*\)"
def break_fw_bracket(name):
    name1 = re.findall(BRACKET_REGEX, name)[0]
    name1 = name1[name1.find("(")+1:name1.find(")")]
    name2 = re.sub(BRACKET_REGEX, "", name)
    return name1, name2

print(break_fw_bracket("Hearty Caldo de Res (Mexican Beef Soup)"))
print(break_fw_bracket("Ukha (Russian Fish Soup)"))

('Mexican Beef Soup', 'Hearty Caldo de Res')
('Russian Fish Soup', 'Ukha')


Apply the split function. Delete old recipe with bracket and foreign words. In both of the new recipes, duplicate old ingredients.

In [254]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in bracket_and_fw:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

There are still remaining names with bracket, mostly due to the foreign words not being recognized.

In [255]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names = bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Tres Leches (Milk Cake)', 'Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)', 'Spicy Thai Basil Chicken (Pad Krapow Gai)', 'Labneh (Lebanese Yogurt)', 'Indian Chicken Curry (Murgh Kari)', 'Keema Aloo (Ground Beef and Potatoes)', 'Turkish Eggs (Cilbir)', 'South African Melktert (Milk Tart)', 'Ukrainian Apple Cake (Yabluchnyk)', 'Spanish Garlic Shrimp (Gambas al Ajillo)', 'Polish Noodles (Cottage Cheese and Noodles)', 'German Potato Dumplings (Kartoffelkloesse)', 'Apfelkuchen (Apple Cake)', 'Oyakodon (Japanese Chicken and Egg Rice Bowl)', 'Eggplant Caponata (Sicilian Version)', 'Chana Masala (Savory Indian Chick Peas)', 'Ricotta Pie (Old Italian Recipe)', 'Easy Blini (Russian Pancake)', 'Easy Bulgogi (Korean BBQ Beef)', 'Carne en su Jugo (Meat in its Juices)', 'Ghormeh Sabzi (Persian Herb Stew)', 'Puerto Rican Tostones (Fried Plantains)', 'Kalbi (Korean BBQ Short Ribs)', 'Macaron (French Macaroon)', 'Atsara (Papaya Relish)', 'Authentic Chinese Egg Rolls ()', 'Greek Le

In [256]:
bracketed_names

['Grillhaxe (Grilled Eisbein, Pork Shanks)',
 'Arroz con Leche (Mexican Rice Pudding)',
 'Ropa Vieja (Cuban Meat Stew)',
 'Ukha (Russian Fish Soup)',
 'Mizeria (Polish Cucumber Salad)',
 'Arroz Tapado (Rice-On-Top)',
 'Albondigas (Meatballs) en Chipotle',
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Empanadas (Beef Turnovers)',
 'Quick Chinese-Style Vermicelli (Rice Noodles)',
 'Korean Mandu (Egg Roll)',
 'Chana Masala (Savory Indian Chick Peas)',
 'Rice & Beans (Haitian Style)',
 'Kalimotxo (Calimocho)',
 'Pancit Molo (Filipino Wonton Soup)',
 'Ginataang Manok (Chicken Cooked in Coconut Milk)',
 'Golabki (Stuffed Cabbage Rolls)',
 'Sago Pudding (Gula Melaka)',
 'Filipino Baked Milkfish (Baked Bangus)',
 'Thai Dipping Sauce for Spring Rolls (Nam Jim Po Piah)',
 'Instant Pot Galbi (Korean-Style Short Ribs)',
 'Quick and Simple Korean Doenjang Chigae (Bean Paste or Tofu Soup)',
 'Yummy Korean Glass Noodles (Jap Chae)',
 'Lahanosalata (Greek Cabbage Salad)',
 'Jamaican Saltfish Fritte

Most of the brackets are at the end of each name. For those that are in the middle, they are translations of one of the words in the name.

In [257]:
b_name_end = []
b_name_mid = []
for b_name in bracketed_names:
    if b_name.endswith(')'):
        b_name_end.append(b_name)
    else:
        b_name_mid.append(b_name)
        
b_name_end

['Grillhaxe (Grilled Eisbein, Pork Shanks)',
 'Arroz con Leche (Mexican Rice Pudding)',
 'Ropa Vieja (Cuban Meat Stew)',
 'Ukha (Russian Fish Soup)',
 'Mizeria (Polish Cucumber Salad)',
 'Arroz Tapado (Rice-On-Top)',
 'Empanadas (Beef Turnovers)',
 'Quick Chinese-Style Vermicelli (Rice Noodles)',
 'Korean Mandu (Egg Roll)',
 'Chana Masala (Savory Indian Chick Peas)',
 'Rice & Beans (Haitian Style)',
 'Kalimotxo (Calimocho)',
 'Pancit Molo (Filipino Wonton Soup)',
 'Ginataang Manok (Chicken Cooked in Coconut Milk)',
 'Golabki (Stuffed Cabbage Rolls)',
 'Sago Pudding (Gula Melaka)',
 'Filipino Baked Milkfish (Baked Bangus)',
 'Thai Dipping Sauce for Spring Rolls (Nam Jim Po Piah)',
 'Instant Pot Galbi (Korean-Style Short Ribs)',
 'Quick and Simple Korean Doenjang Chigae (Bean Paste or Tofu Soup)',
 'Yummy Korean Glass Noodles (Jap Chae)',
 'Lahanosalata (Greek Cabbage Salad)',
 'Jamaican Saltfish Fritters (Stamp and Go)',
 'Tzatziki Sauce (Yogurt and Cucumber Dip)',
 'Fleischkuechle (Fle

In [258]:
b_name_mid

['Albondigas (Meatballs) en Chipotle',
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Zito (Zhito or Koljivo) - Serbian Wheat Pudding',
 'Seaweed (Nori) Soup',
 'Classic Cuban Midnight (Medianoche) Sandwich',
 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce',
 'Bee Sting Cake (Bienenstich) II',
 'Lazy Golumpki (Stuffed Cabbage) Soup',
 'Spicy Indian (Gujarati) Green Beans',
 'Korean Bean Curd (Miso) Soup',
 'Hawaiian Bruddah Potato Mac (Macaroni) Salad',
 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish',
 'Besan (Gram Flour) Halwa',
 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce',
 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms',
 'Lamb (Gosht) Biryani',
 'Jeera (Cumin) Rice',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Ulu (Breadfruit) Pancakes',
 'Coconut (Haupia) and Chocolate Pie',
 "World's Best () Lasagna",
 'Lengua (Beef Tongue) Stew']

On the other hand, without parenthesis anymore, names with foregin words tagged are now clean

In [259]:
fw_names = []
for fw in fw_tokens:
    names = find_value_with_char(p_recipes, 'name', fw)
    print(names)
    fw_names = fw_names + names
fw_names = list(set(fw_names))

['Tender Italian Baked Chicken', 'Herbs de Provence', "Chef John's Beef Rouladen", 'Fideo', 'Tomatillo Salsa Verde', 'Ground Beef with Homemade Taco Seasoning Mix', 'German Beef Rouladen', 'Buche de Noel', 'Tuscan Pork Tenderloin', 'Homemade Mozzarella Cheese', 'Kalamata Olive Tapenade', 'Barbacoa-Style Shredded Beef', "Ingrid's Rouladen", 'Original Homemade Italian Beef', 'Slow Cooker Chile Verde', 'Chicken and Sliders', 'Slow Cooker Mexican Recipes Under Calories', 'Asian Pork Tenderloin', 'Harissa Powder', 'Italian Chicken Marinade', 'Cinder Toffee', 'Enchiladas Verdes', 'Authentic Enchiladas Verdes', 'Korean BBQ Chicken Marinade', 'Homemade Lasagna Sheets', 'Elk Steak Marinade', 'Modenese Pork Chops', 'Italian Pork Tenderloin', 'German Rouladen', 'Brazilian Lemonade', 'Shredded Beef Enchiladas', 'Brigadeiro', 'Homemade Hoisin Sauce', 'Caneles de Bordeaux', 'Homemade Portuguese Chicken', 'Homemade Spaghetti Sauce', 'Pasta de Sardine', 'Sweet and Sour Pork Tenderloin', 'Instant Pot C

In [260]:
fw_names

['Moroccan Breakfast Skillet',
 'Easy Sheet Pan Greek Chicken',
 'Scandinavian Sweetheart Waffles',
 "Papa Oriold's Spaetzle",
 'Filipino Spaghetti',
 'Stir-Fry Chicken and Vegetables',
 'Loukoumades',
 "Suki's Spinach and Feta Pasta",
 'Keto Beef Egg Roll Slaw',
 'Cazuela de Vaca',
 'Stir-Fried Chicken with Tofu and Mixed Vegetables',
 'Cabbage Pico de Gallo',
 'Italian-Style Bruschetta',
 'Thai Barbeque Marinade',
 'Sweet and Sour Sauce II',
 'Vegetable Curry',
 'Chicken and Chinese Vegetable Stir-Fry',
 'Sweet and Tangy Thai Cucumber Salad',
 'Spicy Italian Pork Cutlets',
 'Vietnamese Salad Rolls',
 'Polvorones de Canele',
 'Homemade Chicken Fajitas',
 'Copycat Olive Garden Recipes to Make at Home',
 'French Baguettes',
 'Banana-Dulce de Leche Pie',
 "Chef John's Baby Porchetta",
 'Indian Sweet Bread',
 'Double Ka Meeta',
 'Colorful Vegetable Fajitas',
 'Slow Cooker Chile Verde',
 'Stir Fried Wok Vegetables',
 'Frittata Alle Erbette',
 "Mom's Best Spaghetti Sauce",
 'Sweet Sausage',

For the remaining names with bracket at the end, split into two new recipe names

In [261]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_end:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Tres Leches (Milk Cake)
Chiles en Nogada (Mexican Stuffed Poblano Peppers in Walnut Sauce)
Spicy Thai Basil Chicken (Pad Krapow Gai)
Labneh (Lebanese Yogurt)
Indian Chicken Curry (Murgh Kari)
Keema Aloo (Ground Beef and Potatoes)
Turkish Eggs (Cilbir)
South African Melktert (Milk Tart)
Ukrainian Apple Cake (Yabluchnyk)
Spanish Garlic Shrimp (Gambas al Ajillo)
German Potato Dumplings (Kartoffelkloesse)
Apfelkuchen (Apple Cake)
Eggplant Caponata (Sicilian Version)
Chana Masala (Savory Indian Chick Peas)
Ricotta Pie (Old Italian Recipe)
Easy Blini (Russian Pancake)
Easy Bulgogi (Korean BBQ Beef)
Carne en su Jugo (Meat in its Juices)
Ghormeh Sabzi (Persian Herb Stew)
Puerto Rican Tostones (Fried Plantains)
Kalbi (Korean BBQ Short Ribs)
Macaron (French Macaroon)
Atsara (Papaya Relish)
Authentic Chinese Egg Rolls ()
Greek Lentil Soup (Fakes)
Lumpia (Shanghai version)
Northern Ontario Partridge (Ruffed Grouse)
Vampiros Mexicanos (Mexican Vampires)
Jamaican Saltfish Fritters (Stamp and Go)
Slo

For some reasons, need to run the cell twice

In [262]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_end:
            newname1, newname2 = break_fw_bracket(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Polish Noodles (Cottage Cheese and Noodles)
Oyakodon (Japanese Chicken and Egg Rice Bowl)
Papas Rellenas (Fried Stuffed Potatoes)
Blaukraut (German Red Cabbage)
Irish Boiled Dinner (Corned Beef)
True Dominican Sancocho (Latin 7-Meat Stew)
Blini (Russian Pancakes)
Oeufs Cocotte (Baked Eggs)
Ropa Vieja (Cuban Beef)
Lace Cookies (Florentine Cookies)
Sinigang na Bangus (Filipino Milkfish in Tamarind Broth)
Schwabischer Kartoffelsalat (German Potato Salad - Schwabisch Style)
Roti Canai or Paratha (Indian Pancake)
Melanzana alla Parmigiana (Perfect Eggplant Parmigiana)
Pierogi (Traditional Polish Dumplings)
Nipples of Venus (Capezzoli di Venere)
Samosadilla (Samosa Quesadilla)
Bulgogi (Korean Barbecued Beef)
Sabaayad (Somali Flatbread)
Filipino Baked Milkfish (Baked Bangus)
Ash-e Reshteh (Persian Legume Soup)
Lentil and Cactus Soup (Mom's Recipe)
Ethiopian Cabbage and Potato Dish (Atkilt)
Finnish Kropser (Baked Pancakes)
Oma's Griessnockerlsuppe (Beef and Semolina Dumpling Soup)
Kewa Datshi 

Only the names with bracket in the middle of their names remain

In [263]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names= bracketed_names + names

bracketed_names = list(set(bracketed_names))

['Classic Cuban Midnight (Medianoche) Sandwich', 'Spicy Indian (Gujarati) Green Beans', "World's Best () Lasagna", 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce', 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce', 'Bee Sting Cake (Bienenstich) II', 'Coconut (Haupia) and Chocolate Pie', 'Lamb (Gosht) Biryani', 'Jeera (Cumin) Rice', 'Pollo (Chicken) Fricassee from Puerto Rico', 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish', 'Lazy Golumpki (Stuffed Cabbage) Soup', 'Ulu (Breadfruit) Pancakes', 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican', 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce', 'Seaweed (Nori) Soup', 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms', 'Hawaiian Bruddah Potato Mac (Macaroni) Salad', 'Korean Bean Curd (Miso) Soup', 'Lengua (Beef Tongue) Stew', 'Albondigas (Meatballs) en Chipotle', 'Zito (Zhito or Koljivo) - Serbian Wheat Pudding', 'Besan (Gram Flour) Halwa']


Mac and rapini is only synonymous the the one word before them. Otherwise, the bracketed words are synonymous to all the words before them combined.

In [264]:
bracketed_names

["World's Best () Lasagna",
 'Bee Sting Cake (Bienenstich) II',
 'Vareniki (Russian Pierogi) with Potatoes and Mushrooms',
 'Albondigas (Meatballs) en Chipotle',
 'Pollo (Chicken) Fricassee from Puerto Rico',
 'Besan (Gram Flour) Halwa',
 'Korean Bean Curd (Miso) Soup',
 'Lazy Golumpki (Stuffed Cabbage) Soup',
 'Lamb (Gosht) Biryani',
 'Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce',
 'Coconut (Haupia) and Chocolate Pie',
 'Jeera (Cumin) Rice',
 'Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce',
 'Fried Chicken Chunks (Chicharrones De Pollo) Dominican',
 'Spicy Indian (Gujarati) Green Beans',
 'Lengua (Beef Tongue) Stew',
 'Seaweed (Nori) Soup',
 'Kimchi Jun (Kimchi Pancake) and Dipping Sauce',
 'Ulu (Breadfruit) Pancakes',
 'Hawaiian Bruddah Potato Mac (Macaroni) Salad',
 'Classic Cuban Midnight (Medianoche) Sandwich',
 'Zito (Zhito or Koljivo) - Serbian Wheat Pudding',
 'Fish Sinigang (Tilapia) - Filipino Sour Broth Dish']

The names can still be duplicated into 2, except that the bracketed word replaces the words before in the second new name, treating them as synonyms.

In [265]:
def convert_bracket_synonym(name, num=0):
    name1 = re.findall(BRACKET_REGEX, name)[0]
    name1 = name1[name1.find("(")+1:name1.find(")")]
    name1_suffix = name.split(')')[1]
    if num==0:
        name1 = name1 + name1_suffix
        name2 = re.sub(BRACKET_REGEX, "", name)
    else:
        name1_prefix = name.split('(')[0]
        name1_prefix = name1_prefix[:-num]
        name1 = name1_prefix + name1 + name1_suffix
        name2 = re.sub(BRACKET_REGEX, " ", name)
    return name1, name2

print(convert_bracket_synonym("Lamb (Gosht) Biryani"))
print(convert_bracket_synonym("Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce", 1))
print(convert_bracket_synonym("Hawaiian Bruddah Potato Mac (Macaroni) Salad", 1))

('Gosht Biryani', 'Lamb Biryani')
('Fusilli with RapiniBroccoli Rabe, Garlic, and Tomato Wine Sauce', 'Fusilli with Rapini , Garlic, and Tomato Wine Sauce')
('Hawaiian Bruddah Potato MacMacaroni Salad', 'Hawaiian Bruddah Potato Mac  Salad')


In [266]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in b_name_mid:
            newname1, newname2 = convert_bracket_synonym(p_recipes[i]["name"])
            print(p_recipes[i]["name"])
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

Classic Cuban Midnight (Medianoche) Sandwich
Spicy Indian (Gujarati) Green Beans
World's Best () Lasagna
Karaage (Japanese Fried Chicken) with Honey Mayoster Sauce
Kimchi Jun (Kimchi Pancake) and Dipping Sauce
Bee Sting Cake (Bienenstich) II
Coconut (Haupia) and Chocolate Pie
Lamb (Gosht) Biryani
Jeera (Cumin) Rice
Pollo (Chicken) Fricassee from Puerto Rico
Fish Sinigang (Tilapia) - Filipino Sour Broth Dish
Lazy Golumpki (Stuffed Cabbage) Soup
Ulu (Breadfruit) Pancakes
Fried Chicken Chunks (Chicharrones De Pollo) Dominican
Fusilli with Rapini (Broccoli Rabe), Garlic, and Tomato Wine Sauce
Seaweed (Nori) Soup
Vareniki (Russian Pierogi) with Potatoes and Mushrooms
Hawaiian Bruddah Potato Mac (Macaroni) Salad
Korean Bean Curd (Miso) Soup
Lengua (Beef Tongue) Stew
Albondigas (Meatballs) en Chipotle
Zito (Zhito or Koljivo) - Serbian Wheat Pudding
Besan (Gram Flour) Halwa


Successfully removed all brackets from recipe names

In [267]:
bracketed_names = []
for bracket in bracket_tokens:
    names = find_value_with_char(p_recipes, 'name', bracket)
    print(names)
    bracketed_names= bracketed_names + names

bracketed_names = list(set(bracketed_names))
bracketed_names

[]


[]

Dashes are mostly adjectives, but things like semi colon need to be removed. As for colons, its mostly translation. Semicolons are caused by K&auml;, which are dishes with special characters or German words.

In [268]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-', ':', ';']

In [269]:
for colon in colon_tokens:
  print(find_value_with_char(p_recipes, 'name', colon))

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Her

In [270]:
def remove_entry_with(dict_list, target, key="name"):
    for i, recipe in enumerate(dict_list):
        try:
            if target in dict_list[i]["name"]:
                dict_list.remove(dict_list[i])
        except Exception as e:
            pass

In [271]:
for semicolon in ["Quorn&trade;", "Sp&auml;tzle", "Tamales Oaxaque&ntilde;os", "K&auml;sesahnetorte", "Salte&ntilde;as"]:
    remove_entry_with(p_recipes, semicolon)
tagged_recipe_names = retag(p_recipes, "name")

Semi colons cleaned

In [272]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-', ':']

In [273]:
for colon in colon_tokens:
  print(find_value_with_char(p_recipes, 'name', colon))

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Her

For these 2 names, colons are used for describing

In [274]:
# Spaghetti alla Carbonara: the Traditional Italian Recipe => traditional Italian Spaghetti alla Carbonara
searchReplaceAddPattList(p_recipes, r": the Traditional Italian Recipe", "", "traditional Italian ")
# Grandma's Focaccia: Baraise Style => Grandma's Baraise Style Focaccia
searchReplaceAddPattList(p_recipes, r": Baraise Style", "", "Baraise Style ", index=10)
tagged_recipe_names = retag(p_recipes, "name")

Cleaned 2 names with colon. If the dashes are between a word, they are either part of a word's spelling or joining two words together, typically as an adjective. However, if it is between spaces, they are translations.

In [275]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-', ':']

In [276]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Chicken French - Rochester, NY Style', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Her

['Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Haluski - Cabbage and Noodles',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Carnitas - Pressure Cooker',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 'Onigiri - Japanese Rice Balls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Chicken French - Rochester, NY Style',
 'Velveting Chicken Breast, Chinese Re

But in some cases, they are words after the dashes describe the dish, such as Rochester, NY Style and Restaurant Style

In [277]:
for colname in new_colon_names:
    if len(re.findall("( - )|(: )", colname)) > 0:
        print(colname)

Haluski - Cabbage and Noodles
Carnitas - Pressure Cooker
Onigiri - Japanese Rice Balls
Chicken French - Rochester, NY Style
Taqueria Style Tacos - Carne Asada
Al Kabsa - Traditional Saudi Rice and Chicken
Italian Subs - Restaurant Style
Bazlama - Turkish Flat Bread
Norwegian Pancakes - Pannekaken
Pain de Campagne - Country French Bread
Flemish Frites - Belgian Fries with Andalouse Sauce
Portuguese Custard Tarts - Pasteis de Nata
Eggplant Parmesan - Gluten-Free
Tonkatsu - Asian-Style Pork Chop
Indian Eggplant - Bhurtha
Hot Pepper Sauce - A Trinidadian Staple
The Sarge's Goetta - German Breakfast Treat
Italian Sausage - Tuscan Style
Honey Milk Tea - Hong Kong Style
Mexican Lasagna - No Lasagna Noodles!
Lumpia - Filipino Shrimp and Pork Egg Rolls
Portuguese Muffins - Bolo Levedo
Curry Pasta - Pakistani Style
Cauliflower and Potato Stir-Fry - East Indian Recipe
Keftedes - Greek Meatballs
Brasato al Barolo - Braised Chuck Roast in Red Wine
Potato Salad - German Kartoffel
Tembleque de Coco -

Replace or remove the remaining dashes that are surrounded by spaces

In [278]:
# Chicken French - Rochester, NY Style => Rochester, NY Style Chicken French
searchReplaceAddPattList(p_recipes, r" - Rochester, NY Style", "", "Rochester, NY Style ")
# Carnitas - Pressure Cooker => pressure cooker carnitas
searchReplaceAddPattList(p_recipes, r" - Rochester, NY Style", "", "Rochester, NY Style ")
# Italian Subs - Restaurant Style => restaurant style Italian subs
searchReplaceAddPattList(p_recipes, r" - Restaurant Style", "", "restaurant style ")
# Eggplant Parmesan - Gluten-Free => glutten-free Eggplant Parmesan
searchReplaceAddPattList(p_recipes, r" - Gluten-Free", "", "glutten-free ")
# Italian Sausage - Tuscan Style => Tuscan style Italian Sausage
searchReplaceAddPattList(p_recipes, r" - Tuscan Style", "", "Tuscan style ")
# Honey Milk Tea - Hong Kong Style => Hong Kong style Honey Milk Tea
searchReplaceAddPattList(p_recipes, r" - Hong Kong Style", "", "Hong Kong style ")
# Curry Pasta - Pakistani Style => Pakistani style Curry Pasta
searchReplaceAddPattList(p_recipes, r" - Pakistani Style", "", "Pakistani style ")
# Cauliflower and Potato Stir-Fry - East Indian Recipe => East Indian style Cauliflower and Potato Stir-Fry
searchReplaceAddPattList(p_recipes, r" - East Indian Recipe", "", "East Indian style ")
# German Potato Salad - Schwabisch Style => Schwabisch style German Potato Salad
searchReplaceAddPattList(p_recipes, r" - Schwabisch Style", "", "Schwabisch style ")
# Tilapia - Filipino Sour Broth Dish => Filipino Sour Broth tilapia
searchReplaceAddPattList(p_recipes, r"Tilapia - ", "", "tilapia", index=20)
# Fish Sinigang - Filipino Sour Broth Dish - Schwabisch Style => Filipino Sour Broth Sinigang fish
searchReplaceAddPattList(p_recipes, r"Fish Sinigang - ", "", "Sinigang fish", index=20)

# remove  - A Trinidadian Staple from Hot Pepper Sauce - A Trinidadian Staple
searchReplacePattList(p_recipes, r" - A Trinidadian Staple", "")
# remove  - German Breakfast Treat from The Sarge's Goetta - German Breakfast Treat
searchReplacePattList(p_recipes, r" - German Breakfast Treat", "")
# remove  - No Lasagna Noodles! from Mexican Lasagna - No Lasagna Noodles!
searchReplacePattList(p_recipes, r" - No Lasagna Noodles!", "")
# remove  - Not Just for Chicken from Sweet and Sour Jam - Not Just for Chicken
searchReplacePattList(p_recipes, r" - Not Just for Chicken", "")
                      
tagged_recipe_names = retag(p_recipes, "name")

In [279]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Haluski - Cabbage and Noodles', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Carnitas - Pressure Cooker', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', 'Onigiri - Japanese Rice Balls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Herb Linguine', 'Korean-style Seaweed Soup'

['Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Haluski - Cabbage and Noodles',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Carnitas - Pressure Cooker',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 'Onigiri - Japanese Rice Balls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Velveting Chicken Breast, Chinese Restaurant-Style',
 'Garlic-Herb Linguine',

The remaining names with dashes surrounded by dashes are translations, which can be split into two names

In [280]:
colnames_to_split = []
for colname in new_colon_names:
    if len(re.findall("( - )|(: )", colname)) > 0:
        print(colname)
        colnames_to_split.append(colname)

Haluski - Cabbage and Noodles
Carnitas - Pressure Cooker
Onigiri - Japanese Rice Balls
Taqueria Style Tacos - Carne Asada
Al Kabsa - Traditional Saudi Rice and Chicken
Bazlama - Turkish Flat Bread
Norwegian Pancakes - Pannekaken
Pain de Campagne - Country French Bread
Flemish Frites - Belgian Fries with Andalouse Sauce
Portuguese Custard Tarts - Pasteis de Nata
Tonkatsu - Asian-Style Pork Chop
Indian Eggplant - Bhurtha
Lumpia - Filipino Shrimp and Pork Egg Rolls
Portuguese Muffins - Bolo Levedo
Keftedes - Greek Meatballs
Brasato al Barolo - Braised Chuck Roast in Red Wine
Potato Salad - German Kartoffel
Tembleque de Coco - Coconut Tembleque
Kroppkakor - Swedish Potato Dumplings
Ladolemono - Lemon Oil Sauce for Fish or Chicken
Mie Goreng - Indonesian Fried Noodles
Vaselopita - Greek New Years Cake
Knedliky - Czech Dumpling with Sauerkraut
Zhito or Koljivo - Serbian Wheat Pudding
Zito - Serbian Wheat Pudding
Doro Wat: Ethiopian Chicken Dish


In [281]:
for i, recipe in enumerate(p_recipes):
    try:
        if p_recipes[i]["name"] in colnames_to_split:
            splits = re.split("( - )|(: )", p_recipes[i]["name"])
            newname1 = splits[0]
            newname2 = splits[len(splits)-1]
            new_recipe1 = {'name': newname1, 'ingredients': p_recipes[i]["ingredients"]}
            new_recipe2 = {'name': newname2, 'ingredients': p_recipes[i]["ingredients"]}
            p_recipes.append(new_recipe1)
            p_recipes.append(new_recipe2)
            p_recipes.remove(p_recipes[i])
    except Exception as e:
        pass

tagged_recipe_names = retag(p_recipes, "name")

The remaining names with dash are those in words

In [282]:
colon_tokens = list(set(list_words_with_tag(tagged_recipe_names, ":")))
colon_tokens

['-']

In [283]:
new_colon_names = []
for colon in colon_tokens:
    print(find_value_with_char(p_recipes, 'name', colon))
    new_colon_names=new_colon_names+find_value_with_char(p_recipes, 'name', colon)
new_colon_names

['Pan-Fried Asparagus', 'Super-Delicious Zuppa Toscana', 'Indian-Style Chicken and Onions', 'Chicken Stir-Fry', 'Quick Beef Stir-Fry', 'How to Make Coquilles Saint-Jacques', 'Mexican-Style Chicken Taco Casserole', 'Make-Ahead Vegetarian Moroccan Stew', 'Japanese-Style Deep-Fried Shrimp', 'Chicken and Broccoli Stir-Fry', 'Broccoli and Chicken Stir-Fry', 'Ginger Veggie Stir-Fry', 'White Chicken Enchilada Slow-Cooker Casserole', 'Old-Fashioned Swedish Glogg', 'Stir-Fry Chicken and Vegetables', 'Barbacoa-Style Shredded Beef', 'Simple Slow-Cooked Korean Beef Soft Tacos', 'Air-Fried Korean Chicken Wings', 'Kouign-Amann', 'Gnocchi with Sage-Butter Sauce', 'Giant Bacon-Wrapped Meatballs', 'Low-Carb Cauliflower Rice Sushi Rolls', "Frank's Favorite Slow-Cooker Thai Chicken", 'Two-Ingredient Naan', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Garlic-Herb Linguine', 'Korean-style Seaweed Soup', 'Ube-Macapuno Cake', 'Cuban-Style Yuca', 'Japanese-Style Cabbage Salad', "Jorge's Indian-Spice

['Pan-Fried Asparagus',
 'Super-Delicious Zuppa Toscana',
 'Indian-Style Chicken and Onions',
 'Chicken Stir-Fry',
 'Quick Beef Stir-Fry',
 'How to Make Coquilles Saint-Jacques',
 'Mexican-Style Chicken Taco Casserole',
 'Make-Ahead Vegetarian Moroccan Stew',
 'Japanese-Style Deep-Fried Shrimp',
 'Chicken and Broccoli Stir-Fry',
 'Broccoli and Chicken Stir-Fry',
 'Ginger Veggie Stir-Fry',
 'White Chicken Enchilada Slow-Cooker Casserole',
 'Old-Fashioned Swedish Glogg',
 'Stir-Fry Chicken and Vegetables',
 'Barbacoa-Style Shredded Beef',
 'Simple Slow-Cooked Korean Beef Soft Tacos',
 'Air-Fried Korean Chicken Wings',
 'Kouign-Amann',
 'Gnocchi with Sage-Butter Sauce',
 'Giant Bacon-Wrapped Meatballs',
 'Low-Carb Cauliflower Rice Sushi Rolls',
 "Frank's Favorite Slow-Cooker Thai Chicken",
 'Two-Ingredient Naan',
 'Velveting Chicken Breast, Chinese Restaurant-Style',
 'Garlic-Herb Linguine',
 'Korean-style Seaweed Soup',
 'Ube-Macapuno Cake',
 'Cuban-Style Yuca',
 'Japanese-Style Cabbage 

!, ? and . are found, which are odd for recipe names

In [284]:
punc_tokens = list_words_with_tag(tagged_recipe_names, ".")
punc_tokens

['!', '!', '!', '!', '.', '?']

The punctuations are mostly slang abbreviations and exclamations

In [285]:
for punc in list(set(punc_tokens)):
  print(find_value_with_char(p_recipes, 'name', punc))

["Our Top P.F. Chang's Copycat Recipes", "Perfect St. Patrick's Day Cake"]
['Real Canadian Butter Tarts, eh?']
['Sangria! Sangria!', 'Oatmeal Apple Crisp To Die For!', "Sushi House Salad Dressing, It's ORANGE!"]


Remove the exclamations

In [286]:
searchReplacePattList(p_recipes, r"! Sangria!", "")
searchReplacePattList(p_recipes, r" To Die For!", "")
searchReplacePattList(p_recipes, r", It's ORANGE!", "")
searchReplacePattList(p_recipes, r", eh\?", "")
searchReplacePattList(p_recipes, r"Our Top ", "")

tagged_recipe_names = retag(p_recipes, "name")

Fullstops that remain are part of recipe names

In [287]:
punc_tokens = list_words_with_tag(tagged_recipe_names, ".")
punc_tokens

['.']

In [288]:
for punc in list(set(punc_tokens)):
  print(find_value_with_char(p_recipes, 'name', punc))

["P.F. Chang's Copycat Recipes", "Perfect St. Patrick's Day Cake"]


Some 'that' can be found

In [289]:
wdt_tokens = list_words_with_tag(tagged_recipe_names, "WDT")
wdt_tokens

['That', 'That', 'That', 'That']

The 'that's are used to add details, but not actual recipe name

In [290]:
for wdt in list(set(wdt_tokens)):
  print(find_value_with_char(p_recipes, 'name', wdt))

['German Recipes That Are Comfort Food Favorites', 'Mexican-Inspired Ground Beef Casseroles That Deliver Big Flavor With Every Satisfying Bite', 'Tuscan Recipes That Reveal the Best of Italian Cooking', 'Easy Dinners That Start with Packaged Gnocchi', "That's-a Meatloaf", 'Favorite Recipes That Show Off Armenian Cuisine', 'Our Best Stir-Fry Recipes That Are Even Better Than Take-Out', 'Comforting Polish Cabbage Recipes That Are Family Favorites']


Remove

In [291]:
searchReplacePattList(p_recipes, r" That Are Comfort Food Favorites", "")
searchReplacePattList(p_recipes, r" That Deliver Big Flavor With Every Satisfying Bite", "")
searchReplacePattList(p_recipes, r" That Reveal the Best of Italian Cooking", "")
searchReplacePattList(p_recipes, r"That's-a ", "")
searchReplacePattList(p_recipes, r"Favorite Recipes That Show Off ", "")
searchReplacePattList(p_recipes, r" That Are Even Better Than Take-Out", "")
searchReplacePattList(p_recipes, r" That Are Family Favorites", "")

searchReplaceAddPattList(p_recipes, r" That Start with Packaged Gnocchi", "", "packaged gnocchi ", index=5)
tagged_recipe_names = retag(p_recipes, "name")

That removed

In [292]:
wdt_tokens = list_words_with_tag(tagged_recipe_names, "WDT")
wdt_tokens

[]

There's some 'how's

In [293]:
wrb_tokens = list_words_with_tag(tagged_recipe_names, "WRB")
wrb_tokens

['How', 'How', 'How', 'How', 'How', 'How', 'How']

In [294]:
for wrb in list(set(wrb_tokens)):
  print(find_value_with_char(p_recipes, 'name', wrb))

['How to Make Coquilles Saint-Jacques', 'How to Make Bolognese Sauce', 'How to Make Beef Satay', 'How to Make Peanut Dipping Sauce', 'How to Make Tres Leches Cake', 'How to Make Cassoulet', 'How to Make Turkey Manicotti']


Remove the 'how's and keep only the name

In [295]:
searchReplacePattList(p_recipes, r"How to Make ", "")

tagged_recipe_names = retag(p_recipes, "name")

In [296]:
list_words_with_tag(tagged_recipe_names, "WRB")

[]

There's some personal pronouns (possessive)

In [297]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP$")
prp_tokens

['Our',
 'My',
 'My',
 'My',
 'Our',
 'My',
 'Our',
 'My',
 'My',
 'My',
 'Our',
 'My',
 'My',
 'Your',
 'Our',
 'Our',
 'Our',
 'My',
 'its']

In [298]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['Sweet Recipes to Complete Your Indian Dinner', 'Melt-in-Your-Mouth Beef Cacciatore', 'Polish Recipes to Make Your Grandmother Proud']
['My Own Famous Stuffed Grape Leaves', 'My Best Chicken Piccata', 'My Favorite Sesame Noodles', 'My Chicken Parmesan', "My Mom's Greek Lemon Rice", 'My Fly Stir-Fry', 'My Chicken Pho Recipe', 'My Tangy German Potato Salad', 'My Big Fat Greek Baked Beans', "My Grandmother's French Dressing"]
['Our Best Avgolemono Soup Recipes', 'Our Best Authentic Mexican Recipes', 'Our Best Empanada Recipes', 'Our Best Indian Recipes for Beginner Cooks', 'Our Best Stir-Fry Recipes', 'Our Favorite German Potato Recipes', 'Say Aloha to Our Best Hawaiian Recipes']
['Anzac Biscuits I', "Sadie's Buttermilk Biscuits", 'Canadian Tea Biscuits', 'Empire Biscuits', 'Pastitsio IV', 'Crescent Butter Biscuits', 'Pastitsio', "Nanny's Newfoundland Tea Biscuits", 'Meat in its Juices']


Most can be removed

In [299]:
searchReplacePattList(p_recipes, r"Our ", "")
searchReplacePattList(p_recipes, r"Your ", "")
searchReplacePattList(p_recipes, r"Melt-in-Your-Mouth ", "")
searchReplacePattList(p_recipes, r"My Own ", "")
searchReplacePattList(p_recipes, r"My Best ", "")
searchReplacePattList(p_recipes, r"My Favorite ", "")
searchReplacePattList(p_recipes, r"My Mom's ", "")
searchReplacePattList(p_recipes, r"My Grandmother's ", "")
searchReplacePattList(p_recipes, r"My ", "")

tagged_recipe_names = retag(p_recipes, "name")

The remaining ones are misclassified tags by nltk

In [300]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP$")
prp_tokens

['its']

In [301]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['Anzac Biscuits I', "Sadie's Buttermilk Biscuits", 'Canadian Tea Biscuits', 'Empire Biscuits', 'Pastitsio IV', 'Crescent Butter Biscuits', 'Pastitsio', "Nanny's Newfoundland Tea Biscuits", 'Meat in its Juices']


There's some personal pronouns

In [302]:
prp_tokens = list_words_with_tag(tagged_recipe_names, "PRP")
prp_tokens

['I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'You',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'You',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'We',
 'I',
 'I',
 'I']

In [303]:
for prp in list(set(prp_tokens)):
  print(find_value_with_char(p_recipes, 'name', prp))

['German Apple Cake I', 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Mexican Rice II', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Hot German Potato Salad III', 'Stuffed Shells I', 'Baked Penne with Italian Sausage', 'German Pancakes II', "Grandma's Noodles II", 'Hot Italian Giardiniera', 'Russian Tea Cakes I', 'Fried Irish Cabbage with Bacon', 'Greek Salad I', 'Irish Boxty', 'Italian Sausage Stuffed Shells', 'Vegetarian Mexican Inspired Stuffed Peppers', 'Indian Saag', 'Chicken Enchiladas II', 'Chinese Chicken Fried Rice II', 'Quiche Lorraine I', 'Italian Cream Cheese and Ricotta Cheesecake', 'Taco Seasoning II', "Ingrid's Rouladen", 'Baked Italian Chicken Thighs', 'Original Homemade Italian Beef', 'Pork Chops Italiano', 'Pizzelles III', 'Eclairs II', 'Real Italian Calzones', 'Old Italian Meat Sauce', "Chef John's Italian Sausage Chili", 'Italian Bread Using a Bread Machine', 'Incredibly Delicious Italian Cream Cake', 'Italian Wedding Cookies III', 'B

Not much to remove, since most are misclassified POS

In [304]:
searchReplacePattList(p_recipes, r" You Can Make at Home", "")

tagged_recipe_names = retag(p_recipes, "name")

Some base verbs can be removed

In [305]:
vb_tokens = list_words_with_tag(tagged_recipe_names, "VB")
vb_tokens

['Take',
 'Make',
 'Take',
 'Kedgeree',
 'Swordfish',
 'Serve',
 'Make',
 'Celebrate',
 'Chicken',
 'Pata',
 'aux',
 'Poulet',
 'Papa',
 'Tarte',
 'Pollo',
 'Pancake',
 'Dutch',
 'Kransekake',
 'Dish',
 'Pannekaken']

In [306]:
for vb in list(set(vb_tokens)):
  print(find_value_with_char(p_recipes, 'name', vb))

["Chef John's Patatas Bravas", 'Authentic Patatas Bravas', 'Paksiw na Pata', 'Tajine de Poulet aux Carottes et Patates Douces']
['Kedgeree']
['Spicy Korean Fried Chicken with Gochujang Sauce', 'Greek Lemon Chicken and Potato Bake', "Chef John's Chicken Kiev", 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Chicken Katsu', 'Chicken Stir-Fry', 'Mexican-Style Chicken Taco Casserole', 'Curry Stand Chicken Tikka Masala Sauce', 'Chicken Enchiladas V', 'Jamaican Style Curry Chicken', 'Salsa Chicken', 'Grilled Asian Chicken', 'Chicken Tikka Masala', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Turkish Chicken Kebabs', 'Chicken Souvlaki with Tzatziki Sauce', 'Greek Lemon Chicken Soup', 'Chicken Cacciatore in a Slow Cooker', 'Chicken and Broccoli Stir-Fry', 'Creamy Chicken Lasagna', 'Broccoli and Chicken Stir-Fry', 'Chicken Parmigiana', 'Shoyu Chicken', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Sheet Pan Chicken Fajitas', 'White Chicken Ench

Remove recipe names with instruction

In [307]:
searchReplacePattList(p_recipes, r" to Make at Home", "")
searchReplacePattList(p_recipes, r" to Make Grandmother Proud", "")
searchReplacePattList(p_recipes, r"Ways The World Makes Chicken And ", "")

searchReplaceAddPattList(p_recipes, r"Make Ahead ", "", "packaged gnocchi ")

tagged_recipe_names = retag(p_recipes, "name")

In [308]:
for vb in list(set(vb_tokens)):
  print(find_value_with_char(p_recipes, 'name', vb))

["Chef John's Patatas Bravas", 'Authentic Patatas Bravas', 'Paksiw na Pata', 'Tajine de Poulet aux Carottes et Patates Douces']
['Kedgeree']
['Spicy Korean Fried Chicken with Gochujang Sauce', 'Greek Lemon Chicken and Potato Bake', "Chef John's Chicken Kiev", 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Chicken Katsu', 'Chicken Stir-Fry', 'Mexican-Style Chicken Taco Casserole', 'Curry Stand Chicken Tikka Masala Sauce', 'Chicken Enchiladas V', 'Jamaican Style Curry Chicken', 'Salsa Chicken', 'Grilled Asian Chicken', 'Chicken Tikka Masala', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Turkish Chicken Kebabs', 'Chicken Souvlaki with Tzatziki Sauce', 'Greek Lemon Chicken Soup', 'Chicken Cacciatore in a Slow Cooker', 'Chicken and Broccoli Stir-Fry', 'Creamy Chicken Lasagna', 'Broccoli and Chicken Stir-Fry', 'Chicken Parmigiana', 'Shoyu Chicken', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Sheet Pan Chicken Fajitas', 'White Chicken Ench

Words like best and most can be removed

In [309]:
rbs_tokens = list_words_with_tag(tagged_recipe_names, "RBS")
rbs_tokens

['Best', 'Most', 'Best']

In [310]:
for rbs in list(set(rbs_tokens)):
  print(find_value_with_char(p_recipes, 'name', rbs))

['Best Bobotie', 'Best Fried Walleye', 'Best Avgolemono Soup Recipes', "Chef John's Best German Recipes", 'The Best Thai Peanut Sauce', 'Best Ever Russian Beef Stroganoff', "Grandma's Best Ever Sour Cream Lasagna", 'Best Guacamole', 'Best Ever Slow Cooker Italian Beef Roast', 'The Best Pavlova', "Savannah's Best Marinated Portobello Mushrooms", 'Best Peanut Sauce', 'Best Ever Carne Asada Marinade', "Mom's Best Spaghetti Sauce", 'The Best Korean Chicken Recipes', 'Best Instant Pot Chicken Cacciatore', 'Best Ziti Ever', 'Best Authentic Mexican Recipes', 'Best Empanada Recipes', 'Best Ziti Ever with Sausage', 'Best Chicken Parmesan', 'Best Pernil Ever', 'The Best Ricotta Pancakes', 'Best Indian Recipes for Beginner Cooks', 'Best Hot Sauce', 'Best Ever Irish Soda Bread', 'Best Hummus', 'The Best Thai Tom Kha Soup Recipe', 'Best French Macarons', 'Best Falafel', "Gordo's Best of the Best Lasagna", 'The Best Classic Beef Stroganoff', 'Best Asian Slow Cooker Recipes', 'Best Cheesy Broccoli So

In [311]:
searchReplacePattList(p_recipes, r"Best Ever ", "")
searchReplacePattList(p_recipes, r"Best ", "")
searchReplacePattList(p_recipes, r" ever", "")
searchReplacePattList(p_recipes, r"The Most Iconic ", "")

tagged_recipe_names = retag(p_recipes, "name")

In [312]:
rbs_tokens = list_words_with_tag(tagged_recipe_names, "RBS")
rbs_tokens

[]

Adverbs with -ly can be removed, except for the misclassified ones mainly caused by foreign recipe names

In [313]:
rb_tokens = list_words_with_tag(tagged_recipe_names, "RB")
rb_tokens

['Absolutely',
 'Aebleskiver',
 'Incredibly',
 'Perfectly',
 'Absolutely',
 'Oven',
 'Perfectly',
 'Absolutely',
 'Heavenly',
 'Asiago',
 'Philly',
 'Family',
 'Deadly',
 'Yet',
 'Absolutely',
 'Ever',
 'Tourtiere',
 'Tourtiere',
 'Soon',
 'Here',
 'Long',
 'Tourtiere',
 'Tourtiere']

In [314]:
for rb in list(set(rb_tokens)):
  print(find_value_with_char(p_recipes, 'name', rb))

['No Tomato Paste Here']
['Soon Du Bu Jigae']
['Willard Family German Chocolate Cake', 'Mexican-Inspired Casseroles for Family-Pleasing Dinners', 'Chinese Happy Family', 'Family Sicilian Sauce and Meatballs', 'Greek Ground Beef Recipes Sure To Become Family Favorites']
['Incredibly Delicious Italian Cream Cake']
['Chicken Long Rice Soup', 'Vietnamese Chicken and Long-Grain Rice Congee', 'Long Soup', 'Philippine Longanisa de Eugenio', 'Long Drink']
['Asiago Sun-Dried Tomato Pasta', 'Chicken and Bowtie Pasta with Asiago Cream Sauce']
['Philly Cheesesteak Quesadillas']
['Deadly Delicious Lasagna']
['French Canadian Tourtiere', 'Traditional French Canadian Tourtiere', 'Reveillon Tourtiere', 'Tourtiere Spices', 'Tourtiere', 'Tourtiere', 'Tourtiere', 'Tourtiere']
['Absolutely Fabulous Greek or House Dressing', 'Absolutely Amazing Ahi', 'Absolutely Delicious Stuffed Calamari', 'Absolutely Perfect Palak Paneer']
['Yet Turkey Chili']
['Aebleskiver', 'Dansk Aebleskiver']
['Ziti Ever', 'Ziti Ever

In [315]:
searchReplacePattList(p_recipes, r"Deadly Delicious ", "")
searchReplacePattList(p_recipes, r"Heavenly ", "")
searchReplacePattList(p_recipes, r"Perfectly ", "")
searchReplacePattList(p_recipes, r"Absolutely Fabulous ", "")
searchReplacePattList(p_recipes, r"Absolutely Amazing  ", "")
searchReplacePattList(p_recipes, r"Absolutely Delicious ", "")
searchReplacePattList(p_recipes, r"Absolutely Perfect ", "")

searchReplaceAddPattList(p_recipes, r"No Tomato Paste Here", "", "tomato paste")

tagged_recipe_names = retag(p_recipes, "name")

In [316]:
rb_tokens = list_words_with_tag(tagged_recipe_names, "RB")
rb_tokens

['Aebleskiver',
 'Incredibly',
 'Absolutely',
 'Oven',
 'Asiago',
 'Philly',
 'Family',
 'Yet',
 'Ever',
 'Tourtiere',
 'Tourtiere',
 'Soon',
 'Long',
 'Tourtiere',
 'Tourtiere']

In [317]:
for rb in list(set(rb_tokens)):
  print(find_value_with_char(p_recipes, 'name', rb))

['Soon Du Bu Jigae']
['Incredibly Delicious Italian Cream Cake']
['Chicken Long Rice Soup', 'Vietnamese Chicken and Long-Grain Rice Congee', 'Long Soup', 'Philippine Longanisa de Eugenio', 'Long Drink']
['Asiago Sun-Dried Tomato Pasta', 'Chicken and Bowtie Pasta with Asiago Cream Sauce']
['Philly Cheesesteak Quesadillas']
['French Canadian Tourtiere', 'Traditional French Canadian Tourtiere', 'Reveillon Tourtiere', 'Tourtiere Spices', 'Tourtiere', 'Tourtiere', 'Tourtiere', 'Tourtiere']
['Absolutely Amazing Ahi']
['Aebleskiver', 'Dansk Aebleskiver']
['Ziti Ever', 'Ziti Ever with Sausage', 'Pernil Ever', 'Date Squares Ever']
['Air Fryer Oven Taco Shells', 'Oven Kalua Pork', 'Oven-Roasted Chicken Thighs', 'Oven Baked Chicken Teriyaki', 'Oven-Baked Chicken Fajitas', 'Oven-Baked Teriyaki Chicken Thighs', 'Crispy Oven Beef-and-Bean Tostadas', "Oven-Roasted Za'atar Chicken Breasts", 'Chicken Chimi in the Oven']
['Willard Family German Chocolate Cake', 'Mexican-Inspired Casseroles for Family-Pl

In [318]:
all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_names, POS)}
  all_name_tags.append(new_dic)

get_tag_number(all_name_tags)

[{'$': 1},
 {"''": 7},
 {'(': 0},
 {')': 0},
 {',': 62},
 {'--': 0},
 {'.': 1},
 {':': 1},
 {'CC': 506},
 {'CD': 23},
 {'DT': 96},
 {'EX': 0},
 {'FW': 67},
 {'IN': 464},
 {'JJ': 1897},
 {'JJR': 2},
 {'JJS': 1},
 {'LS': 0},
 {'MD': 0},
 {'NN': 659},
 {'NNP': 12712},
 {'NNPS': 36},
 {'NNS': 389},
 {'PDT': 0},
 {'POS': 346},
 {'PRP': 69},
 {'PRP$': 1},
 {'RB': 15},
 {'RBR': 0},
 {'RBS': 0},
 {'RP': 2},
 {'SYM': 0},
 {'TO': 10},
 {'UH': 0},
 {'VB': 18},
 {'VBD': 39},
 {'VBG': 59},
 {'VBN': 139},
 {'VBP': 9},
 {'VBZ': 29},
 {'WDT': 0},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 6}]

## Examining other POS in names

In [319]:
vbz_tokens = list_words_with_tag(tagged_recipe_names, "VBZ")
vbz_tokens

['Ties',
 'el',
 'Leaves',
 'al',
 'al',
 'Leaves',
 'au',
 'di',
 'Ways',
 'de',
 'al',
 'Breasts',
 'en',
 'e',
 'al',
 'Leaves',
 'Breasts',
 'Squares',
 'al',
 'di',
 'aux',
 'di',
 'Leaves',
 'au',
 'di',
 'di',
 'al',
 'en',
 'en']

In [320]:
vbp_tokens = list_words_with_tag(tagged_recipe_names, "VBP")
vbp_tokens

['Rellenos',
 'Greek',
 'Divine',
 'Wat',
 'Be',
 'en',
 'Mexicanos',
 'Rellenos',
 'en']

In [321]:
vbg_tokens = list_words_with_tag(tagged_recipe_names, "VBG")
vbg_tokens

['Seasoning',
 'Dressing',
 'Pudding',
 'Using',
 'Canning',
 'Pudding',
 'Velveting',
 'Pudding',
 'Pudding',
 'Pudding',
 'Seasoning',
 'Comforting',
 'Seasoning',
 'Pouding',
 'Pudding',
 'Amazing',
 'Pudding',
 'Refreshing',
 'Pudding',
 'Seasoning',
 'Dressing',
 'Comforting',
 'Pudding',
 'Making',
 'Comforting',
 'Pudding',
 'Dumpling',
 'Dipping',
 'Refreshing',
 'Pudding',
 'Seasoning',
 'Seasoning',
 'Filling',
 'Thanksgiving',
 'Stuffing',
 'Pudding',
 'Pudding',
 'Refreshing',
 'Pudding',
 'Sizzling',
 'Topping',
 'Amazing',
 'Refreshing',
 'Comforting',
 'Dressing',
 'Using',
 'Seasoning',
 'Refreshing',
 'Pudding',
 'Pudding',
 'Pudding',
 'Ping',
 'Pudding',
 'Pudding',
 'Pudding',
 'Pudding',
 'Pudding',
 'Dumpling',
 'Pudding']

In [322]:
vbd_tokens = list_words_with_tag(tagged_recipe_names, "VBD")
vbd_tokens

['Braised',
 'Corned',
 'Corned',
 'Pickled',
 'Shredded',
 'Braised',
 'Fashioned',
 'Filled',
 'Corned',
 'Fashioned',
 'Pickled',
 'Braised',
 'Breaded',
 'Fried',
 'Grilled',
 'Braised',
 'Pickled',
 'Braised',
 'Braised',
 'Planked',
 'Corned',
 'Corned',
 'Braised',
 'Infused',
 'Corned',
 'Obsessed',
 'Pickled',
 'Pulled',
 'Roasted',
 'Broiled',
 'Pickled',
 'Roasted',
 'di',
 'Braised',
 'Braised',
 'Pickled',
 'Mulled',
 'Pickled',
 'Boiled']

In [323]:
rp_tokens = set(list(list_words_with_tag(tagged_recipe_names, "RP")))
rp_tokens

{'Hanout', 'Over'}

In [324]:
comma_tokens = set(list(list_words_with_tag(tagged_recipe_names, ",")))
comma_tokens

{','}

In [325]:
for c in list(set(comma_tokens)):
  print(find_value_with_char(p_recipes, 'name', c))

['Bow Ties with Sausage, Tomatoes and Cream', 'Velveting Chicken Breast, Chinese Restaurant-Style', 'Chicken, Spinach, and Cheese Pasta Bake', 'Super-Simple, Super-Spicy Mongolian Beef', 'Creamy Potato, Carrot, and Leek Soup', 'Beef, Mushroom and Guinness Pie', 'Easy, Chewy Flourless Peanut Butter Cookies', 'Filipino Steamed Rice, Cebu Style', 'Orange, Honey and Soy Chicken', 'Chicken Francese, Italian-Style', 'Duck with Honey, Soy, and Ginger', 'Steak, Onion, and Pepper Fajitas', 'Indian Carrots, Peas and Potatoes', 'Simple, Baked Finnan Haddie', 'Indian-Style Rice with Cashews, Raisins and Turmeric', 'Serbian Ground Beef, Veggie, and Potato Bake', 'Fried Rice with Ginger, Hoisin, and Sesame', 'Chard Lentil Soup, Lebanese-Style', 'Easy, Cheesy Tortellini Bake', 'Curried Cashew, Pear, and Grape Salad', 'Pork, Sauerkraut and Dumplings', 'Spinach, Feta, and Pine Nut Ravioli Filling', 'Bell Pepper, Tomato, and Potato Indian Curry', 'Mascarpone Pasta with Chicken, Bacon and Spinach', 'Past

In [326]:
jjr_tokens = list_words_with_tag(tagged_recipe_names, "JJR")
jjr_tokens

['Healthier', 'Lighter']

In [327]:
for j in list(set(jjr_tokens)):
  print(find_value_with_char(p_recipes, 'name', j))

['Lighter Mexican Meatloaf']
['Healthier Bang Bang Chicken in the Air Fryer', 'Healthier Swedish Meatballs', 'Healthier Pan-Fried Honey-Sesame Chicken', 'Healthier Chicken Enchiladas I', 'Healthier Honey-Sesame Chicken']


In [328]:
jjs_tokens = list_words_with_tag(tagged_recipe_names, "JJS")
jjs_tokens

['Oktoberfest']

In [329]:
for j in list(set(jjs_tokens)):
  print(find_value_with_char(p_recipes, 'name', j))

['Oktoberfest Chicken and Red Cabbage', 'Oktoberfest Potato Salad', 'Oktoberfest Chili', 'The Recipes to Celebrate Oktoberfest']


In [330]:
dt_tokens = list_words_with_tag(tagged_recipe_names, "DT")
dt_tokens

['a',
 'The',
 'No',
 'The',
 'the',
 'a',
 'the',
 'The',
 'the',
 'the',
 'a',
 'the',
 'the',
 'A',
 'a',
 'The',
 'a',
 'the',
 'the',
 'a',
 'a',
 'A',
 'The',
 'A',
 'the',
 'a',
 'a',
 'The',
 'a',
 'a',
 'The',
 'the',
 'The',
 'This',
 'The',
 'a',
 'a',
 'the',
 'The',
 'a',
 'a',
 'The',
 'a',
 'A',
 'the',
 'the',
 'No',
 'the',
 'a',
 'a',
 'The',
 'The',
 'a',
 'The',
 'the',
 'the',
 'The',
 'the',
 'a',
 'a',
 'The',
 'a',
 'the',
 'a',
 'The',
 'All',
 'The',
 'a',
 'the',
 'the',
 'the',
 'The',
 'The',
 'A',
 'a',
 'the',
 'a',
 'the',
 'The',
 'the',
 'a',
 'a',
 'a',
 'the',
 'a',
 'a',
 'the',
 'a',
 'An',
 'the',
 'a',
 'a',
 'a',
 'No',
 'a',
 'No']

In [331]:
for dt in list(set(dt_tokens)):
  print(find_value_with_char(p_recipes, 'name', dt))

["Jenn's Out Of This World Spaghetti and Meatballs"]
['Authentic German Potato Salad', 'Easy Authentic Mexican Rice', "Authentic Russian Salad 'Olivye'", 'Authentic Mexican Tortillas', 'The Original Donair From the East Coast of Canada', 'Authentic Paella Valenciana', 'Authentic Pad Thai', 'Lumpia in the Air Fryer', 'Refried Beans Without the Refry', 'Beef Stifado in the Slow Cooker', 'Authentic Mexican Breakfast Tacos', 'Authentic Enchiladas Verdes', 'Toad in the Hole', 'Authentic Chicken Tikka Masala', 'Authentic Mexican Enchiladas', 'Authentic Miso Soup', 'Healthier Bang Bang Chicken in the Air Fryer', 'Authentic Mexican Picadillo', 'Margaritas on the Rocks', 'Authentic French Meringues', 'Authentic Hungarian Goulash', 'Eggplant Parmesan For the Slow Cooker', 'Authentic Greek Moussaka', 'Authentic Chicken Madras', 'Authentic Thai Coconut Soup', 'Mongolian Beef from the Slow Cooker', 'Authentic and Easy Shrimp Curry', 'Authentic Patatas Bravas', 'Cuban Black Bean Soup in the Slow Coo

In [332]:
to_tokens = list_words_with_tag(tagged_recipe_names, "TO")
to_tokens

['to', 'na', 'to', 'to', 'to', 'To', 'to', 'na', 'na', 'na']

In [333]:
for to in list(set(to_tokens)):
  print(find_value_with_char(p_recipes, 'name', to))

['Creamy Au Gratin Potatoes', 'Greek Lemon Chicken and Potato Bake', 'Authentic German Potato Salad', 'Wonton Wrappers', 'Spinach Tomato Tortellini', 'Hot German Potato Salad III', 'Hot German Potato Salad Casserole', 'Chicken Cacciatore in a Slow Cooker', 'Wonton Soup', 'Pesto Cream Sauce', 'Creamy Pesto Shrimp', 'Fabulous Wet Burritos', 'Sofrito', 'Russian Mushroom and Potato Soup', 'Bow Ties with Sausage, Tomatoes and Cream', 'Chicken Wonton Tacos', 'Tomato Basil Salmon', 'Spanish Octopus', 'Potato Scones', 'Lyonnaise Potatoes', 'Pesto', 'Chicken Pesto Pizza', 'Victoria Sponge Cake', 'Ratatouille', 'Addictive Sweet Potato Burritos', 'Hasselback Potatoes', 'Tembleque Puerto Rican Coconut Pudding', 'Pesto Grilled Cheese Sandwich', 'Delicious Black Bean Burritos', 'Italian Stewed Tomatoes', 'Thai Sweet Potato Soup', 'Antipasto Pasta Salad', 'Gelato', 'Rigatoni alla Genovese', 'Oktoberfest Chicken and Red Cabbage', 'Greek Style Potatoes', 'Puerto Rican Steamed Rice', 'Pesto Pizza', 'Roa

Chicken is considered dollar?

In [334]:
dol_tokens = list_words_with_tag(tagged_recipe_names, "$")
dol_tokens

['Chicken']

It's a tagging error, so this can be ignored

In [335]:
for dol in dol_tokens:
  print(find_value_with_char(p_recipes, 'name', dol))

['Spicy Korean Fried Chicken with Gochujang Sauce', 'Greek Lemon Chicken and Potato Bake', "Chef John's Chicken Kiev", 'Indian-Style Chicken and Onions', 'Tender Italian Baked Chicken', 'Chicken Katsu', 'Chicken Stir-Fry', 'Mexican-Style Chicken Taco Casserole', 'Curry Stand Chicken Tikka Masala Sauce', 'Chicken Enchiladas V', 'Jamaican Style Curry Chicken', 'Salsa Chicken', 'Grilled Asian Chicken', 'Chicken Tikka Masala', 'Sweet and Sour Chicken I', 'Chicken Cordon Bleu II', 'Turkish Chicken Kebabs', 'Chicken Souvlaki with Tzatziki Sauce', 'Greek Lemon Chicken Soup', 'Chicken Cacciatore in a Slow Cooker', 'Chicken and Broccoli Stir-Fry', 'Creamy Chicken Lasagna', 'Broccoli and Chicken Stir-Fry', 'Chicken Parmigiana', 'Shoyu Chicken', 'Skillet Chicken Bulgogi', 'Easy Slow Cooker Chicken Tetrazzini', 'Sheet Pan Chicken Fajitas', 'White Chicken Enchilada Slow-Cooker Casserole', 'Chicken Enchiladas II', 'Chinese Chicken Fried Rice II', 'Chicken Milanese', 'Chicken Massaman Curry', "Chef J

There are some quotation marks

In [336]:
quote_tokens = list_words_with_tag(tagged_recipe_names, "''")
quote_tokens

["''", "''", "'", "''", "''", "''", "''"]

Quotation marks are caused by possessive -'s

In [337]:
for quote in quote_tokens:
  print(find_value_with_char(p_recipes, 'name', quote))

[]
[]
["Chef John's Chicken Kiev", "Angela's Awesome Enchiladas", "Randy's Slow Cooker Ravioli Lasagna", "'Chinese Buffet' Green Beans", "Chef John's Beef Rouladen", "Corned Beef and Cabbage Shepherd's Pie", "Gramma's Date Squares", "Authentic Russian Salad 'Olivye'", "Chef John's Meatless Meatballs", "Chef John's Beef Goulash", "Grandma's Noodles II", "Chef John's Clotted Cream", "Newfoundland Jigg's Dinner", "Chef John's Coq Au Vin", "Chef John's Loco Moco", "Dash's Donair", "Turkey Shepherd's Pie", "Papa Drexler's Bavarian Pretzels", "Bob's Stuffed Banana Peppers", "Chef John's Swedish Meatballs", "Chef John's German Recipes", "Chef John's Chicken Tikka Masala", "Maria's Mexican Rice", "Mom's Buttermilk Pancakes", "Geneva's Ultimate Hungarian Mushroom Soup", "Charley's Slow Cooker Mexican Style Meat", "Ingrid's Rouladen", "Chef John's Lasagna", "Lola's Horchata", "Chef John's Italian Sausage Chili", "Kid's Favorite Pizza Casserole", "Traci's Adobo Seasoning", "Frank's Favorite Slow-

 For now, leave the preprocessing of the recipe names first.

## Preprocessing of ingredients

In [338]:
p_ingredients = []

for recipe in p_recipes:
    p_ingredients = p_ingredients + recipe['ingredients']
    
p_ingredients = list(set(p_ingredients))
len(p_ingredients)

19342

In [339]:
p_ingredients[:10]

['1 cucumber, thinly sliced ',
 '1\u2009½ cups coconut milk ',
 '1 tablespoon crunchy peanut butter (Optional)',
 '¼ cup diced green onions ',
 '6 ounces sharp provolone cheese, cubed ',
 '1 tablespoon sumac powder ',
 '1\u2009½ cups chopped onion, divided ',
 '8\u2009½ cups all-purpose flour ',
 ' soy sauce to taste ',
 '1 teaspoon soy sauce, or to taste ']

In [340]:
for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = p_ingredients[i].strip()

p_ingredients[:10]

['1 cucumber, thinly sliced',
 '1\u2009½ cups coconut milk',
 '1 tablespoon crunchy peanut butter (Optional)',
 '¼ cup diced green onions',
 '6 ounces sharp provolone cheese, cubed',
 '1 tablespoon sumac powder',
 '1\u2009½ cups chopped onion, divided',
 '8\u2009½ cups all-purpose flour',
 'soy sauce to taste',
 '1 teaspoon soy sauce, or to taste']

A reusable function that re-tags ingredients

In [341]:
def retag_ingredients():
    tagged_recipe_ingredients = []

    for ingredient in p_ingredients:
        tagged_recipe_ingredients.append(tag_pos(ingredient))
        
    return tagged_recipe_ingredients

tagged_recipe_ingredients = retag_ingredients()
tagged_recipe_ingredients[:10]

[[('1', 'CD'),
  ('cucumber', 'NN'),
  (',', ','),
  ('thinly', 'RB'),
  ('sliced', 'VBD')],
 [('1', 'CD'),
  ('½', 'JJ'),
  ('cups', 'NNS'),
  ('coconut', 'VBP'),
  ('milk', 'NN')],
 [('1', 'CD'),
  ('tablespoon', 'NN'),
  ('crunchy', 'NN'),
  ('peanut', 'NN'),
  ('butter', 'NN'),
  ('(', '('),
  ('Optional', 'NNP'),
  (')', ')')],
 [('¼', 'JJ'),
  ('cup', 'NN'),
  ('diced', 'VBD'),
  ('green', 'JJ'),
  ('onions', 'NNS')],
 [('6', 'CD'),
  ('ounces', 'NNS'),
  ('sharp', 'JJ'),
  ('provolone', 'NN'),
  ('cheese', 'NN'),
  (',', ','),
  ('cubed', 'VBD')],
 [('1', 'CD'), ('tablespoon', 'NN'), ('sumac', 'NN'), ('powder', 'NN')],
 [('1', 'CD'),
  ('½', 'JJ'),
  ('cups', 'NNS'),
  ('chopped', 'VBD'),
  ('onion', 'NN'),
  (',', ','),
  ('divided', 'VBD')],
 [('8', 'CD'),
  ('½', 'JJ'),
  ('cups', 'NNS'),
  ('all-purpose', 'JJ'),
  ('flour', 'NN')],
 [('soy', 'NN'), ('sauce', 'NN'), ('to', 'TO'), ('taste', 'VB')],
 [('1', 'CD'),
  ('teaspoon', 'NN'),
  ('soy', 'NN'),
  ('sauce', 'NN'),
  (','

Numbers need a placeholder

In [342]:
list_words_with_tag(tagged_recipe_ingredients, "CD")

['1',
 '1',
 '1',
 '6',
 '1',
 '1',
 '8',
 '1',
 '1',
 '1',
 '1',
 '2',
 '2',
 '4',
 '4',
 '1',
 '6',
 '2',
 '2',
 '2',
 '10',
 '18',
 '3',
 '2',
 '1',
 '2',
 '1',
 '1',
 '2',
 '3',
 '3',
 '1',
 '3',
 '1',
 '2',
 '6',
 '1/4',
 '1',
 '12',
 '4',
 '1',
 '2',
 '2',
 '4',
 '2',
 '10',
 '1',
 '12',
 '1',
 '2',
 '1',
 '2',
 '14.5',
 '4',
 '8',
 '1',
 '2',
 '4',
 '3',
 '1',
 '1',
 '0.6',
 '24',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '5',
 '6',
 '1',
 '750',
 '1',
 '1',
 '15',
 '2',
 '1',
 '1',
 '1',
 '16',
 '1',
 '1',
 '1',
 '10',
 '4',
 '4',
 '2',
 '2',
 '1',
 '4',
 '2',
 '2',
 '1',
 '5',
 '1',
 '15',
 '110',
 '1',
 '1',
 '3',
 '6',
 '6',
 '2',
 '3',
 '1',
 '3',
 '2',
 '12',
 '2',
 '2',
 '11',
 '2',
 '2',
 '2',
 '1',
 '16',
 '8',
 '30',
 '6',
 '10',
 '5',
 '2',
 '1',
 '1',
 '4',
 '2',
 '6',
 '1',
 '1',
 '4',
 '1',
 '2',
 '1',
 '1',
 '1',
 '2',
 '1',
 '4',
 '1',
 '4',
 '24',
 '6',
 '6',
 '4',
 '1',
 '26',
 '5',
 '1',
 '1',
 '8',
 '1',
 '3',
 '2',
 '1',
 '3',
 '6',
 '2',
 '4',


NLTK assumes fractions as JJ (adjectives)

In [343]:
list_words_with_tag(tagged_recipe_ingredients, "JJ")

['½',
 '¼',
 'green',
 'sharp',
 '½',
 '½',
 'all-purpose',
 '½',
 'cup',
 'oil-packed',
 'sun-dried',
 'small',
 'red',
 '.25',
 'instant',
 'green',
 '2-inch',
 '½',
 'walleye',
 'large',
 'chicken',
 '½',
 'saltine',
 'green',
 '⅔',
 'gingersnap',
 'garlic',
 'fresh',
 'flat-leaf',
 'whole',
 'black',
 'conch',
 '½',
 'black',
 'fresh',
 'garlic',
 '½',
 'large',
 '1/4-inch',
 'large',
 'red',
 '½',
 'phyllo',
 'frozen',
 '¼',
 'small',
 'shrimp',
 'cooked',
 '½',
 'green',
 'red',
 '½',
 'necessary',
 'frozen',
 '½',
 '1/4-inch',
 'medium',
 'Italian-style',
 '½',
 'sweet',
 'small',
 '2-inch',
 'jalapeno',
 'fresh',
 '¾',
 'soft',
 'frozen',
 'vegetable',
 'such',
 'frozen',
 'vegetable',
 'vegetable',
 '½',
 'large',
 'fresh',
 '⅛',
 'green',
 'pinch',
 'white',
 'garlic',
 'large',
 'Korean',
 'vegetable',
 '½',
 'red',
 'such',
 'large',
 'fresh',
 'yellow',
 'tablespoon',
 'coconut-flavored',
 'such',
 'jumbo',
 '⅓',
 'whole',
 'small',
 'red',
 'fresh',
 '¼',
 'soy',
 'red',


Create a function that converts any fraction in a text to integer

In [344]:
import unicodedata
from decimal import Decimal

def fraction_to_int(text):
  for i, char in enumerate(text):
    try:
      # unicode.numeric converts fractions such as ½ to decimal place, 0.25
      # remove trailing decimals, otherwise keep decimals
      text = text[:i] + str(Decimal(unicodedata.numeric(char)).normalize()) + text[i + 1:]
    except Exception as e:
      pass
  # Because number + fraction, such as 1 1/4 may be converted to 1 0, so use re.sub to remove
  text = re.sub("([0-9]+ [0])+", "4", text)
  return text

for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = fraction_to_int(p_ingredients[i])

tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['1 cucumber, thinly sliced',
 '4.5 cups coconut milk',
 '1 tablespoon crunchy peanut butter (Optional)',
 '0.25 cup diced green onions',
 '6 ounces sharp provolone cheese, cubed',
 '1 tablespoon sumac powder',
 '4.5 cups chopped onion, divided',
 '4.5 cups all-purpose flour',
 'soy sauce to taste',
 '1 teaspoon soy sauce, or to taste',
 '0.5 cup oil-packed sun-dried tomatoes, coarsely chopped',
 '1 small head red cabbage, cored and shredded',
 '1 (.25 ounce) package instant yeast',
 '1 teaspoon minced parsley',
 '2 green onions, cut in 2-inch lengths',
 '2 teaspoons Sesame seeds for garnish',
 '0.5 cup grated Parmesan cheese for topping',
 '4 (4 ounce) fillets walleye',
 '1 head Savoy cabbage, coarsely chopped',
 '6 large limes, juiced']

By converting fractions into integers, NLTK stops seeing them as adjectives (JJ) and instead, they are considered numbers (CD)

In [345]:
list_words_with_tag(tagged_recipe_ingredients, "JJ")

['green',
 'sharp',
 'all-purpose',
 'cup',
 'oil-packed',
 'sun-dried',
 'small',
 'red',
 '.25',
 'instant',
 'green',
 '2-inch',
 'walleye',
 'large',
 'chicken',
 'saltine',
 'green',
 'gingersnap',
 'garlic',
 'fresh',
 'flat-leaf',
 'whole',
 'black',
 'conch',
 'cup',
 'black',
 'fresh',
 'garlic',
 'large',
 '1/4-inch',
 'large',
 'red',
 'phyllo',
 'frozen',
 'small',
 'shrimp',
 'cooked',
 'fluid',
 'green',
 'red',
 'cup',
 'necessary',
 'frozen',
 'beef',
 'int1',
 '4/4-inch',
 'medium',
 'Italian-style',
 'sweet',
 'small',
 '2-inch',
 'jalapeno',
 'fresh',
 'soft',
 'frozen',
 'vegetable',
 'such',
 'frozen',
 'vegetable',
 'vegetable',
 'baby',
 'large',
 'fresh',
 'green',
 'pinch',
 'white',
 'garlic',
 'large',
 'Korean',
 'vegetable',
 'red',
 'such',
 'large',
 'fresh',
 'yellow',
 'tablespoon',
 'coconut-flavored',
 'such',
 'jumbo',
 'whole',
 'small',
 'red',
 'fresh',
 'soy',
 'red',
 'garlic',
 'cup',
 'cup',
 'heavy',
 'goat',
 'fresh',
 'warm',
 'green',
 'pi

Replace all the numbers with placeholder of 4

In [346]:
for i, ingre in enumerate(p_ingredients):
    p_ingredients[i] = searchReplacePatt(p_ingredients[i], NUMPATTERN, "4")
    
tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['4 cucumber, thinly sliced',
 '4.4 cups coconut milk',
 '4 tablespoon crunchy peanut butter (Optional)',
 '4.4 cup diced green onions',
 '4 ounces sharp provolone cheese, cubed',
 '4 tablespoon sumac powder',
 '4.4 cups chopped onion, divided',
 '4.4 cups all-purpose flour',
 'soy sauce to taste',
 '4 teaspoon soy sauce, or to taste',
 '4.4 cup oil-packed sun-dried tomatoes, coarsely chopped',
 '4 small head red cabbage, cored and shredded',
 '4 (.4 ounce) package instant yeast',
 '4 teaspoon minced parsley',
 '4 green onions, cut in 4-inch lengths',
 '4 teaspoons Sesame seeds for garnish',
 '4.4 cup grated Parmesan cheese for topping',
 '4 (4 ounce) fillets walleye',
 '4 head Savoy cabbage, coarsely chopped',
 '4 large limes, juiced']

In [347]:
new_cd_tokens = list(set(list_words_with_tag(tagged_recipe_ingredients, "CD")))
new_cd_tokens.remove('4')
new_cd_tokens

['mostaccioli',
 'seven',
 '4p',
 'beef4',
 'xanthan',
 'one',
 '4up',
 'yum',
 'fontina',
 'ziti',
 'four',
 'mascarpone',
 'bleu',
 'marinara',
 'kalamata',
 'millet',
 "za'atar",
 '4/4x4/4',
 '4.4.4',
 'mozzarella',
 'provolone',
 'yellow',
 'zapallo',
 '4/4',
 'zucchini',
 '4.4']

In [348]:
def find_ingre_with_substring(sub):
    ingres = []
    for ingre in p_ingredients:
        matches = searchWordsPatt(ingre, sub)
        if len(matches)  > 0:
            ingres.append(ingre)
    return ingres

find_ingre_with_substring('4/4')

['4 pounds large eggplants, sliced lengthwise into 4/4-inch slices',
 '4 (4 ounce) (4/4 inch thick) slices of beef round',
 '4.4 pounds beef skirt steak, cut across the grain int4 4/4-inch strips',
 '4.4 cup warm wate4 degrees4/4 degrees C)',
 '4 eggplant, peeled and cut into 4/4-inch slices',
 '4.4 pound fresh un4e4led ginger root, cut into 4/4-inch chunks',
 '4 cups raw taro root, peeled and cut into 4/4-inch cubes',
 '4 (4/4 inch) piece fresh ginger root, peeled and chopped',
 '4 small eggplants, sliced 4/4-inch thick',
 '4 pounds onions, cut into 4/4-inch slices',
 '4 pounds boneless beef sirloin or beef top round steaks (4/4" thick)',
 '4 eggplant, peeled and cut into 4/4-inch slices',
 '4 small stalk celery, cut into 4/4-inch dice',
 '4 (4 4/4 pound) pork tenderloins, trimmed',
 '4 hot red pepper, seeded and sliced diagonally into 4/4 inch pieces',
 '4 medium Italian eggplants, peeled and cut into 4/4-inch slices',
 '4.4 c4p4/4-inch long vermicelli',
 '4 pound salt cod steaks, sk

In [349]:
def search_edit_ingredient(regex, new_val):
    for i, ingre in enumerate(p_ingredients):
        p_ingredients[i] = searchReplacePatt(p_ingredients[i], regex, new_val)
        
search_edit_ingredient(r"4/4", "4.4")

find_ingre_with_substring('4/4')

[]

In [350]:
search_edit_ingredient(r"®", "")

find_ingre_with_substring('®')

[]

In [351]:
find_ingre_with_substring('4p')

['4.4 c4p4.4-inch long vermicelli',
 '4.4 4p warm milk (4 degrees F/4 degrees C)']

In [352]:
search_edit_ingredient(r"c4p", "")
search_edit_ingredient(r"4p", "")

find_ingre_with_substring('4p')

[]

In [353]:
find_ingre_with_substring('4up')

['4.4 4up 4% milk']

In [354]:
search_edit_ingredient(r"4up", "7up")

find_ingre_with_substring('7up')

['4.4 7up 4% milk']

In [355]:
def split_ingre_to_two(target, search, retain_target=False):
    for i, ingre in enumerate(p_ingredients):
        if p_ingredients[i] == target:
            splits = re.split(search, p_ingredients[i])
            new_ingre1 = splits[0].strip()
            new_ingre2 = splits[1].strip()
            if retain_target:
                new_ingre2 = search.strip()
            del p_ingredients[i]
            p_ingredients.append(new_ingre1)
            p_ingredients.append(new_ingre2)

split_ingre_to_two('4.4 7up 4% milk', " 4% milk", retain_target=True)

find_ingre_with_substring('7up')

['4.4 7up']

In [356]:
tagged_recipe_ingredients = retag_ingredients()
p_ingredients[:20]

['4 cucumber, thinly sliced',
 '4.4 cups coconut milk',
 '4 tablespoon crunchy peanut butter (Optional)',
 '4.4 cup diced green onions',
 '4 ounces sharp provolone cheese, cubed',
 '4 tablespoon sumac powder',
 '4.4 cups chopped onion, divided',
 '4.4 cups all-purpose flour',
 'soy sauce to taste',
 '4 teaspoon soy sauce, or to taste',
 '4.4 cup oil-packed sun-dried tomatoes, coarsely chopped',
 '4 small head red cabbage, cored and shredded',
 '4 (.4 ounce) package instant yeast',
 '4 teaspoon minced parsley',
 '4 green onions, cut in 4-inch lengths',
 '4 teaspoons Sesame seeds for garnish',
 '4.4 cup grated Parmesan cheese for topping',
 '4 (4 ounce) fillets walleye',
 '4 head Savoy cabbage, coarsely chopped',
 '4 large limes, juiced']

In [357]:
new_cd_tokens = list(set(list_words_with_tag(tagged_recipe_ingredients, "CD")))
new_cd_tokens

['mostaccioli',
 '4',
 'seven',
 '4.4x4.4',
 'beef4',
 'xanthan',
 'one',
 'yum',
 'fontina',
 'ziti',
 'four',
 'mascarpone',
 'bleu',
 'marinara',
 'kalamata',
 'millet',
 "za'atar",
 '4.4.4',
 'mozzarella',
 'provolone',
 'yellow',
 'zapallo',
 'zucchini',
 '7up',
 '4.4']

In [358]:
tagged_recipe_ingredients = retag_ingredients()

all_ingre_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_ingredients, POS)}
  all_ingre_tags.append(new_dic)

get_tag_number(all_ingre_tags)

[{'$': 0},
 {"''": 14},
 {'(': 3744},
 {')': 3828},
 {',': 8512},
 {'--': 0},
 {'.': 23},
 {':': 304},
 {'CC': 3074},
 {'CD': 21788},
 {'DT': 99},
 {'EX': 0},
 {'FW': 52},
 {'IN': 2849},
 {'JJ': 13401},
 {'JJR': 523},
 {'JJS': 6},
 {'LS': 0},
 {'MD': 612},
 {'NN': 32987},
 {'NNP': 2411},
 {'NNPS': 2},
 {'NNS': 13598},
 {'PDT': 1},
 {'POS': 126},
 {'PRP': 2},
 {'PRP$': 1},
 {'RB': 1452},
 {'RBR': 5},
 {'RBS': 0},
 {'RP': 13},
 {'SYM': 53},
 {'TO': 1039},
 {'UH': 0},
 {'VB': 1725},
 {'VBD': 8949},
 {'VBG': 354},
 {'VBN': 3434},
 {'VBP': 646},
 {'VBZ': 588},
 {'WDT': 1},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 0}]

In [359]:
colon_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, ":")))
colon_tags

['-', ':', '--', ';']

In [360]:
for c in colon_tags:
    print(find_ingre_with_substring(c))

['4.4 cups all-purpose flour', '4.4 cup oil-packed sun-dried tomatoes, coarsely chopped', '4 green onions, cut in 4-inch lengths', '4 teaspoons chopped fresh flat-leaf parsley', '4 pounds large eggplants, sliced lengthwise into 4.4-inch slices', '4.4 pound small shrimp - peeled and deveined', '4.4 pounds beef skirt steak, cut across the grain int4 4.4-inch strips', '4 (4.4 ounce) cans Italian-style diced tomatoes', '4 small Golden Delicious apples - peeled, cored, and chopped', '4 pound beef sirloin, cut into 4-inch strips', '4 tablespoon coconut-flavored rum (such as Bacardi Coconut™)', '4 tablespoon brandy-based orange liqueur (such as Grand Marnier)', '4 cups all-purpose baking mix (such as Bisquick)', '4 eggplant, peeled and cut into 4.4-inch slices', '4 apples - peeled, cored, and sliced', '4 fluid ounce half-and-half cream', '4.4 pounds skinless, boneless chicken breast meat - cut into strips', '4.4 cup stir-fry sauce', '4 whole mackerel - gutted, cleaned, and cut into 4-inch pie

In [361]:
find_ingre_with_substring("--")

['4 large skinless, boneless chicken breast halves -- trimmed and cut into 4-inch pieces']

In [362]:
search_edit_ingredient(r"--", ",")

find_ingre_with_substring('--')

[]

In [363]:
find_ingre_with_substring(":")

['Meatballs:',
 'Gravy:',
 'Chipotle Mayonnaise:',
 'Dipping Sauce:',
 'Fillings:',
 'Caramel:',
 'Spice Blend:']

In [364]:
search_edit_ingredient(r":", "")

find_ingre_with_substring(':')

[]

In [365]:
find_ingre_with_substring(";")

['4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)',
 '4 raw chop with refuse, 4 g; (blank) 4.4 ounces boneless pork chops, pounded to 4.4 inch thick',
 '4 (4 ounce) can black beans; drain and reserve liquid']

In [366]:
find_ingre_with_substring(', 4 g')

['4 raw chop with refuse, 4 g; (blank) 4.4 ounces boneless pork chops, pounded to 4.4 inch thick']

In [367]:
search_edit_ingredient(r", 4 g; \(blank\)", ", 4g")

find_ingre_with_substring(";")

['4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)',
 '4 (4 ounce) can black beans; drain and reserve liquid']

In [368]:
split_ingre_to_two('4 raw chop with refuse, 4g; (blank) 4.4 ounces boneless pork chops, pounded to 4.4 inch thick', "; ")

find_ingre_with_substring(";")

['4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)',
 '4 (4 ounce) can black beans; drain and reserve liquid']

In [369]:
split_ingre_to_two("4 cups assorted mushrooms, sliced (I like white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)", r"\(I like ")

find_ingre_with_substring(";")

['4 (4 ounce) can black beans; drain and reserve liquid',
 'white buttons, oyster, shiitake, portobello and crimini; if using shiitake, discard stems)']

In [370]:
find_ingre_with_substring("/")

['4 cups warm water (4 degrees F/4 degrees C)',
 '4 cups warm water (4 degrees F/4 degrees C)',
 '4 cup warm water (4 degrees F/4 degrees C)',
 '4 tablespoons warm milk (4 degrees F/4 degrees C)',
 '4 tablespoons warm water (4 degrees F/4 degrees C)',
 '4 cup shredded Cheddar/Monterey Jack cheese blend',
 '4 cup warm water (4 degrees F/4 degrees C)',
 '4.4  warm milk (4 degrees F/4 degrees C)',
 '4.4 c4 warm water (4 degrees F/4 degrees C)',
 '4 tablespoons warm water (4 degrees F/4 degrees C)',
 '4 (4 ounce) package round gyoza/potsticker wrappers',
 '4.4 tablespoon Guacamole, salsa, and/or sour cream',
 '4 cups warm water (4 degrees F/4 degrees C)',
 '4 (4.4 ounce) package corn bread/muffin mix',
 '4.4 c4 warm water (4 degrees F/4 degrees C)',
 '4.4 cu4 warm water (4 degrees F/4 degrees C)',
 '4.4 cups warm wat4(4 degree4F/4 degrees C)',
 '4 cup warm milk (4 degrees F/4 degrees C)']

In [371]:
search_edit_ingredient(r"\/", " or ")
find_ingre_with_substring("/")

[]

In [372]:
tagged_recipe_ingredients = retag_ingredients()

tagged_recipe_ingredients[:20]

[[('4', 'CD'),
  ('cucumber', 'NN'),
  (',', ','),
  ('thinly', 'RB'),
  ('sliced', 'VBD')],
 [('4.4', 'CD'), ('cups', 'NNS'), ('coconut', 'VBP'), ('milk', 'NN')],
 [('4', 'CD'),
  ('tablespoon', 'NN'),
  ('crunchy', 'NN'),
  ('peanut', 'NN'),
  ('butter', 'NN'),
  ('(', '('),
  ('Optional', 'NNP'),
  (')', ')')],
 [('4.4', 'CD'),
  ('cup', 'NN'),
  ('diced', 'VBD'),
  ('green', 'JJ'),
  ('onions', 'NNS')],
 [('4', 'CD'),
  ('ounces', 'NNS'),
  ('sharp', 'JJ'),
  ('provolone', 'NN'),
  ('cheese', 'NN'),
  (',', ','),
  ('cubed', 'VBD')],
 [('4', 'CD'), ('tablespoon', 'NN'), ('sumac', 'NN'), ('powder', 'NN')],
 [('4.4', 'CD'),
  ('cups', 'NNS'),
  ('chopped', 'VBD'),
  ('onion', 'NN'),
  (',', ','),
  ('divided', 'VBD')],
 [('4.4', 'CD'), ('cups', 'NNS'), ('all-purpose', 'JJ'), ('flour', 'NN')],
 [('soy', 'NN'), ('sauce', 'NN'), ('to', 'TO'), ('taste', 'VB')],
 [('4', 'CD'),
  ('teaspoon', 'NN'),
  ('soy', 'NN'),
  ('sauce', 'NN'),
  (',', ','),
  ('or', 'CC'),
  ('to', 'TO'),
  ('taste

## Examining other POS in ingredients

In [373]:
fw_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "FW")))
fw_tags

['gallo',
 'kalamansi',
 'vanilla',
 'bilbao',
 'pico',
 's',
 'arbol',
 'mirin',
 'herbes',
 'di',
 'de',
 'skin',
 'kielbasa',
 'paprika',
 'kalonji',
 'miso']

In [374]:
rp_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "RP")))
rp_tags

['aside', 'tomato', 'up', 'off', 'out', 'dashi']

In [375]:
for rp in rp_tags:
    print(find_ingre_with_substring(" " + rp))

['4 cup chopped Chinese roast duck meat, skin and fat separated and set aside']
['4.4 cup oil-packed sun-dried tomatoes, coarsely chopped', '4 (4.4 ounce) cans Italian-style diced tomatoes', '4 seeded, chopped roma (plum) tomatoes', '4 cups seeded, chopped plum tomatoes', '4 tomatoes, cut into wedges', '4.4 ounce) can tomato puree', '4 cup finely chopped tomatoes, drained', '4.4 cup canned peeled and diced tomatoes', '4 ripe tomato, chopped', '4 (4 ounce) can diced tomatoes with juice', '4 cups cherry tomatoes (such as Sun Gold)', '4 (4.4 ounce) can diced tomatoes with green chile peppers (such as RO*TEL), undrained', '4.4 tomato', '4 (4 ounce) jar sun-dried tomatoes, drained and cut into quarters', '4 (4 ounce) can diced tomatoes with green chile peppers', '4 (4 fluid ounce) cans tomato juice, or more to taste', '4 (4 ounce) can Italian-flavored tomato paste', '4 (4 ounce) cans Italian plum tomatoes, chopped', '4 ounces chunky tomato sauce (such as Prego)', '4 (4 ounce) can stewed tom

In [376]:
rbr_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "RBR")))
rbr_tags

['lobster', 'lamb', 'leeks']

In [377]:
wdt_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "WDT")))
wdt_tags

['whole']

In [378]:
pdt_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "PDT")))
pdt_tags

['half']

In [379]:
prp_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "PRP")))
prp_tags

['you']

In [380]:
find_ingre_with_substring("you ")

['4 (4 ounce) packages garlic and herb couscous mix (or any flavor you prefer)']

In [381]:
prp_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "PRP$")))
prp_tags

['your']

In [382]:
find_ingre_with_substring("your")

['4 (4 ounce) package pasta, your choice of shape']

In [383]:
punc_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, ".")))
punc_tags

['!', '.']

In [384]:
find_ingre_with_substring("!")

['4.4 cup Greek salad dressing, such as Yazzo!']

In [385]:
quote_tags = list(set(list_words_with_tag(tagged_recipe_ingredients, "''")))
quote_tags

["''"]

In [386]:
for q in quote_tags:
    print(find_ingre_with_substring("'"))

["4 frozen meatless vegetable meatballs (such as IKEA's frozen vegetable balls)", "4.4 cup za'atar, divided", "4 tablespoon confectioners' sugar for dusting, or as needed", "4.4 teaspoon bakers' ammonia", "4.4 cup sifted confectioners' sugar", "4.4 cups confectioners' sugar, sifted, or more as needed", "4 ounces salted pig's tail (Optional)", "4 tablespoon Cajun seasoning blend (such as Tony Chachere's), or to taste", "4 bird's eye chile, minced", "confectioners' sugar for dusting", "4 chile padi (bird's eye chiles)", "4 teaspoon confectioners' sugar (Optional)", "4 cups sifted confectioners' sugar", "4.4 cups confectioners' sugar", "4.4 cup confectioners' sugar", "4.4 cups durum wheat semolina flour (such as Bob's Red Mill)", "4 cups parboiled rice (such as Uncle Ben's)", "4.4 cup confectioners' sugar for rolling", "4 teaspoons garlic pepper seasoning (such as SuzyQ's Santa Maria Valley Style Seasoning), or to taste", "4 pound calf's liver, skinned, deveined, and sliced", "4 tablespoo

In [387]:
list(set(list_words_with_tag(tagged_recipe_ingredients, "SYM")))

['avocados',
 'basil',
 'mangos',
 'mangoes',
 'avocado',
 'lemon',
 'kale',
 'lettuce',
 'cucumber',
 'shrimp',
 'beaten',
 'mango',
 'sauerkraut',
 'choy',
 'squash',
 'tomato',
 'spinach',
 'thighs',
 'breast',
 'leeks']

## Casing of recipe names

In [388]:
all_recipe_names = []

for recipe in p_recipes:
    try:
        all_recipe_names.append(recipe['name'])
    except Exception as e:
        pass
    
all_recipe_names[:10]

['Pan-Fried Asparagus',
 'Creamy Au Gratin Potatoes',
 'Super-Delicious Zuppa Toscana',
 'Simple Teriyaki Sauce',
 'Spicy Korean Fried Chicken with Gochujang Sauce',
 'Spaghetti Aglio e Olio',
 'Easy Garam Masala',
 'Easy Chorizo Street Tacos',
 'Russian Cabbage Rolls with Gravy',
 'Shrimp Scampi with Pasta']

In [389]:
all_recipe_names_corpus = ("\n").join(all_recipe_names)

all_recipe_names_corpus

'Pan-Fried Asparagus\nCreamy Au Gratin Potatoes\nSuper-Delicious Zuppa Toscana\nSimple Teriyaki Sauce\nSpicy Korean Fried Chicken with Gochujang Sauce\nSpaghetti Aglio e Olio\nEasy Garam Masala\nEasy Chorizo Street Tacos\nRussian Cabbage Rolls with Gravy\nShrimp Scampi with Pasta\nGreek Lemon Chicken and Potato Bake\nEasy Mexican Casserole\nGerman Apple Cake I\nSpanish Flan\nGerman Pork Chops and Sauerkraut\nSpaghetti Cacio e Pepe\nChef John\'s Chicken Kiev\nIndian-Style Chicken and Onions\nFajita Seasoning\nPerfect Sushi Rice\nTender Italian Baked Chicken\nAuthentic German Potato Salad\nMiso Soup\nMexican Rice II\nSpongy Japanese Cheesecake\nChicken Katsu\nChicken Stir-Fry\nQuick Beef Stir-Fry\nEasy Authentic Mexican Rice\nHerbs de Provence\nGreek or House Dressing\nFrench Bread\nFocaccia Bread\nJamaican Fried Dumplings\nGluehwein\nCoquilles Saint-Jacques\nMexican-Style Chicken Taco Casserole\nRosemary Braised Lamb Shanks\nMake-Ahead Vegetarian Moroccan Stew\nCurry Stand Chicken Tikka

In [390]:
import nltk

recipe_tokens = list(set(nltk.word_tokenize(all_recipe_names_corpus)))
recipe_tokens[:10]

['Mojitos',
 'Morada',
 'Milk-Braised',
 'Pulpo',
 'Labneh',
 'Embutido',
 'Awesome',
 'Cucumber-Yogurt',
 'Koljivo',
 'Udon']

In [391]:
len(recipe_tokens)

3271

In [392]:
ingredients_corpus = ("\n").join(p_ingredients)

ingredients_corpus

'4 cucumber, thinly sliced\n4.4 cups coconut milk\n4 tablespoon crunchy peanut butter (Optional)\n4.4 cup diced green onions\n4 ounces sharp provolone cheese, cubed\n4 tablespoon sumac powder\n4.4 cups chopped onion, divided\n4.4 cups all-purpose flour\nsoy sauce to taste\n4 teaspoon soy sauce, or to taste\n4.4 cup oil-packed sun-dried tomatoes, coarsely chopped\n4 small head red cabbage, cored and shredded\n4 (.4 ounce) package instant yeast\n4 teaspoon minced parsley\n4 green onions, cut in 4-inch lengths\n4 teaspoons Sesame seeds for garnish\n4.4 cup grated Parmesan cheese for topping\n4 (4 ounce) fillets walleye\n4 head Savoy cabbage, coarsely chopped\n4 large limes, juiced\n4 cups cubed, cooked chicken meat\n4.4 cups rotini pasta\n4 cups crushed saltine crackers\n4 green onions, chopped\n4.4 cup crushed gingersnap cookies\n4 taro leaves\n4 tablespoons chopped garlic\n4 teaspoons chopped fresh flat-leaf parsley\n4 cup coconut milk\n4 tablespoons whole black peppercorns, freshly cru

In [393]:
ingre_tokens = list(set(nltk.word_tokenize(ingredients_corpus)))
ingre_tokens[:10]

['scallion',
 'rump',
 'blood',
 'gelatin',
 'color',
 'nopales',
 'about',
 'hoisin',
 'Udon',
 'refrigerated']

In [394]:
len(ingre_tokens)

2815

Most words in recipe tokens are capitalized

In [395]:
lower_recipe_tokens = []
for token in recipe_tokens:
    if token[0].islower():
        lower_recipe_tokens.append(token)
        
lower_recipe_tokens

['on',
 'aux',
 'e',
 "l'Oignon",
 'con',
 'laziale',
 'le',
 'a',
 'z',
 'des',
 'by',
 'en',
 'na',
 'de',
 'nach',
 'of',
 'bil',
 'version',
 'aka',
 'y',
 "all'Amatriciana",
 'al',
 'for',
 'over',
 'in',
 'the',
 'with',
 'sa',
 'or',
 'chili',
 'to',
 'without',
 'its',
 'di',
 'su',
 'and',
 'au',
 'la',
 'powder',
 'alla',
 'from',
 'et',
 'el']

Number of words that are not capitalized increased significantly crosschecking with lowercase words in ingredient tokens

In [396]:
for i, name in enumerate(recipe_tokens):
    for ingre in ingre_tokens:
        if recipe_tokens[i].lower() == ingre:
            recipe_tokens[i] = recipe_tokens[i].lower()

lower_recipe_tokens = []
for token in recipe_tokens:
    if token[0].islower():
        lower_recipe_tokens.append(token)
        
len(lower_recipe_tokens)

923

In [397]:
upper_recipe_tokens = list(filter(str.istitle, recipe_tokens))
len(upper_recipe_tokens)

2314

In [398]:
upper_recipe_tokens[:20]

['Mojitos',
 'Morada',
 'Milk-Braised',
 'Pulpo',
 'Labneh',
 'Embutido',
 'Awesome',
 'Cucumber-Yogurt',
 'Koljivo',
 'Garlic-Herb',
 'Oh-So-Savory',
 'Pesce',
 'Souvlaki',
 'Shu',
 'Bulgarian',
 'Grillhaxe',
 'Icing',
 'Miraculous',
 'Christmas',
 'Flesh-Keek-Luh']

In [399]:
!pipenv install country_list

Installing country_list...

Installing dependencies from Pipfile.lock (577ce1)...
Ignoring argcomplete: markers 'python_full_version < "3.8.0"' don't match your environment
Ignoring importlib-metadata: markers 'python_version == "3.7" and python_full_version < "3.8.0" and python_full_version < "3.8.0" and python_full_version < "3.8.0"' don't match your environment
Ignoring typing-extensions: markers 'python_full_version < "3.8.0"' don't match your environment




[    ] Installing...
[=   ] Installing country_list...
[==  ] Installing country_list...
[=== ] Installing country_list...
[ ===] Installing country_list...
[  ==] Installing country_list...
[   =] Installing country_list...
[    ] Installing country_list...
[   =] Installing country_list...
[  ==] Installing country_list...
[ ===] Installing country_list...
[====] Installing country_list...
[=== ] Installing country_list...
[==  ] Installing country_list...
[=   ] Installing country_list...
[    ] Installing country_list...
[=   ] Installing country_list...
[==  ] Installing country_list...
[=== ] Installing country_list...
[ ===] Installing country_list...
[  ==] Installing country_list...
[   =] Installing country_list...
[    ] Installing country_list...
[   =] Installing country_list...
Adding country_list to Pipfile's [packages]...
Installation Succeeded 


In [459]:
from country_list import countries_for_language

countries = dict(countries_for_language('en'))
countries = list(countries.values())

countries

['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua & Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia & Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Caribbean Netherlands',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo - Brazzaville',
 'Congo - Kinshasa',
 'Cook Islands',
 'Costa Rica',
 'Côte d’Ivoire',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egyp

In [460]:
countries = ' '.join([elem for elem in countries])
countries = countries.replace('&', '')
countries = countries.split(" ")
countries = [i.strip() for i in countries]
countries = [string for string in countries if string != ""]
countries = [string for string in countries if string != "-"]

countries = countries + ["Filipino", "Malay", "Spanish", "Danish", "Welsh", "Polish", "Schwabisch", "Rochester", "Asia",
                         "Aussie", "Greek", "German", "Mexica", "Hawaii", "Irish", "Mediterranean", "Middle", "East",
                        "Norwegian", "Persian", "Pollo", "Thai", "West"]

countries

['Afghanistan',
 'Åland',
 'Islands',
 'Albania',
 'Algeria',
 'American',
 'Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua',
 'Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia',
 'Herzegovina',
 'Botswana',
 'Bouvet',
 'Island',
 'Brazil',
 'British',
 'Indian',
 'Ocean',
 'Territory',
 'British',
 'Virgin',
 'Islands',
 'Brunei',
 'Bulgaria',
 'Burkina',
 'Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape',
 'Verde',
 'Caribbean',
 'Netherlands',
 'Cayman',
 'Islands',
 'Central',
 'African',
 'Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas',
 'Island',
 'Cocos',
 '(Keeling)',
 'Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Brazzaville',
 'Congo',
 'Kinshasa',
 'Cook',
 'Islands',
 'Costa',
 'Rica',
 'Côte',
 'd’Ivoire',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'C

In [461]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter = PorterStemmer()
lancaster=LancasterStemmer()

porter_c = []
lancester_c = []

for c in countries:
    port = porter.stem(c.split(' ').pop(0))
    if len(port) < 5:
        port = c[:4]
    porter_c.append(port.capitalize())
    lan = lancaster.stem(c.split(' ').pop(0))
    if len(lan) < 5:
        lan = c[:4]
    lancester_c.append(lan.capitalize())

print(porter_c[:10])
print(lancester_c[:10])

['Afghanistan', 'Åland', 'Island', 'Albania', 'Algeria', 'American', 'Samoa', 'Andorra', 'Angola', 'Anguilla']
['Afgh', 'Åland', 'Island', 'Alban', 'Alger', 'Amer', 'Samo', 'Andorr', 'Angol', 'Anguill']


In [462]:
lancester_c.append("Victoria")
lancester_c

['Afgh',
 'Åland',
 'Island',
 'Alban',
 'Alger',
 'Amer',
 'Samo',
 'Andorr',
 'Angol',
 'Anguill',
 'Antarctic',
 'Antigu',
 'Barbud',
 'Argentin',
 'Armen',
 'Arub',
 'Austral',
 'Austr',
 'Azerbaid',
 'Bahama',
 'Bahrain',
 'Bangladesh',
 'Barbado',
 'Belar',
 'Belg',
 'Beli',
 'Benin',
 'Bermud',
 'Bhut',
 'Boliv',
 'Bosn',
 'Herzegovin',
 'Botswan',
 'Bouvet',
 'Island',
 'Brazil',
 'Brit',
 'Indi',
 'Ocea',
 'Territ',
 'Brit',
 'Virgin',
 'Island',
 'Brune',
 'Bulgar',
 'Burkin',
 'Faso',
 'Burund',
 'Cambod',
 'Cameroon',
 'Canad',
 'Cape',
 'Verd',
 'Carib',
 'Netherland',
 'Caym',
 'Island',
 'Cent',
 'Afri',
 'Republ',
 'Chad',
 'Chil',
 'Chin',
 'Christmas',
 'Island',
 'Coco',
 '(keeling)',
 'Island',
 'Colomb',
 'Comoro',
 'Congo',
 'Brazzavil',
 'Congo',
 'Kinshas',
 'Cook',
 'Island',
 'Cost',
 'Rica',
 'Côte',
 'D’ivoire',
 'Croat',
 'Cuba',
 'Curaçao',
 'Cypr',
 'Czech',
 'Denmark',
 'Djibout',
 'Dominic',
 'Domin',
 'Republ',
 'Ecuad',
 'Egypt',
 'El',
 'Salvad',
 'E

In [463]:
token_with_country_prefix = []
for rt in recipe_tokens:
    for lan in lancester_c:
        if lan in rt:
            token_with_country_prefix.append(rt)

token_with_country_prefix = sorted(list(set(token_with_country_prefix)))
token_with_country_prefix.remove("No-Cook")
token_with_country_prefix.remove("Man")
token_with_country_prefix.remove("Slow-Cooked")
token_with_country_prefix.remove("Slow-Cooker")
token_with_country_prefix.remove("Garlic-Anchovy-Sardine")
token_with_country_prefix

ValueError: list.remove(x): x not in list

In [464]:
token_with_country_prefix

["'Chinese",
 'Afghan',
 'Afghani',
 'African',
 'African-Style',
 'Afritada',
 'Algerian',
 'Almond-Ricotta',
 'American',
 'Americano',
 'Arabic',
 'Argentine',
 'Argentinean',
 'Armenian',
 'Asiago',
 'Asian',
 'Asian-Inspired',
 'Asian-Style',
 'Asian-Themed',
 'Australian',
 'Bangladeshi',
 'Belgi',
 'Belgian',
 'Belizean',
 'Bermuda',
 'Bhutanese',
 'Bolivian',
 'Brazilian',
 'Brazilian-Style',
 'British',
 'Bulgarian',
 'Cambodian',
 'Canada',
 'Canadian',
 'Cape',
 'Capezzoli',
 'Caribbean',
 'Caribbean-Spiced',
 'Chad',
 'Chilaquiles',
 'Chile',
 'Chilean',
 'Chilean-Style',
 'Chili',
 'Chinese',
 'Chinese-Style',
 'Christmas',
 'Coco',
 'Coconut',
 'Coconut-Lentil',
 'Coconut-Lime',
 'Cocotte',
 'Colombian',
 'Cooker',
 'Cookies',
 'Cooks',
 'Cookup',
 'Costa',
 'Croatian',
 'Cuban',
 'Cuban-Inspired',
 'Cuban-Style',
 'Cubanos',
 'Curry-Coconut',
 'Czech',
 'Czechoslovakian',
 'Danielle',
 'Danish',
 'Dominican',
 'Dominican-Style',
 'East',
 'Easter',
 'Eastern',
 'Eastern-

In [465]:
possesive_tokens = list_words_with_tag(tagged_recipe_names, "''")
possesive_tokens

["''", "''", "'", "''", "''", "''", "''"]

In [466]:
possessive_names = []
for ps in possesive_tokens:
    print(find_value_with_char(p_recipes, 'name', ps))
    possessive_names = possessive_names + find_value_with_char(p_recipes, 'name', ps)
    
possessive_names = sorted(possessive_names)

[]
[]
["Chef John's Chicken Kiev", "Angela's Awesome Enchiladas", "Randy's Slow Cooker Ravioli Lasagna", "'Chinese Buffet' Green Beans", "Chef John's Beef Rouladen", "Corned Beef and Cabbage Shepherd's Pie", "Gramma's Date Squares", "Authentic Russian Salad 'Olivye'", "Chef John's Meatless Meatballs", "Chef John's Beef Goulash", "Grandma's Noodles II", "Chef John's Clotted Cream", "Newfoundland Jigg's Dinner", "Chef John's Coq Au Vin", "Chef John's Loco Moco", "Dash's Donair", "Turkey Shepherd's Pie", "Papa Drexler's Bavarian Pretzels", "Bob's Stuffed Banana Peppers", "Chef John's Swedish Meatballs", "Chef John's German Recipes", "Chef John's Chicken Tikka Masala", "Maria's Mexican Rice", "Mom's Buttermilk Pancakes", "Geneva's Ultimate Hungarian Mushroom Soup", "Charley's Slow Cooker Mexican Style Meat", "Ingrid's Rouladen", "Chef John's Lasagna", "Lola's Horchata", "Chef John's Italian Sausage Chili", "Kid's Favorite Pizza Casserole", "Traci's Adobo Seasoning", "Frank's Favorite Slow-

In [467]:
possessive_names

["'Calabacitas Guisada'",
 "'Chinese Buffet' Green Beans",
 "'Otai",
 "A Firefighter's Meatloaf",
 "A Scotsman's Shepherd Pie",
 "Abuela's Picadillo",
 "Adriel's Chinese Curry Chicken",
 "Al's Baked Swiss Steak",
 "Al's Burmese Chicken Curry",
 "Ali's Amazing Bruschetta",
 "Alicia's Aloo Gobi",
 "Allie's Mushroom Pizza",
 "Alysia's Basic Meat Lasagna",
 "Amanda's Stuffed Peppers",
 "Andy's Spicy Green Chile Pork",
 "Angela's Asian-Inspired Chicken Noodle Soup",
 "Angela's Awesome Enchiladas",
 "Anne's Chicken Chilaquiles Rojas",
 "Aunt Anne's Sesame Cookies",
 "Aunt Bert's Fruitcake Cookies",
 "Aunt Mary's Eggplant Balls",
 "Authentic Russian Salad 'Olivye'",
 "Bailey's Irish Cream Brownies",
 "Baked Za'atar Chicken Thighs",
 "Barbarella's German Pancakes",
 "Ben's Bearnaise Sauce",
 "Biddy's Butternut Squash and Turkey Lasagna",
 "Big Ben's Beef Machaca",
 "Bill's Peruvian Chicken and Rice",
 "Bob's Awesome Lasagna",
 "Bob's Slow Cooker Braciole",
 "Bob's Stuffed Banana Peppers",
 "Bo

In [468]:
all_recipe_names[:10]

['Pan-Fried Asparagus',
 'Creamy Au Gratin Potatoes',
 'Super-Delicious Zuppa Toscana',
 'Simple Teriyaki Sauce',
 'Spicy Korean Fried Chicken with Gochujang Sauce',
 'Spaghetti Aglio e Olio',
 'Easy Garam Masala',
 'Easy Chorizo Street Tacos',
 'Russian Cabbage Rolls with Gravy',
 'Shrimp Scampi with Pasta']

In [469]:
all_recipe_names2 = all_recipe_names.copy()
all_recipe_names2[:10]

['Pan-Fried Asparagus',
 'Creamy Au Gratin Potatoes',
 'Super-Delicious Zuppa Toscana',
 'Simple Teriyaki Sauce',
 'Spicy Korean Fried Chicken with Gochujang Sauce',
 'Spaghetti Aglio e Olio',
 'Easy Garam Masala',
 'Easy Chorizo Street Tacos',
 'Russian Cabbage Rolls with Gravy',
 'Shrimp Scampi with Pasta']

In [470]:
print(len(all_recipe_names))
  
all_recipe_names2 = [ele for ele in all_recipe_names2 if ele not in possessive_names] 
print(len(all_recipe_names2))

5249
4890


In [471]:
# https://stackoverflow.com/questions/40291443/python-convert-a-string-to-lowercase-except-some-special-strings/40291577
lowerAllExcept = lambda x: " ".join( a if a in token_with_country_prefix else a.lower()
                                    for a in x.split() )

for i, recipe in enumerate(all_recipe_names2):
    for t in token_with_country_prefix:
        all_recipe_names2[i] = lowerAllExcept(all_recipe_names2[i])

In [472]:
all_recipe_names2 = all_recipe_names2 +  possessive_names
print(len(all_recipe_names2))
all_recipe_names2 = list(set(all_recipe_names2))
print(len(all_recipe_names2))

5249
5104


In [473]:
all_recipe_names_corpus = ("\n").join(all_recipe_names2)

recipe_tokens = list(set(nltk.word_tokenize(all_recipe_names_corpus)))

lower_recipe_tokens = []
for token in recipe_tokens:
    if token[0].islower():
        lower_recipe_tokens.append(token)
        
lower_recipe_tokens

['scallion',
 'company',
 'upside-down',
 'chick',
 'frikadeller',
 'gelatin',
 'huancaina',
 'margherita',
 'nutty',
 'hoisin',
 'pudim',
 'mushrooms',
 'brisket',
 'mayan',
 'on',
 'crazy',
 'pastry',
 'normandy',
 'maast-o',
 'paella',
 'chipotle',
 'pressure',
 'booyah',
 'yellow',
 'cacio',
 'omelet',
 'queen',
 'cioccolata',
 'twists',
 'thighs',
 'fantastic',
 'salpicao',
 'raspberry',
 'bircher',
 'mince',
 'kransekake',
 'mash',
 'apple',
 'michigan',
 'turnips',
 'nasi',
 'yorkies',
 'saltibarsciai',
 'ladoo',
 'omelette',
 'pickerel',
 'rustic',
 'not-so-corny',
 'siopao',
 'wrappers',
 'savillum',
 'pintos',
 'bokkeumbap',
 'hi-rise',
 'champorado',
 'fun',
 'spiedies',
 'toasted',
 'green',
 'nori',
 'gurkensalat',
 'bleu',
 'less',
 'injera',
 'breasts',
 'ladolemono',
 'tokneneng',
 'menudo',
 'fryer',
 'blue',
 'squares',
 'radish',
 'bistek',
 'taqueria',
 'mushy',
 'autenticos',
 'crumpets',
 'gourmet',
 'poffertjes',
 'three-meat',
 'fruity',
 'ube-macapuno',
 'rappi

In [474]:
len(lower_recipe_tokens)

2713

In [475]:
upper_recipe_tokens = list(filter(str.istitle, recipe_tokens))
len(upper_recipe_tokens)

850

In [476]:
upper_recipe_tokens

['Awesome',
 'Oh-So-Savory',
 'Roast',
 'Bulgarian',
 'Date',
 'Icing',
 'Christmas',
 'Au',
 'East',
 'Southwestern-Flavored',
 'Belizean',
 'Maul',
 'Biscuits',
 'Jamaican',
 'Iraqi',
 "L'Ossa",
 'Pizzaiola',
 'Overnight',
 'India',
 "D'Amaretti",
 'Peruano',
 'Cream',
 'Colombian',
 "Dell'Impruneta",
 'Rustica',
 'Puffs',
 'Incredible',
 'Dana',
 'Seared',
 'Green',
 'Baby',
 'Eastern',
 'Sauerbraten',
 'This',
 'Swordfish',
 'Duarte',
 'Patti',
 'Oriold',
 'Algerian',
 'Savannah',
 'Harissa',
 'Sando',
 'Troy',
 'Steve',
 'Barbarella',
 'Marinade',
 'Grammy',
 'Schwabischer',
 'Bavarian',
 'Isolde',
 'Ninabell',
 'Jamey',
 'Tea',
 'Nina',
 'Samosas',
 'Argentinean',
 'Temptation',
 'Jalapeno',
 'Malian',
 'Pig',
 'Yorkshire',
 'Samosadilla',
 'Scarlett',
 'Slow-Cooker',
 'Cranberry',
 'Recipe',
 'Stir-Fry',
 'Janssons',
 'Meatloaf',
 'Indian-Spiced',
 'No-Knead',
 'Indian-Style',
 'Newfoundland',
 'Leek',
 'Tarts',
 'Mozambique',
 'Jenn',
 'Newfoundland-Style',
 'Recipes',
 'Jansso

## Updating POS tags in names after changing casing

In [477]:
final_tagged_names = []

for recipe in all_recipe_names2:
    final_tagged_names.append(tag_pos(recipe))

all_name_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(final_tagged_names, POS)}
  all_name_tags.append(new_dic)

get_tag_number(all_name_tags)

[{'$': 0},
 {"''": 7},
 {'(': 0},
 {')': 0},
 {',': 61},
 {'--': 0},
 {'.': 1},
 {':': 1},
 {'CC': 491},
 {'CD': 27},
 {'DT': 97},
 {'EX': 3},
 {'FW': 46},
 {'IN': 510},
 {'JJ': 2988},
 {'JJR': 10},
 {'JJS': 5},
 {'LS': 0},
 {'MD': 2},
 {'NN': 8442},
 {'NNP': 1967},
 {'NNPS': 8},
 {'NNS': 1333},
 {'PDT': 0},
 {'POS': 342},
 {'PRP': 4},
 {'PRP$': 1},
 {'RB': 58},
 {'RBR': 0},
 {'RBS': 0},
 {'RP': 4},
 {'SYM': 1},
 {'TO': 10},
 {'UH': 0},
 {'VB': 52},
 {'VBD': 258},
 {'VBG': 94},
 {'VBN': 223},
 {'VBP': 201},
 {'VBZ': 25},
 {'WDT': 0},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 6}]

## Chunking (recipe names)

If the number of words in recipes are more than 2 (since bigram can deal with 2-word names), then it can be treated as a recipe name chunk

In [478]:
def sort_unique_list(old_list):
    return sorted(list(set(old_list)))

In [479]:
recipe_name_chunk = []

for recipe in all_recipe_names2:
    if len(recipe.split()) > 2:
        recipe_name_chunk.append(recipe)

recipe_name_chunk = sort_unique_list(names_with_nns)

for n in recipe_name_chunk:
    print(n)

"pantry raid" chicken enchilada casserole
"skinny" chicken tacos
'Chinese Buffet' Green Beans
3-ingredient lemon scones
5-ingredient Mexican casserole
A Firefighter's Meatloaf
Afghan beef raviolis
Afghani kabli pulao
African chicken stew
African sweet potato and peanut soup
African sweet potato stew
Amanda's Stuffed Peppers
Angela's Awesome Enchiladas
Anne's Chicken Chilaquiles Rojas
Argentine chimichurri bread
Argentine meat empanadas
Argentinean cheese bread
Armenian shish kabob
Asiago sun-dried tomato pasta
Asian beef with snow peas
Asian chicken salad
Asian cucumber and peanut salad
Asian dipping sauce
Asian garlic beef cubes
Asian ginger dressing
Asian ginger grill marinade
Asian grilled chicken
Asian ground beef noodle bowls
Asian marinated pork chops
Asian pork tenderloin
Asian roasted brussels sprouts
Asian sesame seared or grilled tuna
Asian shrimp rice bowl
Asian slow Cooker recipes
Asian style meatloaf
Asian style paper wrapped chicken
Asian tuna patties
Asian tuna salad
Asi

thai chicken curry
thai chicken curry in coconut milk
thai chicken noodle soup
thai chicken satay
thai chicken spring rolls
thai chicken stock
thai chicken thigh bake
thai chicken thighs
thai chicken wings
thai chicken with basil stir fry
thai dipping sauce for spring rolls
thai fried rice with pineapple and chicken
thai ginger chicken
thai green curry chicken
thai green curry with chicken
thai ground chicken basil
thai orange chicken
thai peanut butter sauce
thai peanut chicken
thai peanut chicken lo mein
thai peanut dressing
thai peanut stir fry sauce
thai pineapple chicken curry
thai pork with peanut sauce
thai quivering tenderloins
thai red chicken curry
thai rice noodle salad
thai spicy basil chicken fried rice
thai steamed mussels
thai sweet potato soup
thai yellow chicken curry
the Irish potatoes
the Korean chicken recipes
the amazing Swedish meatball
the original camarones a la diabla
the perfect Egyptian rice with vermicelli
the perfect blended margarita
the perfect simplified

In [482]:
in_tokens = sort_unique_list(get_values_from_dict_list(all_name_tags, 'IN')[0])

in_tokens

['Of',
 'Under',
 'arroz',
 'bayrischer',
 'before',
 'beyond',
 'brown',
 'by',
 'de',
 'dough',
 'en',
 'for',
 'from',
 'in',
 'of',
 'on',
 'over',
 'pina',
 'so',
 'trout',
 'under',
 'with',
 'without']

In [488]:
# recipe_name_chunk = []

for recipe in all_recipe_names2:
    for i in in_tokens:
        if i in recipe:
            tagged = tag_pos(recipe)
            for tag in tagged:
                if tag[1] == "IN":
                    print(tag_pos(recipe))
#             recipe_name_chunk.append(recipe)

# recipe_name_chunk = sort_unique_list(names_with_nns)

# for n in recipe_name_chunk:
#     print(n)

[('Italian', 'JJ'), ('beef', 'NN'), ('for', 'IN'), ('Sandwiches', 'NNP')]
[('chicken', 'VBN'), ('marsala', 'NN'), ('with', 'IN'), ('portobello', 'NN'), ('mushrooms', 'NNS')]
[('chicken', 'VBN'), ('marsala', 'NN'), ('with', 'IN'), ('portobello', 'NN'), ('mushrooms', 'NNS')]
[('Nigerian', 'JJ'), ('jollof', 'NN'), ('rice', 'NN'), ('with', 'IN'), ('chicken', 'NN'), ('and', 'CC'), ('fried', 'VBD'), ('plantains', 'NNS')]
[('Nigerian', 'JJ'), ('jollof', 'NN'), ('rice', 'NN'), ('with', 'IN'), ('chicken', 'NN'), ('and', 'CC'), ('fried', 'VBD'), ('plantains', 'NNS')]
[('Nigerian', 'JJ'), ('jollof', 'NN'), ('rice', 'NN'), ('with', 'IN'), ('chicken', 'NN'), ('and', 'CC'), ('fried', 'VBD'), ('plantains', 'NNS')]
[('Nigerian', 'JJ'), ('jollof', 'NN'), ('rice', 'NN'), ('with', 'IN'), ('chicken', 'NN'), ('and', 'CC'), ('fried', 'VBD'), ('plantains', 'NNS')]
[('Japanese', 'JJ'), ('soup', 'NN'), ('with', 'IN'), ('tofu', 'NN'), ('and', 'CC'), ('mushrooms', 'NNS')]
[('Japanese', 'JJ'), ('soup', 'NN'), ('w

[('penne', 'NN'), ('with', 'IN'), ('sausage', 'NN'), ('and', 'CC'), ('broccoli', 'NN'), ('rabe', 'NN')]
[('prosciutto-wrapped', 'JJ'), ('chicken', 'NN'), ('breasts', 'NNS'), ('with', 'IN'), ('herbed', 'JJ'), ('goat', 'NN'), ('cheese', 'NN')]
[('prosciutto-wrapped', 'JJ'), ('chicken', 'NN'), ('breasts', 'NNS'), ('with', 'IN'), ('herbed', 'JJ'), ('goat', 'NN'), ('cheese', 'NN')]
[('calamari', 'NN'), ('in', 'IN'), ('a', 'DT'), ('creamy', 'JJ'), ('white', 'JJ'), ('wine', 'NN'), ('sauce', 'NN')]
[('pasta', 'NN'), ('with', 'IN'), ('cottage', 'NN'), ('cheese', 'NN')]
[('homemade', 'NN'), ('pasta', 'NN'), ('without', 'IN'), ('a', 'DT'), ('pasta', 'NN'), ('machine', 'NN')]
[('homemade', 'NN'), ('pasta', 'NN'), ('without', 'IN'), ('a', 'DT'), ('pasta', 'NN'), ('machine', 'NN')]
[('homemade', 'NN'), ('pasta', 'NN'), ('without', 'IN'), ('a', 'DT'), ('pasta', 'NN'), ('machine', 'NN')]
[('homemade', 'NN'), ('pasta', 'NN'), ('without', 'IN'), ('a', 'DT'), ('pasta', 'NN'), ('machine', 'NN')]
[('Dee', 

[('easy', 'JJ'), ('spinach', 'NN'), ('lasagna', 'NN'), ('with', 'IN'), ('white', 'JJ'), ('sauce', 'NN')]
[('easy', 'JJ'), ('spinach', 'NN'), ('lasagna', 'NN'), ('with', 'IN'), ('white', 'JJ'), ('sauce', 'NN')]
[('Indian', 'JJ'), ('chicken', 'NN'), ('korma', 'NN'), ('in', 'IN'), ('the', 'DT'), ('slow', 'JJ'), ('Cooker', 'NN')]
[('Indian', 'JJ'), ('chicken', 'NN'), ('korma', 'NN'), ('in', 'IN'), ('the', 'DT'), ('slow', 'JJ'), ('Cooker', 'NN')]
[('turmeric', 'JJ'), ('golden', 'JJ'), ('milk', 'NN'), ('with', 'IN'), ('turmeric', 'JJ'), ('paste', 'NN')]
[('turmeric', 'JJ'), ('golden', 'JJ'), ('milk', 'NN'), ('with', 'IN'), ('turmeric', 'JJ'), ('paste', 'NN')]
[('turmeric', 'JJ'), ('golden', 'JJ'), ('milk', 'NN'), ('with', 'IN'), ('turmeric', 'JJ'), ('paste', 'NN')]
[('Indian-Style', 'JJ'), ('rice', 'NN'), ('with', 'IN'), ('cashews', 'NNS'), (',', ','), ('raisins', 'NNS'), ('and', 'CC'), ('turmeric', 'NN')]
[('Indian-Style', 'JJ'), ('rice', 'NN'), ('with', 'IN'), ('cashews', 'NNS'), (',', ','

[('gratin', 'NN'), ('dauphinois', 'NN'), ('de', 'IN'), ('solange', 'NN')]
[('gratin', 'NN'), ('dauphinois', 'NN'), ('de', 'IN'), ('solange', 'NN')]
[('gnocchi', 'NNS'), ('with', 'IN'), ('sage-butter', 'JJ'), ('sauce', 'NN')]
[('spinach', 'NN'), ('manicotti', 'NN'), ('with', 'IN'), ('Italian', 'JJ'), ('sausage', 'NN')]
[('spinach', 'NN'), ('manicotti', 'NN'), ('with', 'IN'), ('Italian', 'JJ'), ('sausage', 'NN')]
[('spinach', 'NN'), ('manicotti', 'NN'), ('with', 'IN'), ('Italian', 'JJ'), ('sausage', 'NN')]
[('margarita', 'NN'), ('on', 'IN'), ('the', 'DT'), ('rocks', 'NNS')]
[('baked', 'VBN'), ('chicken', 'NN'), ('in', 'IN'), ('a', 'DT'), ('sweet', 'JJ'), ('bbq', 'NN'), ('sauce', 'NN')]
[('baked', 'VBN'), ('chicken', 'NN'), ('in', 'IN'), ('a', 'DT'), ('sweet', 'JJ'), ('bbq', 'NN'), ('sauce', 'NN')]
[('West', 'NNP'), ('African-Style', 'JJ'), ('peanut', 'NN'), ('stew', 'NN'), ('with', 'IN'), ('chicken', 'NN')]
[('West', 'NNP'), ('African-Style', 'JJ'), ('peanut', 'NN'), ('stew', 'NN'), ('wi

[('Chinese-Style', 'JJ'), ('baby', 'NN'), ('bok', 'NNS'), ('choy', 'VBP'), ('with', 'IN'), ('mushroom', 'NN'), ('sauce', 'NN')]
[('Chinese-Style', 'JJ'), ('baby', 'NN'), ('bok', 'NNS'), ('choy', 'VBP'), ('with', 'IN'), ('mushroom', 'NN'), ('sauce', 'NN')]
[('Chinese-Style', 'JJ'), ('baby', 'NN'), ('bok', 'NNS'), ('choy', 'VBP'), ('with', 'IN'), ('mushroom', 'NN'), ('sauce', 'NN')]
[('vegan', 'JJ'), ('chickpea', 'NN'), ('curry', 'NN'), ('without', 'IN'), ('Coconut', 'NNP'), ('milk', 'NN')]
[('vegan', 'JJ'), ('chickpea', 'NN'), ('curry', 'NN'), ('without', 'IN'), ('Coconut', 'NNP'), ('milk', 'NN')]
[('vegan', 'JJ'), ('chickpea', 'NN'), ('curry', 'NN'), ('without', 'IN'), ('Coconut', 'NNP'), ('milk', 'NN')]
[('Greek', 'JJ'), ('pizza', 'NN'), ('with', 'IN'), ('spinach', 'NN'), (',', ','), ('feta', 'NN'), ('and', 'CC'), ('olives', 'NNS')]
[('Greek', 'JJ'), ('pizza', 'NN'), ('with', 'IN'), ('spinach', 'NN'), (',', ','), ('feta', 'NN'), ('and', 'CC'), ('olives', 'NNS')]
[('Greek', 'JJ'), ('pi

[('sauerkraut', 'NN'), ('filling', 'VBG'), ('for', 'IN'), ('Pierogi', 'NNP')]
[('pork', 'NN'), ('loin', 'VBD'), ('roast', 'NN'), ('with', 'IN'), ('baby', 'NN'), ('bellas', 'NNS')]
[('pork', 'NN'), ('loin', 'VBD'), ('roast', 'NN'), ('with', 'IN'), ('baby', 'NN'), ('bellas', 'NNS')]
[('pork', 'NN'), ('loin', 'VBD'), ('roast', 'NN'), ('with', 'IN'), ('baby', 'NN'), ('bellas', 'NNS')]
[('chicken', 'NN'), ('with', 'IN'), ('chanterelle', 'NN'), ('mushrooms', 'NNS'), ('and', 'CC'), ('marsala', 'NNS'), ('wine', 'VBP')]
[('chicken', 'NN'), ('with', 'IN'), ('chanterelle', 'NN'), ('mushrooms', 'NNS'), ('and', 'CC'), ('marsala', 'NNS'), ('wine', 'VBP')]
[('chicken', 'NN'), ('with', 'IN'), ('chanterelle', 'NN'), ('mushrooms', 'NNS'), ('and', 'CC'), ('marsala', 'NNS'), ('wine', 'VBP')]
[('homemade', 'NN'), ('mince', 'NN'), ('pie', 'NN'), ('with', 'IN'), ('crumbly', 'NN'), ('topping', 'VBG')]
[('homemade', 'NN'), ('mince', 'NN'), ('pie', 'NN'), ('with', 'IN'), ('crumbly', 'NN'), ('topping', 'VBG')]
[

[('stir-fried', 'JJ'), ('mushrooms', 'NNS'), ('with', 'IN'), ('baby', 'NN'), ('corn', 'NN')]
[('Chinese', 'JJ'), ('steamed', 'VBD'), ('buns', 'NNS'), ('with', 'IN'), ('bbq', 'NN'), ('pork', 'NN'), ('filling', 'NN')]
[('Chinese', 'JJ'), ('steamed', 'VBD'), ('buns', 'NNS'), ('with', 'IN'), ('bbq', 'NN'), ('pork', 'NN'), ('filling', 'NN')]
[('drowned', 'VBN'), ('beef', 'NN'), ('Sandwich', 'NNP'), ('with', 'IN'), ('chipotle', 'JJ'), ('sauce', 'NN')]
[('chicken', 'NN'), ('pad', 'NN'), ('Thai', 'NNP'), ('with', 'IN'), ('peanut', 'NN'), ('sauce', 'NN')]
[('chicken', 'NN'), ('pad', 'NN'), ('Thai', 'NNP'), ('with', 'IN'), ('peanut', 'NN'), ('sauce', 'NN')]
[('roasted', 'VBN'), ('chicken', 'NN'), ('with', 'IN'), ('risotto', 'NN'), ('and', 'CC'), ('caramelized', 'JJ'), ('onions', 'NNS')]
[('roasted', 'VBN'), ('chicken', 'NN'), ('with', 'IN'), ('risotto', 'NN'), ('and', 'CC'), ('caramelized', 'JJ'), ('onions', 'NNS')]
[('roasted', 'VBN'), ('chicken', 'NN'), ('with', 'IN'), ('risotto', 'NN'), ('and

[('chicken', 'NN'), ('lo', 'NN'), ('mein', 'NN'), ('with', 'IN'), ('broccoli', 'NN')]
[('orzo', 'NN'), ('with', 'IN'), ('chicken', 'NN'), ('and', 'CC'), ('artichokes', 'NNS')]
[('orzo', 'NN'), ('with', 'IN'), ('chicken', 'NN'), ('and', 'CC'), ('artichokes', 'NNS')]
[('stir', 'NN'), ('fried', 'VBD'), ('sesame', 'JJ'), ('vegetables', 'NNS'), ('with', 'IN'), ('rice', 'NN')]
[('grilled', 'VBN'), ('chicken', 'NN'), ('shawarma', 'NN'), ('wraps', 'NNS'), ('with', 'IN'), ('raita', 'NN')]
[('grilled', 'VBN'), ('chicken', 'NN'), ('shawarma', 'NN'), ('wraps', 'NNS'), ('with', 'IN'), ('raita', 'NN')]
[('lumpia', 'NN'), ('in', 'IN'), ('the', 'DT'), ('air', 'NN'), ('fryer', 'NN')]
[('flan', 'NN'), ('de', 'IN'), ('Coco', 'NNP')]
[('potato', 'NN'), ('and', 'CC'), ('cheese', 'JJ'), ('filling', 'NN'), ('for', 'IN'), ('Pierogi', 'NNP')]
[('potato', 'NN'), ('and', 'CC'), ('cheese', 'JJ'), ('filling', 'NN'), ('for', 'IN'), ('Pierogi', 'NNP')]
[('Czech', 'JJ'), ('dumpling', 'VBG'), ('with', 'IN'), ('sauerkr

[('instant', 'JJ'), ('pot', 'NN'), ('Puerto', 'NNP'), ('Rican', 'NNP'), ('arroz', 'IN'), ('con', 'NN'), ('Pollo', 'NNP')]
[('instant', 'JJ'), ('pot', 'NN'), ('Puerto', 'NNP'), ('Rican', 'NNP'), ('arroz', 'IN'), ('con', 'NN'), ('Pollo', 'NNP')]
[('sweet', 'NN'), ('and', 'CC'), ('spicy', 'NN'), ('stir', 'NN'), ('fry', 'NN'), ('with', 'IN'), ('chicken', 'NN'), ('and', 'CC'), ('broccoli', 'NN')]
[('sweet', 'NN'), ('and', 'CC'), ('spicy', 'NN'), ('stir', 'NN'), ('fry', 'NN'), ('with', 'IN'), ('chicken', 'NN'), ('and', 'CC'), ('broccoli', 'NN')]
[('pain', 'NN'), ('de', 'IN'), ('campagne', 'NN')]
[('pain', 'NN'), ('de', 'IN'), ('campagne', 'NN')]
[('apple', 'NN'), ('coffee', 'NN'), ('cake', 'NN'), ('with', 'IN'), ('brown', 'JJ'), ('sugar', 'NN'), ('sauce', 'NN')]
[('apple', 'NN'), ('coffee', 'NN'), ('cake', 'NN'), ('with', 'IN'), ('brown', 'JJ'), ('sugar', 'NN'), ('sauce', 'NN')]
[('apple', 'NN'), ('coffee', 'NN'), ('cake', 'NN'), ('with', 'IN'), ('brown', 'JJ'), ('sugar', 'NN'), ('sauce', 'N

[('grilled', 'JJ'), ('salmon', 'NN'), ('with', 'IN'), ('avocado', 'NN'), ('dip', 'NN')]
[('dutch', 'NN'), ('apple', 'NN'), ('pie', 'NN'), ('with', 'IN'), ('oatmeal', 'JJ'), ('streusel', 'NN')]
[('bengali', 'NN'), ('chicken', 'NN'), ('curry', 'NN'), ('with', 'IN'), ('potatoes', 'NNS')]
[('bengali', 'NN'), ('chicken', 'NN'), ('curry', 'NN'), ('with', 'IN'), ('potatoes', 'NNS')]
[('Korean', 'JJ'), ('braised', 'VBD'), ('mackerel', 'NN'), ('with', 'IN'), ('radish', 'JJ')]
[('braised', 'VBN'), ('chuck', 'NN'), ('roast', 'NN'), ('in', 'IN'), ('red', 'JJ'), ('wine', 'NN')]
[('without', 'IN'), ('the', 'DT'), ('chili', 'NN'), ('powder', 'NN')]
[('without', 'IN'), ('the', 'DT'), ('chili', 'NN'), ('powder', 'NN')]
[('without', 'IN'), ('the', 'DT'), ('chili', 'NN'), ('powder', 'NN')]
[('easy', 'JJ'), ('French', 'JJ'), ('onion', 'NN'), ('soup', 'NN'), ('for', 'IN'), ('guests', 'NNS')]
[('easy', 'JJ'), ('French', 'JJ'), ('onion', 'NN'), ('soup', 'NN'), ('for', 'IN'), ('guests', 'NNS')]
[('easy', 'JJ'

## Chunking (ingredients)

In [490]:
all_ingre_tags = []

for POS in ALL_POS:
  new_dic = {POS: list_words_with_tag(tagged_recipe_ingredients, POS)}
  all_ingre_tags.append(new_dic)

get_tag_number(all_ingre_tags)

[{'$': 0},
 {"''": 14},
 {'(': 3742},
 {')': 3827},
 {',': 8513},
 {'--': 0},
 {'.': 23},
 {':': 295},
 {'CC': 3094},
 {'CD': 21802},
 {'DT': 99},
 {'EX': 0},
 {'FW': 52},
 {'IN': 2849},
 {'JJ': 13400},
 {'JJR': 523},
 {'JJS': 6},
 {'LS': 0},
 {'MD': 612},
 {'NN': 32984},
 {'NNP': 2416},
 {'NNPS': 2},
 {'NNS': 13598},
 {'PDT': 1},
 {'POS': 126},
 {'PRP': 1},
 {'PRP$': 1},
 {'RB': 1452},
 {'RBR': 5},
 {'RBS': 0},
 {'RP': 13},
 {'SYM': 53},
 {'TO': 1039},
 {'UH': 0},
 {'VB': 1724},
 {'VBD': 8947},
 {'VBG': 354},
 {'VBN': 3436},
 {'VBP': 645},
 {'VBZ': 588},
 {'WDT': 1},
 {'WP': 0},
 {'WP$': 0},
 {'WRB': 0},
 {'``': 0}]

In [494]:
sort_unique_list(get_values_from_dict_list(all_ingre_tags, 'VBG')[0])

['Cooking',
 'Darjeeling',
 'Dipping',
 'Peking',
 'Riesling',
 'Seasoning',
 'Shaoxing',
 'baking',
 'boiling',
 'browning',
 'brushing',
 'canning',
 'casing',
 'coating',
 'coloring',
 'cooking',
 'curing',
 'decorating',
 'dipping',
 'dredging',
 'dressing',
 'drizzling',
 'dumpling',
 'dusting',
 'filling',
 'finishing',
 'flavoring',
 'flying',
 'frying',
 'greasing',
 'including',
 'pickling',
 'pudding',
 'rimming',
 'ring',
 'roasting',
 'rolling',
 'seasoning',
 'serving',
 'shortening',
 'sparkling',
 'sprinkling',
 'string',
 'stuffing',
 'topping',
 'using',
 'whipping',
 'wing']

## Data merging and creating bigram

In [1299]:
all_recipe_names2[:10]

['pan-fried asparagus',
 'creamy au gratin potatoes',
 'super-delicious zuppa toscana',
 'simple teriyaki sauce',
 'spicy Korean fried chicken with gochujang sauce',
 'spaghetti aglio e olio',
 'easy garam masala',
 'easy chorizo street tacos',
 'Russian cabbage rolls with gravy',
 'shrimp scampi with pasta']

In [1300]:
p_ingredients[:10]

['4 tablespoons grated orange peel',
 '4 green bell pepper, cut into matchstick-size pieces',
 '4 cups dry bread crumbs',
 '4 teaspoon New Mexico chili powder',
 '4 teaspoon mayonnaise, or to taste',
 '4.4 tablespoons cornstarch',
 '4 ounces dry miniature ravioli',
 '4 cups peeled and cubed potatoes',
 '4 bunch fresh basil, divided',
 '4 sugar snap peas']

In [1309]:
def generate_bigram_from_entry(entry):
    bigrams = nltk.bigrams(entry.split(' '))
    frequence = nltk.FreqDist(bigrams)
    return dict(sorted(frequence.items(), key=lambda item: item[0]))

generate_bigram_from_entry("4 tablespoons grated orange peel")

{('4', 'tablespoons'): 1,
 ('grated', 'orange'): 1,
 ('orange', 'peel'): 1,
 ('tablespoons', 'grated'): 1}

In [1311]:
from collections import Counter

name_bigrams = {}

for name in all_recipe_names2:
    name_bigrams = dict(Counter(name_bigrams)+Counter(generate_bigram_from_entry(name)))
    
name_bigrams

{('pan-fried', 'asparagus'): 1,
 ('au', 'gratin'): 3,
 ('creamy', 'au'): 1,
 ('gratin', 'potatoes'): 1,
 ('super-delicious', 'zuppa'): 1,
 ('zuppa', 'toscana'): 3,
 ('simple', 'teriyaki'): 1,
 ('teriyaki', 'sauce'): 9,
 ('Korean', 'fried'): 5,
 ('chicken', 'with'): 24,
 ('fried', 'chicken'): 10,
 ('gochujang', 'sauce'): 3,
 ('spicy', 'Korean'): 4,
 ('with', 'gochujang'): 2,
 ('aglio', 'e'): 1,
 ('e', 'olio'): 1,
 ('spaghetti', 'aglio'): 1,
 ('easy', 'garam'): 1,
 ('garam', 'masala'): 3,
 ('chorizo', 'street'): 1,
 ('easy', 'chorizo'): 1,
 ('street', 'tacos'): 3,
 ('Russian', 'cabbage'): 5,
 ('cabbage', 'rolls'): 11,
 ('rolls', 'with'): 6,
 ('with', 'gravy'): 2,
 ('scampi', 'with'): 1,
 ('shrimp', 'scampi'): 7,
 ('with', 'pasta'): 2,
 ('and', 'potato'): 17,
 ('chicken', 'and'): 51,
 ('greek', 'lemon'): 6,
 ('lemon', 'chicken'): 4,
 ('potato', 'bake'): 3,
 ('easy', 'mexican'): 6,
 ('mexican', 'casserole'): 5,
 ('apple', 'cake'): 8,
 ('cake', 'i'): 3,
 ('german', 'apple'): 7,
 ('Spanish',

In [1312]:
ingre_bigrams = {}

for ingre in p_ingredients:
    ingre_bigrams = dict(Counter(ingre_bigrams)+Counter(generate_bigram_from_entry(ingre)))
    
ingre_bigrams

{('4', 'tablespoons'): 1348,
 ('grated', 'orange'): 9,
 ('orange', 'peel'): 8,
 ('tablespoons', 'grated'): 29,
 ('4', 'green'): 170,
 ('bell', 'pepper,'): 182,
 ('cut', 'into'): 990,
 ('green', 'bell'): 125,
 ('into', 'matchstick-size'): 6,
 ('matchstick-size', 'pieces'): 6,
 ('pepper,', 'cut'): 64,
 ('4', 'cups'): 1004,
 ('bread', 'crumbs'): 78,
 ('cups', 'dry'): 21,
 ('dry', 'bread'): 25,
 ('4', 'teaspoon'): 590,
 ('Mexico', 'chili'): 1,
 ('New', 'Mexico'): 9,
 ('chili', 'powder'): 41,
 ('teaspoon', 'New'): 2,
 ('mayonnaise,', 'or'): 6,
 ('or', 'to'): 534,
 ('teaspoon', 'mayonnaise,'): 1,
 ('to', 'taste'): 956,
 ('4.4', 'tablespoons'): 159,
 ('tablespoons', 'cornstarch'): 9,
 ('4', 'ounces'): 501,
 ('dry', 'miniature'): 1,
 ('miniature', 'ravioli'): 1,
 ('ounces', 'dry'): 15,
 ('and', 'cubed'): 88,
 ('cubed', 'potatoes'): 6,
 ('cups', 'peeled'): 12,
 ('peeled', 'and'): 494,
 ('4', 'bunch'): 105,
 ('basil,', 'divided'): 3,
 ('bunch', 'fresh'): 54,
 ('fresh', 'basil,'): 19,
 ('4', 'sug

In [1313]:
all_bigrams = {}

all_bigrams = dict(Counter(name_bigrams)+Counter(ingre_bigrams))

all_bigrams

{('pan-fried', 'asparagus'): 1,
 ('au', 'gratin'): 3,
 ('creamy', 'au'): 1,
 ('gratin', 'potatoes'): 1,
 ('super-delicious', 'zuppa'): 1,
 ('zuppa', 'toscana'): 3,
 ('simple', 'teriyaki'): 1,
 ('teriyaki', 'sauce'): 20,
 ('Korean', 'fried'): 5,
 ('chicken', 'with'): 25,
 ('fried', 'chicken'): 10,
 ('gochujang', 'sauce'): 3,
 ('spicy', 'Korean'): 4,
 ('with', 'gochujang'): 2,
 ('aglio', 'e'): 1,
 ('e', 'olio'): 1,
 ('spaghetti', 'aglio'): 1,
 ('easy', 'garam'): 1,
 ('garam', 'masala'): 17,
 ('chorizo', 'street'): 1,
 ('easy', 'chorizo'): 1,
 ('street', 'tacos'): 3,
 ('Russian', 'cabbage'): 5,
 ('cabbage', 'rolls'): 11,
 ('rolls', 'with'): 6,
 ('with', 'gravy'): 2,
 ('scampi', 'with'): 1,
 ('shrimp', 'scampi'): 7,
 ('with', 'pasta'): 2,
 ('and', 'potato'): 17,
 ('chicken', 'and'): 51,
 ('greek', 'lemon'): 6,
 ('lemon', 'chicken'): 4,
 ('potato', 'bake'): 3,
 ('easy', 'mexican'): 6,
 ('mexican', 'casserole'): 5,
 ('apple', 'cake'): 8,
 ('cake', 'i'): 3,
 ('german', 'apple'): 7,
 ('Spanish

In [1314]:
def find_dict_tuple_key(search):
    entry = {
        "token": search,
        "bigrams": []
    }
    bigrams = {x: all_bigrams[x] for x in all_bigrams.keys() if x[0] == search}
    for key, value in bigrams.items():
        newDict = {key[1]: value}
        entry["bigrams"].append(newDict)
    return entry

find_dict_tuple_key('spicy')

{'token': 'spicy',
 'bigrams': [{'Korean': 4},
  {'stir': 1},
  {'chicken': 4},
  {'szechuan': 1},
  {'basil': 2},
  {'vegan': 1},
  {'red': 2},
  {'Indian': 4},
  {'Vietnamese': 2},
  {'bok': 1},
  {'thai': 4},
  {'tuna': 2},
  {'stir-fry': 1},
  {'orange': 4},
  {'beef': 3},
  {'dipping': 1},
  {'pesto': 1},
  {'cabbage': 1},
  {'asian': 1},
  {'crispy': 1},
  {'eggplant': 4},
  {'green': 1},
  {'pork': 5},
  {'shrimp': 3},
  {'noodles': 1},
  {'Chinese': 2},
  {'Southwest': 1},
  {'asian-style': 1},
  {'sushi': 1},
  {'Mexican-American': 1},
  {'African': 1},
  {'yellowtail': 1},
  {'avocado': 1},
  {'Peruvian': 1},
  {'peach': 1},
  {'mango': 1},
  {'rice': 2},
  {'salmon': 1},
  {'yogurt': 1},
  {'Sinterklass': 1},
  {'Italian': 4},
  {'feta': 1},
  {'and': 1},
  {'banana': 1},
  {'penyet': 1},
  {'calabrian': 1},
  {'marinated': 1},
  {'tomato': 1},
  {'himalayan': 1},
  {'fried': 1},
  {'curry': 1},
  {'cilantro': 1},
  {'sesame': 1},
  {'brown': 2},
  {'Portuguese': 1},
  {'sea

In [1318]:
all_tokens = recipe_tokens + ingre_tokens

len(all_tokens)

6064

In [1319]:
bigram_in_list = []
for value in all_tokens:
    bigram_in_list.append(find_dict_tuple_key(value))
    
bigram_in_list

[{'token': 'chunks',
  'bigrams': [{'Dominican': 1},
   {'in': 1},
   {'(Optional)': 3},
   {'with': 2},
   {'fresh': 1}]},
 {'token': 'pizzelle', 'bigrams': []},
 {'token': 'vinagrete', 'bigrams': []},
 {'token': 'podge', 'bigrams': []},
 {'token': 'torrone', 'bigrams': []},
 {'token': 'whit', 'bigrams': []},
 {'token': 'pasty', 'bigrams': []},
 {'token': 'farmer', 'bigrams': []},
 {'token': 'Caribbean-Spiced', 'bigrams': [{'roast': 1}]},
 {'token': 'lebkuchen', 'bigrams': [{'men': 1}]},
 {'token': 'appetizers', 'bigrams': [{'for': 1}]},
 {'token': 'delight', 'bigrams': []},
 {'token': 'atsara', 'bigrams': []},
 {'token': 'Christmas',
  'bigrams': [{'cookies': 3},
   {'brunch': 1},
   {'glogg': 1},
   {'plum': 1},
   {'red': 1},
   {'cabbage': 1},
   {'coconut': 1},
   {'stollen': 1},
   {'lefse': 1},
   {'dinner': 1},
   {'baklava': 1},
   {'cake': 1},
   {'eve': 1},
   {'cakes': 2},
   {'kringle': 1}]},
 {'token': 'lubia', 'bigrams': [{'polo': 1}]},
 {'token': 'potato-lentil', 'bigr

## Add Phonetics

In [1320]:
!pipenv install eng-to-ipa 

Installing eng-to-ipa...

Installing dependencies from Pipfile.lock (577ce1)...
Ignoring argcomplete: markers 'python_full_version < "3.8.0"' don't match your environment
Ignoring importlib-metadata: markers 'python_version == "3.7" and python_full_version < "3.8.0" and python_full_version < "3.8.0" and python_full_version < "3.8.0"' don't match your environment
Ignoring typing-extensions: markers 'python_full_version < "3.8.0"' don't match your environment




[    ] Installing..
[=   ] Installing eng-to-ipa..
[==  ] Installing eng-to-ipa..
[=== ] Installing eng-to-ipa..
[ ===] Installing eng-to-ipa..
[  ==] Installing eng-to-ipa..
[   =] Installing eng-to-ipa..
[    ] Installing eng-to-ipa..
[   =] Installing eng-to-ipa..
[  ==] Installing eng-to-ipa..
[ ===] Installing eng-to-ipa..
[====] Installing eng-to-ipa..
[=== ] Installing eng-to-ipa..
[==  ] Installing eng-to-ipa..
[=   ] Installing eng-to-ipa..
[    ] Installing eng-to-ipa..
[=   ] Installing eng-to-ipa..
[==  ] Installing eng-to-ipa..
[=== ] Installing eng-to-ipa..
[ ===] Installing eng-to-ipa..
[  ==] Installing eng-to-ipa..
[   =] Installing eng-to-ipa..
[    ] Installing eng-to-ipa..
[   =] Installing eng-to-ipa..
[  ==] Installing eng-to-ipa..
[ ===] Installing eng-to-ipa..
[====] Installing eng-to-ipa..
[=== ] Installing eng-to-ipa..
[==  ] Installing eng-to-ipa..
[=   ] Installing eng-to-ipa..
[    ] Installing eng-to-ipa..
[=   ] Installing eng-to-ipa..
[==  ] Installing

In [1323]:
import eng_to_ipa as eng_to_ipa

eng_to_ipa.convert("hey!")

'heɪ!'

In [1324]:
for bigram in bigram_in_list:
    try:
        bigram["ipa"] = eng_to_ipa.convert(eng_to_ipa.convert(bigram["token"]))
    except Exception as e:
        pass

bigram_in_list

[{'token': 'chunks',
  'bigrams': [{'Dominican': 1},
   {'in': 1},
   {'(Optional)': 3},
   {'with': 2},
   {'fresh': 1}],
  'ipa': 'ʧəŋʧəŋks*'},
 {'token': 'pizzelle', 'bigrams': [], 'ipa': 'pizzelle**'},
 {'token': 'vinagrete', 'bigrams': [], 'ipa': 'vinagrete**'},
 {'token': 'podge', 'bigrams': [], 'ipa': 'pɑʤ*ɑʤ'},
 {'token': 'torrone', 'bigrams': [], 'ipa': 'torrone**'},
 {'token': 'whit', 'bigrams': [], 'ipa': 'wɪt*'},
 {'token': 'pasty', 'bigrams': [], 'ipa': 'pasty**'},
 {'token': 'farmer', 'bigrams': [], 'ipa': 'ˈˈfɑrmər*'},
 {'token': 'Caribbean-Spiced',
  'bigrams': [{'roast': 1}],
  'ipa': 'caribbean-spiced**'},
 {'token': 'lebkuchen', 'bigrams': [{'men': 1}], 'ipa': 'lebkuchen**'},
 {'token': 'appetizers', 'bigrams': [{'for': 1}], 'ipa': 'ˈæˈæpəˌtaɪzərz*'},
 {'token': 'delight', 'bigrams': [], 'ipa': 'dɪˈlaɪt*'},
 {'token': 'atsara', 'bigrams': [], 'ipa': 'atsara**'},
 {'token': 'Christmas',
  'bigrams': [{'cookies': 3},
   {'brunch': 1},
   {'glogg': 1},
   {'plum': 1},
 

## Old code (need to be changed / replaced later on)

Flatten each data entry into a string

In [None]:
corpus_list = []
for item in recipes:
    item['ingredients']=','.join(item['ingredients'])
    try:
        item['text'] = item['name'] + " " + item["ingredients"]
    except Exception as e:
        item['name'] = ""
        item['text'] = item['name'] + " " + item["ingredients"]
        print(e)
    corpus_list.append(item['text'])
    
corpus_list[:3]

Convert entire flattened list into a string

In [None]:
corpus = ','.join(corpus_list)

Compute bigram

In [None]:
import nltk
nltk.download('punkt')

tokens = nltk.word_tokenize(corpus)
bigrams = nltk.bigrams(tokens)
frequence = nltk.FreqDist(bigrams)
for key,value in frequence.items():
    print(key,value)

In [None]:
len(tokens)

Convert bigrams into dictionaries, with bigram as key, frequency as value

In [None]:
result = dict(sorted(frequence.items(), key=lambda item: item[0]))
result

Get unique tokens and sort them in an ascending order

In [None]:
unique_tokens = sorted(list(set(tokens)))
unique_tokens

Combine bigrams of the same first word into a dictionary

In [None]:
def find_dict_tuple_key(search):
    entry = {
        "token": search,
        "bigrams": []
    }
    bigrams = {x: result[x] for x in result.keys() if x[0] == search}
    for key, value in bigrams.items():
        newDict = {key[1]: value}
        entry["bigrams"].append(newDict)
    return entry

find_dict_tuple_key('Garlic')

Do the same to all the tokens to create a list of dictionaries

In [None]:
bigram_list = []
for value in unique_tokens:
    bigram_list.append(find_dict_tuple_key(value))
    
bigram_list

In [None]:
len(bigram_list)

In [None]:
len(unique_tokens)

## Numbers and placeholder

## POS tagging

# Create edit distance

# Create bigram

#Chunking/Phrases
