In [44]:
import pandas as pd
import os
import ast

## Read dataset and select Tag

In [45]:
selected_path = os.path.join('data', 'selected_recipes.parquet')
choc_recipes = pd.read_parquet(selected_path)
choc_recipes.shape

(21377, 13)

In [46]:
choc_recipes.head(5)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,is_included
8,bananas 4 ice cream pie,70971,180,102353,2003-09-10,"['weeknight', 'time-to-make', 'course', 'main-...","[4270.8, 254.0, 1306.0, 111.0, 127.0, 431.0, 2...",8,"['crumble cookies into a 9-inch pie plate , or...",,"['chocolate sandwich style cookies', 'chocolat...",6,True
11,better than sex strawberries,42198,1460,41531,2002-10-03,"['weeknight', 'time-to-make', 'course', 'main-...","[734.1, 66.0, 199.0, 10.0, 10.0, 117.0, 28.0]",8,['crush vanilla wafers into fine crumbs and li...,simple but sexy. this was in my local newspape...,"['vanilla wafers', 'butter', 'powdered sugar',...",7,True
23,deep fried dessert thingys,107699,20,158966,2005-01-05,"['30-minutes-or-less', 'time-to-make', 'course...","[1663.3, 221.0, 168.0, 66.0, 19.0, 158.0, 29.0]",20,"['in a large bowl , mix flour , granulated sug...",my mother used to make this for us as a specia...,"['all-purpose flour', 'granulated sugar', 'bak...",13,True
47,jeanne s style birthday cake,83025,230,116315,2004-02-04,"['time-to-make', 'course', 'preparation', 'occ...","[5467.4, 516.0, 1196.0, 135.0, 110.0, 615.0, 1...",25,"['to prepare base , cut shortening into dry in...",a bakery in winnipeg is famous for this specia...,"['shortening', 'icing sugar', 'vanilla', 'all-...",10,True
48,jiffy extra moist carrot cake,52804,50,49304,2003-01-29,"['60-minutes-or-less', 'time-to-make', 'course...","[612.1, 49.0, 170.0, 25.0, 15.0, 39.0, 25.0]",8,"['preheat oven to 350 degrees', 'mix together ...","this is a very tasty, moist, carrot cake. a ni...","['yellow cake mix', 'vanilla instant pudding m...",11,True


In [47]:
import ast
choc_recipes['steps'] = choc_recipes["steps"].apply(ast.literal_eval)

choc_recipes['ingredients'] = choc_recipes['ingredients'].apply(ast.literal_eval)

In [48]:
choc_recipes['steps'].head(1).to_list()


[['crumble cookies into a 9-inch pie plate , or cake pan',
  'pat down to form an even layer',
  'drizzle 1 cup of chocolate topping evenly over the cookies with a small spoon',
  'scoop the vanilla ice cream on top of the chocolate and smooth down',
  'cover with half of the sliced bananas',
  'top with strawberry ice cream',
  'cover and freeze until firm',
  'before serving , top with 1 / 4 cup chocolate topping , whipped cream , and sliced bananas']]

## Ingredient cleaning

In [49]:
#ingredient_set = set()
#all_ingredients = choc_recipes['ingredients'].to_list()
#for ingredients in all_ingredients:
#    ingredient_set.update(ingredients)

In [50]:
choc_recipes = choc_recipes.loc[(choc_recipes['n_ingredients'] > 4) & (choc_recipes['n_ingredients'] <= 20)]

## Number of Steps cleaning

In [51]:
#choc_recipes = choc_recipes.loc[(choc_recipes['n_steps'] > 3) & (choc_recipes['n_steps'] <= 25)]
choc_recipes = choc_recipes.loc[(choc_recipes['n_steps'] > 3)]

In [52]:
choc_recipes.shape

(18286, 13)

## Cooking Techniques cleaning

In [53]:
vocab_path = os.path.join('data', 'verb.vocab')
cooking_techniques = pd.read_csv(vocab_path, header=None, names=["cooking_techniques"])
cooking_techniques = cooking_techniques['cooking_techniques'].to_list()
cooking_techniques

['lace',
 'perch',
 'pry',
 'soften',
 'snip',
 'skim',
 'skin',
 'follow',
 'strew',
 'whack',
 'simmer',
 'brown',
 'string',
 'ladle',
 'rise',
 'dampen',
 'spoon',
 'cook',
 'slather',
 'wipe',
 'cool',
 'stiffen',
 'whisk',
 'level',
 'tear',
 'pinch',
 'try',
 'sand',
 'adjust',
 'gut',
 'skewer',
 'dip',
 'round',
 'shave',
 'force',
 'fold',
 'barbecue',
 'bake',
 'poke',
 'peel',
 'melt',
 'crush',
 'devein',
 'punch',
 'water',
 'dry',
 'shove',
 'scramble',
 'leaven',
 'narrow',
 'divide',
 'lengthen',
 'replace',
 'plop',
 'zip',
 'paste',
 'dangle',
 'splash',
 'strike',
 'cram',
 'sharpen',
 'garnish',
 'tenderize',
 'warp',
 'warm',
 'stick',
 'grill',
 'join',
 'squish',
 'strain',
 'hardboil',
 'sweeten',
 'cap',
 'pour',
 'thin',
 'drill',
 'pickle',
 'scatter',
 'wedge',
 'debone',
 'encircle',
 'slit',
 'bend',
 'slip',
 'dress',
 'sit',
 'tilt',
 'enlarge',
 'stir',
 'stand',
 'moisten',
 'blacken',
 'lay',
 'drape',
 'bind',
 'smack',
 'scoop',
 'crumple',
 'wind'

In [54]:
# Create DataFrame with all steps that does not contain a technique
all_step_list = []
for recipe in choc_recipes["steps"]:
      for step in recipe:
        all_step_list.append(step)

len(all_step_list)

214278

In [55]:
valid_steps_counter = 0
steps_no_action = []

for step in all_step_list:
    if any(tech in step for tech in cooking_techniques):
        valid_steps_counter += 1

    else:
        steps_no_action.append(step)

steps_no_action

['f',
 'for 10-12 minutes',
 'f for 25 - 40 minutes',
 'this will take at least 15 minutes',
 'enjoy !',
 'awesome !',
 'should be able to ice one 8x8 cake',
 'for the crust:',
 'for the cheesecake:',
 ':)',
 '4',
 '18',
 'enjoy !',
 'then use a knife to loosen the sides of the cake from the pan',
 "don't worry",
 'this recipe is pretty versatile',
 '-you can substitute the milk with different liquids',
 'really good plain too !',
 'if you wish , the recipe may be doubled',
 'sauce pan',
 'the process will take about 15-20 minutes',
 'wait 10-15 minutes for this to occur',
 'this is the most moist , delicious , yummy cake in the world ! perfect for an anniversary or holiday !',
 'enjoy !',
 '5 minutes',
 'sometimes , the pudding needs another 1',
 'i have also used a food processor and a dough blade on pulse for each step',
 'i love the sweet and spicy',
 'consume',
 'yum !',
 'for icing',
 'fit pie crust into a 9 inch pie plate',
 'keep in refrigerator',
 'dived in 2 portions',
 'enjo

In [56]:
def clean_recipe(recipe_string):
    #step_list = ast.literal_eval(recipe_string)
    step_list = recipe_string

    for step in step_list:
         if not any(tech in step for tech in cooking_techniques):
            # print(step)
            step_list.remove(step)
        

    return step_list

In [57]:
choc_recipes['steps'] = choc_recipes['steps'].apply(clean_recipe)

In [58]:
choc_recipes['steps']

8         [crumble cookies into a 9-inch pie plate , or ...
11        [crush vanilla wafers into fine crumbs and lin...
23        [in a large bowl , mix flour , granulated suga...
47        [to prepare base , cut shortening into dry ing...
48        [preheat oven to 350 degrees, mix together the...
                                ...                        
231617    [in a small bowl , combine the yeast and half ...
231618    [prepare pastry: cut margarine into flour , su...
231621    [stir together 1 / 2 teaspoon of sugar , the y...
231635    [place melted butter in a large mixing bowl an...
231636    [whip sugar and shortening in a large bowl , a...
Name: steps, Length: 18286, dtype: object

## Final cleaning

In [59]:
choc_recipes.shape

(18286, 13)

In [60]:
choc_recipes = choc_recipes.dropna()
choc_recipes.shape

(17840, 13)

## Save cleaned recipes to text file

In [63]:
def replace_spaces(ingredient_list):
    return [ingredient.replace(' ', '_') for ingredient in ingredient_list]

In [64]:
choc_recipes['ingredients'] = choc_recipes['ingredients'].apply(replace_spaces)

In [65]:
# combine steps and insert step token
choc_recipes['steps'] = choc_recipes['steps'].apply(' <STEP> '.join)

# combine ingredients 
choc_recipes['ingredients'] = choc_recipes['ingredients'].apply(', '.join)


In [70]:
choc_recipes.head(5)['ingredients'].to_list()[3]

'yellow_cake_mix, vanilla_instant_pudding_mix, nutmeg, cinnamon, eggs, oil, water, crushed_pineapple, carrot, pecans, coconut'

In [71]:
file_str = ''
for index, row in choc_recipes.iterrows():
    file_str += f"{row['name']}\t{row['ingredients']}\t{row['steps']}\n"

In [72]:
ingr_map_path = os.path.join('archive', 'ingr_map.pkl')
ingr_map = pd.read_pickle(ingr_map_path)
ingr_map.head(2)

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308


In [73]:
for index, row in ingr_map.iterrows():
    file_str.replace(row['raw_ingr'], row['replaced'])

In [74]:
for index, row in ingr_map.iterrows():
    file_str.replace(row['processed'], row['replaced'])

In [75]:
text_path = os.path.join('data', 'recipe_dataset.txt')
with open(text_path, 'w') as the_file:
        the_file.write(file_str)