In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
keyword_df = pd.read_parquet('../data/clean_columns/keywords_clean.parquet')
keyword_df['old_Name'] = keyword_df['Name'] 
keyword_df['Name'] = keyword_df['Name'].str.lower()
keyword_df['Keywords'] = keyword_df['KeywordsClean']

In [3]:
keyword_df

Unnamed: 0_level_0,Name,KeywordsClean,old_Name,Keywords
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
38,low-fat berry blue frozen dessert,"[dessert, summer, freezer, healthy, low protei...",Low-Fat Berry Blue Frozen Dessert,"[dessert, summer, freezer, healthy, low protei..."
39,biryani,"[poultry, asian, meat, chicken, indian, sauce,...",Biryani,"[poultry, asian, meat, chicken, indian, sauce,..."
40,best lemonade,"[summer, shake, healthy, low protein, lemon, l...",Best Lemonade,"[summer, shake, healthy, low protein, lemon, l..."
41,carina's tofu-vegetable kebabs,"[corn, shake, vegetable, oven, broil/grill, ba...",Carina's Tofu-Vegetable Kebabs,"[corn, shake, vegetable, oven, broil/grill, ba..."
42,cabbage soup,"[healthy, low protein, soup, vegan, winter, lo...",Cabbage Soup,"[healthy, low protein, soup, vegan, winter, lo..."
...,...,...,...,...
541379,meg's fresh ginger gingerbread,"[< 4 hours, oven, butter, egg, cake]",Meg's Fresh Ginger Gingerbread,"[< 4 hours, oven, butter, egg, cake]"
541380,roast prime rib au poivre with mixed peppercorns,"[meat, < 4 hours, high in..., roast, butter, o...",Roast Prime Rib au Poivre with Mixed Peppercorns,"[meat, < 4 hours, high in..., roast, butter, o..."
541381,kirshwasser ice cream,"[dessert, < 4 hours]",Kirshwasser Ice Cream,"[dessert, < 4 hours]"
541382,quick & easy asian cucumber salmon rolls,"[asian, < 15 mins, cucumber, easy]",Quick & Easy Asian Cucumber Salmon Rolls,"[asian, < 15 mins, cucumber, easy]"


In [4]:
keyword_list_orig = []
for kwl in keyword_df.Keywords:
    keyword_list_orig+=list(kwl)

keyword_list_orig_nr = list(set(keyword_list_orig))

In [5]:
appliances=['microwave', 'blender','air fryer', 'oven', 'stove','toaster', 'mixer','pot',
          'pan', 'kettle', 'rice cooker','cutting board', 'processor', 'peeler',
          'spatula','tongs','colander','baking sheet','grater', 'cup','meat grinder','juicer',
          'bowl', 'freezer', 'whisk', 'electric', 'grill', 'bread machine', 'convection oven', 'dutch oven'
          'deep fryer', 'crepe maker', 'roaster', 'steamer', 'griddle', 'hot plate', 
          'instant pot', 'pressure cooker', 'crock pot', 'flattop grill', 'slow cooker', 
          'sous-vide cooker', 'stove', 'waffle iron', 'baster', 'blow torch', 'can opener', 
          'shredder', 'cheesecloth', 'clay pot', 'corkscrew', 'poacher', 'funnel', 'mill', 
          'sifter', 'garlic press', 'strainer', 'ladle', 'mandoline', 'tenderiser', 'meat thermometer'
          'microplane', 'mortar', 'pestle', 'frother', 'pastry brush', 'masher', 'shears', 'rolling pin',
          'scale', 'sieve', 'spider', 'thermometer', 'twine', 'zester']

In [6]:
len([r for r in keyword_df.index if 'pan' in keyword_df['Name'][r]])

9722

This seems small for a very common word in recipes. It must not be listed as a keyword as frequently as it is mentioned in a recipe.

In [7]:
len([r for r in keyword_df.index if 'oven' in keyword_df['Name'][r]])

2786

In [8]:
len([r for r in keyword_df.index if 'dutch oven' in keyword_df['Name'][r]])

71

In [9]:
len([r for r in keyword_df.index if 'crock pot' in keyword_df['Name'][r]])

4767

Most of the appliance keywords indicate basic items like oven and pan. I want to find a way to make sure the appliance list includes every appliance actually in the recipe, not just these basic items.

In [10]:
keyword_df['ApplianceList'] = [set(appliances).intersection(set(kws)) for kws in keyword_df.Keywords]

In [11]:
appliances_df = keyword_df.loc[keyword_df['ApplianceList'].apply(len)>0][['old_Name', 'Name','ApplianceList']].copy()

In [12]:
appliances_df

Unnamed: 0_level_0,old_Name,Name,ApplianceList
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
38,Low-Fat Berry Blue Frozen Dessert,low-fat berry blue frozen dessert,{freezer}
41,Carina's Tofu-Vegetable Kebabs,carina's tofu-vegetable kebabs,{oven}
45,Buttermilk Pie With Gingersnap Crumb Crust,buttermilk pie with gingersnap crumb crust,{oven}
47,Butter Pecan Cookies,butter pecan cookies,{oven}
48,Boston Cream Pie,boston cream pie,{oven}
...,...,...,...
541370,Minty Whipped Shortbread Cookies,minty whipped shortbread cookies,{oven}
541372,Chiles Rellenos Casserole,chiles rellenos casserole,{oven}
541373,Fran's Maple Blondies,fran's maple blondies,{oven}
541379,Meg's Fresh Ginger Gingerbread,meg's fresh ginger gingerbread,{oven}


I am not sure why this is choosing just one appliance for each recipe. It is possible that very few appliances are appearing in the keyword file.

In [13]:
keyword_df['ApplianceExt'] = [set(appliances).intersection(set(kws)) for kws in keyword_df.Keywords]

In [14]:
keyword_df['ApplianceExt']

RecipeId
38        {freezer}
39               {}
40               {}
41           {oven}
42               {}
            ...    
541379       {oven}
541380       {oven}
541381           {}
541382           {}
541383           {}
Name: ApplianceExt, Length: 522196, dtype: object

In [15]:
keyword_df['ApplianceList']

RecipeId
38        {freezer}
39               {}
40               {}
41           {oven}
42               {}
            ...    
541379       {oven}
541380       {oven}
541381           {}
541382           {}
541383           {}
Name: ApplianceList, Length: 522196, dtype: object

In [16]:
unknown_appliances = keyword_df.loc[keyword_df['ApplianceList'].apply(len)==0][['Name', 'ApplianceList']].copy()

In [17]:
unknown_appliances

Unnamed: 0_level_0,Name,ApplianceList
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1
39,biryani,{}
40,best lemonade,{}
42,cabbage soup,{}
43,best blackbottom pie,{}
44,warm chicken a la king,{}
...,...,...
541377,slow-cooker classic coffee cake,{}
541378,meg's pumpkin spice bread,{}
541381,kirshwasser ice cream,{}
541382,quick & easy asian cucumber salmon rolls,{}


In [18]:
appliances_df['ApplianceKeywords'] = [list(set(appliances_df['ApplianceList'][r])) for r in appliances_df.index]

In [19]:
def appliance_vec(rkwl:list)->list:
    vec = [0]*len(appliances)
    for i in range(len(appliances)):
        if appliances[i] in rkwl:
            vec[i]+=1
    return vec


In [20]:
keyword_df.loc[keyword_df['ApplianceExt'].apply(len)>0]

Unnamed: 0_level_0,Name,KeywordsClean,old_Name,Keywords,ApplianceList,ApplianceExt
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
38,low-fat berry blue frozen dessert,"[dessert, summer, freezer, healthy, low protei...",Low-Fat Berry Blue Frozen Dessert,"[dessert, summer, freezer, healthy, low protei...",{freezer},{freezer}
41,carina's tofu-vegetable kebabs,"[corn, shake, vegetable, oven, broil/grill, ba...",Carina's Tofu-Vegetable Kebabs,"[corn, shake, vegetable, oven, broil/grill, ba...",{oven},{oven}
45,buttermilk pie with gingersnap crumb crust,"[dessert, < 4 hours, healthy, oven, egg, weekn...",Buttermilk Pie With Gingersnap Crumb Crust,"[dessert, < 4 hours, healthy, oven, egg, weekn...",{oven},{oven}
47,butter pecan cookies,"[cookie & brownie, < 4 hours, oven, butter, eg...",Butter Pecan Cookies,"[cookie & brownie, < 4 hours, oven, butter, eg...",{oven},{oven}
48,boston cream pie,"[dessert, < 4 hours, oven, butter, egg, weekni...",Boston Cream Pie,"[dessert, < 4 hours, oven, butter, egg, weekni...",{oven},{oven}
...,...,...,...,...,...,...
541370,minty whipped shortbread cookies,"[dessert, cookie & brownie, oven, butter, < 30...",Minty Whipped Shortbread Cookies,"[dessert, cookie & brownie, oven, butter, < 30...",{oven},{oven}
541372,chiles rellenos casserole,"[casserole, < 4 hours, oven, butter, cheese, c...",Chiles Rellenos Casserole,"[casserole, < 4 hours, oven, butter, cheese, c...",{oven},{oven}
541373,fran's maple blondies,"[< 60 mins, easy, oven]",Fran's Maple Blondies,"[< 60 mins, easy, oven]",{oven},{oven}
541379,meg's fresh ginger gingerbread,"[< 4 hours, oven, butter, egg, cake]",Meg's Fresh Ginger Gingerbread,"[< 4 hours, oven, butter, egg, cake]",{oven},{oven}


In [21]:
keyword_df['ApplianceKeywords']=keyword_df['ApplianceList']
appliance_df = keyword_df.loc[keyword_df['ApplianceKeywords'].apply(len)>0][['old_Name', 'Name','ApplianceKeywords']].copy()

In [22]:
appliance_df['Name'] = appliance_df['old_Name']
appliance_df

Unnamed: 0_level_0,old_Name,Name,ApplianceKeywords
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
38,Low-Fat Berry Blue Frozen Dessert,Low-Fat Berry Blue Frozen Dessert,{freezer}
41,Carina's Tofu-Vegetable Kebabs,Carina's Tofu-Vegetable Kebabs,{oven}
45,Buttermilk Pie With Gingersnap Crumb Crust,Buttermilk Pie With Gingersnap Crumb Crust,{oven}
47,Butter Pecan Cookies,Butter Pecan Cookies,{oven}
48,Boston Cream Pie,Boston Cream Pie,{oven}
...,...,...,...
541370,Minty Whipped Shortbread Cookies,Minty Whipped Shortbread Cookies,{oven}
541372,Chiles Rellenos Casserole,Chiles Rellenos Casserole,{oven}
541373,Fran's Maple Blondies,Fran's Maple Blondies,{oven}
541379,Meg's Fresh Ginger Gingerbread,Meg's Fresh Ginger Gingerbread,{oven}


In [23]:
appliance_df[['Name', 'ApplianceKeywords']].to_pickle('../data/clean_columns/appliance_clean.pk')