# Imports

In [832]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from collections import Counter
import re
from ast import literal_eval
import shutil

# Exploratory Data Analysis

## Loading dataset

In [751]:
csv_path = '../../FeedMe/raw_data/Recipes/Food Ingredients and Recipe Dataset with Image Name Mapping.csv' 

In [752]:
df = pd.read_csv(csv_path)

In [753]:
df.shape

(13501, 6)

In [754]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


## Inital Cleaning and Renaming

In [755]:
df = df.dropna()

In [756]:
df.drop(columns=['Ingredients', 'Unnamed: 0'], inplace=True)

In [757]:
df.rename(columns={'Cleaned_Ingredients': 'Ingredients'}, inplace=True)

In [758]:
df['Ingredients'] = df['Ingredients'].apply(literal_eval)

In [759]:
df.shape

(13493, 4)

In [760]:
df.head()

Unnamed: 0,Title,Instructions,Image_Name,Ingredients
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa..."
1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[2 large egg whites, 1 pound new potatoes (abo..."
2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[1 cup evaporated milk, 1 cup whole milk, 1 ts..."
3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[1 (¾- to 1-pound) round Italian loaf, cut int..."
4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[1 teaspoon dark brown sugar, 1 teaspoon hot w..."


## Data Cleaning

In [761]:
stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer()
remove_words = ['storebought', 'garnish', 'homemade', 'fresh',
              'coarsely', 'grated', 'evaporated', 'pound',
              'new', 'inch', 'diameter', 'torn',
              'sturdy', 'loaf', 'ground', 'flake',
              'piece', 'gala', 'cored','melted',
              'unsalted','salted','whole', 'divided',
              'kosher','cup', 'tsp', 'tbsp',
              'small', 'medium', 'large', 'lb',
              'finely', 'thinly', 'chopped', 'freshly',
              'sliced', 'cut', 'crushed', 'teaspoon',
              'plus', 'room', 'temperature', 'dry',
              'lady', 'oz', 'total', 'goodquality',
              'tablespoon', 'g', 'ounce', 'peeled']

In [762]:
def cleaning(ingredients):
    cleaned_ingredients = []
    ingredients = list(map(lambda x: x.lower(), ingredients))
    ingredients = list(map(lambda x: ''.join(word for word in x if not word.isdigit()), ingredients))
    ingredients = list(map(lambda x: ''.join(word for word in x if not word in string.punctuation), ingredients))
    for i in ingredients:
        word_tokens = word_tokenize(i)
        word_tokens = [lemmatizer.lemmatize(w) for w in word_tokens]
        text = [w for w in word_tokens if not w in stop_words if not w in remove_words if w.isalpha()]
        cleaned_ingredients.append(" ".join(text))
    return cleaned_ingredients

In [763]:
df['Cleaned_Ingredients'] = df['Ingredients'].apply(lambda x: cleaning(x))

In [764]:
df.head()

Unnamed: 0,Title,Instructions,Image_Name,Ingredients,Cleaned_Ingredients
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa...","[chicken, salt, acorn squash, sage, rosemary, ..."
1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[2 large egg whites, 1 pound new potatoes (abo...","[egg white, potato, salt, black pepper, rosema..."
2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...","[milk, milk, garlic powder, onion powder, smok..."
3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[1 (¾- to 1-pound) round Italian loaf, cut int...","[round italian cube, olive oil, sweet italian ..."
4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[1 teaspoon dark brown sugar, 1 teaspoon hot w...","[dark brown sugar, hot water, bourbon, lemon j..."


## Transposing labels

In [765]:
ingredients = ['apple', 'banana', 'beef', 'blueberries',
               'bread', 'butter', 'carrot', 'cheese',
               'chicken', 'chocolate','corn', 'eggs',
               'flour','beans', 'ham',
               'cream', 'lime', 'milk', 'mushrooms',
               'onion', 'potato', 'shrimp', 'spinach',
               'strawberries', 'sugar', 'tomato']
len(ingredients)

26

In [801]:
ingredients_lem = [lemmatizer.lemmatize(w) for w in ingredients]
ingredients_lem

['apple',
 'banana',
 'beef',
 'blueberry',
 'bread',
 'butter',
 'carrot',
 'cheese',
 'chicken',
 'chocolate',
 'corn',
 'egg',
 'flour',
 'bean',
 'ham',
 'cream',
 'lime',
 'milk',
 'mushroom',
 'onion',
 'potato',
 'shrimp',
 'spinach',
 'strawberry',
 'sugar',
 'tomato']

In [767]:
siders = ['oil', 'cider', 'broth', 'juice', 'brisket', 'ravioli', 'sauce']

In [768]:
def relabelling(ingredients):
    relabeled_ingredients = []
    for i in ingredients:
        word_tokens = word_tokenize(i)
        label = i
        for ingredient in ingredients_lem:
            if ingredient in word_tokens:
                if not any(word in siders for word in i.split(' ')):
                    label = ingredient
        relabeled_ingredients.append(label)
    return relabeled_ingredients

In [769]:
df['Relabelled_Ingredients'] = df['Cleaned_Ingredients'].apply(lambda x: relabelling(x))

In [770]:
df.head()
df.columns

Index(['Title', 'Instructions', 'Image_Name', 'Ingredients',
       'Cleaned_Ingredients', 'Relabelled_Ingredients'],
      dtype='object')

## One-Hot_Encoder

In [771]:
def one_hot_encoder(row):
    for ingredient in ingredients_lem:
        if ingredient in row['Relabelled_Ingredients']:
            row[ingredient] = 1
        else:
            row[ingredient] = 0
    return row

In [772]:
df = df.apply(one_hot_encoder, axis=1)

In [773]:
df.columns

Index(['Title', 'Instructions', 'Image_Name', 'Ingredients',
       'Cleaned_Ingredients', 'Relabelled_Ingredients', 'apple', 'banana',
       'beef', 'blueberry', 'bread', 'butter', 'carrot', 'cheese', 'chicken',
       'chocolate', 'corn', 'egg', 'flour', 'bean', 'ham', 'cream', 'lime',
       'milk', 'mushroom', 'onion', 'potato', 'shrimp', 'spinach',
       'strawberry', 'sugar', 'tomato'],
      dtype='object')

In [774]:
df.head()

Unnamed: 0,Title,Instructions,Image_Name,Ingredients,Cleaned_Ingredients,Relabelled_Ingredients,apple,banana,beef,blueberry,...,lime,milk,mushroom,onion,potato,shrimp,spinach,strawberry,sugar,tomato
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa...","[chicken, salt, acorn squash, sage, rosemary, ...","[chicken, salt, acorn squash, sage, rosemary, ...",1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[2 large egg whites, 1 pound new potatoes (abo...","[egg white, potato, salt, black pepper, rosema...","[egg, potato, salt, black pepper, rosemary, th...",0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...","[milk, milk, garlic powder, onion powder, smok...","[milk, milk, garlic powder, onion, smoked papr...",0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[1 (¾- to 1-pound) round Italian loaf, cut int...","[round italian cube, olive oil, sweet italian ...","[round italian cube, olive oil, sweet italian ...",0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[1 teaspoon dark brown sugar, 1 teaspoon hot w...","[dark brown sugar, hot water, bourbon, lemon j...","[sugar, hot water, bourbon, lemon juice, butte...",0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## Finishing touches

In [775]:
def tokenize(list_x):
    tokenized = []
    for i in list_x:
        word_tokens = word_tokenize(i)
        for x in word_tokens:
            tokenized.append(x)
    return tokenized
    

In [776]:
df['Tokenized_Ingredients'] = df['Relabelled_Ingredients'].apply(lambda x: tokenize(x))

In [777]:
df = df[['Title', 'Instructions','Image_Name',
                     'Ingredients', 'Cleaned_Ingredients', 'Relabelled_Ingredients', 'Tokenized_Ingredients',
                     'apple','banana','beef','blueberry','bread','butter','carrot','cheese','chicken','chocolate','corn',
                     'egg','flour','bean','ham','cream','lime','milk','mushroom','onion','potato','shrimp',
                     'spinach','strawberry','sugar','tomato']]

In [778]:
df.head()

Unnamed: 0,Title,Instructions,Image_Name,Ingredients,Cleaned_Ingredients,Relabelled_Ingredients,Tokenized_Ingredients,apple,banana,beef,...,lime,milk,mushroom,onion,potato,shrimp,spinach,strawberry,sugar,tomato
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa...","[chicken, salt, acorn squash, sage, rosemary, ...","[chicken, salt, acorn squash, sage, rosemary, ...","[chicken, salt, acorn, squash, sage, rosemary,...",1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[2 large egg whites, 1 pound new potatoes (abo...","[egg white, potato, salt, black pepper, rosema...","[egg, potato, salt, black pepper, rosemary, th...","[egg, potato, salt, black, pepper, rosemary, t...",0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...","[milk, milk, garlic powder, onion powder, smok...","[milk, milk, garlic powder, onion, smoked papr...","[milk, milk, garlic, powder, onion, smoked, pa...",0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[1 (¾- to 1-pound) round Italian loaf, cut int...","[round italian cube, olive oil, sweet italian ...","[round italian cube, olive oil, sweet italian ...","[round, italian, cube, olive, oil, sweet, ital...",0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[1 teaspoon dark brown sugar, 1 teaspoon hot w...","[dark brown sugar, hot water, bourbon, lemon j...","[sugar, hot water, bourbon, lemon juice, butte...","[sugar, hot, water, bourbon, lemon, juice, but...",0,0,0,...,0,0,0,0,0,0,0,0,1,0


## Filtering

In [779]:
filtering_ingredients = {'fish': ['salmon', 'fish', 'crab', 'scallop', 'tuna', 'cod',
                                  'anchovy', 'halibut', 'sea', 'bass', 'mussel', 'oyster', 
                                  'trout', 'squid', 'seafood'],
                         'meat': ['pork','lamb', 'turkey', 'bacon','sausage', 'ham',
                                  'prosciutto', 'chorizo', 'veal'],
                         'vegetable': ['squash', 'pea', 'pumpkin', 'beet', 'cucumber', 'kale', 
                                       'zucchini', 'asparagus', 'eggplant', 'cabbage', 'broccoli', 
                                       'celery', 'leek', 'artichoke', 'polenta', 'radish', 'lentil', 
                                       'brussels', 'horseradish', 'tofu', 'radicchio', 'lettuce', 
                                       'truffle', 'escarole', 'kimchi', 'collard', 'turnip', 'endive'],
                         'fruits': ['orange', 'coconut', 'cherry', 'cranberry', 'pear', 'peach', 'raspberry',
                                    'avocado', 'pineapple', 'apricot', 'fig', 'mango', 'pomegranate', 
                                    'blackberry', 'rhubarb', 'grape', 'raisin', 'grapefruit', 
                                    'watermelon', 'lobster', 'melon'],
                         'other': ['pizza', 'couscous', 'margarita', 'champagne']}

In [780]:
def filter(ingredients):
    list_filter = []
    for value in filtering_ingredients.values():
        for ingredient in value:
            list_filter.append(ingredient)
    for ingredient in list_filter:
        if any(word for word in list_filter if word in ingredients):
            return 1
        else:
            return 0

In [781]:
df['Remove'] = df['Tokenized_Ingredients'].apply(lambda x: filter(x))

In [782]:
df_filtered = df[df['Remove']==0]


In [783]:
df_filtered.shape

(4805, 34)

In [784]:
df_filtered.head()

Unnamed: 0,Title,Instructions,Image_Name,Ingredients,Cleaned_Ingredients,Relabelled_Ingredients,Tokenized_Ingredients,apple,banana,beef,...,milk,mushroom,onion,potato,shrimp,spinach,strawberry,sugar,tomato,Remove
1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[2 large egg whites, 1 pound new potatoes (abo...","[egg white, potato, salt, black pepper, rosema...","[egg, potato, salt, black pepper, rosemary, th...","[egg, potato, salt, black, pepper, rosemary, t...",0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...","[milk, milk, garlic powder, onion powder, smok...","[milk, milk, garlic powder, onion, smoked papr...","[milk, milk, garlic, powder, onion, smoked, pa...",0,0,0,...,1,0,1,0,0,0,0,0,0,0
5,Warm Comfort,Place 2 chamomile tea bags in a heatsafe vesse...,warm-comfort-tequila-chamomile-toddy,"[2 chamomile tea bags, 1½ oz. reposado tequila...","[chamomile tea bag, reposado tequila, lemon ju...","[chamomile tea bag, reposado tequila, lemon ju...","[chamomile, tea, bag, reposado, tequila, lemon...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Apples and Oranges,"Add 3 oz. Grand Marnier, 1 oz. Amaro Averna, a...",apples-and-oranges-spiked-cider,"[3 oz. Grand Marnier, 1 oz. Amaro Averna, Smal...","[grand marnier, amaro averna, pat butter, hot ...","[grand marnier, amaro averna, butter, hot appl...","[grand, marnier, amaro, averna, butter, hot, a...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Turmeric Hot Toddy,"For the turmeric syrup, combine ½ cup hot wate...",turmeric-hot-toddy-claire-sprouse,"[¼ cup granulated sugar, ¾ tsp. ground turmeri...","[granulated sugar, turmeric, amontillado sherr...","[sugar, turmeric, amontillado sherry, bourbon ...","[sugar, turmeric, amontillado, sherry, bourbon...",0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Feature Engineering: Prep Time

In [785]:
def prep_time(instructions):
    prep_time = 0
    default_time = 2
    pattern_minutes = '(\d+) minutes'
    pattern_hour = '(\d) hour'
    sentences = instructions.split(".")
    prep_time += default_time * len(sentences)
    for sentence in sentences:
        search_minutes = re.findall(pattern_minutes, sentence)
        search_hours = re.findall(pattern_hour, sentence)
        for m in search_minutes:
            prep_time += int(m)
        for h in search_hours:
            prep_time += (int(h) * 60)
    return prep_time

In [786]:
df_filtered['Prep Time'] = df_filtered['Instructions'].apply(lambda x: prep_time(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Prep Time'] = df_filtered['Instructions'].apply(lambda x: prep_time(x))


In [787]:
df_filtered.head()

Unnamed: 0,Title,Instructions,Image_Name,Ingredients,Cleaned_Ingredients,Relabelled_Ingredients,Tokenized_Ingredients,apple,banana,beef,...,mushroom,onion,potato,shrimp,spinach,strawberry,sugar,tomato,Remove,Prep Time
1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[2 large egg whites, 1 pound new potatoes (abo...","[egg white, potato, salt, black pepper, rosema...","[egg, potato, salt, black pepper, rosemary, th...","[egg, potato, salt, black, pepper, rosemary, t...",0,0,0,...,0,0,1,0,0,0,0,0,0,34
2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...","[milk, milk, garlic powder, onion powder, smok...","[milk, milk, garlic powder, onion, smoked papr...","[milk, milk, garlic, powder, onion, smoked, pa...",0,0,0,...,0,1,0,0,0,0,0,0,0,44
5,Warm Comfort,Place 2 chamomile tea bags in a heatsafe vesse...,warm-comfort-tequila-chamomile-toddy,"[2 chamomile tea bags, 1½ oz. reposado tequila...","[chamomile tea bag, reposado tequila, lemon ju...","[chamomile tea bag, reposado tequila, lemon ju...","[chamomile, tea, bag, reposado, tequila, lemon...",0,0,0,...,0,0,0,0,0,0,0,0,0,21
6,Apples and Oranges,"Add 3 oz. Grand Marnier, 1 oz. Amaro Averna, a...",apples-and-oranges-spiked-cider,"[3 oz. Grand Marnier, 1 oz. Amaro Averna, Smal...","[grand marnier, amaro averna, pat butter, hot ...","[grand marnier, amaro averna, butter, hot appl...","[grand, marnier, amaro, averna, butter, hot, a...",0,0,0,...,0,0,0,0,0,0,0,0,0,18
7,Turmeric Hot Toddy,"For the turmeric syrup, combine ½ cup hot wate...",turmeric-hot-toddy-claire-sprouse,"[¼ cup granulated sugar, ¾ tsp. ground turmeri...","[granulated sugar, turmeric, amontillado sherr...","[sugar, turmeric, amontillado sherry, bourbon ...","[sugar, turmeric, amontillado, sherry, bourbon...",0,0,0,...,0,0,0,0,0,0,1,0,0,30


In [788]:
def prep_time_range(prep_time_min):
    prep_time_h = round(prep_time_min / 60)
    return f'{prep_time_h} - {prep_time_h + 1} hours'

In [789]:
df_filtered['Prep Time Range'] = df_filtered['Prep Time'].apply(lambda x: prep_time_range(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Prep Time Range'] = df_filtered['Prep Time'].apply(lambda x: prep_time_range(x))


In [805]:
df_filtered.iloc[0, 2]

'crispy-salt-and-pepper-potatoes-dan-kluger'

## Suggestion Scoring

In [791]:
df_filtered[ingredients_lem]

Unnamed: 0,apple,banana,beef,blueberry,bread,butter,carrot,cheese,chicken,chocolate,...,lime,milk,mushroom,onion,potato,shrimp,spinach,strawberry,sugar,tomato
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13492,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
13494,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
13496,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
13498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [792]:
def score(row, vector):
    #computes dot product
    A = row.to_numpy()
    row['score'] = np.matmul(A, vector)
    return row

In [793]:
V = np.ones(26)

In [794]:
len(ingredients_lem)

26

In [795]:
df_filtered.columns

Index(['Title', 'Instructions', 'Image_Name', 'Ingredients',
       'Cleaned_Ingredients', 'Relabelled_Ingredients',
       'Tokenized_Ingredients', 'apple', 'banana', 'beef', 'blueberry',
       'bread', 'butter', 'carrot', 'cheese', 'chicken', 'chocolate', 'corn',
       'egg', 'flour', 'bean', 'ham', 'cream', 'lime', 'milk', 'mushroom',
       'onion', 'potato', 'shrimp', 'spinach', 'strawberry', 'sugar', 'tomato',
       'Remove', 'Prep Time', 'Prep Time Range'],
      dtype='object')

In [796]:
df_filtered[ingredients_lem].apply(lambda x: score(x, V), axis=1).sort_values(by='score', ascending=False)

Unnamed: 0,apple,banana,beef,blueberry,bread,butter,carrot,cheese,chicken,chocolate,...,milk,mushroom,onion,potato,shrimp,spinach,strawberry,sugar,tomato,score
9684,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0
5774,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0
13056,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,9.0
8881,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0
402,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [799]:
df_filtered.to_csv('/Users/chrissibierich/code/christopherbierich/FeedMe/raw_data/Recipes/26052022_dataset.csv')

## Filtering for bad images 

In [834]:
df_filtered.loc[df['Image_Name'] == '#NAME?'].head(3)

Unnamed: 0,Title,Instructions,Image_Name,Ingredients,Cleaned_Ingredients,Relabelled_Ingredients,Tokenized_Ingredients,apple,banana,beef,...,onion,potato,shrimp,spinach,strawberry,sugar,tomato,Remove,Prep Time,Prep Time Range
3441,"""Burnt"" Carrots and Parsnips","Preheat oven to 450°F. Toss carrots, parsnips,...",#NAME?,"[1 1/2 pounds carrots, peeled, halved lengthwi...","[carrot halved lengthwise, parsnip, olive oil,...","[carrot, parsnip, olive oil, salt pepper, onio...","[carrot, parsnip, olive, oil, salt, pepper, on...",0,0,0,...,1,0,0,0,0,1,0,0,49,1 - 2 hours
4139,Hazelnut Butter and Coffee Meringues,Preheat oven to 350°. Toast hazelnuts on a rim...,#NAME?,"[1 cup skin-on hazelnuts, 1/4 teaspoon kosher ...","[skinon hazelnut, salt, egg white, pinch cream...","[skinon hazelnut, salt, egg, cream, sugar, sug...","[skinon, hazelnut, salt, egg, cream, sugar, su...",0,0,0,...,0,0,0,0,0,1,0,0,183,3 - 4 hours
4331,Pumpkin Gruyère Gratin with Thyme,Preheat oven to 425°F. In a medium skillet ove...,#NAME?,"[1 teaspoon extra-virgin olive oil, 1 teaspoon...","[extravirgin olive oil, butter, onion halved, ...","[extravirgin olive oil, butter, onion, sugar, ...","[extravirgin, olive, oil, butter, onion, sugar...",0,0,0,...,1,0,0,0,0,1,0,0,123,2 - 3 hours


In [828]:
df_filtered_images = df_filtered[df_filtered['Image_Name']!='#NAME?']

In [829]:
df_filtered.shape

(4805, 36)

In [830]:
df_filtered_images.shape

(4792, 36)

In [833]:
def copy_images(image_name):
    source = f'/Users/chrissibierich/code/christopherbierich/FeedMe/raw_data/Recipes/Food Images/{image_name}.jpg'
    destination = f'/Users/chrissibierich/code/christopherbierich/FeedMe/raw_data/Recipes/Food Images Filtered/{image_name}.jpg'
    shutil.copyfile(source, destination)  

In [835]:
df_filtered_images['Image_Name'].apply(lambda x: copy_images(x))

1        None
2        None
5        None
6        None
7        None
         ... 
13492    None
13494    None
13496    None
13498    None
13499    None
Name: Image_Name, Length: 4792, dtype: object

### Removing bad images

list_bad_images = ['applesauce-with-butter-pecan-crumbs-231204.jpg', ]

In [None]:
list_bad_images = ['applesauce-with-butter-pecan-crumbs-231204.jpg',
                   'aromatic-braised-chicken-with-fried-onions-242308.jpg',
                   'arugula-salad-234128.jpg',
                   'baked-egg-custard-with-gruyere-and-chives-350969.jpg',
                   'baked-garden-tomatoes-with-cheese-235353.jpg',
                   'baked-yams-with-ginger-molasses-butter-230942.jpg',
                   'balsamic-bean-dip-with-fresh-veggies-241331.jpg',
                   'balsamic-roasted-tomato-and-goat-cheese-crisps-363383.jpg',
                   'banana-upside-down-cake-358131.jpg',
                   'banoffee-pie-231392.jpg',
                   'barley-soup-with-duck-confit-and-root-vegetables-232784.jpg',
                   'basic-yogurt-pancakes-358271.jpg',
                   'basil-vodka-gimlets-238926.jpg',
                   'bazooka-bubblegum-cocktail-350146.jpg',
                   'beef-short-ribs-in-chipotle-and-green-chili-sauce-107644.jpg',
                   'beef-stroganoff-233247.jpg',
                   'beef-tenderloin-with-smoked-paprika-mayonnaise-242869.jpg',
                   'beef-yakitori-360290.jpg',
                   'beer-batter-fried-sardines-and-lime-101775.jpg',
                   'beignets-356810.jpg',
                   'belgian-buttermilk-waffles-with-glazed-bananas-232170.jpg',
                   'bengali-style-fish-in-yogurt-curry-367030.jpg',
                   'black-currant-diablo-232183.jpg',
                   'blisters-on-my-sisters-355904.jpg',
                   'blue-cheese-dressing-364989.jpg',
                   'blue-cheese-gougeres-351019.jpg',
                   'blueberriest-muffins-363353.jpg',
                   'blueberry-pie-with-cornmeal-crust-and-lemon-cream-242725.jpg',
                   'blueberry-smash-395929.jpg',
                   'boiled-yuca-em-yuca-hervida-em-51203220.jpg',
                   'braised-chicken-with-tomatoes-and-olives-em-poulet-provencal-em-241766.jpg',
                   'braised-duck-legs-with-shallots-and-parsnips-109109.jpg',
                   'brioche-363478.jpg',
                   'broiled-arctic-char-with-basil-sauce-and-tomato-230448.jpg',
                   'brown-butter-and-peanut-brittle-ice-cream-241123.jpg',
                   'brown-butter-ginger-and-sour-cream-coffee-cake-368250.jpg',
                   'brown-sugar-ginger-cream-cake-363475.jpg',
                   'brown-sugar-shortbread-361649.jpg',
                   'browned-onion-kugels-231507.jpg',
                   'brownies-363442.jpg',
                   ]