# Imports

In [61]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from collections import Counter

# Exploratory Data Analysis

## Loading dataset

In [62]:
df = pd.read_csv("/Users/chrissibierich/code/christopherbierich/FeedMe/raw_data/Recipes/Food Ingredients and Recipe Dataset with Image Name Mapping.csv")

## Overview

In [63]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


In [64]:
df.shape

(13501, 6)

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13501 entries, 0 to 13500
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           13501 non-null  int64 
 1   Title                13496 non-null  object
 2   Ingredients          13501 non-null  object
 3   Instructions         13493 non-null  object
 4   Image_Name           13501 non-null  object
 5   Cleaned_Ingredients  13501 non-null  object
dtypes: int64(1), object(5)
memory usage: 633.0+ KB


In [66]:
df.rename(columns={'Ingredients': 'Uncleaned_Ingredients'}, inplace=True)

In [67]:
df.rename(columns={'Cleaned_Ingredients': 'Ingredients'}, inplace=True)

In [68]:
def text_to_list(text):
    text = text.replace("['", "")
    text = text.replace("']", "")
    my_list = text.split("', '")
    return my_list

In [69]:
df['Ingredients'] = df['Ingredients'].apply(lambda x: text_to_list(x))

## Data Cleaning

In [70]:
stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer()
list_words = ['storebought', 'garnish', 'homemade', 'fresh',
              'coarsely', 'grated', 'evaporated', 'pound',
              'new', 'inch', 'diameter', 'torn',
              'sturdy', 'loaf', 'ground', 'flake',
              'piece', 'gala', 'cored','melted',
              'unsalted','salted','whole', 'divided',
              'kosher','cup', 'tsp', 'tbsp',
              'small', 'medium', 'large', 'lb',
              'finely', 'thinly', 'chopped', 'freshly',
              'sliced', 'cut', 'crushed', 'teaspoon',
              'plus', 'room', 'temperature', 'dry',
              'lady', 'oz', 'total', 'goodquality',
              'tablespoon', 'g', 'ounce', 'peeled']

In [71]:
def cleaning(list_x):
    cleaned_list = []
    list_x = list(map(lambda x: x.lower(), list_x))
    list_x = list(map(lambda x: ''.join(word for word in x if not word.isdigit()), list_x))
    list_x = list(map(lambda x: ''.join(word for word in x if not word in string.punctuation), list_x))
    for i in list_x:
        word_tokens = word_tokenize(i)
        word_tokens = [lemmatizer.lemmatize(w) for w in word_tokens]
        text = [w for w in word_tokens if not w in stop_words if not w in list_words if w.isalpha()]
        cleaned_list.append(" ".join(text))
    return cleaned_list

In [72]:
df['Ingredients'] = df['Ingredients'].apply(lambda x: cleaning(x))

In [73]:
df.isnull().sum()

Unnamed: 0               0
Title                    5
Uncleaned_Ingredients    0
Instructions             8
Image_Name               0
Ingredients              0
dtype: int64

In [74]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Uncleaned_Ingredients,Instructions,Image_Name,Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[chicken, salt, acorn squash, sage, rosemary, ..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[egg white, potato, salt, black pepper, rosema..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[milk, milk, garlic powder, onion powder, smok..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[round italian cube, olive oil, sweet italian ..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[dark brown sugar, hot water, bourbon, lemon j..."


In [75]:
all_ingredients = []
def add_all_ingredients(list_x):
    for i in list_x:
        all_ingredients.append(i)

In [76]:
df['Ingredients'].apply(lambda x: add_all_ingredients(x))

0        None
1        None
2        None
3        None
4        None
         ... 
13496    None
13497    None
13498    None
13499    None
13500    None
Name: Ingredients, Length: 13501, dtype: object

### Most common features

In [77]:
all_words = []
def add_all_words(list_x):
    for i in list_x:
        split = i.split()
        for word in split:
            all_words.append(word)

In [78]:
df['Ingredients'].apply(lambda x: add_all_words(x))

0        None
1        None
2        None
3        None
4        None
         ... 
13496    None
13497    None
13498    None
13499    None
13500    None
Name: Ingredients, Length: 13501, dtype: object

In [79]:
len(all_words)

389511

In [80]:
counts = Counter(all_words)
len(counts)

7025

In [81]:
sorted_counts = sorted(counts.items(), key=lambda x:x[1])
sorted_dict = dict(sorted_counts)

In [82]:
len(sorted_dict)

7025

In [83]:
list_ingredients = []
list_frequencies = []
for key in sorted_dict.keys():
    list_ingredients.append(key)

for value in sorted_dict.values():
    list_frequencies.append(value)

In [84]:
data_freq = {'ingredient': list_ingredients, 'frequency': list_frequencies}

In [85]:
df_freq = pd.DataFrame(data=data_freq)

In [86]:
df_freq.tail(20)

Unnamed: 0,ingredient,frequency
7005,stick,2672
7006,water,2696
7007,cream,3032
7008,flour,3412
7009,onion,3577
7010,white,3622
7011,red,3632
7012,leaf,3743
7013,egg,4119
7014,clove,4136


### Transposing labels

In [87]:
default_ingredients = ['apple', 'banana', 'beef', 'blueberries',
                       'bread', 'butter', 'carrot', 'cheese',
                       'chicken', 'chocolate',
                       'corn', 'eggs', 'flour', 'cheese',
                       'beans', 'ham', 'cream',
                       'lime', 'milk', 'mushrooms','onion',
                       'potato', 'shrimp', 'spinach', 'strawberries',
                       'sugar', 'tomato']

In [120]:
lemmatized = [lemmatizer.lemmatize(w) for w in default_ingredients]
lemmatized_default = lemmatized
lemmatized_default

['apple',
 'banana',
 'beef',
 'blueberry',
 'bread',
 'butter',
 'carrot',
 'cheese',
 'chicken',
 'chocolate',
 'corn',
 'egg',
 'flour',
 'cheese',
 'bean',
 'ham',
 'cream',
 'lime',
 'milk',
 'mushroom',
 'onion',
 'potato',
 'shrimp',
 'spinach',
 'strawberry',
 'sugar',
 'tomato']

In [89]:
siders = ['oil', 'cider', 'broth', 'juice', 'brisket', 'cream', 'ravioli', 'sauce']

In [90]:
def re_labelling_exp_2(list_x):
    relabeled_list = []
    for i in list_x:
        word_tokens = word_tokenize(i)
        label = i
        for ingredient in lemmatized_default:
            if ingredient in word_tokens:
                if not any(word in siders for word in i.split(' ')):
                    label = ingredient
        relabeled_list.append(label)
    return relabeled_list

In [91]:
df['labelled ingredients'] = df['Ingredients'].apply(lambda x: re_labelling_exp_2(x))

In [92]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Uncleaned_Ingredients,Instructions,Image_Name,Ingredients,labelled ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[chicken, salt, acorn squash, sage, rosemary, ...","[chicken, salt, acorn squash, sage, rosemary, ..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[egg white, potato, salt, black pepper, rosema...","[egg, potato, salt, black pepper, rosemary, th..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[milk, milk, garlic powder, onion powder, smok...","[milk, milk, garlic powder, onion, smoked papr..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[round italian cube, olive oil, sweet italian ...","[round italian cube, olive oil, sweet italian ..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[dark brown sugar, hot water, bourbon, lemon j...","[sugar, hot water, bourbon, lemon juice, butte..."


### One-Hot_Encoder

In [93]:
# def one_hot_encoder(dataframe):
#     for ingredient in lemmatized_default:
#         dataframe[ingredient] = 0
#     for i in range(dataframe.shape[0]):
#         for ingredient in lemmatized_default:
#             if ingredient in dataframe['labelled ingredients'][i]:
#                 dataframe[ingredient][i] = 1
#             else:
#                 dataframe[ingredient][i] = 0
#     return dataframe

In [94]:
# def create_matrix_row(dataframe):
#     for ingredient in lemmatized_default:
#         dataframe[ingredient] = 0


# def one_hot_encoder_row(row):
#     for ingredient in lemmatized_default:
#         if ingredient in row['labelled ingredients']:
#             row[ingredient][i] = 1
#         else:
#             row[ingredient][i] = 0
#     return row

In [95]:
def one_hot_encoder(row):
    for ingredient in lemmatized_default:
        if ingredient in row['labelled ingredients']:
            row[ingredient] = 1
        else:
            row[ingredient] = 0
    return row

In [96]:
df_final = df.apply(one_hot_encoder, axis=1)

In [97]:
df_final.head()

Unnamed: 0.1,Unnamed: 0,Title,Uncleaned_Ingredients,Instructions,Image_Name,Ingredients,labelled ingredients,apple,banana,beef,...,lime,milk,mushroom,onion,potato,shrimp,spinach,strawberry,sugar,tomato
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[chicken, salt, acorn squash, sage, rosemary, ...","[chicken, salt, acorn squash, sage, rosemary, ...",1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[egg white, potato, salt, black pepper, rosema...","[egg, potato, salt, black pepper, rosemary, th...",0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[milk, milk, garlic powder, onion powder, smok...","[milk, milk, garlic powder, onion, smoked papr...",0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[round italian cube, olive oil, sweet italian ...","[round italian cube, olive oil, sweet italian ...",0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[dark brown sugar, hot water, bourbon, lemon j...","[sugar, hot water, bourbon, lemon juice, butte...",0,0,0,...,0,0,0,0,0,0,0,0,1,0


drop unmatched
Rename appropritately 
Reorder appropritately
Add vectorized ingredients

### Finishing touches

In [99]:
df_final.drop(columns='Unnamed: 0', inplace=True)

In [102]:
df_final.rename(columns={'Uncleaned_Ingredients': 'Ingredients', 'Ingredients': 'Cleaned_Ingredients', 'labelled ingredients': 'Relabelled_Ingredients'}, inplace=True)

In [112]:
def tokenize(list_x):
    tokenized = []
    for i in list_x:
        word_tokens = word_tokenize(i)
        for x in word_tokens:
            tokenized.append(x)
    return tokenized
    

In [118]:
df_final['Tokenized_Ingredients'] = df_final['Relabelled_Ingredients'].apply(lambda x: tokenize(x))

In [119]:
df_final.head()

Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients,Relabelled_Ingredients,apple,banana,beef,blueberry,...,milk,mushroom,onion,potato,shrimp,spinach,strawberry,sugar,tomato,Tokenized_Ingredients
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[chicken, salt, acorn squash, sage, rosemary, ...","[chicken, salt, acorn squash, sage, rosemary, ...",1,0,0,0,...,0,0,1,0,0,0,0,0,0,"[chicken, salt, acorn, squash, sage, rosemary,..."
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[egg white, potato, salt, black pepper, rosema...","[egg, potato, salt, black pepper, rosemary, th...",0,0,0,0,...,0,0,0,1,0,0,0,0,0,"[egg, potato, salt, black, pepper, rosemary, t..."
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[milk, milk, garlic powder, onion powder, smok...","[milk, milk, garlic powder, onion, smoked papr...",0,0,0,0,...,1,0,1,0,0,0,0,0,0,"[milk, milk, garlic, powder, onion, smoked, pa..."
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[round italian cube, olive oil, sweet italian ...","[round italian cube, olive oil, sweet italian ...",0,0,0,0,...,0,0,1,0,0,0,0,0,0,"[round, italian, cube, olive, oil, sweet, ital..."
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[dark brown sugar, hot water, bourbon, lemon j...","[sugar, hot water, bourbon, lemon juice, butte...",0,0,0,0,...,0,0,0,0,0,0,0,1,0,"[sugar, hot, water, bourbon, lemon, juice, but..."


In [121]:
df_final = df_final[['Title', 'Instructions','Image_Name',
                     'Ingredients', 'Cleaned_Ingredients', 'Relabelled_Ingredients', 'Tokenized_Ingredients',
                     'apple','banana','beef','blueberry','bread','butter','carrot','cheese','chicken','chocolate','corn',
                     'egg','flour','cheese','bean','ham','cream','lime','milk','mushroom','onion','potato','shrimp',
                     'spinach','strawberry','sugar','tomato']]

In [122]:
df_final.head()

Unnamed: 0,Title,Instructions,Image_Name,Ingredients,Cleaned_Ingredients,Relabelled_Ingredients,Tokenized_Ingredients,apple,banana,beef,...,lime,milk,mushroom,onion,potato,shrimp,spinach,strawberry,sugar,tomato
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","[chicken, salt, acorn squash, sage, rosemary, ...","[chicken, salt, acorn squash, sage, rosemary, ...","[chicken, salt, acorn, squash, sage, rosemary,...",1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (...","[egg white, potato, salt, black pepper, rosema...","[egg, potato, salt, black pepper, rosemary, th...","[egg, potato, salt, black, pepper, rosemary, t...",0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ...","[milk, milk, garlic powder, onion powder, smok...","[milk, milk, garlic powder, onion, smoked papr...","[milk, milk, garlic, powder, onion, smoked, pa...",0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in...","[round italian cube, olive oil, sweet italian ...","[round italian cube, olive oil, sweet italian ...","[round, italian, cube, olive, oil, sweet, ital...",0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho...","[dark brown sugar, hot water, bourbon, lemon j...","[sugar, hot water, bourbon, lemon juice, butte...","[sugar, hot, water, bourbon, lemon, juice, but...",0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [130]:
df_final.to_csv('/Users/chrissibierich/code/christopherbierich/FeedMe/raw_data/Recipes/Cleaned_Recepies.csv')

In [132]:
df_final.iloc[1,1]

'Preheat oven to 400°F and line a rimmed baking sheet with parchment. In a large bowl, whisk the egg whites until foamy (there shouldn’t be any liquid whites in the bowl). Add the potatoes and toss until they’re well coated with the egg whites, then transfer to a strainer or colander and let the excess whites drain. Season the potatoes with the salt, pepper, and herbs. Scatter the potatoes on the baking sheet (make sure they’re not touching) and roast until the potatoes are very crispy and tender when poked with a knife, 15 to 20 minutes (depending on the size of the potatoes).\nTransfer to a bowl and serve.'

### Feature Engineering: Prep Time

In [136]:
# def prep_time(sentence):
#     default_time = 2
#     prep_time = 0
#     # seperate string into a list of sentences seperated by "."
#     split_string = sentence.split('.')
#     for sentence in split_string:
#         word_tokenize = tokenize(sentence)
#         if any(word in word_tokenize in 'minutes'))
#         for word in word_tokenize:
#             if word.isdigit():
#                 prep_time += int(word)
#             else:
#                 prep_time += default_time
                
#     return prep_time

In [137]:
# example = df_final.iloc[1,1]

In [138]:
# result = prep_time(example)

In [139]:
# result

994

Idea about prep_time
- loop over each sentence (if no time intervall is indicated) --> add default value
                          (if time intervals are given, take the latter one)
- differentiate between minutes, hours and F'

/d --> matches any digits
/d/d to consequtive digits 
/d{2} --> exactly do digits 