# Imports

In [727]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from collections import Counter

# Exploratory Data Analysis

## Loading dataset

In [728]:
df = pd.read_csv("/Users/chrissibierich/code/christopherbierich/FeedMe/raw_data/Recipes/Food Ingredients and Recipe Dataset with Image Name Mapping.csv")

## Overview

In [729]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


In [730]:
df.shape

(13501, 6)

In [731]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13501 entries, 0 to 13500
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           13501 non-null  int64 
 1   Title                13496 non-null  object
 2   Ingredients          13501 non-null  object
 3   Instructions         13493 non-null  object
 4   Image_Name           13501 non-null  object
 5   Cleaned_Ingredients  13501 non-null  object
dtypes: int64(1), object(5)
memory usage: 633.0+ KB


In [732]:
df.rename(columns={'Ingredients': 'Uncleaned_Ingredients'}, inplace=True)

In [733]:
df.rename(columns={'Cleaned_Ingredients': 'Ingredients'}, inplace=True)

In [734]:
def text_to_list(text):
    text = text.replace("['", "")
    text = text.replace("']", "")
    my_list = text.split("', '")
    return my_list

In [735]:
df['Ingredients'] = df['Ingredients'].apply(lambda x: text_to_list(x))

## Data Cleaning

1. Lowercase everything 
2. Remove numbers
3. Remove punctuation
4. Remove stopwords
5. Remove scale words ['inches', 'teaspoon', medium, large, smalllb, inch, 1/2, 3/4]

In [736]:
stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer()
list_words = ['storebought', 'garnish', 'homemade', 'fresh',
              'coarsely', 'grated', 'evaporated', 'pound',
              'new', 'inch', 'diameter', 'torn',
              'sturdy', 'loaf', 'ground', 'flake',
              'piece', 'gala', 'cored','melted',
              'unsalted','salted','whole', 'divided',
              'kosher','cup', 'tsp', 'tbsp',
              'small', 'medium', 'large', 'lb',
              'finely', 'thinly', 'chopped', 'freshly',
              'sliced', 'cut', 'crushed', 'teaspoon',
              'plus', 'room', 'temperature', 'dry',
              'lady', 'oz', 'total', 'goodquality',
              'tablespoon', 'g', 'ounce', 'peeled']

In [737]:
def cleaning(list_x):
    cleaned_list = []
    list_x = list(map(lambda x: x.lower(), list_x))
    list_x = list(map(lambda x: ''.join(word for word in x if not word.isdigit()), list_x))
    list_x = list(map(lambda x: ''.join(word for word in x if not word in string.punctuation), list_x))
    for i in list_x:
        word_tokens = word_tokenize(i)
        word_tokens = [lemmatizer.lemmatize(w) for w in word_tokens]
        text = [w for w in word_tokens if not w in stop_words if not w in list_words if w.isalpha()]
        cleaned_list.append(" ".join(text))
    return cleaned_list

In [738]:
df['Ingredients'] = df['Ingredients'].apply(lambda x: cleaning(x))

In [739]:
df.iloc[0,4]

'miso-butter-roast-chicken-acorn-squash-panzanella'

In [740]:
df.isnull().sum()

Unnamed: 0               0
Title                    5
Uncleaned_Ingredients    0
Instructions             8
Image_Name               0
Ingredients              0
dtype: int64

In [741]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Uncleaned_Ingredients,Instructions,Image_Name,Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[chicken, salt, acorn squash, sage, rosemary, ..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[egg white, potato, salt, black pepper, rosema..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[milk, milk, garlic powder, onion powder, smok..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[round italian cube, olive oil, sweet italian ..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[dark brown sugar, hot water, bourbon, lemon j..."


In [742]:
all_ingredients = []
def add_all_ingredients(list_x):
    for i in list_x:
        all_ingredients.append(i)

In [743]:
df['Ingredients'].apply(lambda x: add_all_ingredients(x))

0        None
1        None
2        None
3        None
4        None
         ... 
13496    None
13497    None
13498    None
13499    None
13500    None
Name: Ingredients, Length: 13501, dtype: object

In [744]:
all_ingredients_df = pd.DataFrame(all_ingredients)

In [745]:
all_ingredients_df.to_csv('/Users/chrissibierich/code/christopherbierich/all_ingredients.csv')

check whether recipe includes 30 items from fridge 
- Map most common features 
- One-Hot-Endoder
- Disregards elements considered as condiments 

### Most common features

In [746]:
all_words = []
def add_all_words(list_x):
    for i in list_x:
        split = i.split()
        for word in split:
            all_words.append(word)

In [747]:
df['Ingredients'].apply(lambda x: add_all_words(x))

0        None
1        None
2        None
3        None
4        None
         ... 
13496    None
13497    None
13498    None
13499    None
13500    None
Name: Ingredients, Length: 13501, dtype: object

In [748]:
len(all_words)

389511

In [749]:
counts = Counter(all_words)
len(counts)

7025

In [750]:
sorted_counts = sorted(counts.items(), key=lambda x:x[1])
sorted_dict = dict(sorted_counts)

In [751]:
len(sorted_dict)

7025

In [752]:
list_ingredients = []
list_frequencies = []
for key in sorted_dict.keys():
    list_ingredients.append(key)

for value in sorted_dict.values():
    list_frequencies.append(value)

In [753]:
data_freq = {'ingredient': list_ingredients, 'frequency': list_frequencies}

In [754]:
df_freq = pd.DataFrame(data=data_freq)

In [755]:
df_freq.tail(20)

Unnamed: 0,ingredient,frequency
7005,stick,2672
7006,water,2696
7007,cream,3032
7008,flour,3412
7009,onion,3577
7010,white,3622
7011,red,3632
7012,leaf,3743
7013,egg,4119
7014,clove,4136


Word for word in each string:
  if word = to any of the 30 ingredients:
   label as word from default ingredients 
   from the model or is considered a document:
  if words such as oil, broth, etc. are accomponied, do label as 
    

### Transposing labels

Create a list of condoments and ingredients from the model ()

In [756]:
default_ingredients = ['apple', 'banana', 'beef', 'blueberries',
                       'bread', 'butter', 'carrot', 'cheese',
                       'chicken', 'chocolate',
                       'corn', 'eggs', 'flour', 'cheese',
                       'beans', 'ham', 'cream',
                       'lime', 'milk', 'mushrooms','onion',
                       'potato', 'shrimp', 'spinach', 'strawberries',
                       'sugar', 'tomato']

In [757]:
lemmatized = [lemmatizer.lemmatize(w) for w in default_ingredients]
lemmatized_default = lemmatized
lemmatized_default

['apple',
 'banana',
 'beef',
 'blueberry',
 'bread',
 'butter',
 'carrot',
 'cheese',
 'chicken',
 'chocolate',
 'corn',
 'egg',
 'flour',
 'cheese',
 'bean',
 'ham',
 'cream',
 'lime',
 'milk',
 'mushroom',
 'onion',
 'potato',
 'shrimp',
 'spinach',
 'strawberry',
 'sugar',
 'tomato']

In [758]:
occurences_words = []

for word in lemmatized_default:
    
    if word in list_ingredients:
        occurences_words.append(word)
        
occurences_words

['apple',
 'banana',
 'beef',
 'blueberry',
 'bread',
 'butter',
 'carrot',
 'cheese',
 'chicken',
 'chocolate',
 'corn',
 'egg',
 'flour',
 'cheese',
 'bean',
 'ham',
 'cream',
 'lime',
 'milk',
 'mushroom',
 'onion',
 'potato',
 'shrimp',
 'spinach',
 'strawberry',
 'sugar',
 'tomato']

In [759]:
siders = ['oil', 'cider', 'broth', 'juice', 'brisket', 'cream', 'ravioli', 'sauce']

In [761]:
def re_labelling_exp_2(list_x):
    relabeled_list = []
    for i in list_x:
        word_tokens = word_tokenize(i)
        label = i
        for ingredient in lemmatized_default:
            if ingredient in word_tokens:
                if not any(word in siders for word in i.split(' ')):
                    label = ingredient
        relabeled_list.append(label)
    return relabeled_list

In [763]:
df['labelled ingredients'] = df['Ingredients'].apply(lambda x: re_labelling_exp_2(x))

In [764]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Uncleaned_Ingredients,Instructions,Image_Name,Ingredients,labelled ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[chicken, salt, acorn squash, sage, rosemary, ...","[chicken, salt, acorn squash, sage, rosemary, ..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[egg white, potato, salt, black pepper, rosema...","[egg, potato, salt, black pepper, rosemary, th..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[milk, milk, garlic powder, onion powder, smok...","[milk, milk, garlic powder, onion, smoked papr..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[round italian cube, olive oil, sweet italian ...","[round italian cube, olive oil, sweet italian ..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[dark brown sugar, hot water, bourbon, lemon j...","[sugar, hot water, bourbon, lemon juice, butte..."


What to do tomorrow:
- on-hot encouder 
- Implement condoments (oil, salt, water, )