## Intermediate Cleaning - Simply Recipes

In [1]:
# read in kickstarter intermediate data 
import numpy as np
import os
import pandas as pd 
import seaborn as sns
import sys
import re
# import warnings
# warnings.filterwarnings('ignore')

# src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
# sys.path.append(src_dir)

# from d01_data.utils import read_multiple_csv_and_concat

In [2]:
recipes_sr1 = pd.read_csv('../../data/01_raw/simply_recipes/simply_recipes_1.csv')
recipes_sr2 = pd.read_csv('../../data/01_raw/simply_recipes/simply_recipes_2.csv')
recipes_sr3 = pd.read_csv('../../data/01_raw/simply_recipes/simply_recipes_3.csv')

## Drop Useless Columns & Concat

Looks like both Datasets have unnamed columns. Let's drop these

In [3]:
recipes_sr1.columns

Index(['Unnamed: 0', 'title', 'prep_time', 'cook_time', 'recipe_yield', 'tags',
       'ingredients', 'entire_card', 'byline', 'link_food'],
      dtype='object')

In [4]:
recipes_sr2.columns

Index(['Unnamed: 0', 'title', 'prep_time', 'cook_time', 'recipe_yield', 'tags',
       'ingredients', 'entire_card', 'byline', 'link_food'],
      dtype='object')

In [5]:
recipes_sr3.columns

Index(['Unnamed: 0', 'title', 'prep_time', 'cook_time', 'recipe_yield', 'tags',
       'ingredients', 'entire_card', 'byline', 'link_food'],
      dtype='object')

In [6]:
recipes_sr1.drop(columns=['Unnamed: 0'], inplace=True)
recipes_sr2.drop(columns=['Unnamed: 0'], inplace=True)
recipes_sr3.drop(columns=['Unnamed: 0'], inplace=True)

In [7]:
print(recipes_sr1.columns)
print(recipes_sr2.columns)
print(recipes_sr3.columns)

Index(['title', 'prep_time', 'cook_time', 'recipe_yield', 'tags',
       'ingredients', 'entire_card', 'byline', 'link_food'],
      dtype='object')
Index(['title', 'prep_time', 'cook_time', 'recipe_yield', 'tags',
       'ingredients', 'entire_card', 'byline', 'link_food'],
      dtype='object')
Index(['title', 'prep_time', 'cook_time', 'recipe_yield', 'tags',
       'ingredients', 'entire_card', 'byline', 'link_food'],
      dtype='object')


### Concat

In [8]:
recipes_sr_full = pd.concat([recipes_sr1, recipes_sr2, recipes_sr3], axis=0)
recipes_sr_full.reset_index(inplace=True, drop=True)

In [9]:
recipes_sr_full.head()

Unnamed: 0,title,prep_time,cook_time,recipe_yield,tags,ingredients,entire_card,byline,link_food
0,['Grilled Cheese BLT'],"['Prep time:', ' ', '10 minutes']","['Cook time:', ' ', '10 minutes']","['Yield:', ' ', '4 sandwiches']","['Filed under:', ' ', 'Dinner', 'Lunch', 'Sand...","['\n ', 'Ingredients', ...","['\n\n ', '\n ...","['by ', ' ', 'Aaron Hutcherson', 'August 2...","['<link rel=""canonical"" href=""https://www.simp..."
1,['Pulled Pork Sandwich'],"['Prep time:', ' ', '10 minutes']","['Cook time:', ' ', '2 hours, 45 minutes']","['Yield:', ' ', 'Serves 6 to 8']","['Filed under:', ' ', 'Dinner', 'Sandwich', 'B...","['\n ', 'Ingredients', ...","['\n\n ', '\n ...","['by ', ' ', 'Elise Bauer', 'Updated Augus...","['<link rel=""canonical"" href=""https://www.simp..."
2,['How to Make Bacon in the Oven'],"['Prep time:', ' ', '5 minutes']","['Cook time:', ' ', '20 minutes']","['Yield:', ' ', '12 strips']","['Filed under:', ' ', 'Tips', 'Breakfast and B...","['\n ', 'Ingredients', ...","['\n\n ', '\n ...","['by ', ' ', 'Nick Evans', 'August 25, 2019']","['<link rel=""canonical"" href=""https://www.simp..."
3,['Sausage Stuffed Zucchini'],"['Prep time:', ' ', '15 minutes']","['Cook time:', ' ', '1 hour']","['Yield:', ' ', 'Serves 4']","['Filed under:', ' ', 'Dinner', 'Favorite Summ...","['\n ', 'Ingredients', ...","['\n\n ', '\n ...","['by ', ' ', 'Elise Bauer', 'Updated Augus...","['<link rel=""canonical"" href=""https://www.simp..."
4,['The Best Dry Rub for Ribs'],"['Prep time:', ' ', '5 minutes']","['Yield:', ' ', '1 1/2 cups']",[],"['Filed under:', ' ', 'Favorite Fall', 'Favori...","['\n ', 'Ingredients', ...","['\n\n ', '\n ...","['by ', ' ', 'Irvin Lin', 'July 28, 2019']","['<link rel=""canonical"" href=""https://www.simp..."


## EDA - Let's see what we have

**TO-DO LIST**
1. title - take title out of brackets
1. prep_time - take cook time down to time
1. cook_time - split into 2 columns (one with number and other with measure word) 
1. tags - make tags just a simple list
1. ingredients - make simple list


In [10]:
recipes_sr_full.drop_duplicates(inplace=True)

In [11]:
recipes_sr_full.shape

(1748, 9)

In [12]:
recipes_sr_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1748 entries, 0 to 1751
Data columns (total 9 columns):
title           1748 non-null object
prep_time       1748 non-null object
cook_time       1748 non-null object
recipe_yield    1748 non-null object
tags            1748 non-null object
ingredients     1748 non-null object
entire_card     1748 non-null object
byline          1748 non-null object
link_food       1748 non-null object
dtypes: object(9)
memory usage: 136.6+ KB


### Title

In [13]:
titles = recipes_sr_full.title

In [14]:
title_new = []
for title in titles:
    try:
        title_new.append(re.search("'(.*)\'", title).group(1))
    except:
        title_new.append(np.nan)

### Prep Time

In [15]:
prep_time = recipes_sr_full.prep_time

In [16]:
prep_time_new = []
for time in prep_time:
    try:
        prep_time_new.append(re.search("\'Prep time:\', \' \',\ \'(.*)\'", time).group(1))
    except:
        prep_time_new.append(np.nan)

### cook_time

In [17]:
cook_times = recipes_sr_full.cook_time

In [18]:
cook_time_new = []
for time in cook_times:
    try:
        cook_time_new.append(re.search("\'Cook time:\', \' \',\ \'(.*)\'", time).group(1))
    except:
        cook_time_new.append(np.nan)

### recipe_yield

In [19]:
recipe_yield = recipes_sr_full.recipe_yield

# clean the recipe yield
recipe_yield_new = []
for element in recipe_yield:
    try:
        recipe_yield_new.append(re.search("\'Yield:\', \' \', \'(.*)\']", element).group(1))
    except:
        recipe_yield_new.append(np.nan)

### Tags

In [20]:
tags = recipes_sr_full.tags

In [21]:
tags_new = []
for tag in tags:
    sub_list = []
    try:
        sub_list.append(re.search("\'Filed under:\', \' \',\ (.*)\]", tag).group(1))
    except:
        sub_list.append(np.nan)
    tags_new.append(sub_list)

### Ingredients

In [22]:
ingredient_lists = recipes_sr_full.ingredients

In [23]:
ingredients_new = []
for ingredient_list in ingredient_lists:
    try:
        ingredients_new.append(re.findall("\'(.*?)\'", ingredient_list))
    except:
        ingredients_new.append(np.nan)

In [24]:
# Note, this line needs to be run three times. Fix this later.
count = 0
while count <= 3:
    for sub_list in ingredients_new:
        for idx, element in enumerate(sub_list):
            if "\\n" in element:
                sub_list.pop(idx)
    count = count + 1

In [25]:
l = []
for element in ingredients_new:
    l.append(str(element).split('\\\\n'))
    

In [26]:
new_list = []
for i in l:
    sub_list = []
    for ele in i:
        sub_list.append(ele)
    new_list.append(sub_list)
        
#         print(ele)
#         print('\n')

In [27]:
nl = []
for i in new_list:
    sl = []
    for ele in i:
        sl.append(ele.strip('.][\,'))
    nl.append(sl)

In [28]:
count = 0
while count <= 3:
    count = count + 1
    for sub_list in nl:
        for idx, element in enumerate(sub_list):
            try:
                if element == "                        ', 'Ingredients', '":
                    sub_list.pop(idx)
                if element == "'":
                    sub_list.pop(idx)
                if element == "":
                    sub_list.pop(idx)
                if element == "                        ', '":
                    sub_list.pop(idx)
                if element == "                        ', 'For the sauce:', '":
                    sub_list.pop(idx)
                if element == "                                                  ', ' Special equipment:', '":
                    sub_list.pop(idx)
                if '\\\\t' in element:
                    sub_list.pop(idx)
                if element == "                                              '":
                    sub_list.pop(idx)
                if element == '':
                    sub_list.pop(idx)
#                 if "                        ', '" in element:
#                     sub_list.pop(idx)
#                 if element == "', '":
#                     sub_list.pop(idx)
            except:
                pass

In [29]:
data_sr = []
for i in nl:
    sl = []
    for ele in i:
        sl.append(ele.strip("').,('"))
    data_sr.append(sl)

In [30]:
data_sr_fin = []
for recipe in data_sr:
    sub_list = []
    for ingredient in recipe:
        try:
            sub_list.append(re.search("\ \'(.*)\', ", ingredient).group(1))
        except:
            sub_list.append(ingredient)
    data_sr_fin.append(sub_list)

In [31]:
data_sr_fin[0]

["8 slices sourdough bread', '4 tablespoon unsalted butter, at room temperature', '8 ounces (2 cups) shredded cheddar cheese', '2 slicing tomatoes (such as beefsteak, Brandywine, or Cherokee purple), sliced 1/4-inch thick', '8 to 12 slices ', 'cooked bacon', '12 leaves butterhead or other crispy lettuce"]

### byline'

In [32]:
bylines = list(recipes_sr_full.byline)

In [33]:
len(bylines)

1748

In [34]:
byline_new = []
for byline in bylines:
    try:
        byline_new.append(re.search("\[\'by   \', \'   \', \'(.*)\', \'", byline).group(1))
    except: 
        byline_new.append(byline)

In [35]:
len(byline_new)

1748

In [36]:
byline_new_2 = []
for byline in byline_new:
    try:
        byline_new_2.append(re.search("\[\'by   \', \'   \', \'(.*)\'\]", byline).group(1))
    except: 
        byline_new_2.append(byline)

In [37]:
len(byline_new_2)

1748

In [38]:
byline_new_3 = []
for byline in byline_new_2:
    try:
        byline_new_3.append(re.search("\[\'   \', \'   \', (.*)\'\]", byline).group(1))
    except: 
        byline_new_3.append(byline)

In [39]:
len(byline_new_3)

1748

### link_food

In [40]:
link_food = list(recipes_sr_full.link_food)

In [41]:
link_food_new = []
for link in link_food:
    try:
        link_food_new.append(re.search("\\\'<link rel=\"canonical\" href=\"(.*)\"\>\\\'\]", link).group(1))
    except: 
        link_food_new.append(link)

In [42]:
print('Title Count: ',len(title_new))
print('Prep Time Count: ', len(prep_time_new))
print('Cook Time Count: ',len(cook_time_new))
print('Tag Count',len(tags_new))
print('Ingredients Count: ',len(data_sr_fin))
print('Recipe Yield Count: ',len(recipe_yield_new))
print('Byline Count: ',len(byline_new_3))
print('Link Count: ', len(link_food_new))

Title Count:  1748
Prep Time Count:  1748
Cook Time Count:  1748
Tag Count 1748
Ingredients Count:  1748
Recipe Yield Count:  1748
Byline Count:  1748
Link Count:  1748


In [43]:
recipes_sr_full['title'] = title_new
recipes_sr_full['prep_time'] = prep_time_new
recipes_sr_full['cook_time'] = cook_time_new
recipes_sr_full['tags'] = tags_new
recipes_sr_full['ingredients'] = data_sr_fin
recipes_sr_full['recipe_yield'] = recipe_yield_new
recipes_sr_full['byline'] = byline_new_3
recipes_sr_full['link_food'] = link_food_new

In [44]:
recipes_sr_full.drop(columns=['entire_card'], inplace=True)

In [45]:
recipes_sr_full.to_csv('../../data/02_intermediate/recipies_sr_inter.csv')

In [46]:
recipes_sr_full.to_pickle("../../data/02_intermediate/recipies_sr_inter.pkl")