# Damn Delicious

In [24]:
# Import statements
import os
import ast
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.request import Request, urlopen

In [2]:
# Get the Web content
def get_web_content(url):
    try:
        request = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webContent = urlopen(request).read().decode('UTF-8')
        return webContent
    except Exception as e:
        print("Error in requesting web page: ", e)
        return False

### Constants

In [3]:
# Get Current Directory Path
PATH = os.getcwd()
# URL to the website
INDEX_URL = "https://damndelicious.net/recipe-index/"
# Directory for recipes csv
DIR_PATH = PATH + "/datasets/"
# Create path if doesn't exists
if not os.path.exists(DIR_PATH):
        os.makedirs(DIR_PATH)

### Columns

In [4]:
# Columns for the recipes dataset
recipe_columns = [
    'recipe_id',
    'title',
    'date_posted',
    'yields',
    'prep_time',
    'cook_time',
    'total_time',
    'ratings',
    'ratings_count',
    'n_ingredients',
    'ingredients',
    'n_steps',
    'directions',
    'nutrition',
    'tags'
]
# Columns for reviews dataset
review_columns = [
    "recipe_id",
    "username",
    "review_date",
    "rating",
    "review_text"
]

### Scraping the Web Page

In [5]:
def read_recipe(url, recipe_id):
    # Empty DataFrame
    empty_df = pd.DataFrame([], columns=recipe_columns)
    # Empty Reviews DataFrame
    empty_reviews_df = pd.DataFrame([], columns=review_columns)
    # Tokens from url
    tokens = url.split('/')
    try:
        # Get the html page
        webContent = get_web_content(url)
        # Parse the html page
        soup = BeautifulSoup(webContent, 'html.parser')
        # Check if page contains a recipe
        recipe_content = soup.find('div', class_='recipe-content')
        if recipe_content == None:
            # Return empty as none found
            print("No Recipe Found in ", tokens[-2])
            return empty_df, False
        # Data for dataframe
        data = []
        data.append(recipe_id)
        review_data = []
        # Title
        title = ""
        title_h1 = soup.find('h1', class_="post-title")
        if title_h1 != None:
            title = title_h1.text
        data.append(title)
        # Date Posted
        date_posted = ""
        if len(tokens) == 8:
            date_posted = tokens[3]+"/"+tokens[4]+"/"+tokens[5]
            date_posted = datetime.strptime(date_posted, '%Y/%m/%d').isoformat()
        data.append(date_posted)
        # Yields and Time
        yields = ""
        prep_time = ""
        cook_time = ""
        total_time = ""
        # Get the Post Meta Time Section
        post_meta_time = soup.find('div', {"class":["time"]})
        if post_meta_time != None:
            time_p_tags = post_meta_time.find_all('p')
            for time_p in time_p_tags:
                if time_p != None:
                    strong = time_p.find('strong')
                    if strong != None:
                        keyword = strong.text[:-1]
                        if strong.text[:-1] == "Yield" :
                            yields = time_p.span.text
                        if strong.text[:-1] == "prep time" :
                            prep_time = time_p.span.text
                        if strong.text[:-1] == "cook time" :
                            cook_time = time_p.span.text
                        if strong.text[:-1] == "total time" :
                            total_time = time_p.span.text
        # Yields
        data.append(yields)
        # Prep Time
        data.append(prep_time)
        # Cook Time
        data.append(cook_time)
        # Total Time
        data.append(total_time)
        # Rating
        rating_average = ""
        ratings_average_span = soup.find('span', class_='wprm-recipe-rating-average')
        if ratings_average_span != None:
            rating_average = ratings_average_span.text.strip()
        data.append(rating_average)
        # Ratings Count
        ratings_count = ""
        ratings_count_span = soup.find('span', class_='wprm-recipe-rating-count')
        if ratings_count_span != None:
            ratings_count = ratings_count_span.text.strip()
        data.append(ratings_count)
        # Ingredients
        ingredients = []
        ingredients_li = soup.find_all('li', itemprop="ingredients")
        for ingredient in ingredients_li:
            ingredients.append(ingredient.text.strip())
        data.append(len(ingredients))
        data.append(ingredients)
        # Directions
        directions = []
        directions_div = soup.find('div', class_="instructions")
        if directions_div != None:
            directions_li = directions_div.find_all('li')
            for direction in directions_li:
                if direction != None:
                    directions.append(direction.text)
        data.append(len(directions))
        data.append(directions)
        # Reviews 
        comments_li = soup.find_all('li', {'class':['depth-1']})
        if comments_li != None:
            for comment_li in comments_li:
                review = []
                username = ""
                rating = ""
                review_date = ""
                comments = ""
                comment_meta = comment_li.div.find('div', class_="comment-meta")
                if comment_meta != None:
                    username = comment_meta.find('strong').text
                if comment_meta.a.time != None:
                    review_date = comment_meta.a.time['datetime']
                comment_cont = comment_li.div.find('div', class_="comment-content")
                if comment_cont != None:
                    comment_rating = comment_cont.find('div', class_="wpsso-rar")
                    if comment_rating != None:
                        comment_rating = comment_rating.div
                        if comment_rating != None:
                            rating = comment_rating.text
                    comment_ps = comment_cont.find_all('p')
                    for p in comment_ps:
                        if p != None:
                            comment = p.text.strip()
                            if len(comment) > 0:
                                comments = comments + " " + comment
                    review.append(recipe_id)
                    review.append(username)
                    review.append(review_date)
                    review.append(rating)
                    review.append(comments.strip())
                    review_data.append(review)
        # Nutrition
        nutrition = []
        nutrition_div = soup.find('div', class_="wp-nutrition-label")
        if nutrition_div != None:
            span_left = nutrition_div.find_all('span', class_='f-left')
            for span in span_left:
                if span != None:
                    pairs = span.text.strip()
                    pairs = pairs.rsplit(" ", 1)
                    nutrition.append([pairs[0], pairs[1].strip()])
        data.append(nutrition)
        # Categories
        categories = []
        categories_a = soup.find_all('a', rel="category")
        for category in categories_a:
            if category != None:
                categories.append(category.text)
        data.append(categories)
        # Create a dataframe
        recipe_df = pd.DataFrame(data=[data], columns=recipe_columns)
        review_df = pd.DataFrame(data=review_data, columns=review_columns)
        #Processing reviews
        review_df['rating'] = review_df['rating'].str.split(" ").str[1]
        review_df['rating'] = review_df['rating'].fillna("")
        # Success Message
        print("Recipe Loaded successfully: ", tokens[-2])
        # Return statement
        return recipe_df, review_df, True
    except Exception as e:
        print("Error in loading Recipe: ", tokens[-2])
        print("Error in loading Web page: ", e)
        return empty_df, empty_reviews_df, False  

### Testing Web scraping of one recipe

In [6]:
# Test Case
recipe_url = "https://damndelicious.net/2015/03/20/lemon-chicken-with-asparagus-and-potatoes/"
result, review, success = read_recipe(recipe_url, 1)
print(result, review, success)

Recipe Loaded successfully:  lemon-chicken-with-asparagus-and-potatoes
   recipe_id                                      title          date_posted  \
0          1  Lemon Chicken with Asparagus and Potatoes  2015-03-20T00:00:00   

       yields   prep_time   cook_time         total_time ratings  \
0  8 servings  15 minutes  50 minutes  1 hour, 5 minutes    5.00   

  ratings_count  n_ingredients  \
0             3             10   

                                         ingredients  n_steps  \
0  [2 tablespoons olive oil, 2 tablespoons Dijon ...        6   

                                          directions  \
0  [Preheat oven to 400 degrees F., In a small bo...   

                                           nutrition               tags  
0  [[Calories, 263.0], [Total Fat, 18.6g], [Satur...  [chicken, entree]       recipe_id                         username                review_date  \
0           1                          Claudia  2020-05-29T09:17:25-07:00   
1           1   

### Getting Recipes for Each Ingredient

In [7]:
def get_recipes_by_ingredient(url, title_list, count):
    # Ingredient's recipes DataFrame 
    recipes_by_ingredient_df = pd.DataFrame([], columns=recipe_columns)
    reviews_by_ingredient_df = pd.DataFrame([], columns=review_columns)
    try:    
        # Get the html page
        webContent = get_web_content(url)
        # Parse the html page
        soup = BeautifulSoup(webContent, 'html.parser')
        # Get all the recipes for the ingredient
        archives = soup.find('div', class_='archives')
        if archives != None:
            archive_posts = archives.find_all('div', 'archive-post')
            for post in archive_posts:
                if post != None:
                    post_a = post.find('a')
                    if post_a != None:
                        title = post_a['title']
                        # Check for duplication of recipes
                        if title not in title_list:
                            recipe_url = post_a['href']
                            recipe_ingredient_df, review_ingredient_df, success = read_recipe(recipe_url, count)
                            count += 1
                            if success:
                                title_list.append(title)
                                recipes_by_ingredient_df = pd.concat\
                                ([recipes_by_ingredient_df, recipe_ingredient_df], ignore_index=True)
                                reviews_by_ingredient_df = pd.concat\
                                ([reviews_by_ingredient_df, review_ingredient_df], ignore_index=True)
    except Exception as e:
        print("Error in loading recipes from page: ", url)
        print("Error: ", e)
    finally:
        return recipes_by_ingredient_df, reviews_by_ingredient_df, count

### Testing for one ingredient

In [8]:
# Test for recipes by ingredients
tag_url = "https://damndelicious.net/tag/asparagus/"
result = get_recipes_by_ingredient(tag_url, [])
# print(result)

Recipe Loaded successfully:  bacon-wrapped-asparagus
Recipe Loaded successfully:  prosciutto-wrapped-asparagus
Recipe Loaded successfully:  asparagus-tart-with-balsamic-reduction
Recipe Loaded successfully:  veggie-flatbread-pizza
No Recipe Found in  creamy-meyer-lemon-pasta
Error in loading recipes from page:  https://damndelicious.net/tag/asparagus/
Error:  not enough values to unpack (expected 3, got 2)


### Get Ingredients List

In [9]:
def get_ingredients_list(url):
    # List of Lead Ingredients Title and URL
    ingredients_list = []
    try:
        # Get the html page
        webContent = get_web_content(url)
        # Parse the html page
        soup = BeautifulSoup(webContent, 'html.parser')
        # <Li> List
        ingredients_li = []
        # Get all the archives <UL> Tags
        archivelist_ul = soup.find_all('ul', 'archiveslist')
        for archive in archivelist_ul:
            if archive != None:
                archive_li = archive.find_all('li')
                ingredients_li = ingredients_li + archive_li
        # Get the Ingredients URL and Title
        for ingredient in ingredients_li:
            if ingredient != None:
                ingredient_a = ingredient.find('a')
                if ingredient_a != None:
                    ingredients_list.append([ingredient_a['title'], ingredient_a['href']])
    except Exception as e:
        print("Error in Loading the index page: ", e)
    finally:
        return ingredients_list

### Merging All Ingredients Recipes into One

In [10]:
def scrape_dd():
    # The Index of all recipes
    index_url = "https://damndelicious.net/recipe-index/"
    try:
        ingredients_list = get_ingredients_list(index_url)
    except:
        print("Error in getting ingredients list! Terminating..")
        exit()
    # Titles of all recipes we find
    title_list = []
    # Final Dataframe
    damn_delicious_recipes_df = pd.DataFrame([], columns=recipe_columns)
    damn_delicious_reviews_df = pd.DataFrame([], columns=review_columns)
    print("Start..")
    count = 0
    # Get Recipes for each ingredient
    for ingredient in ingredients_list:
        # Ingredient Details
        ingredient_title = ingredient[0]
        ingredient_url = ingredient[1]
        print("##################################################")
        print("Reading recipes for Ingredient: ", ingredient_title)
        print("##################################################")
        try:
            ingredient_recipe_df, ingredient_review_df, count = get_recipes_by_ingredient(ingredient_url, title_list, count)
        except Exception as e: 
            print("##################################################")
            print("Error in Reading recipes for Ingredient: ", ingredient_title)
            print("Error = ", e)
            print("##################################################")
        print("##################################################")
        print("Completed reading recipes for Ingredient: ", ingredient_title)
        print("##################################################")
        try:
            damn_delicious_recipes_df = pd.concat([damn_delicious_recipes_df, ingredient_recipe_df], ignore_index=True)
            damn_delicious_reviews_df = pd.concat([damn_delicious_reviews_df, ingredient_review_df], ignore_index=True)
        except Exception as e:
            print("Error in concating dataframe for ingredient: ", ingredient_title)
            print("Error = ", e)
    # Save the final dataframe as csv file
    damn_delicious_recipes_df.to_csv(DIR_PATH+'damn_delicious_recipes.csv', index=False)
    damn_delicious_reviews_df.to_csv(DIR_PATH+'damn_delicious_reviews.csv', index=False)
    print("Finished!")

In [11]:
def main():
    scrape_dd()

In [12]:
main()

Start..
##################################################
Reading recipes for Ingredient:  apple
##################################################
Recipe Loaded successfully:  vanilla-glazed-apple-cinnamon-muffins
No Recipe Found in  cinnamon-apple-dippers
Error in loading recipes from page:  https://damndelicious.net/tag/apple/
Error:  not enough values to unpack (expected 3, got 2)
##################################################
Completed reading recipes for Ingredient:  apple
##################################################
##################################################
Reading recipes for Ingredient:  asparagus
##################################################
Recipe Loaded successfully:  bacon-wrapped-asparagus
Recipe Loaded successfully:  prosciutto-wrapped-asparagus
Recipe Loaded successfully:  asparagus-tart-with-balsamic-reduction
Recipe Loaded successfully:  veggie-flatbread-pizza
No Recipe Found in  creamy-meyer-lemon-pasta
Error in loading recipes from page:  ht

Recipe Loaded successfully:  best-ever-beef-stew
##################################################
Completed reading recipes for Ingredient:  beef stew
##################################################
##################################################
Reading recipes for Ingredient:  beef stew recipe
##################################################
##################################################
Completed reading recipes for Ingredient:  beef stew recipe
##################################################
##################################################
Reading recipes for Ingredient:  beef stroganoff
##################################################
Recipe Loaded successfully:  one-pot-beef-stroganoff
##################################################
Completed reading recipes for Ingredient:  beef stroganoff
##################################################
##################################################
Reading recipes for Ingredient:  beef stroganoff recipe
##########

Recipe Loaded successfully:  instant-pot-butternut-squash-soup
##################################################
Completed reading recipes for Ingredient:  butternut squash recipe
##################################################
##################################################
Reading recipes for Ingredient:  cake
##################################################
Recipe Loaded successfully:  pumpkin-bundt-cake-chocolate-rum-glaze
Recipe Loaded successfully:  banana-pecan-and-nutella-swirled-snack-cake
Recipe Loaded successfully:  chocolate-sour-cream-bundt-cake
Recipe Loaded successfully:  coffee-cake-with-crumble-topping-and-brown-sugar-glaze
Recipe Loaded successfully:  pumpkin-bundt-cake-with-pumpkin-glaze-bundtamonth-and
Recipe Loaded successfully:  nectarine-upside-down-cake
No Recipe Found in  muffin-monday-strawberry-shortcake-muffins-with
Error in loading recipes from page:  https://damndelicious.net/tag/cake/
Error:  not enough values to unpack (expected 3, got 2)
######

Recipe Loaded successfully:  quick-chicken-ramen-noodle-stir-fry
##################################################
Completed reading recipes for Ingredient:  chicken noodle recipe
##################################################
##################################################
Reading recipes for Ingredient:  chicken noodle soup
##################################################
##################################################
Completed reading recipes for Ingredient:  chicken noodle soup
##################################################
##################################################
Reading recipes for Ingredient:  chicken ramen noodle recipe
##################################################
##################################################
Completed reading recipes for Ingredient:  chicken ramen noodle recipe
##################################################
##################################################
Reading recipes for Ingredient:  chicken ramen noodles
######

##################################################
Completed reading recipes for Ingredient:  creamy tortellini soup
##################################################
##################################################
Reading recipes for Ingredient:  cupcakes
##################################################
Recipe Loaded successfully:  pumpkin-cheesecake-cupcakes
Recipe Loaded successfully:  candy-corn-cupcakes
Recipe Loaded successfully:  pumpkin-cupcakes-cinnamon-cream-cheese-frosting
Recipe Loaded successfully:  fruity-pebble-cupcakes
Recipe Loaded successfully:  strawberry-shortcake-cupcakes
Recipe Loaded successfully:  mini-vanilla-bean-cupcakes-with-vanilla-buttercream
No Recipe Found in  green-tea-cupcakes-with-matcha-cream-cheese-frosting
Error in loading recipes from page:  https://damndelicious.net/tag/cupcakes/
Error:  not enough values to unpack (expected 3, got 2)
##################################################
Completed reading recipes for Ingredient:  cupcakes
####

Recipe Loaded successfully:  easy-homemade-ramen
##################################################
Completed reading recipes for Ingredient:  Homemade Ramen
##################################################
##################################################
Reading recipes for Ingredient:  hot dog
##################################################
Recipe Loaded successfully:  classic-pigs-blanket
Recipe Loaded successfully:  bahn-mi-hot-dogs
Recipe Loaded successfully:  hawaiian-hot-dogs-with-mango-salsa
No Recipe Found in  bacon-wrapped-chili-cheese-dogs-sundaysupper
Error in loading recipes from page:  https://damndelicious.net/tag/hot-dog/
Error:  not enough values to unpack (expected 3, got 2)
##################################################
Completed reading recipes for Ingredient:  hot dog
##################################################
##################################################
Reading recipes for Ingredient:  ice cream
############################################

Recipe Loaded successfully:  herb-chicken-with-lemon-cream-sauce
Recipe Loaded successfully:  sparkling-cherry-lemonade
Recipe Loaded successfully:  lemon-chicken-orzo-soup
Recipe Loaded successfully:  sparkling-strawberry-lemonade
Recipe Loaded successfully:  shrimp-scampi
Recipe Loaded successfully:  sweet-lemon-shrimp
Recipe Loaded successfully:  strawberry-basil-meyer-lemonade
No Recipe Found in  lemon-belgian-waffles-with-blueberry-syrup
Error in loading recipes from page:  https://damndelicious.net/tag/lemon/
Error:  not enough values to unpack (expected 3, got 2)
##################################################
Completed reading recipes for Ingredient:  lemon
##################################################
##################################################
Reading recipes for Ingredient:  lightened up
##################################################
Recipe Loaded successfully:  blueberry-oatmeal-muffins-with-granola-crumb-topping
Recipe Loaded successfully:  greek-yogurt-

Recipe Loaded successfully:  no-knead-rosemary-bread
##################################################
Completed reading recipes for Ingredient:  no knead recipes
##################################################
##################################################
Reading recipes for Ingredient:  no knead rosemary bread recipe
##################################################
##################################################
Completed reading recipes for Ingredient:  no knead rosemary bread recipe
##################################################
##################################################
Reading recipes for Ingredient:  Nutella
##################################################
No Recipe Found in  banana-nutella-granola
Error in loading recipes from page:  https://damndelicious.net/tag/nutella/
Error:  not enough values to unpack (expected 3, got 2)
##################################################
Completed reading recipes for Ingredient:  Nutella
#######################

##################################################
Completed reading recipes for Ingredient:  potato gnocchi recipe
##################################################
##################################################
Reading recipes for Ingredient:  potatoes
##################################################
Recipe Loaded successfully:  easy-potato-pancakes
Recipe Loaded successfully:  lemon-rosemary-roasted-potatoes
Recipe Loaded successfully:  baked-ranch-chicken-tenders-and-veggies
Recipe Loaded successfully:  one-pan-greek-chicken
Recipe Loaded successfully:  one-pan-garlic-ranch-chicken-and-veggies
Recipe Loaded successfully:  easy-potatoes-in-foil
Recipe Loaded successfully:  crispy-dijon-smashed-potatoes
Recipe Loaded successfully:  slow-cooker-garlic-mashed-potatoes
Recipe Loaded successfully:  baked-garlic-sweet-potato-fries
Recipe Loaded successfully:  garlic-parmesan-fries
Recipe Loaded successfully:  parmesan-crusted-scalloped-potatoes
Recipe Loaded successfully:  sweet-po

Recipe Loaded successfully:  creamy-red-pepper-shells
##################################################
Completed reading recipes for Ingredient:  red pepper pasta
##################################################
##################################################
Reading recipes for Ingredient:  red pepper pasta recipe
##################################################
##################################################
Completed reading recipes for Ingredient:  red pepper pasta recipe
##################################################
##################################################
Reading recipes for Ingredient:  red pepper pasta shell recipe
##################################################
##################################################
Completed reading recipes for Ingredient:  red pepper pasta shell recipe
##################################################
##################################################
Reading recipes for Ingredient:  rosemary bread
#################

##################################################
Completed reading recipes for Ingredient:  the best beef stew
##################################################
##################################################
Reading recipes for Ingredient:  tomato
##################################################
Recipe Loaded successfully:  caprese-mac-and-cheese
Recipe Loaded successfully:  caprese-cheese-kabobs-with-balsamic-reduction-and-a
Recipe Loaded successfully:  zucchini-ribbon-pasta
No Recipe Found in  creamy-meyer-lemon-pasta
Error in loading recipes from page:  https://damndelicious.net/tag/tomato/
Error:  not enough values to unpack (expected 3, got 2)
##################################################
Completed reading recipes for Ingredient:  tomato
##################################################
##################################################
Reading recipes for Ingredient:  Top instant pot recipes
##################################################
No Recipe Found in  10-

## ETL

In [111]:
dd_rc_df = pd.read_csv("datasets/damn_delicious_recipes.csv")
dd_rc_df.head(5)

Unnamed: 0,recipe_id,title,date_posted,yields,prep_time,cook_time,total_time,ratings,ratings_count,n_ingredients,ingredients,n_steps,directions,nutrition,tags
0,0,Vanilla Glazed Apple Cinnamon Muffins,2013-09-20T00:00:00,4 muffins,15 minutes,15 minutes,30 minutes,,,13,"['1/2 cup all-purpose flour', '1 teaspoon baki...",7,['Preheat oven to 350 degrees F. Coat a standa...,[],['breakfast']
1,1,Bacon Wrapped Asparagus,2016-05-21T00:00:00,6 servings,10 minutes,20 minutes,30 minutes,4.5,2.0,10,"['1/2 cup unsalted butter, melted', '1/4 cup b...",7,['Preheat oven to 400 degrees F. Lightly oil a...,"[['Calories', '318.9'], ['Total Fat', '28.2g']...",['side dish']
2,2,Prosciutto Wrapped Asparagus,2013-12-27T00:00:00,4 servings,10 minutes,5 minutes,15 minutes,5.0,1.0,3,"['6 ounces sliced prosciutto, halved horizonta...",3,"['Working one at a time, wrap halved prosciutt...",[],['appetizer']
3,3,Asparagus Tart with Balsamic Reduction,2013-05-13T00:00:00,4 servings,10 minutes,25 minutes,35 minutes,,,7,"['1 sheet frozen puff pastry, thawed, cut into...",6,['Preheat oven to 400 degrees F. Line a baking...,[],"['appetizer', 'vegetarian']"
4,4,Whole Wheat Veggie Flatbread Pizza,2013-03-22T00:00:00,6 servings,10 minutes,10 minutes,20 minutes,,,6,"['6 whole wheat Middle Eastern flatbreads', '1...",4,['Preheat oven to 375 degrees F. Line a baking...,[],"['entree', 'healthy', 'quick & easy', 'vegetar..."


In [112]:
# Characters to be deleted
delete_char = ['(',')','[',']','/','-','⁄','%','.','\n']

# Extra keywords to be omitted
extras = [
    'bag',
    'box',
    'bunch',
    'can',
    'cup',
    'cups',
    'dash',
    'dozen',
    'drop',
    'envelope',
    'fluid',
    'g',
    'gallon',
    'head',
    'inch',
    'jar',
    'kg',
    'kgs',
    'large',
    'lb',
    'lbs',
    'leaf',
    'liter',
    'loaf',
    'medium',
    'mg',
    'ml',
    'ounce',
    'ounces',
    'package',
    'packages',
    'packet',
    'packets',
    'pinch',
    'pint',
    'pints',
    'quart',
    'quarts',
    'scoop',
    'scoops',
    'sheet',
    'sheets',
    'slice',
    'slices',
    'small',
    'sprig',
    'sprigs',
    'stalk',
    'stalks',
    'tablespoon',
    'tablespoons',
    'teaspoon',
    'teaspoons',
    'whole'
]

# Remove the extra tokens
def remove_extras(string):
    
    string = str(string)
    for char in delete_char:
        string = string.replace(str(char), " ")
    
    tokens = string.split(" ")
    
    if tokens[0].strip().lower() == 'for':
        return ""
    
    string_list=[]
    for token in tokens:
        token = token.strip()
        if not token.isnumeric():
            if token not in extras:
                string_list.append(token)
    
    return (' '.join([elem for elem in string_list])).strip()


In [140]:
dd_rc = dd_rc_df.copy()
dd_rc = dd_rc.drop(['yields', 'prep_time', 'cook_time'], axis=1)
dd_rc['ratings'] = dd_rc['ratings'].fillna(0)
dd_rc['ratings_count'] = dd_rc['ratings_count'].fillna(0)
dd_rc['total_time'] = dd_rc['total_time'].fillna("0 minutes")
dd_rc['total_time'] = dd_rc['total_time'].apply(lambda x: x.split(" ")[0])

In [141]:
# Process the dataframe to transform ingredients
def process_ingredients(input_df):
    ingredients = input_df.copy()
    ingredients = ingredients[['ingredients','recipe_id']]
#     ingredients['ingredients'] = ingredients['ingredients'].astype(str)
#     ingredients['ingredients'] = ingredients['ingredients'].apply(lambda x: x.replace("empty", "0"))
    ingredients['ingredients'] = ingredients['ingredients'].apply(ast.literal_eval)
    ingredients = ingredients.explode('ingredients')
    ingredients['ingredients'] = ingredients['ingredients'].str.lower()
    ingredients['ingredients'] = ingredients['ingredients'].apply(remove_extras)
    ingredients = ingredients[ingredients["ingredients"] != ""]
#     ingredients = ingredients.explode('recipe_id')
    ingredients = ingredients.groupby(['recipe_id']).agg({'ingredients': lambda x: x.tolist()})
    ingredients = ingredients[ingredients['ingredients'] != ""]
    ingredients['ingredients'] = ingredients['ingredients'].astype(str)
    ingredients.reset_index(inplace=True)
    return ingredients

In [142]:
dd_rc_ig = dd_rc.copy()
dd_rc_ig = process_ingredients(dd_rc_ig)
dd_rc_ig.columns = ['id', 'ing']
dd_rc_fi = pd.merge(dd_rc, dd_rc_ig, how="left", left_on=["recipe_id"], right_on=["id"])
dd_rc_fi['ingredients'] = dd_rc_fi['ing']
dd_rc_fi = dd_rc_fi.drop(['id', 'ing'], axis=1)
dd_rc_fi.head(5)

Unnamed: 0,recipe_id,title,date_posted,total_time,ratings,ratings_count,n_ingredients,ingredients,n_steps,directions,nutrition,tags
0,0,Vanilla Glazed Apple Cinnamon Muffins,2013-09-20T00:00:00,30,0.0,0.0,13,"['all purpose flour', 'baking powder', 'cinnam...",7,['Preheat oven to 350 degrees F. Coat a standa...,[],['breakfast']
1,1,Bacon Wrapped Asparagus,2016-05-21T00:00:00,30,4.5,2.0,10,"['unsalted butter, melted', 'brown sugar, pack...",7,['Preheat oven to 400 degrees F. Lightly oil a...,"[['Calories', '318.9'], ['Total Fat', '28.2g']...",['side dish']
2,2,Prosciutto Wrapped Asparagus,2013-12-27T00:00:00,15,5.0,1.0,3,"['sliced prosciutto, halved horizontally', 'po...",3,"['Working one at a time, wrap halved prosciutt...",[],['appetizer']
3,3,Asparagus Tart with Balsamic Reduction,2013-05-13T00:00:00,35,0.0,0.0,7,"['frozen puff pastry, thawed, cut into squares...",6,['Preheat oven to 400 degrees F. Line a baking...,[],"['appetizer', 'vegetarian']"
4,4,Whole Wheat Veggie Flatbread Pizza,2013-03-22T00:00:00,20,0.0,0.0,6,"['wheat middle eastern flatbreads', 'marinara ...",4,['Preheat oven to 375 degrees F. Line a baking...,[],"['entree', 'healthy', 'quick & easy', 'vegetar..."


In [180]:
def get_num(item):
    item = item.replace("g", "")    
    item = item.replace("mg", "")
    item = item.replace("m", "")
    return item

In [181]:
# Process the dataframe to transform ingredients
def process_nutrition(input_df):
    nutrition = input_df.copy()
    nutrition = nutrition[['nutrition','recipe_id']]
    nutrition['nutrition'] = nutrition['nutrition'].astype(str)
    nutrition = nutrition[nutrition['nutrition'] != "[]"]
    nutrition['nutrition'] = nutrition['nutrition'].apply(ast.literal_eval)
    nutrition = nutrition.explode('nutrition')
    nutrition['nutrition'] = nutrition['nutrition'].apply(lambda x: x[1] if len(x) > 1 else [])
    nutrition['nutrition'] = nutrition['nutrition'].astype(str).apply(get_num)
    nutrition = nutrition.groupby(['recipe_id']).agg({'nutrition': lambda x: x.tolist()})
    nutrition['nutrition'] = nutrition['nutrition'].astype(str)
    nutrition.reset_index(inplace=True)
    return nutrition

In [182]:
dd_rc_nu = dd_rc_fi.copy()
dd_rc_nu = process_nutrition(dd_rc_nu)
dd_rc_nu.columns = ['id', 'nut']
dd_rc_nut = pd.merge(dd_rc_fi, dd_rc_nu, how='left', left_on=["recipe_id"], right_on=["id"])
dd_rc_nut['nutrition'] = dd_rc_nut['nut']
dd_rc_nut = dd_rc_nut.drop(['id', 'nut'], axis=1)
dd_rc_nut['nutrition'] = dd_rc_nut['nutrition'].fillna("[]")
dd_rc_nut.head(5)

Unnamed: 0,recipe_id,title,date_posted,total_time,ratings,ratings_count,n_ingredients,ingredients,n_steps,directions,nutrition,tags
0,0,Vanilla Glazed Apple Cinnamon Muffins,2013-09-20T00:00:00,30,0.0,0.0,13,"['all purpose flour', 'baking powder', 'cinnam...",7,['Preheat oven to 350 degrees F. Coat a standa...,[],['breakfast']
1,1,Bacon Wrapped Asparagus,2016-05-21T00:00:00,30,4.5,2.0,10,"['unsalted butter, melted', 'brown sugar, pack...",7,['Preheat oven to 400 degrees F. Lightly oil a...,"['318.9', '28.2', '13.0', '48.4', '69.4', '15....",['side dish']
2,2,Prosciutto Wrapped Asparagus,2013-12-27T00:00:00,15,5.0,1.0,3,"['sliced prosciutto, halved horizontally', 'po...",3,"['Working one at a time, wrap halved prosciutt...",[],['appetizer']
3,3,Asparagus Tart with Balsamic Reduction,2013-05-13T00:00:00,35,0.0,0.0,7,"['frozen puff pastry, thawed, cut into squares...",6,['Preheat oven to 400 degrees F. Line a baking...,[],"['appetizer', 'vegetarian']"
4,4,Whole Wheat Veggie Flatbread Pizza,2013-03-22T00:00:00,20,0.0,0.0,6,"['wheat middle eastern flatbreads', 'marinara ...",4,['Preheat oven to 375 degrees F. Line a baking...,[],"['entree', 'healthy', 'quick & easy', 'vegetar..."


In [183]:
dd_rc_copy = dd_rc_nut.copy()
dd_rc_copy.columns = [
    'id',
    'name', 
    'submitted', 
    'minutes', 
    'ratings', 
    'ratings_count',
    'n_ingredients', 
    'ingredients', 
    'n_steps', 
    'steps', 
    'nutrition', 
    'tags'
]
display(dd_rc_copy)

Unnamed: 0,id,name,submitted,minutes,ratings,ratings_count,n_ingredients,ingredients,n_steps,steps,nutrition,tags
0,0,Vanilla Glazed Apple Cinnamon Muffins,2013-09-20T00:00:00,30,0.00,0.0,13,"['all purpose flour', 'baking powder', 'cinnam...",7,['Preheat oven to 350 degrees F. Coat a standa...,[],['breakfast']
1,1,Bacon Wrapped Asparagus,2016-05-21T00:00:00,30,4.50,2.0,10,"['unsalted butter, melted', 'brown sugar, pack...",7,['Preheat oven to 400 degrees F. Lightly oil a...,"['318.9', '28.2', '13.0', '48.4', '69.4', '15....",['side dish']
2,2,Prosciutto Wrapped Asparagus,2013-12-27T00:00:00,15,5.00,1.0,3,"['sliced prosciutto, halved horizontally', 'po...",3,"['Working one at a time, wrap halved prosciutt...",[],['appetizer']
3,3,Asparagus Tart with Balsamic Reduction,2013-05-13T00:00:00,35,0.00,0.0,7,"['frozen puff pastry, thawed, cut into squares...",6,['Preheat oven to 400 degrees F. Line a baking...,[],"['appetizer', 'vegetarian']"
4,4,Whole Wheat Veggie Flatbread Pizza,2013-03-22T00:00:00,20,0.00,0.0,6,"['wheat middle eastern flatbreads', 'marinara ...",4,['Preheat oven to 375 degrees F. Line a baking...,[],"['entree', 'healthy', 'quick & easy', 'vegetar..."
...,...,...,...,...,...,...,...,...,...,...,...,...
377,377,Leftover Thanksgiving Turkey Pesto Panini,2012-11-21T00:00:00,15,0.00,0.0,7,"['olive oil', 'of french bread, cut into equal...",3,['Heat olive oil in a grill pan over medium-hi...,[],"['entree', 'quick & easy', 'thanksgiving']"
378,378,Sheet Pan Zucchini Parmesan,2017-04-25T00:00:00,40,5.00,5.0,9,"['panko*', 'freshly grated parmesan cheese', '...",7,['Preheat oven to 400 degrees F. Lightly oil a...,[],"['appetizer', 'one pot']"
379,379,Zucchini Parmesan Foil Packets,2016-07-19T00:00:00,25,5.00,5.0,8,"['unsalted butter, melted', 'freshly grated pa...",5,['PREHEAT a gas or charcoal grill over high he...,"['163.6', '13.9', '8.5', '35.0', '124.5', '6.9...",['side dish']
380,380,Zucchini Alfredo,2016-01-17T00:00:00,30,4.33,3.0,11,"['unsalted butter, divided', 'pound sized zu...",5,['Melt 1 tablespoon butter in a saucepan over ...,"['203.6', '14.2', '8.3', '39.5', '168.5', '12....","['entree', 'healthy', 'quick & easy']"


In [186]:
dd_rv_df = pd.read_csv("datasets/damn_delicious_reviews.csv")
dd_rv_df.columns = ['recipe_id', 'user_id', 'date', 'rating', 'review']
dd_rv_df = dd_rv_df[['user_id', 'recipe_id', 'date', 'rating', 'review']]
dd_rv_df['rating'] = dd_rv_df['rating'].fillna(0)
dd_rv_df['date'] = dd_rv_df['date'].fillna("2022-01-01T00:00:00+00:00")
dd_rv_df['review'] = dd_rv_df['review'].fillna(" ")
dd_rv_df.head(5)

Unnamed: 0,user_id,recipe_id,date,rating,review
0,Krystle,0,2016-11-02T13:04:59-07:00,0.0,Totally bummed to make this and find that I on...
1,Jess,0,2022-01-01T00:00:00+00:00,0.0,I absolutely love this recipe! So much so I ad...
2,Rachel,0,2022-01-01T00:00:00+00:00,0.0,Muffins. I. love. Muffins. Maybe it’s the carb...
3,Nicola,0,2014-10-26T14:17:08-07:00,0.0,Did you really mean only half a cup of flour? ...
4,Maria,0,2014-07-12T18:31:09-07:00,0.0,According to the proportions in the recipe how...


In [187]:
if not os.path.exists("temp/"):
        os.makedirs("temp/")
dd_rc_copy.to_csv('temp/damn_delicious_recipes.csv', index=False)
dd_rv_df.to_csv('temp/damn_delicious_reviews.csv', index=False)

# FIN