In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

### In the first step, the urls for main pages of recipes will be stored in a list:  

In [2]:
# Storing the urls of main recipe pages, for 80 pages: 

def get_urls(page_nums):
    url = 'https://www.allrecipes.com/recipes/17562/dinner/?page={}'
    lst = []
    for num in page_nums:
        page_url = url.format(num)
        lst.append(page_url)
    return lst

page_nums = range(2,82)
url_list = get_urls(page_nums)

### Next, a list of soup objects will be produced for all 80 pages:

In [3]:
# Making a list of soup objects for main pages:

def get_soups(url_list):
    lst = []
    for url in url_list:
        lst.append(BeautifulSoup((requests.get(url)).text, "html5lib"))
    return lst

soup_list = get_soups(url_list)    

In [4]:
len(soup_list)

80

### Every main page contains 22 recipes. In this step, the url of all the individual recipes will be extracted from the previous soup list and all the urls will be added to a list:

In [5]:
# Extracting the individual recipe's url:

def get_recipes_url(soup_list):
    main_url = 'https://www.allrecipes.com'
    lst = []
    for soup in soup_list:
        for unit in soup.find_all(class_='tout__titleLink'):
            if 'recipe' in unit['href']:
                lst.append(main_url+unit['href'])
    return lst

recipes_url_list = get_recipes_url(soup_list)

In [6]:
len(recipes_url_list)

1680

In [7]:
recipes_url_list[0:5]

['https://www.allrecipes.com/recipe/229658/oven-roasted-turkey-breast/',
 'https://www.allrecipes.com/recipe/166160/juicy-thanksgiving-turkey/',
 'https://www.allrecipes.com/recipe/276505/grandmas-hash-brown-casserole/',
 'https://www.allrecipes.com/recipe/222000/spaghetti-aglio-e-olio/',
 'https://www.allrecipes.com/recipe/221958/chef-johns-perfect-prime-rib/']

### Having the individual urls, now the soup object for each individual recipe can be made. All these soup object will be added to a list:

In [8]:
# Extracting and saving the soup object for each recipe:

def get_recipes_soup(recipes_url_list):
    lst = []
    for url in recipes_url_list:
        lst.append(BeautifulSoup((requests.get(url)).text, "html5lib"))
    return lst

recipes_soup = get_recipes_soup(recipes_url_list)

In [9]:
len(recipes_soup)

1680

### Now, using the recipe's soup list, the features and targets will be extracted into a dictionary using the function below. Because of some website anomalies, there might be some recipes with different structures with the site norm. An exception case will be defined for them inside the function:

In [10]:
# Extracting the targets and features from individual recipe's soups:

def recipe_dict(recipes_soup):
    dict={}
    for item in recipes_soup:
        try:
            title = item.find('title').text.split('Recipe |')[0].strip()
            ratings_average = float(item.find(class_='component recipe-reviews container-full-width template-two-col with-sidebar-right hidden')['data-ratings-average'])
            ratings_count = float(item.find(class_='component recipe-reviews container-full-width template-two-col with-sidebar-right hidden')['data-ratings-count'])
            reviews_count = float(item.find(class_='component recipe-reviews container-full-width template-two-col with-sidebar-right hidden')['data-reviews-count'])
            calories = item.find(class_='nutrition-top light-underline').text.split(' ')[-2]
            values = item.find_all(class_='nutrient-value')
            values_list = []
            values_list.append(ratings_average)
            values_list.append(ratings_count)
            values_list.append(reviews_count)
            values_list.append(calories)
            for value in values:
                values_list.append(float(re.sub('[a-zA-Z]', '', value.text).strip()))
                dict[title] = values_list
        except TypeError:
            pass
    return dict

values_dict = recipe_dict(recipes_soup)

In [11]:
len(values_dict)

1670

### A subset of values in the above dictionary have a length different than 23 (the number of features and targets that we are selecting). That most probably is due to anomalies in the structure of the webpage for their recipes. In making our dataframe from the recipes dict, we remove those:

In [12]:
# Making the final dataframe from the values dictionary:

columns = ['recipe', 'rating', 'rating_num', 'review_num'
           , 'calories', 'protein', 'carbs'
           , 'diet_fiber', 'sugars', 'fat'
           , 'sat_fat', 'cholesterol', 'vit_A'
           , 'niacin_eq', 'vit_B6', 'vit_C'
           , 'folate', 'calcium', 'iron'
           , 'magnesium', 'potassium', 'sodium'
           , 'thiamin', 'calories_fat']

def recipes_df_maker(values_dict):
    final_dict = {}
    for i in values_dict.keys():
        if len(values_dict[i]) == 23:
            final_dict[i] = values_dict[i]
    df = pd.DataFrame(final_dict).T.reset_index()
    df.set_axis(columns, axis=1, inplace=True)
    return df
    

In [13]:
recipes_df = recipes_df_maker(values_dict)

In [14]:
recipes_df.head()

Unnamed: 0,recipe,rating,rating_num,review_num,calories,protein,carbs,diet_fiber,sugars,fat,...,vit_B6,vit_C,folate,calcium,iron,magnesium,potassium,sodium,thiamin,calories_fat
0,Oven-Roasted Turkey Breast,4.710801,287.0,285.0,384.6,60.3,4.3,0.4,0.3,11.8,...,1.1,0.7,20.9,40.1,3.5,61.4,615.2,313.8,0.1,106.6
1,Juicy Thanksgiving Turkey,4.836009,872.0,894.0,555.6,69.3,4.3,1.0,1.9,24.0,...,1.1,6.3,27.2,93.5,5.4,71.1,789.4,680.2,0.2,216.2
2,Spaghetti Aglio e Olio | Allrecipes,4.691974,461.0,459.0,754.6,22.9,87.4,3.9,3.3,34.5,...,0.2,6.5,277.0,259.5,4.4,71.0,321.0,354.8,1.2,310.7
3,Marie's Easy Slow Cooker Pot Roast,4.587838,2368.0,2388.0,540.2,45.7,18.2,2.8,2.4,30.6,...,1.2,18.3,35.1,47.1,6.0,69.2,1172.7,271.9,0.3,275.1
4,Best Tuna Casserole,4.321311,2745.0,2805.0,595.0,32.1,58.1,3.8,4.6,26.1,...,0.4,7.4,164.5,319.2,5.0,75.7,602.1,1061.1,0.7,235.0


### We finally save our dataframe into a CSV file for further analysis:

In [15]:
recipes_df.to_csv('recipes_df.csv')