In [10]:
import os
import re
import requests
import validators
import collections
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import Counter
from nltk.stem.wordnet import WordNetLemmatizer

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [11]:
allrecipe_base_url = 'https://www.allrecipes.com'

def scrape_page(URL):
    '''
    Function to extract all the information on the webpage using BeautifulSoup.
    '''
    page = requests.get(URL, verify=False)
    soup = BeautifulSoup(page.content, "html.parser")
    
    return soup

def lemmatize(string):
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    words = string.split(' ')
    for word in words :
        lem_word = lemmatizer.lemmatize(word)
        lemmatized.append(lem_word)
        
    return ' '.join(lemmatized)

def get_allergen_diet(fourth_soup):
    
    vegan = False
    vegetarian = False
    egg_free = True
    dairy_free = True
    nut_free = True
    shellfish_free = True
    
    dairy = ['butter', 'cheese', 'milk', 'cream']
    nuts = ['almond', 'brazil nut', 'walnut', 'hazelnut', 'cashew', 'pine nut', 'macadamia nut', 'pistachio']
    shellfish = ['shrimp', 'crab', 'lobster', 'clam', 'oyster', 'mussel', 'prawn']
    
    recipe_ingredients = [x.find('input').get('data-ingredient') for x in fourth_soup.find_all('li', class_='ingredients-item')]
                
    for ing in recipe_ingredients:
        ing = lemmatize(ing)

        if 'egg' in ing:
            egg_free = False
        if any(x in ing for x in dairy):
            dairy_free = False
        if any(x in ing for x in nuts):
            nut_free = False
        if any(x in ing for x in shellfish):
            shellfish_free = False

    if 'vegan' in recipe_title:
        vegan = True
    if 'vegetarian' in recipe_title:
        vegetarian = True
    
    return vegan, vegetarian, egg_free, dairy_free, nut_free, shellfish_free

def get_rating(fourth_soup):
    rating_review = fourth_soup.find('div', {'aria-label': 'Ratings and Reviews'})
    recipe_rating = rating_review.get('data-ratings-average')
    
    if recipe_rating == '':
        recipe_rating = 'None'
        recipe_rating_count = '0'
        recipe_review_count = '0'
    else:
        recipe_rating_count = rating_review.get('data-ratings-count')
        recipe_review_count = rating_review.get('data-reviews-count')
        if recipe_rating_count == '':
            recipe_rating_count = '0'
            
        if recipe_review_count == '':
            recipe_rating_count = '0'
    
    return recipe_rating, recipe_rating_count, recipe_review_count

In [None]:
soup = scrape_page('https://dish.allrecipes.com/faq-sitemap/')
main_ingredients_links = [x.get('href') for x in soup.find_all('p')[1].find_all('a')]
main_ingredients = [x.text.replace('Recipes', '').lstrip().rstrip() for x in soup.find_all('p')[1].find_all('a')]

if os.path.exists('allrecipe-data.csv'):
    scraped_df = pd.read_csv('data.csv', index_col=[0])
    
    main_cat_list = scraped_df['Main Category'].tolist()
    sub_cat_list = scraped_df['Subcategory'].tolist()
    title_list = scraped_df['Title'].tolist()
    rating_list = scraped_df['Rating'].tolist()
    rating_count_list = scraped_df['Rating Count'].tolist()
    review_count_list = scraped_df['Review Count'].tolist()
    image_url_list = scraped_df['Image URL'].tolist()
    recipe_url_list = scraped_df['Recipe URL'].tolist()
    nutrition_list = scraped_df['Nutrition'].tolist()
    vegan_list = scraped_df['Vegan'].tolist()
    vegetarian_list = scraped_df['Vegetarian'].tolist()
    egg_free_list = scraped_df['Egg Free'].tolist()
    dairy_free_list = scraped_df['Dairy Free'].tolist()
    nut_free_list = scraped_df['Nut Free'].tolist()
    shellfish_free_list = scraped_df['Shellfish Free'].tolist()
else:
    main_cat_list = []
    sub_cat_list = []
    title_list = []
    rating_list = []
    rating_count_list = []
    review_count_list = []
    image_url_list = []
    recipe_url_list = []
    nutrition_list = []
    vegan_list = []
    vegetarian_list = []
    egg_free_list = []
    dairy_free_list = []
    nut_free_list = []
    shellfish_free_list = []

for link, ingredient in zip(main_ingredients_links, main_ingredients):
    print('Scraping {} Recipes ...'.format(ingredient))
    
    second_soup = scrape_page(link)
    sub_cat_results = second_soup.find_all('li', class_='carouselNav__listItem recipeCarousel__listItem')
    
    for sub_cat_result in sub_cat_results:
        sub_cat_title = sub_cat_result.find('a').get('data-tracking-content-headline').lower()
        if os.path.exists('data.csv'):
            if sub_cat_title in scraped_df['Subcategory'].unique():
                continue
            
        sub_cat_link = sub_cat_result.find('a').get('href')
        base_link = sub_cat_link
        
        recipe_count = 0
        page_num = 1
        print('Scraping {} Recipes ...'.format(sub_cat_title))
        while recipe_count < 150:
            third_soup = scrape_page(sub_cat_link)
            if page_num == 1:
                results = third_soup.find_all('div', class_='component card card__category')
            else:
                results = third_soup.find_all('div', class_='component tout')
            
            if (page_num == 1 and len(results) < 36) or (page_num > 2 and len(results) < 24 and recipe_count + len(results) < 150):
                break
    
            for result in tqdm(results):
                # Extract recipe title
                recipe_title = result.a['title'].lower()
                
                # Extract recipe image URL
                recipe_image = result.find('noscript').find('img').get('src')
                
                # Extract recipe URL
                if page_num == 1:
                    recipe_url = result.a['href']
                else:
                    recipe_url = allrecipe_base_url + result.a['href']
                    
                # Check if URL is valid
                if not validators.url(recipe_url):
                    continue
                
                # Scrape recipe page
                fourth_soup = scrape_page(recipe_url)
                
                # Extract recipe rating and number of ratings
                # All recipes should have a rating (unrated is a rating)
                # If no rating is found on page, this means that the page does not contain a specific recipe
                try:
                    recipe_rating, recipe_rating_count, recipe_review_count = get_rating(fourth_soup)
                except AttributeError:
                    continue
                
                # Extract nutritional info
                try:
                    recipe_nutrition = fourth_soup.find('div', class_='partial recipe-nutrition-section').find('div', class_='section-body').text.replace('Full Nutrition', '').lstrip().rstrip()
                except AttributeError:
                    recipe_nutrition = 'None'
                
                # Extract possible allergens/dietary restrictions
                vegan, vegetarian, egg_free, dairy_free, nut_free, shellfish_free = get_allergen_diet(fourth_soup)

                main_cat_list.append(ingredient.lower())
                sub_cat_list.append(sub_cat_title)
                title_list.append(recipe_title)
                rating_list.append(recipe_rating)
                rating_count_list.append(recipe_rating_count)
                review_count_list.append(recipe_review_count)
                image_url_list.append(recipe_image)
                recipe_url_list.append(recipe_url)
                nutrition_list.append(recipe_nutrition)
                egg_free_list.append(egg_free)
                dairy_free_list.append(dairy_free)
                nut_free_list.append(nut_free)
                shellfish_free_list.append(shellfish_free)
                vegan_list.append(vegan)
                vegetarian_list.append(vegetarian)
                recipe_count += 1

            page_num += 1
            sub_cat_link = base_link + '?page={}'.format(page_num)
            
        df = pd.DataFrame(list(zip(main_cat_list, sub_cat_list, recipe_url_list, title_list, image_url_list, rating_list, rating_count_list, review_count_list, nutrition_list, vegan_list, vegetarian_list, egg_free_list, dairy_free_list, nut_free_list, shellfish_free_list)), 
         columns=['Main Category', 'Subcategory', 'Recipe URL', 'Title', 'Image URL', 'Rating', 'Rating Count', 'Review Count', 'Nutrition', 'Vegan', 'Vegetarian', 'Egg Free', 'Dairy Free', 'Nut Free', 'Shellfish Free'])
        
        df.to_csv('allrecipe-data.csv')

# Main cats and sub cats

In [53]:
import pandas as pd
from PIL import Image
import io
import requests
import json
import numpy as np
import os

In [54]:
scraped_csv = pd.read_csv("allrecipe-data.csv")

In [55]:
scraped_csv.drop("Unnamed: 0", axis = 1, inplace = True)

In [56]:
scraped_csv = scraped_csv.sample(frac=0.2, random_state=42)

In [57]:
scraped_csv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2458 entries, 4464 to 10612
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Main Category   2458 non-null   object
 1   Subcategory     2458 non-null   object
 2   Recipe URL      2458 non-null   object
 3   Title           2458 non-null   object
 4   Image URL       2458 non-null   object
 5   Rating          2458 non-null   object
 6   Rating Count    2458 non-null   int64 
 7   Review Count    2458 non-null   int64 
 8   Nutrition       2458 non-null   object
 9   Vegan           2458 non-null   bool  
 10  Vegetarian      2458 non-null   bool  
 11  Egg Free        2458 non-null   bool  
 12  Dairy Free      2458 non-null   bool  
 13  Nut Free        2458 non-null   bool  
 14  Shellfish Free  2458 non-null   bool  
dtypes: bool(6), int64(2), object(7)
memory usage: 206.4+ KB


In [58]:
# import json
import pickle
maincats = scraped_csv["Main Category"].unique()
maincat2id = {}
id2subcat2id = {}
for i, k in enumerate(maincats):
    maincat2id[k]=i
    print(k, len(subcats))
    subcats = scraped_csv[scraped_csv["Main Category"] ==k]["Subcategory"].unique()
    for k,v in scraped_csv[scraped_csv["Main Category"] ==k]["Subcategory"].value_counts().items():
        if v<10:
            print(f"\t{k} has only {v} images")
    id2subcat2id[i] = {k2: j for j, k2 in enumerate(subcats)}

print(maincat2id, id2subcat2id)
with open("maincat2id.pkl", "wb") as f:
    pickle.dump(maincat2id, f)
with open("id2subcat2id.pkl", "wb") as f:
    pickle.dump(id2subcat2id, f)

chicken 15
	chicken sausage has only 7 images
beef 23
	prime rib has only 4 images
turkey 20
	turkey salad has only 9 images
	turkey brine has only 6 images
	turkey meatloaf has only 6 images
	turkey appetizers has only 3 images
vegetable 12
seafood and fish 19
pork 8
	pork sandwiches has only 8 images
{'chicken': 0, 'beef': 1, 'turkey': 2, 'vegetable': 3, 'seafood and fish': 4, 'pork': 5} {0: {'chicken appetizers': 0, 'baked and roasted chicken': 1, 'chicken main dishes': 2, 'healthy chicken main dishes': 3, 'chicken wings': 4, 'chicken soup': 5, 'chicken stew': 6, 'chicken salad': 7, 'chicken stir-fry': 8, 'chicken pie': 9, 'chicken breasts': 10, 'slow cooker chicken main dishes': 11, 'gourmet chicken main dishes': 12, 'whole chicken': 13, 'chicken thighs': 14, 'bbq & grilled chicken': 15, 'chicken tenders': 16, 'fried chicken': 17, 'chicken sausage': 18, 'chicken legs': 19, 'chicken sandwiches': 20, 'chicken chili': 21, 'ground chicken': 22}, 1: {'beef soup': 0, 'beef chili': 1, 'be