# Foodnetwork 

### www.foodnetwork.com - a food recipe website with over 14,000 recipes.

This notebook performs webscraping on the foodnetwork website to collect

- Master Directory URLs of the Recipe Links
- After having got the Master Directory, it scrapes the webpage URLs of all the Recipes


In [None]:
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [None]:
baseURL = 'https://www.foodnetwork.com/recipes/food-network-kitchen/'

#### Get the directories of the Food Network website - where all the recipes are stored

In [None]:
start_time = time.time()

recipeIndexes = ['123', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'xyz']
foodNetwork_masterRecipe_indexLinks = []

for index in recipeIndexes:
    next_page = urljoin(baseURL, index)
    for sub_page in range(1, 20):
        final_page = urljoin(f'{next_page}/p/', str(sub_page))
        response = requests.get(final_page)
        if (str(response) == '<Response [200]>'):
            foodNetwork_masterRecipe_indexLinks.append(final_page)
        elif(str(response) == '<Response [404]>'):
            break

end_time = time.time()

total_time = end_time - start_time
print(f'Total Time Elapsed: {total_time} seconds')

In [None]:
foodNetwork_masterDirectory_links_df = pd.DataFrame(foodNetwork_masterRecipe_indexLinks, columns = ['Recipe Directories'])
foodNetwork_masterDirectory_links_df.to_csv('./foodNetworkMasterDirectoryLinks.csv', index = None)

In [None]:
foodNetwork_masterDirectory_links_df.head()

#### Get all the recipes & their corresponding webpages - of the Food Network Website

In [None]:
start_time = time.time()

recipe_title_link = []

for recipeDirectory_link in foodNetwork_masterDirectory_links_df['Recipe Directories']:
    
    url = recipeDirectory_link

    response = requests.get(url)
    html_content = response.text
    doc = BeautifulSoup(html_content, 'html.parser')
        
    for recipeList_column in range(0, 2):

        recipes = doc.find_all("ul", class_="m-PromoList o-Capsule__m-PromoList")[recipeList_column]
        recipes = recipes.find_all("li", class_='m-PromoList__a-ListItem')

        for recipe in recipes:
            recipeTitle = recipe.text.strip()
            reciple_url = 'https:' + recipe.a['href']
            recipe_title_link.append((recipeTitle, reciple_url))


end_time = time.time()
total_time_inSeconds = end_time - start_time
print(f'Total time taken to get the links of all the recipes on Food Network: {total_time_inSeconds} seconds')

In [None]:
recipe_df = pd.DataFrame(recipe_title_link, columns = ['Recipe Title', 'Recipe Link'])
recipe_df.index = range(1, len(recipe_df) + 1)
recipe_df.to_csv('./foodNetworkRecipeLinks.csv', index = None)