In [1]:
#Import the required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Scraping 100 recipes

Below section illustrates how to extract 100 recipes from food section on the ndtv.com website. We extract 52 recipes from the vegetarian recipes section and about 48 recipes from the healthy recipes section on the ndtv.com website.

In [None]:
# Extract vegetarian recipes
url = 'https://food.ndtv.com/recipes/vegetarian-recipes'
html_doc = requests.get(url).content
soup = BeautifulSoup(html_doc, 'html.parser')

# Get the source
recipe_container = soup.find("div", {"class": "recp-det-cont"})
source1 = recipe_container.find('h1').get_text().strip()
print('Recipe Type:',source1)

Recipe Type: VEGETARIAN Recipes


In [None]:
# Extract the inidvidual components: name, url and ingredients for each recipe
# name
recipe_ingredients = recipe_container.find('div', {"id": "recipeListing"})
veg_recipe_names = [x.get_text().strip()
               for x in recipe_ingredients.find_all('a')]
# links
recipe_ingredients = recipe_container.find('div', {"id": "recipeListing"})
veg_recipe_links = [x.get('href')
               for x in recipe_ingredients.find_all('a')]

#ingredients
veg_recipe_ingredients = []
remove = []
for link in veg_recipe_links:
    url = link
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    recipe_container = soup.find("div", {"class": "recp-det-cont"})
    recipe_ingredients = recipe_container.find('div', {"class": "ingredients"})
    
    # Remove extra subtitles from the ingredients
    unwanted = recipe_container.find('div', {"class": "ingredients"}).find_all('b')
    for x in unwanted:
        remove.append(x.get_text().strip())
        
    veg_recipe_ingredients.append([y.get_text().strip() for y in recipe_ingredients.find_all('li') 
                         if y.get_text().strip() not in remove and not(y.get_text().strip().startswith('For')) and not(y.get_text().strip().startswith('Ingredients'))])

In [None]:
# Extract healthly recipes
url = 'https://food.ndtv.com/recipes/meat-recipes'
html_doc = requests.get(url).content
soup = BeautifulSoup(html_doc, 'html.parser')

# Get the source
recipe_container = soup.find("div", {"class": "recp-det-cont"})
source2 = recipe_container.find('h1').get_text().strip()
print('Recipe Type:',source2)

In [None]:
# Extract the inidvidual components: name, url and ingredients for each recipe
# name
recipe_ingredients = recipe_container.find('div', {"id": "recipeListing"})
healthy_recipe_names = [x.get_text().strip()
               for x in recipe_ingredients.find_all('a')]

# links
recipe_ingredients = recipe_container.find('div', {"id": "recipeListing"})
healthy_recipe_links = [x.get('href')
               for x in recipe_ingredients.find_all('a')]

#ingredients
healthy_recipe_ingredients = []
remove = []
for link in healthy_recipe_links:
    url = link
    html_doc = requests.get(url).content
    soup = BeautifulSoup(html_doc, 'html.parser')
    recipe_container = soup.find("div", {"class": "recp-det-cont"})
    recipe_ingredients = recipe_container.find('div', {"class": "ingredients"})
    
     # Remove extra subtitles from the ingredients
    unwanted = recipe_container.find('div', {"class": "ingredients"}).find_all('b')    
    for x in unwanted:
        remove.append(x.get_text().strip())
        
    healthy_recipe_ingredients.append([y.get_text().strip() for y in recipe_ingredients.find_all('li') 
                         if y.get_text().strip() not in remove and not(y.get_text().strip().startswith('For')) and not(y.get_text().strip().startswith('Ingredients'))])

In [None]:
# Combining the list of health recipes and vegetarian recipes
for i in range(len(healthy_recipe_names)):
    if len(veg_recipe_names) < 100:
        veg_recipe_names.append(healthy_recipe_names[i])
        veg_recipe_links.append(healthy_recipe_links[i])
        veg_recipe_ingredients.append(healthy_recipe_ingredients[i])

# Creating a data dictionary for all the veg and healthy recipes
data = {'url':veg_recipe_links,'name':veg_recipe_names,'ingredient':veg_recipe_ingredients}

raw_data = pd.DataFrame(data=data)
raw_data_final = raw_data.set_index(['url','name'])['ingredient'].apply(lambda x: pd.Series(x)).stack().reset_index(level=2, drop=True).to_frame('ingredient')
raw_data_final = raw_data_final.reset_index()
raw_data_final.head()

In [None]:
# Writing the data to csv file
raw_data_final.to_csv('./a2_rawData.csv',index=False)

### Cleaned the Data

In this section, we illustrate how to clean the ingredients data from the ndtv.com website. We look for additonal white spaces, removing measure metrics etc.

In [None]:
# Import required libraries
import re
import pattern
from pattern.text.en import singularize

# Copy the dataframe into a new dataframe
clean_data_intermediate = raw_data_final
clean_data_intermediate.head()

In [None]:
# Cleaing the data
pattern = '[0-9]|[^A-Za-z]+' # patterns to be removed
# Extra words to be removed
remove_words = ['gm','gms','gram','grams','kg','tbsp','tsp','ml','cm','cups','pcs','cup','inch','height','drops','litre','piece','chotti','adding','whisk','well','packed',
                'pinch','per','taste','peeled','little','cook','optional','garnishing','finely','chopped','unsweetened','rajasthani','elaichi','compound','bruised',
                'freshly','uncooked','toasted','powdered','required','cube','cut','pieces','sliced','minced','shredded','glasses','pair','squeezed','lavang', 'fairly',
                'shell','melted','few','stranded','grated','strands','liquid','cubes','refined','dessicated','sultanas','cleaned','washed','ground','jaiphal','approx','each',
                'fresh','pinches','frying','clarified','few','purpose','tulsi','crushed','fried','pre','soaked','softened','shahi','zeera','pasanda','thin','slice',
                'christmas','fry','segments','smashed','roasted','large','nos','granulated','ripened','diced','crumbled','five','marinade','pounded','javitri','crush',
                'whisked','cubed','mashed','glass','refined','coarse','grind','enough','regular','extra','fat','skimmed','spicy','desi','ground','and','lukewarm','dalia',
                'small','squares','seedless','balls','srushed','skim','atta','frozen','dried','full','shakes','preferably','it','like','style','deseeded','approximately',
                'slices','cooled','medium','roll','to','before','soaking','as','sendha','namak','sweetening','agent','healthier','you','curry','slit','slightly','grilled'
                'substitute','halved','khaand','low','boiled','a','holy','purchased','homemade','dusting','fine','work','seedless','if','cleaned','kaalimirch','seasoning',
                'defrosted','young','season','deep','quartered','chargrilled','whole','spices','himalayan','best','rolling','cooked','or','coarsely','ready','add',
                'grated','drained','kashmiri','garnish','handful','according','wash','very','half','day','one','bowl','mixed','spicy','dipping','coarsely']
ingredient_new = []
rx = re.compile(r'\ban\b|\bthe\b|\band\b|\bfor\b|\ba\b|\bin\b|\ball\b|\bas\b|\bfrom\b|\bof\b|\bto\b|\bfor\b|\binto\b|\bas\b\|\bor\b')

# Loop through all words and clean
for ingredient in clean_data_intermediate['ingredient'].values:
    ingredient_nw = re.sub(pattern,' ', ingredient)
    #print(ingredient_nw)
    ingredient_nw = rx.sub(' ', ingredient_nw)
    
    ingredient_new.append(ingredient_nw)
    
#ingredient_new

In [None]:
# Loop through all words and clean
ingredient_cleaned = []
for ingredient in ingredient_new:
    for word in ingredient:
        res = ingredient.split()

        res_final = [res[i].lower() for i in range(len(res)) if res[i].lower() not in remove_words]
        res_final = ' '.join(res_final)
        res_final = res_final.strip()
    ingredient_cleaned.append(res_final)

In [None]:
# Cleaning additional data points in ingredient column
clean_data_intermediate['ingredient_cleaned'] = ingredient_cleaned

In [None]:
# Cleaning long sentences
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'red chilli powder cayenne pepper'] = 'red chilli powder'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'red rice vermicelli broken sooji semolina vermicelli'] = 'vermicelli'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'olive oil not virgin'] = 'olive oil'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'mixed nuts almonds cashews walnuts pistachios peanuts'] = 'mixed nuts'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'raisins cranberries citrus peel nuts' ] = 'nuts'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'mix black pepper cinnamon cardamom clove bay leaf'] = 'spices'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'red yellow green bell peppers'] = 'bell pepper'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'urad dal flour dry roast dal powder'] = 'urad dal powder'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'coconut oil vegetable oil'] = 'coconut oil'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'herbs basil parsley thyme cilantro'] = 'herbs'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'parmesan mozzarella cheese'] = 'parmesan cheese'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'mutton clean male goat thigh'] = 'mutton'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'mash ki dal urad dal'] = 'urad dal'

In [None]:
# Cleaning long sentences
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'meat chicken mutton turkey lamb'] = 'meat'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'spice mix turmeric red chili powder coriander powder cumin powder salt garam masala made paste with water'] = 'spice'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'duck vertically'] = 'duck'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'processed cheese parmesan cheese'] = 'parmesan cheese'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'cloves green cardamoms pepper corns tiny cinnamon together'] = 'condiment'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'tamarind sauce with dash chilli sauce vinegar'] = 'sauce'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'sized prawns shelled de veined'] = 'prawn'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'stalk spring onion both white green portion'] = 'spring onion'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'coconut poqder'] = 'coconut powder'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'tomatoes skewers oil brush chaat masala'] = 'tomatoes'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'saltand pepper'] = 'salt and pepper'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'] == 'tomato de skinned de seeded'] = 'tomato'

In [None]:
# Replacing similar words with the same word
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['coriander','green coriander','coriander green','coriander leaves','coriander leaf'])] = 'coriander'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['flour maida'])] = 'maida'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['green chillies','chili green'])] = 'green chilli'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['black pepper','pepper','ground black pepper','black pepper powder','peppers'])] = 'black pepper'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['basil','basil leaves'])] = 'basil'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['potatoes','potato'])] = 'potato'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['onions','onion','onions size'])] = 'onion'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['apples','apple'])] = 'apple'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['almonds','almond'])] = 'almond'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['cloves','clove'])] = 'clove'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['bread crumbs','bread crumb','breadcrumbs'])] = 'bread crumbs'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['cashews','cashew nuts','cashewnuts','cashew nut','blend with water make paste cashew'])] = 'cashews'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['chilly flakes','chilli flakes','chili flakes'])] = 'chilli flakes'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['garlic cloves','garlic clove','cloves garlic','garlic','garlic pods'])] = 'garlic'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['veg stock','vegetable stock'])] = 'vegetable stock'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['straberries','strawberry'])] = 'strawberry'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['straberries pureed','strawberry puree'])] = 'strawberry puree'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['lemon juiced','lemon juice'])] = 'lemon juice'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['jaggery gur','jaggery organic','jaggery'])] = 'jaggery'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['sprig curry leaves','sprigs curry leaves'])] = 'sprig curry leaves'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['raspeberry','raspberries'])] = 'raspberry'

In [None]:
# Replacing similar words with the same word
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['gulaabjal','rose water gulaabjal'])] = 'rose water'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['bayleaf','bay leaf','bay leaves'])] = 'bayleaf'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['soak tablespoon water saffron','saffron diluted water'])] = 'saffron water'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['powder garam masala','garam masala'])] = 'garam masala powder'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['black peppercorns','peppercorn','black pepper corn','black peppercorn','black pepper corns','black pepper corns'])] = 'black pepper corn'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['cinnamon sticks','stick cinnamon','cinnamon','cinnamon sticks'])] = 'cinnamon stick'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['chilli powder','chili powder'])] = 'chilli powder'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['egg','eggs'])] = 'egg'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['red chili powder','red chilli powder','red chillli powder'])] = 'red chilli powder'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['brown onion','brown onions'])] = 'brown onion'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['bell pepper','bell peppers'])] = 'bell pepper'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['black cardamom','black cardamoms','cardamom','cardamoms','cardamoms pods'])] = 'cardamom'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['carrot','carrots'])] = 'carrot'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['mint leaf','mint leaves'])] = 'mint leaves'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['lemon juice','lemon juice lemons'])] = 'lemon'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['mustard oil','mustard oil oil'])] = 'mustard oil'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['malted vinegar','malt vinegar'])] = 'malt vinegar'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['green cardamoms','green cardamom'])] = 'green cardamom'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['cornflour','corn flour'])] = 'corn flour'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['cashew nut paste','cashew paste'])] = 'cashew paste'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['yogurt beaten','yogurt dahi','yogurt plain','yoghurt','yogurt'])] = 'yogurt'
clean_data_intermediate['ingredient_cleaned'].loc[clean_data_intermediate['ingredient_cleaned'].isin(['green chili','green chilli','green chily','green chilies'])] = 'green chillies'

In [None]:
# Check the shape
clean_data_intermediate.head()
clean_data_intermediate.shape

In [None]:
# Remove ingredients data points which are not ingredients
remove_non_ingredients = ['oven proof width','food thermometer','plastic bag','paste water','japanese','lasagna sheets','serve','banana leaf covering leg lamb','']
clean_data_final = clean_data_intermediate[~clean_data_intermediate['ingredient_cleaned'].isin(remove_non_ingredients)]
clean_data_final = pd.DataFrame(data=clean_data_final,columns=['url','name','ingredient_cleaned'])
clean_data_final.rename(columns={'ingredient_cleaned':'ingredient'},inplace=True)

In [None]:
# Extract to csv
clean_data_final.drop_duplicates(inplace=True)
clean_data_final.to_csv('./a2_cleanData.csv',index=False)

### III. Result Analysis

This section illustrates how to calculate count and proportion for each individual ingredients.

In [None]:
# Cpy the cleaned data file
recipe_data_analysis = clean_data_final 

In [None]:
# Count each ingredient
recipe_data = recipe_data_analysis.groupby('ingredient')['name'].count()
recipe_data.sort_values(ascending=False,inplace=True)

In [None]:
# Create a new dataframe
recipe_data_analysis = {'ingredient':recipe_data.index,'count':recipe_data.values,'proportion':recipe_data.values/len(set(clean_data_final['name']))}

In [None]:
# Pull the top most common ingredients
recipe_data_analysis = pd.DataFrame(recipe_data_analysis)
recipe_data_analysis = recipe_data_analysis.head(10)
recipe_data_analysis

In [None]:
# Extract to csv
recipe_data_analysis.to_csv('./a2_results.csv',index=False)