In [1]:
# Import requests package to request webpage information
import requests
# Import BeautifulSoup to parse HTML
from bs4 import BeautifulSoup
import pandas as pd
from wordcloud import WordCloud, STOPWORDS 
import re
from collections import Counter


Scraping data:
Following code, pulls recipes from a website which lists Indian breakfast recipes(182 recipes). I have just pulled the data, and have not made any modification to the ingredient list

In [2]:
r = requests.get('https://www.padhuskitchen.com/2010/09/breakfast-recipes-indian-breakfast.html')
soup = BeautifulSoup(r.text, 'html.parser')

recipe_links=[]
recipe_name=[]
s_recipe=soup.find("select",{"name" : "Breakfast"})
s_recipe2=s_recipe.find_all("option")

for link in s_recipe2:
    if link.has_attr('value'):
        recipe_links.append(link['value'])
recipe_links1=[x for x in recipe_links if x not in [""]]

raw_ing=pd.DataFrame(columns=["url","name","ingredient"])

def recipe_pull(link,raw_ing):
    #print(link)
    r = requests.get(link)
    soup = BeautifulSoup(r.text, 'html.parser')
    ing_list = soup.find_all("span", {"itemprop" : "ingredients"})
    
    recipe_name = soup.find('h3',{"class" : "post-title entry-title"})
    
    for i in ing_list:
        lst_dict = []
        lst_dict.append({'url':link, 'name':recipe_name.get_text("",strip=True), 'ingredient': i.get_text("",strip=True)})
        raw_ing=raw_ing.append(lst_dict, ignore_index=True)
    return(raw_ing)
    

for i in range(len(recipe_links1)-1):
    raw_ing=recipe_pull(recipe_links1[i],raw_ing)

raw_ing.to_csv("a2_rawData.csv",encoding='utf-8', index=False)

Cleaning Data:
This website has ingredients with quantity. To ensure only ingredients are captured, I have tried to remove irrelevant words, numbers, blanks & special characters.

In [3]:
clean_ing=pd.DataFrame()
clean_ing=raw_ing.copy()


for i in range(len(raw_ing)):
    
    comment_words = []
    stopwords = list(STOPWORDS) 
    stopwords1=stopwords+["","i","½","¼","(",")","1","2","3","4","5","6","–","-","cup","tbsp","tsp","needed","chopped","finely","used","medium",
             "large","small","size","deseeded"]
    stopwords2 = ["1/2","1/4","3/4","big","pinch","","cup","cups","tbsp","tsp","needed","chopped","finely","used","–","-","½","¼","medium","(",")","large","small","1","2","3","4","5","6","–","-",
      "size","deseeded","inch","(without)","piece","seeds","pinch","to","taste","(optional)","optional","whole","flat","for","dusting","or","chopped",
                 "finely","kitchen","fistful","without","crushed","slightly","required","king","making","leaves","a","as","12","11","10","7","8","9","green","powder","grated","clean"]
    
    val=raw_ing.iloc[i,2]
    tokens = val.lower()
    tokens=tokens.replace("-"," ")
    tokens=tokens.replace("/"," ")
    tokens=tokens.replace("("," ")
    tokens=tokens.replace(")"," ")
    tokens=tokens.strip()
    tokens1 = tokens.split()
    resultwords  = [word for word in tokens1 if word.lower() not in stopwords2]
    tokens2 = ' '.join(resultwords)
    clean_ing["ingredient"][i]=tokens2.strip()
    
clean_ing.to_csv("a2_cleanData.csv",encoding='utf-8', index=False)


Calculating Popular Ingredients:
Code below calculates the Top 10 ingredients, # times it has occured in all the recipes & proportion of recipes which have the ingredient.  

Important thing to note is that, for some recipes, it has sub modules like: Curry powder preparation part, Main item preparation part etc. In those cases, ingredient might be listed twice. (eg) In https://www.padhuskitchen.com/2011/01/ven-pongal-ven-pongal-with-sambar.html , salt is listed twice. So, count of salt will be 2. But for proportion calculation, it would be counted only as 1 recipe in which salt has occured.

In [5]:
ing_words=[]
clean_ing2=pd.DataFrame()
clean_ing2=clean_ing.copy()
for i in range(len(clean_ing2)):
    val=clean_ing2.iloc[i,2].split()
    for j in val:
        j.lower()
        ing_words.append(j.lower())

ing_grp=dict(Counter(ing_words))

ing_dt=pd.DataFrame.from_dict(ing_grp,orient="index")
ing_dt1=ing_dt.reset_index()
ing_dt1.columns=['word','count']
ing_dt1.sort_values(by=['count'],inplace=True,ascending=False)
ing_top10=ing_dt1.head(10)
ingtop10 = ing_top10.reset_index(drop=True)

for i in range(10):
    x=ingtop10.iloc[i,0]
    clean_ing2[x]=0
for i in range(len(clean_ing2)):
    val=clean_ing2.iloc[i,2].split()
    for j in val:
        for k in range(10):
            l=k+3
            if j.strip() == ingtop10.iloc[k,0].strip():
                x=ingtop10.iloc[k,0]
                clean_ing2[x][i]=1
        
for k in range(10):
    ing_c=0
    x=pd.DataFrame(clean_ing2.groupby(['url']).sum()[ingtop10.iloc[k,0]])
    for j in range(len(x)):
        if x.iloc[j,0] > 0:
            ing_c=ing_c+1
    ingtop10.loc[k,2]=ing_c/len(x)
ingtop10.columns=['word','count','proportion']
ingtop10.to_csv("a2_results.csv",encoding='utf-8', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
