# Scraping data to create the dataset

## Imports

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm

## Create Product List

In [68]:
with open('../data/All Products at Whole Foods Market_final.htm', 'r', encoding='utf-8') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')

len(soup.find_all(class_='w-pie--product-tile__link'))

In [148]:
i = 0
products = []
for nutr_element in soup.find_all(class_='w-pie--product-tile__link'):
    
    if not nutr_element: continue
    
    product = nutr_element.find(attrs={"data-testid": "product-tile-name"}).get_text()
    
    brand = nutr_element.find(attrs={"data-testid": "product-tile-brand"})
    if brand: brand = brand.get_text()
    
    if "Prime" in nutr_element.get_text(): #print(nutr_element)
        price = nutr_element.find(class_="bds--heading-5 mr-1 inline px-1 !text-base").get_text()
        #print("Price: ", )
    else:
        price = nutr_element.find(class_="text-left bds--heading-5").get_text()
        #print("Price: ", price)
    
    href = nutr_element["href"]
    
    products.append(
        [product, brand, price, href]
    )
    
    #i+=1
    #if i == 1000: break

In [150]:
df = pd.DataFrame(products, columns=['product_name', 'product_brand', 'price', 'href'])
df.to_csv('../data/product_list.csv', index=False)

## Get Nutritional data

In [155]:
session = requests.Session()

In [528]:
def get_nutritional_data(href, serving_size_pattern):
    
    response = session.get(href)
    soup = BeautifulSoup(response.text, 'html.parser')
    nutr_rows = soup.find_all(class_='nutrition-row')
    
    if not nutr_rows: return

    # initialise with None
    (serving_size, calories, total_fat_amount, saturated_fat, trans_fat, cholesterol,
     sodium, carbs, fiber, sugars, added_sugar, protein, vit_d, potassium, iron, calcium) = [None] * 16

    # ---------- Nutritional elements indices ----------
    
    nutr_row_texts = [nutr_row.get_text().lower() for nutr_row in nutr_rows]
    
    nutr_elements = ['serving', 'calories', 'total fat', 'sat fat', 'trans fat', 'cholesterol', 'sodium', 'carbohydrates',
                     'fiber', 'sugars', 'added sugar', 'protein', 'vitamin d', 'potassium', 'iron', 'calcium']

    nutr_elements_idx = {keyword: None for keyword in nutr_elements}

    # Iterate over each keyword and find the index
    for nutr_element in nutr_elements:
        for index, string in enumerate(nutr_row_texts):
            if nutr_element in string:
                nutr_elements_idx[nutr_element] = index
                break  # Stop searching after the first match
                
                
    # --------------------------------------------------
    
    if nutr_elements_idx['serving'] != None: # usually zero, se we use not None to check existence
        matched = pattern.search(nutr_rows[nutr_elements_idx['serving']].get_text())
        if matched: serving_size = matched.group(1).split()[0] # can be grams or mL
        
    if nutr_elements_idx['calories']:
        calories = nutr_rows[nutr_elements_idx['calories']].find(
            class_='nutrition-column calories-row amount text-right align-bottom').get_text()
        
    if nutr_elements_idx['total fat']:
        total_fat_amount = nutr_rows[nutr_elements_idx['total fat']].find(
            'span', class_='text-bold').next_sibling.strip()[:-1]
        
    if nutr_elements_idx['sat fat']:
        saturated_fat    = nutr_rows[nutr_elements_idx['sat fat']].find(
            'span', class_='text-indent').get_text(strip=True).split()[-1][:-1]
        
    if nutr_elements_idx['trans fat']:
        trans_fat        = nutr_rows[nutr_elements_idx['trans fat']].find(
            'span', class_='text-indent').get_text(strip=True).split()[-1][:-1]
        
    if nutr_elements_idx['cholesterol']:
        cholesterol      = nutr_rows[nutr_elements_idx['cholesterol']].find(
            'span', class_='text-bold').next_sibling.strip()[:-2]
        
    if nutr_elements_idx['sodium']:
        sodium           = nutr_rows[nutr_elements_idx['sodium']].find(
            'span', class_='text-bold').next_sibling.strip()[:-2]
    if nutr_elements_idx['carbohydrates']:
        carbs            = nutr_rows[nutr_elements_idx['carbohydrates']].find(
            'span', class_='text-bold').next_sibling.strip()[:-1]
        
    if nutr_elements_idx['fiber'] and "Soluble" not in nutr_rows[nutr_elements_idx['fiber']].get_text():
        fiber            = nutr_rows[nutr_elements_idx['fiber']].find(
            'span', class_='text-indent').get_text(strip=True).split()[-1][:-1]
        
    if nutr_elements_idx['sugars']:
        sugars           = nutr_rows[nutr_elements_idx['sugars']].find(
            'span', class_='text-indent').get_text(strip=True).split()[-1][:-1]
        
    if nutr_elements_idx['added sugar']:
        added_sugar      = nutr_rows[nutr_elements_idx['added sugar']].find(
            'span', class_='text-indent').get_text(strip=True).split()[-1][:-1]
        
    if nutr_elements_idx['protein']:
        protein          = nutr_rows[nutr_elements_idx['protein']].find(
            'span', class_='text-bold').next_sibling.strip()[:-1]
        
    if nutr_elements_idx['vitamin d']:
        vit_d            = nutr_rows[nutr_elements_idx['vitamin d']].find(
            class_='nutrition-column').get_text(strip=True).split()[-1][:-3]
        
    if nutr_elements_idx['potassium']:
        potassium        = nutr_rows[nutr_elements_idx['potassium']].find(
            class_='nutrition-column').get_text(strip=True).split()[-1][:-2]
        
    if nutr_elements_idx['iron']:
        iron             = nutr_rows[nutr_elements_idx['iron']].find(
            class_='nutrition-column').get_text(strip=True).split()[-1][:-2]
        
    if nutr_elements_idx['calcium']:
        calcium          = nutr_rows[nutr_elements_idx['calcium']].find(
            class_='nutrition-column').get_text(strip=True).split()[-1][:-2]
    
    nutr_data = [
        serving_size,
        calories,
        total_fat_amount,
        saturated_fat,
        trans_fat,
        cholesterol,
        sodium,
        carbs,
        fiber,
        sugars,
        added_sugar,
        protein,
        vit_d,
        potassium,
        iron,
        calcium
    ]
    return nutr_data

In [530]:
serving_size_pattern = re.compile(r'\((.*?)\)')
prod_with_nutr = []

for product in tqdm(products):
    #print(get_nutritional_data(product[3], serving_size_pattern))
    nutr_values = get_nutritional_data(product[3], serving_size_pattern)
    if nutr_values:
        prod_with_nutr.append(
            product + get_nutritional_data(product[3], serving_size_pattern)
        )

100%|██████████████████████████████████████████████████████████████████████████████| 7031/7031 [16:15<00:00,  7.21it/s]


In [533]:
cols = [
    'product_name', 'product_brand', 'price', 'href', 'serving_size', 'calories', 'total_fat_amount', 'saturated_fat', 'trans_fat', 'cholesterol',
    'sodium', 'carbs', 'fiber', 'sugars', 'added_sugar', 'protein', 'vit_d', 'potassium', 'iron', 'calcium'
]

df = pd.DataFrame(prod_with_nutr, columns=cols)
df.to_csv('../data/df_with_nutr.csv', index=False)

In [537]:
df.iloc[1000]

product_name        Organic Hamburger Buns, Classic White (8 Buns)...
product_brand                               365 by Whole Foods Market
price                                                           $4.39
href                https://www.wholefoodsmarket.com/product/365-b...
serving_size                                                       57
calories                                                          150
total_fat_amount                                                    2
saturated_fat                                                       0
trans_fat                                                           0
cholesterol                                                         0
sodium                                                            260
carbs                                                              26
fiber                                                               1
sugars                                                              2
added_sugar         

## Serving Size

In [49]:
df = pd.read_csv("../data/df_with_nutr.csv")

In [50]:
session = requests.Session()

df["serving_size"] = ""

tqdm.pandas()

In [51]:
def get_serving_size(row):
    href = row.href
    response = session.get(href)
    soup = BeautifulSoup(response.text, 'html.parser')
    serving = soup.find_all(class_='servings')
    if not serving: return row
    row["serving_size"] = serving[0].get_text().split()[0]
    return row

In [52]:
df = df.progress_apply(get_serving_size, axis=1)

100%|██████████████████████████████████████████████████████████████████████████████| 5493/5493 [11:29<00:00,  7.97it/s]


In [64]:
df.to_csv('../data/df_with_nutr_serv_size.csv', index=False)

## Categories
possible dataset expansion

In [37]:
#soup_cookies.get_text()#.find("$")#find_all(attrs={"data-testid": "pdp-pricing-block"})
categories = []
for badge in soup.find_all(class_='w-diet-badge__label'):
    categories.append(badge.get_text())
    
categories

['Dairy-Free',
 'Paleo-Friendly',
 'Sugar-Conscious',
 'Vegetarian',
 'Whole Foods Diet',
 'Keto-Friendly',
 'Organic',
 'Local']