* Scrape McDonald's and get nutritional values
* Download images and rename them sequentially

In [208]:
import urllib.request
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import sys

In [52]:
link = 'https://www.mcdonalds.com.sg/nutrition-calculator/#'
imagefolder = r'/Users/x/Desktop/notebooks/mcdonalds'
total = 124
chromedriver = r'/Users/x/x/Scripts/MyPythonScripts/chromedriver_mac235'

In [210]:
df = pd.DataFrame()

for i in range(1,total+1):
    try:
        url = link + str(i)
        driver=webdriver.Chrome(chromedriver)
        driver.get(url)

        # refresh to remove popup window ---------
        time.sleep(1)
        driver.refresh()
        time.sleep(1)
        html = driver.page_source

        # cook in soup ---------
        soup = BeautifulSoup(html, 'html.parser')

        # download image ---------
        img_tag = soup.select(".foodfact-conf-item-thumb > img")[0]
        name = img_tag.get('alt').strip()
        img = img_tag.get('src')
        urllib.request.urlretrieve(img, "{}/{}.png".format(imagefolder,name))

        # record name, nutritional facts ---------
        nutrition = soup.select('.fact-value')
        energy = nutrition[0].getText()
        protein = nutrition[1].getText()
        totalfat = nutrition[2].getText()
        saturatedfat = nutrition[3].getText()
        cholesterol = nutrition[4].getText()
        carbo = nutrition[5].getText()
        fibers = nutrition[6].getText()
        sodium = nutrition[7].getText()

        # append data to dataframe
        df = df.append({'item':name, 'energy':energy, 'total_fat':totalfat, 'saturated_fat':saturatedfat, \
                        'cholesterol':cholesterol, 'carbohydrates':carbo, 'dietary_fibers': fibers, 'sodium':sodium}, \
                        ignore_index=True)

        print(i, name, energy, protein, totalfat, saturatedfat, cholesterol, carbo, fibers, sodium)
    except:
        print(i, 'nothing found!', 'line:', sys.exc_info()[2].tb_lineno)
    
    driver.close()

1 Big Breakfast® 525kcal 27g 25g 7g 445mg 49g 5g 970mg
2 Breakfast Deluxe 826kcal 31g 32g 10.6g 486mg 104g 6g 1120mg
3 Wholegrain Chicken Muffin 401kcal 17g 20g 5.7g 151mg 38g 3g 860mg
4 Wholegrain Egg McMuffin® 288kcal 18g 11g 5g 335mg 29g 3g 600mg
5 Hotcakes 557kcal 9g 18g 5.8g 20mg 91g 3g 680mg
6 Hotcakes with Sausage 640kcal 16g 23g 7.4g 51mg 92g 4g 970mg
7 nothing found! line: 19
8 Wholegrain Sausage McMuffin™ 277kcal 16g 11g 5g 141mg 29g 3g 670mg
9 Wholegrain Sausage McMuffin™ with Egg 356kcal 23g 16g 6.6g 362mg 30g 3g 750mg
10 nothing found! line: 19
11 nothing found! line: 19
12 Filet-O-Fish® 332kcal 15g 13g 3.5g 90mg 38g 2g 570mg
13 Double Filet-O-Fish® 474kcal 24g 21g 5.6g 170mg 48g 3g 790mg
14 Hashbrown 149kcal 1g 8g 1.3g 0mg 17g 1g 350mg
15 Apple Slices 30kcal 0g 0g 0g 1mg 7g 1g 20mg
16 Corn Cup - 3oz 66kcal 2g 1g 0.2g 0mg 12g 2g 70mg
17 Corn Cup - 4.5oz 100kcal 4g 1g 0.3g 0mg 18g 3g 100mg
18 100% Pure Orange Juice - Small 145kcal 2g 0g 0g 0mg 34g 0g 0mg
19 100% Pure Orange

In [211]:
df = df[['item', 'energy', 'total_fat', 'saturated_fat', 'cholesterol', \
          'carbohydrates', 'dietary_fibers', 'sodium']]
df2 = df.copy()

### Process Data

In [212]:
# clear whitespace & change values to float
df2['cholesterol_mg'] = df2['cholesterol'].apply(lambda x: x.replace('mg','').strip())
df2['sodium_mg'] = df2['sodium'].apply(lambda x: x.replace('mg','').strip())
df2['energy_kcal'] = df2['energy'].apply(lambda x: x.replace('kcal','').strip())
df2['total_fat_g'] = df2['total_fat'].apply(lambda x: x.replace('g','').strip())
df2['saturated_fat_g'] = df2['saturated_fat'].apply(lambda x: x.replace('g','').strip())
df2['carbohydrates_g'] = df2['carbohydrates'].apply(lambda x: x.replace('g','').strip())
df2['dietary_fibers_g'] = df2['dietary_fibers'].apply(lambda x: x.replace('g','').strip())

In [213]:
# sort by item so that can sort/match custom logos properly in Tableau
df2=df2.sort_values('item')
df2=df2.reset_index(drop=True).reset_index()
df2['index'] = df2['index'] + 1
df2.head(2)

Unnamed: 0,index,item,energy,total_fat,saturated_fat,cholesterol,carbohydrates,dietary_fibers,sodium,cholesterol_mg,sodium_mg,energy_kcal,total_fat_g,saturated_fat_g,carbohydrates_g,dietary_fibers_g
0,1,100% Pure Orange Juice - Medium,184kcal,0g,0g,0mg,43g,0g,0mg,0,0,184,0,0,43,0
1,2,100% Pure Orange Juice - Small,145kcal,0g,0g,0mg,34g,0g,0mg,0,0,145,0,0,34,0


In [214]:
# set all columns to float type
df3 = df2[['index', 'item', 'energy_kcal', 'total_fat_g', 'saturated_fat_g', \
           'cholesterol_mg', 'carbohydrates_g', 'dietary_fibers_g', 'sodium_mg']]

for col in df3.columns[2:]:
    df3[col] = df3[col].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [215]:
df3.dtypes

index                 int64
item                 object
energy_kcal         float64
total_fat_g         float64
saturated_fat_g     float64
cholesterol_mg      float64
carbohydrates_g     float64
dietary_fibers_g    float64
sodium_mg           float64
dtype: object

In [216]:
df3.to_excel('mac.xlsx',index=False)

### Number Each Item Image Sequentially

In [243]:
tableau_folder = '/Users/x/Documents/My Tableau Repository/Shapes/Mac'
for i in os.listdir(tableau_folder):
    if i.endswith('.png'):
        item = i.replace('.png','')
        try:
            index = df3[df3['item']==item]['index'].tolist()[0]
            index = str(index).zfill(3)
            os.rename(os.path.join(tableau_folder,i), os.path.join(tableau_folder,'{}_{}'.format(index,i)))
        except:
            pass
        
print('all images renamed')

all images renamed
