In [525]:
import re
import itertools
import pickle
import pandas as pd

In [2]:
import requests
from bs4 import BeautifulSoup

In [None]:
# Todo scrape dietary restriction lists
# https://www.traderjoes.com/dietary-lists/vegan
# https://www.traderjoes.com/dietary-lists/kosher
# https://www.traderjoes.com/dietary-lists/gluten-free

In [61]:
# As of 1/10/2020 Trader Joes has articles numberd ~4600 to ~ 5270
flyer_numbers = range(4600, 5270)

### Helpers

#### Functions for pulling and pre-processing data

In [758]:
def get_raw_name(soup):
    return soup.find("h1", {'class': 'lead'})

In [759]:
def get_raw_price(soup):
    return soup.find("strong", text=re.compile("(\$\d+\.\d+|\d+¢)"))

In [760]:
def get_timeline(soup):
    timeline = soup.find(string=re.compile('INGREDIENTS.*'))
    if timeline:
        return timeline.find_parents('div', {'class': 'pad-timeline'})

In [1060]:
def get_ingredients_and_nutrition(timeline):
    if len(timeline) > 0:
        #contents = [get_base_content(c) for c in timeline[0].contents if c.string not in ['\n', ' ']]
        raw_strings = [get_base_content(c) for c in timeline[0].contents]
        clean_strings = get_clean_content(raw_strings)
        if len(contents) > 0:
            ingredients_ix = (index_containing_substring(clean_strings, "INGREDIENTS") + 1)
            ingredients = clean_strings[ingredients_ix]
            nutrition_ix = (index_containing_substring(clean_strings, "NUTRITION") + 1)
            nutrition = clean_strings[nutrition_ix]
            return ingredients, nutrition
        else:
            return None, None
    else:
        return None, None

In [1061]:
def index_containing_substring(list_, substr):
    for i, s in enumerate(list_):
        if substr in s:
              return i
    return -1

In [1062]:
def get_base_content(contents):
    string = None
    if contents.string:
        string = contents.string
    else:
        string = " ".join([c.string for c in contents.contents if c.string is not None])
    return string

In [1063]:
def get_clean_content(raw_strings):
    clean_strings = []
    for string in raw_strings:
        if string not in ['\n', ' ', u''] and '\n' not in string:
            clean_strings.append(string.strip())
        elif '\n' in string and string != '\n':
            split = [s.strip() for s in string.split('\n')]
            clean_strings = clean_strings + split
    clean_strings = [c for c in clean_strings if c not in ['', '\n']]
    return clean_strings

In [1071]:
def make_soup(url):
    """ Takes a url and returns a soup object for that url """
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [1072]:
def make_data(number, soup=None):
    """ Takes a Flyer number and returns slightly processed data from the page """
    url = "https://www.traderjoes.com/fearless-flyer/article/{}".format(number)
    if not soup:
        soup = make_soup(url)
    
    # Product name
    name = get_raw_name(soup)
    # Product price
    price = get_raw_price(soup)
    # Timeline
    timeline = get_timeline(soup)
    # Ingredients and Nutrition facts
    ingredients, nutrition = get_ingredients_and_nutrition(timeline)
    
    return {
        'id': number,
        'url': url,
        'name': name,
        'price': price,
        'ingredients': ingredients,
        'nutrition': nutrition
    }

### Functions for extracting final values from the pre-processed data

In [1069]:
def get_servings(servings_string):
    """ Returns the string with nutrition information """
    if servings_string:
        try:
            servings = re.compile('[0-9]*\.?[0-9]+').findall(servings_string)
            if servings:
                return servings[0]
        except:
            print servings_string

In [1067]:
def get_price(price):
    """ Takes a price string retunrs a number with the price per package """
    price = re.compile('[0-9]*\.?[0-9]+').findall(price)
    if price:
        return price[0]

In [1070]:
def get_name(name):
    """ Takes a soup Tag and returns a cleaned string of the product name """
    if not name:
        return ""
    
    string = None
    if name.string:
        string = name.string
    elif name.contents:
        string = name.contents[0].string
    else:
        string = ""
    return  string.replace('\n', '').replace('\r','').strip()

In [1059]:
def get_nf(nf, key):
    """
    Takes a string with nutrition fact information
    and returns a quantity for a given key
    
    TODO: include a case in the regex for values including "X less than 1g"
    see: https://www.traderjoes.com/fearless-flyer/article/4688
    """
    if nf:
        result = [v for v in nf if key in v]
        if result:
            re_str = r'({} ([0-9]*[.,]?[0-9]+g|[0-9]*[.,]?[0-9]+mg|[0-9]*[.,]?[0-9]+\w|[0-9]*[.,]?[0-9]+mcg\d+% DV)|Includes [0-9]*[.,]?[0-9]+g {})'.format(key, key)
            qregex = re.compile(re_str)
            quantity = qregex.findall(result[0])
            quantity = quantity[0][0] if quantity else None
            if quantity:
                regex = re.compile('[0-9]*[.,]?[0-9]+')
                quantity = regex.findall(quantity)[0]
            return quantity

In [983]:
base_nutrition = {
    'servings_per_container': None,
    'serving_size': None,
    'calories': None,
    'total_fat': None,
    'saturated_fat': None,
    'trans_fat': None,
    'cholesterol': None,
    'sodium': None,
    'total_carbs': None,
    'fiber': None,
    'total_sugars': None,
    'protein': None,
    'vit_a': None,
    'vit_c': None,
    'iron': None,
    'calcium': None,
    'potassium': None
}

In [1014]:
def process_nutrition(nutrition_string):
    """ Takes string of nutrition information and returns a dict """
    if nutrition_string:
        try:
            servings = nutrition_string.split('|')
            servings_container = servings[0].split(':')[1].strip() if ":" in servings[0] else servings[0]
            serving_size = servings[1] if len(servings) > 1 else 'Varies by region'
            nf = servings[2].split(',') if len(servings) > 1 else None
            return {
                'servings_per_container': servings_container,
                'serving_size': serving_size,
                'calories': get_nf(nf, 'Calories'),
                'total_fat': get_nf(nf, 'Total Fat'),
                'saturated_fat': get_nf(nf, 'Saturated Fat'),
                'trans_fat': get_nf(nf, 'Trans Fat'),
                'cholesterol': get_nf(nf, 'Cholesterol'),
                'sodium': get_nf(nf, 'Sodium'),
                'total_carbs': get_nf(nf, 'Total Carbohydrate'),
                'fiber': get_nf(nf, 'Dietary Fiber'),
                'sugars': get_nf(nf, 'Sugars'),
                'total_sugars': get_nf(nf, 'Total Sugars'),
                'added_sugars': get_nf(nf, 'Added Sugars'),
                'protein': get_nf(nf, 'Protein'),
                'vit_a': get_nf(nf, 'Vitamin A'),
                'vit_c': get_nf(nf, 'Vitamin C'),
                'vit_d': get_nf(nf, 'Vitamin D'),
                'iron': get_nf(nf, 'Iron'),
                'calcium': get_nf(nf, 'Calcium'),
                'potassium': get_nf(nf, 'Potassium')
            }
        except Exception, e:
            return base_nutrition
    else:
        return base_nutrition

### Fetching, Processing, and Exporting the Data

In [1015]:
# Generate a list of urls of Flyer pages
urls = ["https://www.traderjoes.com/fearless-flyer/article/{}".format(number) for number in flyer_numbers]

In [1016]:
# Fetch the html and create a list of soup objects for each page
soups = [make_soup(url) for url in urls]

In [1017]:
# Preprocess the data to extract the relevant chunks of htmls
data = [make_data(number, soup) for number, soup in zip(flyer_numbers, soups)]

In [1018]:
# Convert the preprocessed data into a dataframe
ffdf = pd.DataFrame(data)

In [1026]:
# Process the nutrition information in the preprocessed data
nutrition_data = [d for d in ffdf.nutrition.apply(process_nutrition).tolist() if d]

In [1027]:
nutrition_df = pd.DataFrame(nutrition_data)
nutrition_df.serving_size.fillna('', inplace=True)
nutrition_df.servings_per_container.fillna('', inplace=True)

In [1041]:
# Add the nutrition information back to the original dataframe
df = pd.concat([ffdf, nutrition_df], axis=1)

In [1043]:
columns =[
    'id', 'url', 'name', 'price', 'servings', 
    'serving_size', 'cost_per_serving', 
    'calories', 'protein', 'total_fat', 
    'total_carbs', 'sugars', 'total_sugars', 'added_sugars',
    'sodium', 'fiber', 'iron', 'potassium',
    'vit_a', 'vit_c', 'calcium', 'trans_fat' 
]

In [1044]:
# Run some final processing on the data
df['name'] = df.name.apply(get_name)
df['price'] = df.price.apply(get_price).astype(float)
df['servings'] = df.servings_per_container.apply(get_servings).astype(float)
df['cost_per_serving'] = df.price / df.servings

In [1055]:
# Export
df[df.name != ""][columns].to_csv('./clean_flyer_data2.csv', index=False, encoding='utf-8')

### Old Shit

In [91]:
data = [make_data(number, soup) for number, soup in zip(flyer_numbers)

In [37]:
# ffdf = pd.DataFrame(
#     zipped_ff, 
#     columns=['id', 'url', 'name', 'price', 'timelines'])

In [220]:
# Extract the nutrition information
# ffdf['nutrition'] = (ffdf
#     .timelines
#     .apply(lambda s: s.text.split('NUTRITION FACTS')[1].replace('\n', '').strip() if s and len(s.text.split('NUTRITION FACTS')) > 1 else ''))

In [3]:
# from selenium import webdriver 
# from selenium.webdriver.common.by import By 
# from selenium.webdriver.support.ui import WebDriverWait 
# from selenium.webdriver.support import expected_conditions as EC 
# from selenium.common.exceptions import TimeoutException

In [4]:
# option = webdriver.ChromeOptions()
# option.add_argument(" — incognito")

In [5]:
# browser = webdriver.Chrome(executable_path='/Users/pdarche/Downloads/chromedriver 2', chrome_options=option)

In [None]:
# def get_base_content(contents, found=set()):
#     string = None
#     if contents.string:
#         found.add(contents.string)
#     else:
#         for content in contents.contents:
#             get_base_content(content, found)
#     return found

In [1019]:
# haunted = ffdf[ffdf.id == 4985].nutrition.values[0]

In [1020]:
# haunted_nf = haunted.split('|')[2].split(',')

In [1058]:
# restr = r'({} ([0-9]*[.,]?[0-9]+g|[0-9]*[.,]?[0-9]+mg|[0-9]*[.,]?[0-9]+\w|[0-9]*[.,]?[0-9]+mcg\d+% DV)|Includes [0-9]*[.,]?[0-9]+g {})'.format('Total Sugars', 'Added Sugars')
# print(restr)
# qregex = re.compile(restr)
# quantity = qregex.findall(haunted_nf[7])
# print(quantity)
# get_nf(haunted_nf, 'Total Fat')
# haunted_nf
# process_nutrition(haunted)