In [1]:
import re
import itertools
import pickle
import numpy as np
import pandas as pd

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
# As of 1/10/2020 Trader Joes has articles numberd ~4600 to ~ 5270
flyer_numbers = range(4600, 5270)

### Helpers

#### Functions for pulling and pre-processing data

In [4]:
def get_raw_name(soup):
    return soup.find("h1", {'class': 'lead'})

In [5]:
def get_raw_price(soup):
    return soup.find("strong", text=re.compile("(\$\d+\.\d+|\d+¢)"))

In [6]:
def get_timeline(soup):
    timeline = soup.find(string=re.compile('INGREDIENTS.*'))
    if timeline:
        return timeline.find_parents('div', {'class': 'pad-timeline'})

In [7]:
def get_ingredients_and_nutrition(timeline):
    if timeline and len(timeline) > 0:
        raw_strings = [get_base_content(c) for c in timeline[0].contents]
        clean_strings = get_clean_content(raw_strings)
        if len(clean_strings) > 0:
            ingredients_ix = (index_containing_substring(clean_strings, "INGREDIENTS") + 1)
            ingredients = clean_strings[ingredients_ix]
            nutrition_ix = (index_containing_substring(clean_strings, "NUTRITION") + 1)
            nutrition = clean_strings[nutrition_ix]
            return ingredients, nutrition
        else:
            return None, None
    else:
        return None, None

In [8]:
def index_containing_substring(list_, substr):
    for i, s in enumerate(list_):
        if substr in s:
              return i
    return -1

In [9]:
def get_base_content(contents):
    string = None
    if contents.string:
        string = contents.string
    else:
        string = " ".join([c.string for c in contents.contents if c.string is not None])
    return string

In [10]:
def get_clean_content(raw_strings):
    clean_strings = []
    for string in raw_strings:
        if string not in ['\n', ' ', u''] and '\n' not in string:
            clean_strings.append(string.strip())
        elif '\n' in string and string != '\n':
            split = [s.strip() for s in string.split('\n')]
            clean_strings = clean_strings + split
    clean_strings = [c for c in clean_strings if c not in ['', '\n']]
    return clean_strings

In [11]:
def make_soup(url):
    """ Takes a url and returns a soup object for that url """
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [82]:
def make_tag_data(number, soup=None):
    """ Takes a Flyer number and returns slightly processed data from the page """
    url = "https://www.traderjoes.com/fearless-flyer/article/{}".format(number)
    if not soup:
        soup = make_soup(url)
    
    # Product name
    name = get_raw_name(soup)
    # Product price
    price = get_raw_price(soup)
    # Timeline
    timeline = get_timeline(soup)
    
    return (number, url, name, price, timeline)

### Functions for extracting string values from the tag data

In [129]:
def make_string_data(tag_data):
    """ Takes a Flyer number and returns slightly processed data from the page """
    id, url, name_tag, price_tag, timeline_tag = tag_data
    
    # Product name
    name = get_name_string(name_tag)
    # Product price
    price = get_price_string(price_tag)
    # Ingredients and Nutrition facts
    ingredients, nutrition = get_ingredients_and_nutrition(timeline_tag)
    
    return (id, url, name, price, ingredients, nutrition)

In [143]:
def get_price_string(price):
    """ Takes a price string retunrs a number with the price per package """
    if not price:
        return ""
    
    price_string = ""
    if type(price) == str:
        price_string = price
    elif price.string:
        price_string = price.string
    elif price.contents:
        price_string = price.contents[0].string
    else:
        price_string = ""
  
    return price_string

In [144]:
def get_name_string(name):
    """ Takes a soup Tag and returns a cleaned string of the product name """
    if not name:
        return ""
    
    string = ""
    if name.string:
        string = name.string
    elif name.contents:
        string = name.contents[0].string
    else:
        string = ""
    return  string.replace('\n', '').replace('\r','').strip()

### Functions for extracting the final structured data from the raw string data

In [145]:
def get_price(price_string):
    """Extracts the price information from the """
    price = re.compile('[0-9]*\.?[0-9]+').findall(price_string)
    
    if price:
        if "¢" in price_string:
            price = "." + price[0]
        else:
            price = price[0]
        return float(price)

In [146]:
def get_servings(servings_string):
    """ Returns the string with nutrition information """
    if not servings_string:
        return None
    
    if servings_string:
        try:
            servings = re.compile('[0-9]*\.?[0-9]+').findall(servings_string)
            if servings:
                return float(servings[0])
        except:
            print(servings_string)

In [147]:
def get_nf(nf, key):
    """
    Takes a string with nutrition fact information
    and returns a quantity for a given key
    
    TODO: include a case in the regex for values including "X less than 1g"
    see: https://www.traderjoes.com/fearless-flyer/article/4688
    """
    if nf:
        result = [v for v in nf if key in v]
        if result:
            re_str = r'({} ([0-9]*[.,]?[0-9]+g|[0-9]*[.,]?[0-9]+mg|[0-9]*[.,]?[0-9]+\w|[0-9]*[.,]?[0-9]+mcg\d+% DV)|Includes [0-9]*[.,]?[0-9]+g {})'.format(key, key)
            qregex = re.compile(re_str)
            quantity = qregex.findall(result[0])
            quantity = quantity[0][0] if quantity else None
            if quantity:
                regex = re.compile('[0-9]*[.,]?[0-9]+')
                quantity = regex.findall(quantity)[0]
            else:
                quantity = None
            return quantity

In [135]:
base_nutrition = {
    'servings_per_container': None,
    'serving_size': None,
    'calories': None,
    'total_fat': None,
    'saturated_fat': None,
    'trans_fat': None,
    'cholesterol': None,
    'sodium': None,
    'total_carbs': None,
    'fiber': None,
    'total_sugars': None,
    'protein': None,
    'vit_a': None,
    'vit_c': None,
    'iron': None,
    'calcium': None,
    'potassium': None
}

In [165]:
def process_nutrition(nutrition_string):
    """ Takes string of nutrition information and returns a dict """
    if nutrition_string:
        try:
            servings = nutrition_string.split('|')
            servings_container = servings[0].split(':')[1].strip() if ":" in servings[0] else servings[0]
            serving_size = servings[1].strip() if len(servings) > 1 else None
            nf = servings[2].split(',') if len(servings) > 1 else None
            return {
                'servings_per_container': servings_container,
                'serving_size': serving_size,
                'calories': get_nf(nf, 'Calories'),
                'total_fat': get_nf(nf, 'Total Fat'),
                'saturated_fat': get_nf(nf, 'Saturated Fat'),
                'trans_fat': get_nf(nf, 'Trans Fat'),
                'cholesterol': get_nf(nf, 'Cholesterol'),
                'sodium': get_nf(nf, 'Sodium'),
                'total_carbs': get_nf(nf, 'Total Carbohydrate'),
                'fiber': get_nf(nf, 'Dietary Fiber'),
                'sugars': get_nf(nf, 'Sugars'),
                'total_sugars': get_nf(nf, 'Total Sugars'),
                'added_sugars': get_nf(nf, 'Added Sugars'),
                'protein': get_nf(nf, 'Protein'),
                'vit_a': get_nf(nf, 'Vitamin A'),
                'vit_c': get_nf(nf, 'Vitamin C'),
                'vit_d': get_nf(nf, 'Vitamin D'),
                'iron': get_nf(nf, 'Iron'),
                'calcium': get_nf(nf, 'Calcium'),
                'potassium': get_nf(nf, 'Potassium')
            }
        except Exception:
            return base_nutrition
    else:
        return base_nutrition

In [176]:
def make_final_data(string_data):
    """
    Creates a dictionary of the final data for the food
    """
    id, url, name_string, price_string, ingredients_string, nutrition_string = string_data
    
    # Get final price data
    price = get_price(price_string)
    
    final_dict = {
        'id': id,
        'url': url,
        'name': name_string,
        'price': price
    }
    
    nutrition_dict = process_nutrition(nutrition_string)
    servings = get_servings(nutrition_dict.get('servings_per_container'))
    
    nutrition_dict['servings'] = servings if servings else None
    nutrition_dict['cost_per_serving'] = price / servings if price and servings else None
        
    final_dict.update(nutrition_dict)
    final_dict['vegan'] = 1 if find_similar(name_string, 'vegan') else 0
    final_dict['gluten_free'] = 1 if find_similar(name_string, 'gluten-free') else 0
    final_dict['kosher'] = 1 if find_similar(name_string, 'kosher') else 0
    
    return final_dict

### Fetching, Processing, and Exporting the Data

In [177]:
# Generate a list of urls of Flyer pages
# urls = ["https://www.traderjoes.com/fearless-flyer/article/{}".format(number) for number in flyer_numbers]

In [178]:
# Fetch the html and create a list of soup objects for each page
# soups = [make_soup(url) for url in urls]
# soups = pickle.load(open( "../data/fearless_flyer_soups.pkl", "rb" ))

In [41]:
# pickle.dump(soups, open("../data/fearless_flyer_soups.pkl", "wb" ))

In [137]:
tag_data = [make_tag_data(number, soup) for number, soup in zip(flyer_numbers, soups)]

In [179]:
string_data = [make_string_data(data) for data in tag_data]

In [180]:
final_data = [make_final_data(data) for data in string_data]

In [181]:
df = pd.DataFrame(final_data)

In [185]:
columns =[
    'id', 'url', 'name', 'price', 'servings', 
    'serving_size', 'cost_per_serving', 
    'calories', 'protein', 'total_fat',
    'trans_fat', 'saturated_fat', 'cholesterol',
    'total_carbs', 'sugars', 'total_sugars', 'added_sugars',
    'sodium', 'fiber', 'iron', 'potassium',
    'vit_a', 'vit_c', 'calcium', 'trans_fat',
    'vegan', 'gluten_free', 'kosher'
]

In [190]:
# Export
df[df.name != ""][columns].to_csv('../data/clean_flyer_data3.csv', index=False, encoding='utf-8')

In [188]:
# df[df.name != ""][columns]

### Dietary Lists

In [75]:
def get_dietary_list_foods(dietary_type):
    html = requests.get("https://www.traderjoes.com/dietary-lists/{}".format(dietary_type)).text
    soup = BeautifulSoup(html, 'html.parser')
    headers = soup.find_all('p', {'class': 'subheader4'})
    tags = [tag.next_sibling.next_sibling for tag in headers]
    contents = [get_base_content(tag) for tag in tags]
    foods = get_clean_content(contents)
    return foods

In [76]:
vegan_foods = get_dietary_list_foods('vegan')
gf_foods = get_dietary_list_foods('gluten-free')
k_foods = get_dietary_list_foods('kosher')

In [97]:
# df = pd.read_csv('../data/clean_fearless_flyer_with_interest.csv')

In [77]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [78]:
dietary_restrictions = {
    'vegan': vegan_foods,
    'gluten-free': gf_foods,
    'kosher': k_foods
}

In [112]:
def find_similar(name, key):
    foods = dietary_restrictions.get(key)
    foods_lists = [food.split() for food in foods]
    scores = [jaccard_similarity(name.split(), food) for food in foods_lists]
    max_index = np.argmax(scores)
    max_score = scores[max_index]
    match = foods[max_index] if max_score > .6 else None
    return match

In [130]:
df['vegan'] = df.name.apply(lambda n: find_most_similar(n, 'vegan'))

In [131]:
df['gluten_free'] = df.name.apply(lambda n: find_most_similar(n, 'gluten-free'))

In [132]:
df['kosher'] = df.name.apply(lambda n: find_most_similar(n, 'kosher'))

In [139]:
df.to_csv('../data/clean_fearless_flyer_with_interest_and_dietary_restrictions.csv', index=False, encoding='utf-8')