In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import os
import re
import pandas as pd
import altair as alt
import itertools
import sys
from sklearn.pipeline import Pipeline
from gensim import corpora

sys.path.append("..")
from seefood.data import BasicTextTransformer

pd.set_option("max_colwidth", 400)

In [2]:
import glob
import random
import base64

from PIL import Image
from io import BytesIO
from IPython.display import HTML

pd.set_option("display.max_colwidth", -1)


def get_thumbnail(path):
    i = Image.open(path).convert("RGB")
    i.thumbnail((224, 224), Image.LANCZOS)
    return i


def image_base64(im):
    if isinstance(im, str):
        im = get_thumbnail(im)
    with BytesIO() as buffer:
        im.save(buffer, "jpeg")
        return base64.b64encode(buffer.getvalue()).decode()


def image_formatter(im):
    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'

In [3]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [4]:
df = pd.read_csv("../data/seriouseats/seriouseats.csv")[
    ["title", "thumbnail", "ingredients"]
]
df["ingredients"] = df["ingredients"].map(lambda x: str(x).split("||||"))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8312 entries, 0 to 8311
Data columns (total 3 columns):
title          8312 non-null object
thumbnail      8312 non-null object
ingredients    8312 non-null object
dtypes: object(3)
memory usage: 194.9+ KB


In [5]:
df.head()

Unnamed: 0,title,thumbnail,ingredients
0,Jammy Fruit Bars,https://www.seriouseats.com/2019/05/20190429-fruit-oat-bars-vicky-wasik-20-625x469.jpg,"[For the Dough:, 4 1/2 ounces old fashioned rolled oats, not quick cooking, instant, or thick cut (about 1 1/3 cup; 128g), 4 1/2 ounces all-purpose flour (about 1 cup, spooned; 128g), 6 ounces light brown sugar (about 3/4 cup, firmly packed; 170g), 1 teaspoon baking powder, 1 teaspoon (4g) Diamond Crystal kosher salt, plus more for sprinkling; for table salt, use about half as much by volume or the same weight, plus additional salt for sprinkling, 1/4 teaspoon baking soda, 1/4 teaspoon ground cinnamon, 6 ounces cold, unsalted butter, cut into 1/2-inch cubes (about 3/4 cup; 170g), 1 large egg, straight from the fridge (about 1 3/4 ounces; 50g), For the Filling:, 8 ounces jam (volume will vary; 225g) or 12 ounces ""juicy"" fruit, such as whole blueberries or raspberries, pitted cherries, sliced strawberries, or peeled and diced peaches (volume will vary; 340g), Lemon juice, to taste]"
1,Balsamic Glazed Baby Back Ribs,https://www.seriouseats.com/recipes/images/2016/07/20130519-252806-balsamic-glazed-ribs-625x469.jpg,"[For the Rub:, 2 tablespoons dark brown sugar, 2 tablespoons Kosher salt, 1 tablespoon granulated sugar, 1 tablespoon paprika, 1/2 teaspoon ground white pepper, 1/2 teaspoon ground black pepper, 1/2 teaspoon ground mustard, 1/2 teaspoon dried thyme, 1/2 teaspoon garlic powder, 1/2 teaspoon ground Szechuan peppercorns, 1/4 teaspoon cayenne pepper, , 2 racks baby back ribs, 1 to 2 fist sized chunks of light smoking wood, like apple or cherry, 1 cup balsamic barbecue sauce, , Type of fire: Indirect, Grill heat: Low]"
2,The Best Meatball Pizza,https://www.seriouseats.com/recipes/images/2015/01/20150109-meatball-pizza-small-balls-8-625x469.jpg,"[1 recipe Italian-American Meatballs in Red Sauce, prepared through step 5, including sauce, 5 tablespoons (75ml) extra-virgin olive oil, divided, 1 medium bunch fresh basil leaves, divided, 1 recipe New York–Style Pizza Dough, divided into 3 balls, proofed, and ready to stretch and top, 1 pound grated full-fat dry mozzarella cheese (450g; about 4 cups), Kosher salt, 1 1/2 ounces (45g) finely grated Pecorino Romano or Parmesan cheese]"
3,Easy Pork Rillettes (Slow-Cooked Pork Spread),https://www.seriouseats.com/recipes/images/2017/02/20170215-pork-rillettes-12-625x469.jpg,"[2 pounds boneless, skinless pork shoulder, cut into 1 1/2-inch chunks (about 1kg), Kosher salt, 1/2 cup vegetable oil, lard, or duck fat (120ml), 4 bay leaves, 6 fresh thyme sprigs, 2 large shallots, very roughly chopped, 4 medium cloves garlic, split in half, Freshly ground nutmeg, to taste]"
4,Duck Pastrami,https://www.seriouseats.com/recipes/images/2013/02/20130227-242590-duck-pastrami.jpg,"[For the Cure:, 1/4 cup Kosher salt, 2 teaspoons ground black pepper, 2 teaspoons ground coriander, 2 teaspoons dark brown sugar, 1 teaspoon ground juniper berries, 3/4 teaspoon pink salt, 1/2 teaspoon ground ginger, 1/2 teaspoon granulated garlic, 1/2 teaspoon ground cloves, , 4 pounds skin-on boneless duck breast (about 4 large breasts each), , For the Rub:, 3 tablespoons coarsely ground black pepper, 1 1/2 tablespoons coarsely ground coriander seed, 2 teaspoons coarsely ground juniper berries, 1/2 teaspoon granulated garlic, , 1to 2 fist-size chunks of light smoking wood, such as apple or cherry, , Type of fire: Indirect, Grill heat: Low]"


In [6]:
all_ingredients = set(itertools.chain(*df["ingredients"]))

In [7]:
def remove_content_in_braces(ingredient):
    match = re.search(r".*(\s\(.+\)).*", ingredient)
    if match:
        return ingredient.replace(match.group(1), "")
    return ingredient


def get_unit(ingredient):
    match = re.search(r"^(([0-9]+\s?)?([0-9]\/[0-9])?\s)(.*)$", ingredient)
    if match:
        return match.group(4)
    return None

In [8]:
all_ingredients_no_braces = [remove_content_in_braces(i) for i in all_ingredients]

In [148]:
pipeline = Pipeline(
    [
        (
            "basic_text",
            BasicTextTransformer(
                "ingredients",
                "p_ingredients",
                bigram_threshold=30,
                trigram_threshold=50,
            ),
        )
    ]
)

ingreds_df = pipeline.fit_transform(
    pd.DataFrame({"ingredients": all_ingredients_no_braces})
)

[nltk_data] Downloading package stopwords to /home/mike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [155]:
from collections import Counter

tokens = list(itertools.chain(*ingreds_df["p_ingredients"]))
counter = Counter(tokens)
counter.most_common(20)

[('cup', 12423),
 ('tablespoon', 7638),
 ('teaspoon', 5191),
 ('pound', 3337),
 ('inch', 2853),
 ('oil', 2332),
 ('ounce', 2307),
 ('onion', 1857),
 ('pepper', 1758),
 ('butter', 1646),
 ('juice', 1643),
 ('sugar', 1604),
 ('chicken', 1601),
 ('cheese', 1500),
 ('leave', 1451),
 ('lemon', 1440),
 ('piece', 1332),
 ('clove', 1305),
 ('ground', 1110),
 ('water', 1071)]

In [156]:
INGREDIENT_COUNT_THRESHOLD = 30
least_common_ingredients = set(
    [i for i, count in counter.items() if count <= INGREDIENT_COUNT_THRESHOLD]
)

In [157]:
len(least_common_ingredients)

3336

In [158]:
measure_unit = [
    "cup",
    "teaspoon",
    "tablespoon",
    "bund",
    "pound",
    "knob",
    "slice",
    "cut",
    "ounce",
    "inch",
    "leave",
    "piece",
    "ground",
    "clove",
    "dice",
    "bunch",
    "cubes",
    "medium",
    "sections",
    "cubed",
]

In [159]:
ingreds_df["pp_ingredients"] = ingreds_df["p_ingredients"].map(
    lambda x: [
        i for i in x if i not in measure_unit and i not in least_common_ingredients
    ]
)

In [160]:
ingreds_df.sample(20)

Unnamed: 0,ingredients,p_ingredients,pp_ingredients
46470,2 slices hardwood-smoked bacon,"[hardwood, bacon]",[bacon]
31312,"1 can peeled whole tomatoes, preferably San Marzano",[tomato],[tomato]
32645,"2 leeks, white parts only, chopped",[part],[part]
47683,Sugar for sprinkling the pie before baking,"[sugar, sprinkling, pie, baking]","[sugar, sprinkling, baking]"
25523,1 cup all-purpose flour,"[cup, purpose_flour]",[purpose_flour]
43943,1 recipe Santa Maria-Style Salsa,"[recipe, santa, maria, style, salsa]","[recipe, style, salsa]"
26226,"3/4 cup plus 2 tablespoons unbleached all purpose flour, sifted","[cup, tablespoon, purpose_flour]",[purpose_flour]
47698,"Lime wheels, mint leaves, and large ice block or large ice cubes, for garnish","[lime, wheel, mint, ice, ice, cube]","[lime, mint, ice, ice, cube]"
29234,"4 tablespoons unsalted butter or margarine, melted","[tablespoon, butter, margarine]",[butter]
26319,"3 tablespoons unsalted butter, cut into cubes","[tablespoon, butter, cut, cube]","[butter, cube]"


In [162]:
set(itertools.chain(*ingreds_df["pp_ingredients"]))

{'adobo',
 'allspice',
 'almond',
 'ancho',
 'anchovy',
 'anchovy_fillet',
 'apple',
 'apple_cider',
 'apricot',
 'arugula',
 'asian_fish',
 'asparagus',
 'avocado',
 'baby',
 'baby_arugula',
 'baby_spinach',
 'bacon',
 'bag',
 'baguette',
 'baking',
 'baking_soda',
 'ball',
 'balsamic_vinegar',
 'banana',
 'barbecue',
 'base',
 'basil',
 'bay',
 'bean',
 'bean_paste',
 'beef',
 'beef_chuck',
 'beer',
 'beet',
 'bell',
 'bell_pepper',
 'berry',
 'bias',
 'bite_size',
 'bittersweet_chocolate',
 'blood_orange',
 'blueberry',
 'boiling',
 'bone',
 'boneless',
 'boneless_skinless',
 'bottom',
 'bourbon',
 'bowl',
 'box',
 'brandy',
 'bread',
 'bread_crumbs',
 'breadcrumb',
 'breast',
 'breast_halve',
 'brine',
 'brisket',
 'broccoli',
 'broth',
 'brussels_sprout',
 'bun',
 'bunche',
 'butter',
 'buttermilk',
 'butternut_squash',
 'button_mushroom',
 'cabbage',
 'cake',
 'can',
 'canola',
 'cap',
 'caper',
 'cardamom',
 'carrot',
 'cashews',
 'casing',
 'cayenne',
 'celery',
 'center',
 'ch