In [2]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import pickle
import json
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_kernels
from unidecode import unidecode

In [3]:
recipes = pd.read_pickle('recipes.pkl')

recipes_mod = pd.read_pickle('recipes_mod.pkl')

In [247]:
recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18277 entries, 0 to 18276
Data columns (total 12 columns):
calories       18277 non-null int64
carb           18277 non-null int64
categories     18277 non-null object
date           18277 non-null datetime64[ns]
desc           13655 non-null object
directions     18277 non-null object
fat            18277 non-null int64
ingredients    18277 non-null object
protein        18277 non-null int64
rating         18277 non-null float64
sodium         18277 non-null int64
title          18277 non-null object
dtypes: datetime64[ns](1), float64(1), int64(5), object(5)
memory usage: 1.7+ MB


In [4]:
recipes_mod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18277 entries, 0 to 18276
Data columns (total 18 columns):
calories          18277 non-null int64
carb              18277 non-null int64
categories        18277 non-null object
date              18277 non-null datetime64[ns]
desc              18277 non-null object
directions        18277 non-null object
fat               18277 non-null int64
ingredients       18277 non-null object
protein           18277 non-null int64
rating            18277 non-null float64
sodium            18277 non-null int64
title             18277 non-null object
desc_mod          18277 non-null object
desc_mod_clean    18277 non-null object
dir_clean         18277 non-null object
ingr_clean        18277 non-null object
cat_clean         18277 non-null object
all_text          18277 non-null object
dtypes: datetime64[ns](1), float64(1), int64(5), object(11)
memory usage: 2.5+ MB


In [5]:
recipes_mod.head()

Unnamed: 0,calories,carb,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title,desc_mod,desc_mod_clean,dir_clean,ingr_clean,cat_clean,all_text
0,426,77,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",2006-09-01 04:00:00,,"[1. Place the stock, lentils, celery, carrot, ...",7,"[4 cups low-sodium vegetable or chicken stock,...",30,2.5,559,"Lentil, Apple, and Turkey Wrap","Lentil, Apple, and Turkey Wrap",lentil*apple*turkey*wrap,[place*stock*lentils*celery*carrot*thyme*salt*...,"[4 cup*low-sodium*vegetable*chicken*stock, 1 c...","[sandwich, bean, fruit, tomato, turkey, vegeta...",lentil*apple*turkey*wrap*sandwich*bean*fruit*t...
1,403,34,"[Food Processor, Onion, Pork, Bake, Bastille D...",2004-08-20 04:00:00,This uses the same ingredients found in boudin...,[Combine first 9 ingredients in heavy medium s...,23,"[1 1/2 cups whipping cream, 2 medium onions, c...",18,4.375,1439,Boudin Blanc Terrine with Red Onion Confit,Boudin Blanc Terrine with Red Onion Confit Thi...,boudin*blanc*terrine*red*onion*confit*boudin*b...,[combine*9 ingredient*heavy*medium*saucepan*ad...,"[1 1/2 cup*whipping*cream, 2 medium*onions*cho...","[food processor, onion, pork, bake, bastille d...",boudin*blanc*terrine*red*onion*confit*boudin*b...
2,357,92,"[Candy, Citrus, Quick & Easy, Low Sodium, Grap...",2004-08-20 12:48:48,,[Quarter the grapefruits lengthwise and remove...,0,"[3 large grapefruits, 13 cups cold water, 4 cu...",1,5.0,17,Candied Grapefruit Zest,Candied Grapefruit Zest,candied*grapefruit*zest,[quarter*grapefruit*lengthwise*remove*pulp*res...,"[3 large*grapefruit, 13 cup*cold*water, 4 cup*...","[candy, citrus, quick & easy, low sodium, grap...",candied*grapefruit*zest*candy*citrus*quick & e...
3,691,21,"[Leafy Green, Shellfish, Marinate, Sauté, High...",2004-08-20 04:00:00,When Roy Yamaguchi invited me to cook at the 5...,"[Preheat oven to 350°F., Dip each crab in milk...",41,"[8 soft shell crabs, cleaned (See note), 1 cup...",57,3.75,1533,Soft-Shell Crabs with Wilted Spinach and Warm ...,Soft-Shell Crabs with Wilted Spinach and Warm ...,soft-shell*crab*wilted*spinach*warm*tomato-bas...,"[preheat*oven*degf, dip*crab*milk*lightly*coat...","[8 soft*shell*crabs*cleaned*note, 1 cup*milk, ...","[leafy green, shellfish, marinate, saute, high...",soft-shell*crab*wilted*spinach*warm*tomato-bas...
4,181,39,"[Sauce, Fruit, Ginger, Dessert, Low Sodium, Wh...",2004-08-20 04:00:00,A simple dessert sauce that is absolutely deli...,"[Combine Sauternes, sugar, water and whole clo...",1,[1/2 cup plus 2 tablespoons Sauternes or Late ...,1,5.0,11,Kumquat Compote with Sauternes and Ginger,Kumquat Compote with Sauternes and Ginger A si...,kumquat*compote*sauterne*ginger*simple*dessert...,[combine*sauternes*sugar*water*whole*clove*hea...,[1/2 cup*plus*2 tablespoon*sauterne*late*harve...,"[sauce, fruit, ginger, dessert, low sodium, wh...",kumquat*compote*sauterne*ginger*simple*dessert...


In [5]:
descriptions = recipes.loc[~recipes.desc.isnull()]

In [3]:
tok_pat = re.compile(r'[^\d\W][\w-]+|\d+.*?(?:(?=[.;:]\s|,\s[^\d\W])|\s[^\d\W]+)')

In [6]:
def calc_cos_sims(text_corp, vect_type = TfidfVectorizer , **kwargs):
    
    vect = vect_type(**kwargs)
    mat = vect.fit_transform(text_corp)
    cos_sims = cosine_similarity(mat, mat)
    np.fill_diagonal(cos_sims, 0)
    
    return cos_sims


def calc_jacc_sims(text_corp, **kwargs):
    
    vect = CountVectorizer(binary=True, **kwargs)
    mat = vect.fit_transform(text_corp).toarray()
    
    js = lambda v1, v2: (v1 * v2).sum() / (v1.sum() + v2.sum() - (v1 * v2).sum())
    jacc_sims = pairwise_kernels(mat, metric=js)
    
    return jacc_sims


def recommend(recipe, sim_mat, n_recs=20):
    
    ind = recipes_mod[recipes_mod.title == recipe].index[0]
    
    top_sims = pd.Series(sim_mat[ind]).sort_values(ascending=False)
    
    recommendations = []
    for i in list(top_sims[:n_recs].index):
        recommendations.append(recipes_mod.title[i])
        
    return recommendations, top_sims[:n_recs], top_sims[:n_recs].index

In [None]:
# cs_old_desc = calc_sims(recipes_mod.desc)
recommend('Spinach Ricotta Gnocchi with Tomato Sauce', cs_old_desc, 20)

In [None]:
# recipes.loc[recipes.title == 'Spinach Ricotta Gnocchi with Tomato Sauce']
recipes_mod.loc[18174].desc

In [None]:
recipes_mod.loc[14542].desc

In [None]:
# Recommend using cleaned descriptions + title

# cs_clean_desc = calc_sims(recipes_mod.desc_mod_clean) 

recommend('Spinach Ricotta Gnocchi with Tomato Sauce', cs_clean_desc, 20)

In [None]:
# cs_clean_desc_n = calc_sims(recipes_mod.desc_mod_clean, token_pattern=tok_pat)

recommend('Spinach Ricotta Gnocchi with Tomato Sauce', cs_clean_desc_n)

In [117]:
recipes_mod.loc[recipes_mod.title == 'Spinach Herb Sauce', 'desc_mod_clean']

4995    spinach herb sauce prepared 45 minute
Name: desc_mod_clean, dtype: object

In [121]:
recipes_mod.desc_mod_clean[18174]


'spinach ricotta gnocchi tomato sauce prepared 45 minute'

In [120]:
recipes_mod.desc_mod_clean[4995]

'spinach herb sauce prepared 45 minute'

In [116]:
# Including bigrams

# cs_clean_desc_bg = calc_sims(recipes_mod.desc_mod_clean, token_pattern=tok_pat, ngram_range=(1,2))
recommend('Spinach Ricotta Gnocchi with Tomato Sauce', cs_clean_desc_bg)

(["Cheat's Ricotta Gnocchi Dough",
  'Mexican-Style Tomato Sauce',
  'Spaghetti with Eggplant and Tomato Sauce',
  'Spinach Herb Sauce',
  'Spiced Cranberry Sauce',
  'Scallop Quenelles with Gingered Tomato Sauce',
  'Chocolate Coconut Sauce',
  'Gratineed Gnocchi with Spinach and Ricotta',
  'Spicy Cilantro Sauce',
  'Cranberry Raspberry Sauce',
  'Coconut Caramel Sauce',
  'Ginger Custard Sauce',
  'Jalapeño Garlic Sauce',
  'Apricot Caramel Sauce',
  'Sambuca Chocolate Sauce',
  'Chicken and Dumplings',
  'Mustard Watercress Sauce',
  'Rotelle with Mushroom Sauce',
  'Spicy Tomato Sauce',
  'Lemon-Pepper Dill Sauce'],
 17555    0.373182
 8520     0.342779
 7544     0.327798
 4995     0.307823
 7481     0.261232
 16706    0.256433
 16704    0.253060
 16731    0.250637
 6484     0.249570
 6365     0.245382
 17088    0.245082
 17487    0.243501
 14922    0.243001
 6559     0.242028
 8105     0.230292
 15892    0.227199
 6477     0.226597
 7059     0.224506
 353      0.223799
 11615    

In [233]:
# Using ngram_range of (1,2) seems to have yielded more recipes with 'Sauce' keyword, perhaps suggesting
# that sauces recipes are relatively uncommon (IDF upweighs 'Sauce')

recipes_mod.loc[recipes_mod.desc_mod.str.lower().str.contains('\*')].index

# tok_pat
# tp = re.compile('[^\d\W][\w-]+|\d+.*?(?:\s[^\d\W]+|(?=\.\s))')
# # tp.findall()
tp = re.compile(r'[^\d\W][\w-]+|\d+.*?(?:(?=[.;:]\s|,\s[^\d\W])|\s[^\d\W]+)')
# ([i for e in recipes_mod.directions for i in e if re.search('\d+:',i)])
# tp.findall('red 1 hour remove lid add 10 more briquette grill turkey 1 hour insert meat thermometer fleshy thigh thermometer register 180deg:f juice run clear')

In [244]:
# (re.search('\d+:','2 (4") coeur à la crème molds; 4: cheesecloth'))


In [167]:
# recipes_mod.directions[]
recipes_mod.dir_clean[0]

['place stock lentils celery carrot thyme salt medium saucepan bring boil reduce heat low simmer lentil tender 30 minutes depending lentils begin dry add water needed remove discard thyme drain transfer mixture bowl let cool',
 'fold tomato apple lemon juice olive oil season pepper',
 'assemble wrap place 1 lavash sheet clean work surface spread lentil mixture end nearest leaving 1-inch border slice turkey lettuce roll lavash slice crosswise serve using tortillas spread lentil center turkey lettuce fold left right rolling away']

In [131]:
recipes_mod.directions[654]

['Put oven rack in middle position and preheat oven to 375°F. Lightly oil a large shallow baking pan with vegetable oil.',
 'Arrange rhubarb in 1 layer in pan and sift confectioners sugar evenly over top. Bake, stirring occasionally, until rhubarb is very tender, 25 to 30 minutes.',
 'Transfer rhubarb to a food processor and purée until smooth. Force through a medium-mesh sieve into a bowl, discarding solids, and cool completely.']

In [256]:
# tp.findall(recipes_mod.ingredients[2673])

In [28]:
%timeit calc_jacc_sims(recipes_mod.cat_clean.loc[:10], tokenizer = lambda x:x, lowercase=False)

3.55 ms ± 564 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [49]:
# cs = calc_cos_sims(recipes_mod.cat_clean,tokenizer = lambda x:x,lowercase=False)
%timeit calc_cos_sims(recipes_mod.cat_clean.loc[:100],tokenizer = lambda x:x,lowercase=False)

4.32 ms ± 510 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
%timeit calc_jacc_sims(recipes_mod.cat_clean.loc[:100], tokenizer = lambda x:x, lowercase=False)
# calc_jacc_sims(recipes_mod.cat_clean.loc[1], tokenizer = lambda x:x, lowercase=False)

116 ms ± 4.95 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [73]:
%timeit calc_jacc_sims(recipes_mod.cat_clean.loc[:1000], tokenizer = lambda x:x, lowercase=False)

7.6 s ± 68.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
# js=calc_jacc_sims(recipes_mod.cat_clean, tokenizer = lambda x:x, lowercase=False)

In [98]:
# 7600/116
# (7600*130*2)/1000/60
calc_jacc_sims(recipes_mod.cat_clean.loc[:100], tokenizer = lambda x:x, lowercase=False)

array([[1.        , 0.        , 0.        , ..., 0.03125   , 0.        ,
        0.03448276],
       [0.        , 1.        , 0.05555556, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.05555556, 1.        , ..., 0.07142857, 0.15384615,
        0.08      ],
       ...,
       [0.03125   , 0.        , 0.07142857, ..., 1.        , 0.20833333,
        0.4137931 ],
       [0.        , 0.        , 0.15384615, ..., 0.20833333, 1.        ,
        0.18181818],
       [0.03448276, 0.        , 0.08      , ..., 0.4137931 , 0.18181818,
        1.        ]])

In [None]:
# from sklearn.metrics import pairwise_distances as pwd


# vect = CountVectorizer(token_pattern=tok_pat)
# mat = vect.fit_transform(recipes_mod.desc_mod_clean)
# mat = np.array([[0,0,1],[0,0,1], [1,1,0]])


In [101]:
vect = CountVectorizer(lowercase=False, tokenizer=lambda x: x)
mat = vect.fit_transform(recipes_mod.cat_clean.loc[:100])
mat = mat.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
%timeit (mat.toarray()[0] * mat.toarray()[1]).sum()

In [102]:
from sklearn.metrics import jaccard_score
from sklearn.metrics import pairwise_distances
%timeit 1 - pairwise_distances(np.array(mat,dtype=bool),metric='jaccard')

3.6 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [103]:
1 - pairwise_distances(np.array(mat,dtype=bool),metric='jaccard')

array([[1.        , 0.        , 0.        , ..., 0.03125   , 0.        ,
        0.03448276],
       [0.        , 1.        , 0.05555556, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.05555556, 1.        , ..., 0.07142857, 0.15384615,
        0.08      ],
       ...,
       [0.03125   , 0.        , 0.07142857, ..., 1.        , 0.20833333,
        0.4137931 ],
       [0.        , 0.        , 0.15384615, ..., 0.20833333, 1.        ,
        0.18181818],
       [0.03448276, 0.        , 0.08      , ..., 0.4137931 , 0.18181818,
        1.        ]])

In [11]:
calc_jacc_sims(recipes_mod.cat_clean.loc[[0,4]], tokenizer = lambda x:x, lowercase=False)

array([[1.        , 0.03333333],
       [0.03333333, 1.        ]])

In [71]:
# mat = mat.toarray()
pairwise_kernels(mat,metric = 'cosine')
cosine_similarity(mat)
10000**2

100000000

In [57]:
%timeit pairwise_kernels(mat,metric = 'cosine')
# jaccard_score(mat[0], mat[1], average='samples')

195 µs ± 71.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [63]:
%timeit cosine_similarity(mat)

120 µs ± 7.56 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [90]:
jaccard_score(mat[0],mat[1])
# mat[0].toarray().flatten()


0.03333333333333333

In [247]:
# cs_new = calc_sims(recipes_mod.desc_mod_clean, token_pattern=tok_pat)

# cs_new[18174,9110]

In [244]:
recommend('Spinach Ricotta Gnocchi with Tomato Sauce', cs_new)

(['Spinach Gnocchi Gratin',
  'Spinach Herb Sauce',
  "Cheat's Ricotta Gnocchi Dough",
  'Gratineed Gnocchi with Spinach and Ricotta',
  'Rigatoni with Eggplant, Tomato, and Ricotta',
  'Sauteed Spinach',
  'Tomato, Arugula, and Ricotta Salata Salad',
  'Rice and Tomatoes',
  'Pasta with Ricotta and Fresh Herbs',
  'Spinach Risotto',
  'Herb Gnocchi',
  'Sauteed Spinach and Garlic',
  'Tomato and Spinach Soup',
  'Shiitake and Ricotta Patties',
  'Gnocchi with Mushrooms and Butternut Squash',
  'Tomato and Herb Salad',
  'Cauliflower and Spinach Vinaigrette',
  'Pasta with Roasted Eggplant, Ricotta, and Basil',
  'Spinach and Roasted Red Pepper Salad',
  'Mexican-Style Tomato Sauce'],
 9110     0.555805
 4995     0.531862
 17555    0.522014
 16731    0.498280
 8647     0.482718
 476      0.474219
 572      0.464995
 14472    0.451556
 16933    0.451152
 6837     0.448804
 6495     0.435978
 9624     0.425775
 8428     0.408936
 14602    0.402747
 4131     0.402331
 427      0.399661
 4

In [11]:
# descriptions.ingredients[1]

In [15]:
# descriptions.head()

In [6]:
# categories = []
# [categories.extend(cat) for cat in recipes.categories]
# categories = pd.Series(categories)
# categories.value_counts()

In [13]:
tfidf = TfidfVectorizer(tokenizer=lambda x:x, preprocessor=lambda x:x)
tfidf_mat = tfidf.fit_transform(recipes.categories)
# print(tfidf_mat)
# tfidf.vocabulary_

In [14]:
tfidf_mat.shape

(18277, 721)

In [19]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer = vectorizer.fit_transform(descriptions.desc)
word_sums = vectorizer.sum(axis=0)
vectorizer

<bound method spmatrix.get_shape of <13655x17865 sparse matrix of type '<class 'numpy.int64'>'
	with 225643 stored elements in Compressed Sparse Row format>>

In [10]:
tf_vec = TfidfVectorizer(stop_words='english')
tf_vec = tf_vec.fit_transform(descriptions.desc)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)