In [1]:
import pandas as pd
import numpy as np

In [45]:
def flatten(l):
    lnew = []
    for l0 in l:
        lnew+=l0
    return list(set(lnew))


In [17]:
def tovec(subset,basis):
    d = {b:0 for b in basis}
    for s in subset:
        d[s]+=1
    return [d[b] for b in basis]

In [48]:
recipesdf = pd.read_pickle('data/recmini.pk')
reviewsdf = pd.read_pickle('data/reviewsred.pk')

Unnamed: 0_level_0,Name,RecipeCategory,RecipeIngredientParts,TotalTime
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
387,Chocolate Coffee Ice Cream Soda,Frozen Desserts,"[chocolate flavored coffee, milk, coffee ice c...",PT0S
707,"Tricolor Crustless ""Quiche""",Savory Pies,"[spinach, carrots, potatoes, low-fat cream che...",PT0S
721,Vidalia Onion Casserole,Vegetable,"[butter, herb seasoned stuffing mix, butter, e...",PT0S
736,Whole Grain Buttermilk Pancakes,Breakfast,"[whole wheat bread flour, cornmeal, light brow...",PT0S
742,Yolkless Noodles,Low Cholesterol,"[all-purpose flour, salt, water, olive oil, al...",PT0S
...,...,...,...,...
78355,Garlic Basil Bread,Yeast Breads,"[basil, salt, sugar, flour, garlic, olive oil,...",PT0S
80121,Cranberry Salad in Raspberry Jello with Cream ...,Gelatin,"[fresh cranberries, crushed pineapple, walnut ...",PT0S
91931,Homemade Gel Pack,Homeopathy/Remedies,[water],PT0S
183593,Peanut Butter Oreos,Dessert,[peanut butter],PT0S


In [6]:
authors = list(set(reviewsdf['AuthorId'].values))
data_by_author = {a:{'ratings':[],'recids':[]} for a in authors}
for i in reviewsdf.index:
    ai = reviewsdf['AuthorId'][i]
    data_by_author[ai]['ratings'].append(reviewsdf['Rating'][i])
    data_by_author[ai]['recids'].append(reviewsdf['RecipeId'][i])
    
for a in authors:
    data_by_author[a]['No_ratings'] = len(data_by_author[a]['ratings'])
    data_by_author[a]['Mean_rating'] = np.mean(data_by_author[a]['ratings'])
    data_by_author[a]['Std_ratings'] = np.std(data_by_author[a]['ratings'])

reviewsdf['Auth_No_Ratings'] = [len(data_by_author[a]['ratings']) for a in reviewsdf['AuthorId'].values]
reviewsdf['Auth_Std_Ratings'] = [np.std(data_by_author[a]['ratings']) for a in reviewsdf['AuthorId'].values]
reviewsdf['Auth_Avg_Rating'] = [np.mean(data_by_author[a]['ratings']) for a in reviewsdf['AuthorId'].values]

In [13]:
def get_authors(min_rev,min_std):
    return [a for a in authors if data_by_author[a]['No_ratings'] >= min_rev and data_by_author[a]['Std_ratings'] > min_std]

In [14]:
author_subset = get_authors(50,0)

In [15]:
def summarize(authorlist):
    no_authors = len(authorlist)
    recids = []
    for a in authorlist:
        recids+=data_by_author[a]['recids']
    norevs = len(recids)
    norecs = len(set(recids))
    return {'Reviewers':no_authors,'Recipes':norecs,'Ratings':norevs,'Avg_no_by_rec':norevs/norecs}

In [16]:
summarize(author_subset)

{'Reviewers': 3382,
 'Recipes': 221276,
 'Ratings': 764847,
 'Avg_no_by_rec': 3.4565294021945445}

In [35]:
summarize(get_authors(100,0))

{'Reviewers': 1724,
 'Recipes': 208761,
 'Ratings': 650005,
 'Avg_no_by_rec': 3.1136323355416002}

# Similar Recipes by Name

We will try to group together recipes with similar names.

To speed things up, we do the grouping within each recipe category first.

In [28]:
recipenames = list(recipesdf['Name'].values)
recipecategories = list(set(recipesdf['RecipeCategory'].values))

category_to_recnames = {c:[] for c in recipecategories}
category_to_recids = {c:[] for c in recipecategories}
for i in recipesdf.index:
    c = recipesdf['RecipeCategory'][i]
    n = recipesdf['Name'][i]
    category_to_recnames[c].append(n)
    category_to_recids[c].append(i)

We will use a sentence transformer to embed the recipe names as vectors and then cluster the names using the embedding.

In [30]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

from sklearn.cluster import KMeans, DBSCAN

def kmeans(sentences,k):
    clustering_model = KMeans(n_clusters = k)
    vecs = embedder.encode(sentences)
    clustering_model.fit(vecs)
    cluster_assignment = clustering_model.labels_
    clusters = {i:[] for i in range(max(cluster_assignment)+1)}
    for i in range(len(cluster_assignment)):
        clusters[cluster_assignment[i]].append(sentences[i])
    return clusters

def dbscan(sentences,ep,msam):
    clustering_model = DBSCAN(eps=ep, min_samples=msam, metric='euclidean')
    vecs = embedder.encode(sentences)
    clustering_model.fit(vecs)
    cluster_assignment = clustering_model.labels_
    clusters = {i:[] for i in range(-1,max(cluster_assignment)+1)}
    for i in range(len(cluster_assignment)):
        clusters[cluster_assignment[i]].append(sentences[i])
    return clusters

def dbscan_no(sentences,ep,msam):
    clustering_model = DBSCAN(eps=ep, min_samples=msam, metric='euclidean')
    vecs = embedder.encode(sentences)
    clustering_model.fit(vecs)
    cluster_assignment = clustering_model.labels_
    clusters = {i:[] for i in range(max(cluster_assignment)+1)}
    for i in range(len(cluster_assignment)):
        if cluster_assignment[i]>=0:
            clusters[cluster_assignment[i]].append(sentences[i])
    return clusters

def closest(words,target):
    vecs = embedder.encode([target]+words)
    wvs = vecs[1:]
    tv = vecs[0]
    w0 = words[0]
    d0 = np.linalg.norm(tv-wvs[0])
    for i in range(1,len(words)):
        vi = wvs[i]
        di = np.linalg.norm(tv-vi)
        if di < d0:
            d0 = di
            w0 = words[i]
    return w0

def closest_to_center(words):
    if len(words)==1:
        return words[0]
    vecs = embedder.encode(words)
    true_center = (1/len(vecs))*sum(vecs)
    w0 = words[0]
    d0 = np.linalg.norm(vecs[0]-true_center)
    for i in range(1,len(words)):
        di = np.linalg.norm(vecs[i]-true_center)
        if di < d0:
            w0 = words[i]
            d0 = di
    return w0

def dbscan_reorg(clusters):
    m =max(list(clusters.keys()))
    newclusters = {i:clusters[i] for i in range(m)}
    for x in clusters[-1]:
        newclusters[m] = [x]
        m+=1
    return newclusters
    
def relabel_clusters(clusters):
    return {closest_to_center(clusters[c]):clusters[c] for c in clusters}

def dbscan_nice(sentences,ep,msam):
    clus0 = dbscan(sentences,ep,msam)
    clus1 = dbscan_reorg(clus0)
    return relabel_clusters(clus1)

In [33]:
recipe_labels = {}

for c in category_to_recnames:
    rs = category_to_recnames[c]
    if len(rs)>2:
        clus = dbscan_nice(rs,0.4,2)
        for c in clus:
            if c in recipe_labels:
                recipe_labels[c]+=clus[c]
            else:
                recipe_labels[c] = clus[c]

In [37]:
recipe_labels_inv = {n:l for l in recipe_labels for n in recipe_labels[l]}

In [92]:
labelsdf = pd.read_pickle('data/recipelabels.pk')

In [97]:
rid_to_label = {r:'Not found' for r in recipesdf.index}
for r in labelsdf.index:
    rid_to_label[r] = labelsdf['Label'][r]
recipesdf['Label'] = [rid_to_label[r] for r in recipesdf.index]

In [99]:
recipes_w_labels = recipesdf.loc[recipesdf.Label != 'Not found']

In [100]:
recipes_w_labels

Unnamed: 0_level_0,Name,RecipeCategory,RecipeIngredientParts,TotalTime,NoIngredients,Label
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
524289,Apple-Glazed Stuffed Pork Chops,Pork,"[bacon, leeks, other dry white wine, thyme, sa...",PT40M,8,Apple Glazed Pork Chops
40,Best Lemonade,Beverages,"[sugar, lemon rind, lemon, zest of, fresh wate...",PT35M,5,The Best Lemonade
42,Cabbage Soup,Vegetable,"[plain tomato juice, cabbage, onion, carrots, ...",PT50M,5,Cabbage Soup
47,Butter Pecan Cookies,Dessert,"[butter, light brown sugar, granulated sugar, ...",PT1H4M,6,Butter Pecan Cookies
48,Boston Cream Pie,Pie,"[margarine, cake flour, baking powder, salt, s...",PT2H15M,16,Boston Cream Pie
...,...,...,...,...,...,...
523058,Triple Chocolate Zucchini Muffins,Quick Breads,"[whole wheat bread flour, fine rolled oats, ba...",PT55M,14,Zucchini Muffins
523104,French Onion Soup,< 4 Hours,"[sweet unsalted butter, yellow onions, minced ...",PT3H50M,14,French Onion Soup
523120,Homemade Chicken & Rice Soup,Chicken,"[boneless skinless chicken breast half, onion,...",PT50M,9,Creamy Chicken and Rice Soup
523582,Garlic Cream Sauce,Sauces,"[butter, minced fresh garlic cloves, flour, h...",PT15M,6,Garlic Sauce


In [53]:
ingredients_to_recipes = {i:[] for i in ingredients}
for r in recipesdf.index:
    ris = recipesdf['RecipeIngredientParts'][r]
    for i in ris:
        ingredients_to_recipes[i].append(r)

In [101]:
labels = list(set(labelsdf['Label'].values))

In [88]:
ingredientwords = flatten([l.split(' ') for l in ingredients])

In [103]:
labels_to_ingredients = {l:[] for l in labels}
for r in recipes_w_labels.index:
    labels_to_ingredients[recipes_w_labels['Label'][r]].append(recipes_w_labels['RecipeIngredientParts'][r])

In [113]:
def to_vec(subset,basis):
    dic = {b:0 for b in basis}
    for s in subset:
        dic[s]+=1
    return np.array([dic[b] for b in basis])

def avg_vec(subsets,basis):
    if len(subsets)==0:
        return [0 for b in basis]
    vecs = [to_vec(s,basis) for s in subsets]
    return sum(vecs)/len(vecs)

labels_to_ingrvecs = {l:avg_vec(labels_to_ingredients[l],ingredients) for l in labels}

array([0.        , 0.        , 0.        , ..., 0.71428571, 0.42857143,
       1.        ])

In [65]:
vsrep = [v for v in vs if len(vs[v])>1]

In [69]:
vrids = [vs[v] for v in vsrep]

In [81]:
recipesdf['NoIngredients'] = [len(ing) for ing in recipesdf['RecipeIngredientParts'].values]

In [85]:
recipesdf.loc[recipesdf.NoIngredients>5]

Unnamed: 0_level_0,Name,RecipeCategory,RecipeIngredientParts,TotalTime,NoIngredients
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
524289,Apple-Glazed Stuffed Pork Chops,Pork,"[bacon, leeks, other dry white wine, thyme, sa...",PT40M,8
524301,Bob's Taco Soup With Pasta,One Dish Meal,"[ground beef, onion, diced tomatoes, red kidne...",PT40M,8
524315,Crispy Chicken Thighs in Convection Oven,Poultry,"[skinless chicken thighs, light sour cream, mi...",PT1H5M,11
39,Biryani,Chicken Breast,"[saffron, milk, green chili peppers, onions, g...",PT4H25M,25
41,Carina's Tofu-Vegetable Kebabs,Soy/Tofu,"[extra firm tofu, baby eggplant, zucchini, mus...",PT24H20M,14
...,...,...,...,...,...
524062,Bisquick Crepes,Breakfast,"[eggs, Bisquick, milk, butter, cream cheese, s...",PT45M,7
524149,Hidden Veggie Mac and Cheese,< 60 Mins,"[carrots, cauliflower, low-fat milk, salt, che...",PT40M,8
524229,Coconut Curried Butternut Squash Soup,Lactose Free,"[olive oil, salt, ground white pepper, carrot,...",PT1H40M,10
524235,Paleo Party Pork Carnitas,Pork,"[canola oil, dried oregano, minced fresh garl...",PT8H30M,13


In [74]:
vrids

[[9870, 158139],
 [42974, 91633, 142287, 223553],
 [43222, 66810, 351699],
 [51528, 121602, 210303, 213261, 267945, 293717, 355939],
 [98438, 103960, 111777],
 [153225, 325358],
 [157045, 212365, 337754]]