In [1]:
import pandas as pd
import numpy as np

In [12]:
def flatten(l):
    lnew = []
    for l0 in l:
        lnew+=l0
    return list(set(lnew))


In [17]:
def tovec(subset,basis):
    d = {b:0 for b in basis}
    for s in subset:
        d[s]+=1
    return [d[b] for b in basis]

In [20]:
recipesdf = pd.read_pickle('data/recmini.pk')
reviewsdf = pd.read_pickle('data/reviewsred.pk')

In [21]:
recipesdf

Unnamed: 0_level_0,Name,RecipeCategory,RecipeIngredientParts
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
524289,Apple-Glazed Stuffed Pork Chops,Pork,"[bacon, leek, dry white wine, thyme, salt, pep..."
524301,Bob's Taco Soup With Pasta,One Dish Meal,"[ground beef, onion, diced tomatoes, red kidne..."
524315,Crispy Chicken Thighs in Convection Oven,Poultry,"[boneless skinless chicken thighs, light sour ..."
38,Low-Fat Berry Blue Frozen Dessert,Frozen Desserts,"[blueberries, granulated sugar, vanilla yogurt..."
39,Biryani,Chicken Breast,"[saffron, milk, hot green chili peppers, onion..."
...,...,...,...
524149,Hidden Veggie Mac and Cheese,< 60 Mins,"[carrots, cauliflower, low-fat milk, salt, che..."
524197,Refreshing Ice Cubes,Very Low Carbs,"[water, water]"
524229,Coconut Curried Butternut Squash Soup,Lactose Free,"[olive oil, salt, ground black pepper, carrot,..."
524235,Paleo Party Pork Carnitas,Pork,"[canola oil, dried oregano, garlic cloves, car..."


In [6]:
authors = list(set(reviewsdf['AuthorId'].values))
data_by_author = {a:{'ratings':[],'recids':[]} for a in authors}
for i in reviewsdf.index:
    ai = reviewsdf['AuthorId'][i]
    data_by_author[ai]['ratings'].append(reviewsdf['Rating'][i])
    data_by_author[ai]['recids'].append(reviewsdf['RecipeId'][i])
    
for a in authors:
    data_by_author[a]['No_ratings'] = len(data_by_author[a]['ratings'])
    data_by_author[a]['Mean_rating'] = np.mean(data_by_author[a]['ratings'])
    data_by_author[a]['Std_ratings'] = np.std(data_by_author[a]['ratings'])

reviewsdf['Auth_No_Ratings'] = [len(data_by_author[a]['ratings']) for a in reviewsdf['AuthorId'].values]
reviewsdf['Auth_Std_Ratings'] = [np.std(data_by_author[a]['ratings']) for a in reviewsdf['AuthorId'].values]
reviewsdf['Auth_Avg_Rating'] = [np.mean(data_by_author[a]['ratings']) for a in reviewsdf['AuthorId'].values]

In [13]:
def get_authors(min_rev,min_std):
    return [a for a in authors if data_by_author[a]['No_ratings'] >= min_rev and data_by_author[a]['Std_ratings'] > min_std]

In [14]:
author_subset = get_authors(50,0)

In [15]:
def summarize(authorlist):
    no_authors = len(authorlist)
    recids = []
    for a in authorlist:
        recids+=data_by_author[a]['recids']
    norevs = len(recids)
    norecs = len(set(recids))
    return {'Reviewers':no_authors,'Recipes':norecs,'Ratings':norevs,'Avg_no_by_rec':norevs/norecs}

In [16]:
summarize(author_subset)

{'Reviewers': 3382,
 'Recipes': 221276,
 'Ratings': 764847,
 'Avg_no_by_rec': 3.4565294021945445}

In [35]:
summarize(get_authors(100,0))

{'Reviewers': 1724,
 'Recipes': 208761,
 'Ratings': 650005,
 'Avg_no_by_rec': 3.1136323355416002}

# Similar Recipes by Name

We will try to group together recipes with similar names.

To speed things up, we do the grouping within each recipe category first.

In [28]:
recipenames = list(recipesdf['Name'].values)
recipecategories = list(set(recipesdf['RecipeCategory'].values))

category_to_recnames = {c:[] for c in recipecategories}
category_to_recids = {c:[] for c in recipecategories}
for i in recipesdf.index:
    c = recipesdf['RecipeCategory'][i]
    n = recipesdf['Name'][i]
    category_to_recnames[c].append(n)
    category_to_recids[c].append(i)

We will use a sentence transformer to embed the recipe names as vectors and then cluster the names using the embedding.

In [30]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

from sklearn.cluster import KMeans, DBSCAN

def kmeans(sentences,k):
    clustering_model = KMeans(n_clusters = k)
    vecs = embedder.encode(sentences)
    clustering_model.fit(vecs)
    cluster_assignment = clustering_model.labels_
    clusters = {i:[] for i in range(max(cluster_assignment)+1)}
    for i in range(len(cluster_assignment)):
        clusters[cluster_assignment[i]].append(sentences[i])
    return clusters

def dbscan(sentences,ep,msam):
    clustering_model = DBSCAN(eps=ep, min_samples=msam, metric='euclidean')
    vecs = embedder.encode(sentences)
    clustering_model.fit(vecs)
    cluster_assignment = clustering_model.labels_
    clusters = {i:[] for i in range(-1,max(cluster_assignment)+1)}
    for i in range(len(cluster_assignment)):
        clusters[cluster_assignment[i]].append(sentences[i])
    return clusters

def dbscan_no(sentences,ep,msam):
    clustering_model = DBSCAN(eps=ep, min_samples=msam, metric='euclidean')
    vecs = embedder.encode(sentences)
    clustering_model.fit(vecs)
    cluster_assignment = clustering_model.labels_
    clusters = {i:[] for i in range(max(cluster_assignment)+1)}
    for i in range(len(cluster_assignment)):
        if cluster_assignment[i]>=0:
            clusters[cluster_assignment[i]].append(sentences[i])
    return clusters

def closest(words,target):
    vecs = embedder.encode([target]+words)
    wvs = vecs[1:]
    tv = vecs[0]
    w0 = words[0]
    d0 = np.linalg.norm(tv-wvs[0])
    for i in range(1,len(words)):
        vi = wvs[i]
        di = np.linalg.norm(tv-vi)
        if di < d0:
            d0 = di
            w0 = words[i]
    return w0

def closest_to_center(words):
    if len(words)==1:
        return words[0]
    vecs = embedder.encode(words)
    true_center = (1/len(vecs))*sum(vecs)
    w0 = words[0]
    d0 = np.linalg.norm(vecs[0]-true_center)
    for i in range(1,len(words)):
        di = np.linalg.norm(vecs[i]-true_center)
        if di < d0:
            w0 = words[i]
            d0 = di
    return w0

def dbscan_reorg(clusters):
    m =max(list(clusters.keys()))
    newclusters = {i:clusters[i] for i in range(m)}
    for x in clusters[-1]:
        newclusters[m] = [x]
        m+=1
    return newclusters
    
def relabel_clusters(clusters):
    return {closest_to_center(clusters[c]):clusters[c] for c in clusters}

def dbscan_nice(sentences,ep,msam):
    clus0 = dbscan(sentences,ep,msam)
    clus1 = dbscan_reorg(clus0)
    return relabel_clusters(clus1)

In [33]:
recipe_labels = {}

for c in category_to_recnames:
    rs = category_to_recnames[c]
    if len(rs)>2:
        clus = dbscan_nice(rs,0.4,2)
        for c in clus:
            if c in recipe_labels:
                recipe_labels[c]+=clus[c]
            else:
                recipe_labels[c] = clus[c]

In [37]:
recipe_labels_inv = {n:l for l in recipe_labels for n in recipe_labels[l]}

KeyError: 'Deep Fried Chicken Livers'

In [40]:
recipesdf

Unnamed: 0_level_0,Name,RecipeCategory,RecipeIngredientParts
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
524289,Apple-Glazed Stuffed Pork Chops,Pork,"[bacon, leek, dry white wine, thyme, salt, pep..."
524301,Bob's Taco Soup With Pasta,One Dish Meal,"[ground beef, onion, diced tomatoes, red kidne..."
524315,Crispy Chicken Thighs in Convection Oven,Poultry,"[boneless skinless chicken thighs, light sour ..."
38,Low-Fat Berry Blue Frozen Dessert,Frozen Desserts,"[blueberries, granulated sugar, vanilla yogurt..."
39,Biryani,Chicken Breast,"[saffron, milk, hot green chili peppers, onion..."
...,...,...,...
524149,Hidden Veggie Mac and Cheese,< 60 Mins,"[carrots, cauliflower, low-fat milk, salt, che..."
524197,Refreshing Ice Cubes,Very Low Carbs,"[water, water]"
524229,Coconut Curried Butternut Squash Soup,Lactose Free,"[olive oil, salt, ground black pepper, carrot,..."
524235,Paleo Party Pork Carnitas,Pork,"[canola oil, dried oregano, garlic cloves, car..."


In [41]:
ingredients = flatten([list(l) for l in recipesdf['RecipeIngredientParts'].values])

In [44]:
recipesdf['RecipeIngredientParts'].values

array([array(['bacon', 'leek', 'dry white wine', 'thyme', 'salt', 'pepper',
              'dijon-style mustard', 'boneless pork loin chops'], dtype=object),
       array(['ground beef', 'onion', 'diced tomatoes', 'red kidney beans',
              'corn', 'green chilies', 'bow tie pasta', 'sour cream'],
             dtype=object)                                                 ,
       array(['boneless skinless chicken thighs', 'light sour cream',
              'garlic cloves', 'harissa', 'salt', 'pepper', 'onion powder',
              'paprika', 'panko breadcrumbs', 'parmesan cheese', 'butter'],
             dtype=object)                                                 ,
       ...,
       array(['olive oil', 'salt', 'ground black pepper', 'carrot', 'celery',
              'curry powder', 'garlic cloves', 'onion', 'coconut milk',
              'flat leaf parsley'], dtype=object)                            ,
       array(['canola oil', 'dried oregano', 'garlic cloves', 'carrots',
      