In [1]:
import datacleaning
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import itertools
import matplotlib
import numpy as np
# https://realpython.com/nltk-nlp-python/


In [2]:
data = datacleaning.clean_recipedata("recipes_raw_nosource_ar.json")
print(np.shape(data))
data = data.sample(n=5000, random_state=2024)
data
# data.to_csv("data_small.csv")



(39522, 7)


Unnamed: 0,title,ingredients,instructions,picture_link,clean_ingredients,clean_instructions,clean_instructions_masked
5eA5nRW8VgbOry0hsA.SVnrGkt2AdzO,Southern-Style Chocolate Pound Cake,1 cup butter ADVERTISEMENT 1/2 cup shortening ...,Preheat oven to 350 degrees F (175 degrees C)....,v.IiJhm4GZSZAGtMDWpTfAe6vspLCiu,cup butter cup shortening cups white suga...,preheat oven to degrees f degrees c grease a...,preheat oven to degrees f degrees c grease and...
DIzfv.NycIqtwo58y7fteog1dKRw12O,Pumpkin Pie I,1 egg ADVERTISEMENT 1 tablespoon all-purpose f...,Preheat oven to 450 degrees F (230 degrees C)....,EbVR3lftwDSDeE1MRSGdi1evygKF/D6,egg tablespoon allpurpose flour cup white...,preheat oven to degrees f degrees c add the ...,preheat oven to degrees f degrees c add the gr...
TFxp0RPK/1PxAnjYiZUS0KztkgYb3iW,Cinnamon Oatmeal Zucchini Cookies,"1 1/2 cups butter, softened ADVERTISEMENT 1 1/...",Preheat oven to 350 degrees F (175 degrees C)....,jG7CnWbsc8cYsoHbreIorq8Uvyzzk/6,cups butter softened cups white sugar e...,preheat oven to degrees f degrees c grease b...,preheat oven to degrees f degrees c grease she...
3yH5Jc6HLmEnJw5ggLqI11oeBiSpwBy,Plum-Oat Drop Biscuits,2 tablespoons white sugar ADVERTISEMENT 1/2 te...,Preheat oven to 450 degrees F (230 degrees C)....,qE58a7Z1Au0GXvPO188iHZZVqna9hLa,tablespoons white sugar teaspoon ground cin...,preheat oven to degrees f degrees c grease a...,preheat oven to degrees f degrees c grease a s...
8lZak.EVdLP9/dukyN72DYHyjseFwV2,Ghirardelli Milk Chocolate Chip Cookies,2 1/4 cups all-purpose flour ADVERTISEMENT 1 t...,Preheat the oven to 375 degrees F. Stir togeth...,50UyVqBQayirbQ47M.8oJvkETheUGFW,cups allpurpose flour teaspoon baking soda...,preheat the oven to degrees f stir together t...,preheat the oven to degrees f stir together th...
...,...,...,...,...,...,...,...
z5ZKH66nGOR2cgkzDl5vGj4DPX/J95O,Sun-Dried Tomato With Fresh Basil Spread,1/2 cup oil-packed sun-dried tomatoes ADVERTIS...,Place sun-dried tomatoes in the work bowl of a...,IV7I3CRDxEMTrdHzOCE3YHxewYs0JQW,cup oilpacked sundried tomatoes cup basil l...,place sundried tomatoes in the work bowl of a ...,place in the work bowl of a food processor pul...
g1mQzPtIoBSaOaaHSkMj7cObWt0r5.K,Banana Split Martini,ice as needed ADVERTISEMENT 2 fluid ounces whi...,Fill a cocktail shaker with ice; add white cho...,qE58a7Z1Au0GXvPO188iHZZVqna9hLa,ice as needed fluid ounces white chocolate l...,fill a cocktail shaker with ice add white choc...,fill a cocktail shaker with add creme de and c...
dsDOddsQtDp7xomWY2mQwcoTyLeUfYa,Citrus Glazed Banana Squash,"1 1/2 pounds banana squash, peeled and cubed A...",Melt butter in a large skillet over medium hea...,SkAAXlzdZZcQ6UmAO72KI82FiqpAG9y,pounds banana squash peeled and cubed cup ...,melt butter in a large skillet over medium hea...,melt in a large skillet over medium heat add c...
CdxL1ghjiOgZTF8mPLi/JdaUW74ARmq,Classic Spanish Sangria,1 lemon ADVERTISEMENT 1 lime ADVERTISEMENT 1 o...,"Have the fruit, rum, wine, and orange juice we...",xCSyOeooKYofbXHuZpV5h7prJrWUDhK,lemon lime orange cups rum cup white...,have the fruit rum wine and orange juice well ...,have the fruit and well chilled slice the and ...


In [3]:
data["ingredient_words"] = data.clean_ingredients.apply(word_tokenize)
data["instruction_words"] = data.clean_instructions_masked.apply(word_tokenize)

In [4]:
ingredient_units = {'inch', 'ml', 'milliliter','milliliters','liters','teaspoons', 'l','liter','teaspoon','t','tsp','tablespoon','tablespoons','tbl','tbs','tbsp','ounce','oz','fl','cup','cups','c','pint','pints','pt','p','quart','quarts','qt','gal','gals','gallon','gallons','g','mg','milligram','milligrams','gram','grams','pound','pounds','lb','lbs','c','f'}

In [5]:
stop_words = set(stopwords.words("english"))
stop_words = stop_words.union(ingredient_units)

def filter_stop_words(words):
    output = list()
    for word in words:
        if word.casefold() not in stop_words:
            output.append(word)
    return(output)

data["instruction_words"] = data["instruction_words"].apply(filter_stop_words)
data["ingredient_words"] = data["ingredient_words"].apply(filter_stop_words)
data['index'] = data.index
data.to_csv("data_small.csv")


In [6]:
data["ingredient_words"]

5eA5nRW8VgbOry0hsA.SVnrGkt2AdzO    [butter, shortening, white, sugar, eggs, vanil...
DIzfv.NycIqtwo58y7fteog1dKRw12O    [egg, allpurpose, flour, white, sugar, salt, p...
TFxp0RPK/1PxAnjYiZUS0KztkgYb3iW    [butter, softened, white, sugar, eggs, vanilla...
3yH5Jc6HLmEnJw5ggLqI11oeBiSpwBy    [white, sugar, ground, cinnamon, allpurpose, f...
8lZak.EVdLP9/dukyN72DYHyjseFwV2    [allpurpose, flour, baking, soda, salt, unsalt...
                                                         ...                        
z5ZKH66nGOR2cgkzDl5vGj4DPX/J95O    [oilpacked, sundried, tomatoes, basil, leaves,...
g1mQzPtIoBSaOaaHSkMj7cObWt0r5.K    [ice, needed, fluid, ounces, white, chocolate,...
dsDOddsQtDp7xomWY2mQwcoTyLeUfYa    [banana, squash, peeled, cubed, butter, water,...
CdxL1ghjiOgZTF8mPLi/JdaUW74ARmq    [lemon, lime, orange, rum, white, sugar, bottl...
zDZUSIRODtz86LgwPyFH3AoMoPO8p.2    [cooked, seasoned, frozen, beef, strips, thawe...
Name: ingredient_words, Length: 5000, dtype: object

In [7]:
all_instruction_words = list(itertools.chain.from_iterable(data["instruction_words"]))
all_ingredient_words = list(itertools.chain.from_iterable(data["ingredient_words"]))

In [8]:
all_ingredient_words

['butter',
 'shortening',
 'white',
 'sugar',
 'eggs',
 'vanilla',
 'extract',
 'milk',
 'allpurpose',
 'flour',
 'dutch',
 'process',
 'cocoa',
 'powder',
 'baking',
 'powder',
 'egg',
 'allpurpose',
 'flour',
 'white',
 'sugar',
 'salt',
 'pumpkin',
 'puree',
 'evaporated',
 'milk',
 'ground',
 'cinnamon',
 'ground',
 'ginger',
 'ground',
 'nutmeg',
 'light',
 'corn',
 'syrup',
 'unbaked',
 'pie',
 'crust',
 'butter',
 'softened',
 'white',
 'sugar',
 'eggs',
 'vanilla',
 'extract',
 'allpurpose',
 'flour',
 'ground',
 'cinnamon',
 'baking',
 'soda',
 'salt',
 'finely',
 'shredded',
 'zucchini',
 'quickcooking',
 'oats',
 'chopped',
 'pecans',
 'package',
 'cinnamon',
 'chips',
 'white',
 'sugar',
 'ground',
 'cinnamon',
 'allpurpose',
 'flour',
 'rolled',
 'oats',
 'baking',
 'powder',
 'salt',
 'butter',
 'chopped',
 'pitted',
 'prunes',
 'dried',
 'plums',
 'fatfree',
 'milk',
 'allpurpose',
 'flour',
 'baking',
 'soda',
 'salt',
 'unsalted',
 'butter',
 'room',
 'temperature',
 '

In [9]:
# Instructions
common_instruction_words = []

for item in FreqDist(all_instruction_words).items():
    if (item[1]) > (len(data)*.01): # remove words that appear less than once per hundred recipes
        common_instruction_words.append(item)


common_instruction_words = sorted(common_instruction_words, key=lambda x: x[1], reverse=True)

common_instruction_words_dict = {}
for i in range(len(common_instruction_words)):
    common_instruction_words_dict[common_instruction_words[i][0]] = i



# Ingredients
common_ingredient_words = []
for item in FreqDist(all_ingredient_words).items():
    if (item[1]) > (len(data)*.01): # remove words that appear less than once per hundred recipes
        common_ingredient_words.append(item)

common_ingredient_words = sorted(common_ingredient_words, key=lambda x: x[1], reverse=True)

common_ingredient_words_dict = {}
for i in range(len(common_ingredient_words)):
    common_ingredient_words_dict[common_ingredient_words[i][0]] = i

In [10]:
common_ingredient_words_dict

{'chopped': 0,
 'salt': 1,
 'ground': 2,
 'pepper': 3,
 'sugar': 4,
 'white': 5,
 'taste': 6,
 'butter': 7,
 'oil': 8,
 'fresh': 9,
 'cheese': 10,
 'garlic': 11,
 'flour': 12,
 'onion': 13,
 'black': 14,
 'powder': 15,
 'water': 16,
 'sliced': 17,
 'allpurpose': 18,
 'cream': 19,
 'milk': 20,
 'package': 21,
 'minced': 22,
 'sauce': 23,
 'diced': 24,
 'eggs': 25,
 'chicken': 26,
 'olive': 27,
 'extract': 28,
 'vanilla': 29,
 'baking': 30,
 'dried': 31,
 'cut': 32,
 'red': 33,
 'juice': 34,
 'green': 35,
 'shredded': 36,
 'cloves': 37,
 'peeled': 38,
 'large': 39,
 'egg': 40,
 'vegetable': 41,
 'brown': 42,
 'lemon': 43,
 'ounces': 44,
 'drained': 45,
 'optional': 46,
 'cinnamon': 47,
 'pinch': 48,
 'tomatoes': 49,
 'grated': 50,
 'softened': 51,
 'vinegar': 52,
 'bell': 53,
 'chocolate': 54,
 'beef': 55,
 'dry': 56,
 'finely': 57,
 'frozen': 58,
 'mix': 59,
 'crushed': 60,
 'slices': 61,
 'soda': 62,
 'divided': 63,
 'tomato': 64,
 'small': 65,
 'onions': 66,
 'whole': 67,
 'boneless':

In [11]:
common_instruction_words_dict

{'minutes': 0,
 'heat': 1,
 'stir': 2,
 'degrees': 3,
 'bowl': 4,
 'oven': 5,
 'mixture': 6,
 'cook': 7,
 'add': 8,
 'place': 9,
 'large': 10,
 'together': 11,
 'pour': 12,
 'mix': 13,
 'bake': 14,
 'medium': 15,
 'preheat': 16,
 'skillet': 17,
 'cover': 18,
 'pan': 19,
 'remove': 20,
 'top': 21,
 'cool': 22,
 'boil': 23,
 'preheated': 24,
 'combine': 25,
 'baking': 26,
 'bring': 27,
 'set': 28,
 'dish': 29,
 'simmer': 30,
 'beat': 31,
 'well': 32,
 'lightly': 33,
 'saucepan': 34,
 'pot': 35,
 'remaining': 36,
 'serve': 37,
 'smooth': 38,
 'water': 39,
 'dough': 40,
 'drain': 41,
 'sprinkle': 42,
 'tender': 43,
 'stirring': 44,
 'small': 45,
 'sheet': 46,
 'reduce': 47,
 'brown': 48,
 'let': 49,
 'hours': 50,
 'browned': 51,
 'season': 52,
 'spread': 53,
 'center': 54,
 'aside': 55,
 'onto': 56,
 'prepared': 57,
 'low': 58,
 'serving': 59,
 'grease': 60,
 'hot': 61,
 'evenly': 62,
 'whisk': 63,
 'golden': 64,
 'batter': 65,
 'side': 66,
 'cut': 67,
 'mediumhigh': 68,
 'refrigerate': 69

In [12]:
instruction_vectorizer = CountVectorizer(vocabulary=common_instruction_words_dict)

ingredient_vectorizer = CountVectorizer(vocabulary=common_ingredient_words_dict)

In [13]:
A = instruction_vectorizer.fit_transform(data["instructions"])
A_ingredients = ingredient_vectorizer.fit_transform(data["ingredients"])

In [14]:
# B = cosine_similarity(A, A[1,:])

In [15]:
# np.histogram(B, bins=4)

In [16]:
# A2 = A[0:100,:]
# print(np.shape(A2)[0])
# G2 = nx.Graph()
# for i, attr in data[0:100].iterrows():
#     G2.add_node(i, title = attr[0], ingredients = attr[1], instructions = attr[2])

# for i in range((np.shape(A2))[0]):
#     # if i%1000 == 0: 
#     #     print(i/(np.shape(A2))[0])
#     current_node = data.index[i]
#     current_node_similarity = cosine_similarity(A2, A2[i,:])
#     for j in range((np.shape(A2))[0]):
#         target_node = data.index[j]
#         similarity = current_node_similarity[j]
#         if (similarity > 0.5 and current_node != target_node): #arbitrary cutoff
#             G2.add_edge(current_node, target_node, weight = float(similarity))

In [17]:
# A2 = A_ingredients[0:100,:]
# G2 = nx.Graph()
# for i, attr in data[0:100].iterrows():
#     G2.add_node(i, title = attr[0])

# for i in range((np.shape(A2))[0]):
#     if i%1000 == 0: 
#         print(i/(np.shape(A2))[0])
#     current_node = data.index[i]
#     current_node_similarity = cosine_similarity(A2, A2[i,:])
#     edges_to_add = np.argwhere(current_node_similarity > .5)[:,0] #arbitrary cutoff
#     for j in edges_to_add:
#         target_node = data.index[j]
#         similarity = current_node_similarity[j]
#         if (current_node != target_node): 
#             G2.add_edge(current_node, target_node, weight = float(similarity))

In [18]:
# current_node_similarity = cosine_similarity(A, A[1,:])



In [19]:
# thing = np.vstack((np.ravel(current_node_similarity),np.array(range(39522))))

# np.shape(thing[:,] > .5)

In [20]:
G = nx.Graph()
for i, attr in data.iterrows():
    G.add_node(i, title = attr[0])

for i in range((np.shape(A))[0]):
    if i%1000 == 0: 
        print(i/(np.shape(A))[0])
    current_node = data.index[i]
    current_node_similarity = cosine_similarity(A, A[i,:])
    edges_to_add = np.argwhere(current_node_similarity > .5)[:,0] #arbitrary cutoff
    for j in edges_to_add:
        target_node = data.index[j]
        similarity = current_node_similarity[j]
        if (current_node != target_node): 
            G.add_edge(current_node, target_node, weight = float(similarity))

  G.add_node(i, title = attr[0])


0.0


  G.add_edge(current_node, target_node, weight = float(similarity))


0.2
0.4
0.6
0.8


In [21]:
nx.write_gexf(G, "recipe_instruction_small.gexf")

In [22]:
G_ingr = nx.Graph()
for i, attr in data[0:100].iterrows():
    G_ingr.add_node(i, title = attr[0])

for i in range((np.shape(A_ingredients))[0]):
    if i%1000 == 0: 
        print(i/(np.shape(A_ingredients))[0])
    current_node = data.index[i]
    current_node_similarity = cosine_similarity(A_ingredients, A_ingredients[i,:])
    edges_to_add = np.argwhere(current_node_similarity > .5)[:,0] #arbitrary cutoff
    for j in edges_to_add:
        target_node = data.index[j]
        similarity = current_node_similarity[j]
        if (current_node != target_node): 
            G_ingr.add_edge(current_node, target_node, weight = float(similarity))

  G_ingr.add_node(i, title = attr[0])
  G_ingr.add_edge(current_node, target_node, weight = float(similarity))


0.0
0.2
0.4
0.6
0.8


In [23]:
nx.write_gexf(G_ingr, "recipe_ingredient_small.gexf")