In [1]:
import datacleaning
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import itertools
import matplotlib
import numpy as np
# https://realpython.com/nltk-nlp-python/


In [3]:
data = datacleaning.clean_recipedata("recipes_raw_nosource_ar.json")
print(np.shape(data))
data = data.sample(n=5000, random_state=2024)
data
data.to_csv("data_small.csv")



(39522, 7)


In [4]:
data["ingredient_words"] = data.clean_ingredients.apply(word_tokenize)
data["instruction_words"] = data.clean_instructions_masked.apply(word_tokenize)

In [5]:
ingredient_units = {'ml', 'milliliter','milliliters','liters','teaspoons', 'l','liter','teaspoon','t','tsp','tablespoon','tablespoons','tbl','tbs','tbsp','ounce','oz','fl','cup','cups','c','pint','pints','pt','p','quart','quarts','qt','gal','gals','gallon','gallons','g','mg','milligram','milligrams','gram','grams','pound','pounds','lb','lbs','c','f'}

In [6]:
stop_words = set(stopwords.words("english"))
stop_words = stop_words.union(ingredient_units)

def filter_stop_words(words):
    output = list()
    for word in words:
        if word.casefold() not in stop_words:
            output.append(word)
    return(output)

data["instruction_words"] = data["instruction_words"].apply(filter_stop_words)
data["ingredient_words"] = data["ingredient_words"].apply(filter_stop_words)


In [7]:
data['index'] = data.index

In [8]:
all_instruction_words = list(itertools.chain.from_iterable(data["instruction_words"]))
all_ingredient_words = list(itertools.chain.from_iterable(data["ingredient_words"]))

In [9]:
# Instructions
common_instruction_words = []

for item in FreqDist(all_instruction_words).items():
    if (item[1]) > (len(data)*.01): # remove words that appear less than once per hundred recipes
        common_instruction_words.append(item)


common_instruction_words = sorted(common_instruction_words, key=lambda x: x[1], reverse=True)

common_instruction_words_dict = {}
for i in range(len(common_instruction_words)):
    common_instruction_words_dict[common_instruction_words[i][0]] = i



# Ingredients
common_ingredient_words = []
for item in FreqDist(all_ingredient_words).items():
    if (item[1]) > (len(data)*.01): # remove words that appear less than once per hundred recipes
        common_ingredient_words.append(item)

common_ingredient_words = sorted(common_ingredient_words, key=lambda x: x[1], reverse=True)

common_ingredient_words_dict = {}
for i in range(len(common_ingredient_words)):
    common_ingredient_words_dict[common_ingredient_words[i][0]] = i

In [14]:
np.shape(common_ingredient_words)

(326, 2)

In [11]:
instruction_vectorizer = CountVectorizer(vocabulary=common_instruction_words_dict)

ingredient_vectorizer = CountVectorizer(vocabulary=common_ingredient_words_dict)

In [12]:
A = instruction_vectorizer.fit_transform(data["instructions"])
A_ingredients = ingredient_vectorizer.fit_transform(data["ingredients"])

In [12]:
# B = cosine_similarity(A, A[1,:])

In [13]:
# np.histogram(B, bins=4)

In [14]:
# A2 = A[0:100,:]
# print(np.shape(A2)[0])
# G2 = nx.Graph()
# for i, attr in data[0:100].iterrows():
#     G2.add_node(i, title = attr[0], ingredients = attr[1], instructions = attr[2])

# for i in range((np.shape(A2))[0]):
#     # if i%1000 == 0: 
#     #     print(i/(np.shape(A2))[0])
#     current_node = data.index[i]
#     current_node_similarity = cosine_similarity(A2, A2[i,:])
#     for j in range((np.shape(A2))[0]):
#         target_node = data.index[j]
#         similarity = current_node_similarity[j]
#         if (similarity > 0.5 and current_node != target_node): #arbitrary cutoff
#             G2.add_edge(current_node, target_node, weight = float(similarity))

In [15]:
# A2 = A_ingredients[0:100,:]
# G2 = nx.Graph()
# for i, attr in data[0:100].iterrows():
#     G2.add_node(i, title = attr[0])

# for i in range((np.shape(A2))[0]):
#     if i%1000 == 0: 
#         print(i/(np.shape(A2))[0])
#     current_node = data.index[i]
#     current_node_similarity = cosine_similarity(A2, A2[i,:])
#     edges_to_add = np.argwhere(current_node_similarity > .5)[:,0] #arbitrary cutoff
#     for j in edges_to_add:
#         target_node = data.index[j]
#         similarity = current_node_similarity[j]
#         if (current_node != target_node): 
#             G2.add_edge(current_node, target_node, weight = float(similarity))

In [16]:
# current_node_similarity = cosine_similarity(A, A[1,:])



In [17]:
# thing = np.vstack((np.ravel(current_node_similarity),np.array(range(39522))))

# np.shape(thing[:,] > .5)

In [13]:
G = nx.Graph()
for i, attr in data.iterrows():
    G.add_node(i, title = attr[0])

for i in range((np.shape(A))[0]):
    if i%1000 == 0: 
        print(i/(np.shape(A))[0])
    current_node = data.index[i]
    current_node_similarity = cosine_similarity(A, A[i,:])
    edges_to_add = np.argwhere(current_node_similarity > .5)[:,0] #arbitrary cutoff
    for j in edges_to_add:
        target_node = data.index[j]
        similarity = current_node_similarity[j]
        if (current_node != target_node): 
            G.add_edge(current_node, target_node, weight = float(similarity))

  G.add_node(i, title = attr[0])


0.0


  G.add_edge(current_node, target_node, weight = float(similarity))


0.2
0.4
0.6
0.8


In [37]:
nx.write_gexf(G, "recipe_instruction_small.gexf")

In [38]:
G_ingr = nx.Graph()
for i, attr in data[0:100].iterrows():
    G_ingr.add_node(i, title = attr[0])

for i in range((np.shape(A_ingredients))[0]):
    if i%1000 == 0: 
        print(i/(np.shape(A_ingredients))[0])
    current_node = data.index[i]
    current_node_similarity = cosine_similarity(A_ingredients, A_ingredients[i,:])
    edges_to_add = np.argwhere(current_node_similarity > .5)[:,0] #arbitrary cutoff
    for j in edges_to_add:
        target_node = data.index[j]
        similarity = current_node_similarity[j]
        if (current_node != target_node): 
            G_ingr.add_edge(current_node, target_node, weight = float(similarity))

  G_ingr.add_node(i, title = attr[0])
  G_ingr.add_edge(current_node, target_node, weight = float(similarity))


0.0
0.2
0.4
0.6
0.8


In [39]:
nx.write_gexf(G_ingr, "recipe_ingredient_small.gexf")