In [None]:
import pandas as pd
from google.colab import drive
drive.mount("/content/drive")

In [None]:
mydir = "/content/drive/MyDrive/Dataset/"

## Extraction of relations between objects and verbs

In [None]:
df = pd.read_csv(mydir + "RAW_recipes.csv")

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

#This function extract all the direct object in relation of a VERB
def extract_relations(recipe_text):
    doc = nlp(recipe_text)
    relations = []
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "VERB" and any([child.dep_ == "dobj" for child in token.children]):
                verb = token.lemma_
                ingredients = [child.lemma_ for child in token.children if child.dep_ == "dobj"]
                for ingredient in ingredients:
                    relations.append((ingredient, verb))
    return relations

In [None]:
relation_dict = defaultdict(list)

#Here I extracted all the VERB relations in 200k rows from the recipes dataset
for steps in df["steps"][:200000]:
    recipe = steps[2:-2]
    relations = extract_relations(recipe)
    for ingredient, method in relations:
        relation_dict[ingredient].append(method)

In [None]:
#Here I counted the number of times a VERB is referred to an object
for ingredient in relation_dict:
    relation_dict[ingredient] = {method: relation_dict[ingredient].count(method) for method in set(relation_dict[ingredient])}

In [None]:
#I used the library 'pickle' to save the dictionary
import pickle
with open(mydir + "relation_dict_completo.pkl", "wb") as f:
    pickle.dump(relation_dict, f)

In [None]:
import pickle
x = open(mydir + "relation_dict_completo.pkl",'rb')
relation_dict = pickle.load(x)

## To get a better analysis I searched for the most used Verbs in the dataset for all the objects

In [None]:
from collections import defaultdict
all_relation = defaultdict(list)

for ingredient in relation_dict:
  verbs = relation_dict[ingredient]
  for verb in verbs:
    for i in range(0, relation_dict[ingredient][verb]):
      all_relation[0].append(verb)

In [None]:
all_relation[0] = {method: all_relation[0].count(method) for method in set(all_relation[0])}

In [None]:
#Print the 15 most used Verbs and how much times are used
sorted(all_relation[0].items(), key=lambda x: x[1], reverse=True)[:15]

[('add', 345964),
 ('combine', 101662),
 ('mix', 66252),
 ('stir', 59915),
 ('cook', 54499),
 ('remove', 50831),
 ('use', 49900),
 ('preheat', 48331),
 ('cut', 44102),
 ('bake', 42462),
 ('bring', 40230),
 ('put', 37588),
 ('place', 35116),
 ('make', 28587),
 ('sprinkle', 27231)]