In [None]:
!pip install transformers
!pip install torch
!pip install datasets

In [None]:
import pandas as pd
from google.colab import drive
drive.mount("/content/drive")

In [1]:
mydir = "/content/drive/MyDrive/Dataset/"

### Import the data structure containing the information about verbs relation using the library pickle

In [4]:
import pickle
z = open(mydir + "relation_dict_completo.pkl",'rb')
relation_dict = pickle.load(z)

## Remember that the most used verb with the ingredients is "Add"


In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name= "moro01525/T5_FineTuning"
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [5]:
final = pd.read_csv(mydir + "intermedio.csv").drop(columns=["Unnamed: 0"])
final['ingredients'] = 'Ingredients: ' + final['ingredients']

In [None]:
from datasets import Dataset
final[:100000] = final[:100000].sample(frac=1).reset_index(drop=True)
dataset = Dataset.from_pandas(final[:1000])
evaluation = Dataset.from_pandas(final[101000:106000].sample(frac=1).reset_index(drop=True)[:20])
test = final[100000:101000]

In [6]:
model_dir = mydir + "T5_FineTuning"

In [None]:
def generate_recipe(model, ingredients):
    input_text = f"Ingredients: {ingredients}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    outputs = model.generate(input_ids, max_length=150, num_beams=5, repetition_penalty=2.5, no_repeat_ngram_size=2, early_stopping=True)
    recipe = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return recipe

### To avoid wasting time generating the sentence every time I save 300 rows of generated text, with relative input and expected text

In [None]:
import pandas as pd
import random

#This will be used for future evaluation
x = pd.DataFrame(columns=["input", "text", "label"])
seen = []
for i in range(0, 300):
  flag = True
  while(flag):
    index = random.randint(0, 999) + 100000
    if(index not in seen):
      seen.append(index)
      flag = False
  input = test.loc[index]["ingredients"]
  generated_text = generate_recipe(input)
  reference_text = test.loc[index]["steps"]

  x.loc[index, "input"] = input
  x.loc[index, "text"] = generated_text
  x.loc[index, "label"] = reference_text

In [None]:
import pickle
with open(mydir + "generations.pkl", "wb") as f:
    pickle.dump(x, f)

## Analysis

In [None]:
import pickle
x = open(mydir + "generations.pkl",'rb')
x = pickle.load(x)

In [None]:
import pickle
z = open(mydir + "relation_dict_completo.pkl",'rb')
relation_dict = pickle.load(z)

## To evaluate the model I created a metric:
For every generated sentences:

*   If the verb is 'add' (the most used) the model gain 0.75 point
*   If the verb is contained in the top 10 most used verbs of the model gain 1 full point
*   0 in the other cases

Finally the score is averaged





In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def find_conjuncts(token):
    conjuncts = [token]
    for child in token.children:
        if child.dep_ == "conj":
            conjuncts.extend(find_conjuncts(child))
    return conjuncts

def get_relations(text):
  relations = {}

  for p in text.split("; "):
    doc = nlp(p)
    for token in doc:
      if token.pos_ == "VERB":
        objects = []
        for child in token.children:
            if child.dep_ in ("dobj", "obj", "obl"):
                objects.extend(find_conjuncts(child))
        relations[token.lemma_] = [obj.lemma_ for obj in objects]
  return relations

def get_score(relationships, relation_dict):
  score = 0
  n = 0
  for verb in relationships.keys():
    ingredients = relationships[verb]
    if(len(ingredients)>0):
      for ingredient in ingredients:
        n += 1
        if(verb == "add"):            #The most used verb gives a light penalty
          score += 0.75
          continue
        #If the verb is contained in the top 10 most used verbs of the object then the model gain a full point
        verbs = sorted(relation_dict[ingredient].items(), key=lambda x: x[1], reverse=True)[:10]
        contained = False
        for i in verbs:
          if(verb in i):
            contained = True
        if(contained):
          score += 1

  if(n != 0):
    return score/n
  else:
    return 1

In [None]:
import random

relationships = []
scores = []

# Iterate through the sentences generated

for i in range(0, 300):
  input = x.loc[i]["input"]
  output = x.loc[i]["label"]
  predict = x.loc[i]["text"]

  relationships = get_relations(predict)
  score = get_score(relationships, relation_dict)
  scores.append(score)

In [None]:
print("Len:", len(scores))
res_sum = sum(scores)
print("final score: ", res_sum/len(scores))


Len: 300
final score:  0.8587559523809525


### The final score:

*   0.8587559523809525



### If 'add' gives 1 full point:
*   0.9622010582010586