# Similarity Evaluation
### To measure the Word Mover's Distance value between a sample target recipe with the model's output to evaluate their similarity.

In [2]:
import gensim.downloader as api
import nltk
import requests

from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from nltk.corpus import stopwords



We will use the word vectors that are pre-trained and available from google. https://code.google.com/archive/p/word2vec/ They are hosted in gensim. These are from part of Google News dataset (about 100 billion words) with 300-dimensional vectors for 3 million words and phrases.

In [3]:
# Load the model
model = api.load('word2vec-google-news-300')

Create a couple of helper functions

In [4]:
def webscrapping(url):
    '''
    This function performs web scraping on a foodnetwork.com recipe.
    Returns the recipe information as a list.
    '''
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all('li', attrs={'class': 'o-Method__m-Step'})
    recipe_info = []
    for result in results:
      recipe_info.append(result.text.strip())

    return recipe_info

In [5]:
def preprocess(list):
    '''
    This function takes in a list of strings (string length can vary and can be > 1)
    and tokenises each string in the list. As well as filtering out the tokens based on stopwords, punctuation and numbers.
    Each token is then added to the token_list and the latter is returned.
    '''
    mystopwords = stopwords.words("english")
    WNlemma = nltk.WordNetLemmatizer()
    tokens_list = []
    for item in list:
        tokens = nltk.word_tokenize(item)
        tokens = [ t for t in tokens if t.isalpha() ]   # Remove numbers and punctuation.
        tokens = [ WNlemma.lemmatize(t.lower()) for t in tokens ]
        tokens = [ t for t in tokens if not t in mystopwords ]
        tokens = [ t for t in tokens if len(t) >= 3 ]
        for token in tokens:
            tokens_list.append(token)
    
    return tokens_list

Evaluate results from actual top 5 similar recipes output from the model.
![2021-09-24 21_49_55-Window](https://user-images.githubusercontent.com/19281828/134685571-b73ad552-98f0-463a-a572-8b777918eeb3.png)

In [6]:
# Perform web scrapping of the target url
target = webscrapping("https://www.foodnetwork.com/recipes/oven-baked-salmon-recipe-1911951")

# Preprocess the target recipe info
preprocessed_target = preprocess(target)

In [7]:
# Manually extract the recipe steps of the results from the dataset for simplicity sake
result1 = ['preheat oven to 400f', 'place the fish on a baking sheet', 'rub both sides of fish with olive oil and sprinkle with salt and pepper', 'roast for about 25 minutes , until the fish is firm and fully cooked', 'allow to cool at room temperature for about 30 minutes', 'remove and discard the skin and bones', 'when the fish is cool , flake the flesh into a bowl in large pieces', 'add the celery , red onion , fennel , dill , lemon juice , vinegar , capers , mayonnaise , 1 teaspoons salt , and 1 / 2 teaspoons pepper', 'mix gently and refrigerate for at about 30 minutes', 'taste for seasoning and serve at room temperature', 'enjoy !']
result2 = ['preheat oven to 350f', 'spread bread cubes on baking sheet', 'bake 15-20 minutes , or until golden brown', 'in small saucepan , heat oil over med-low heat', 'add garlic and cook 2 minutes , or until fragrant', 'place all remaining ingredients in a large serving bowl', 'add the cooked olive oil and garlic', 'toss to coat', 'season to taste with salt and pepper', 'allow to stand at least 20 minutes , to allow the flavors to develop', 'adjust seasonings , if necessary , and serve']
result3 = ['to toast walnuts: place in baking pan and bake in a 350 degree oven until golden , about 10 minutes', 'pour 1 tablespoon olive oil into a 12- by 15-inch baking pan', 'add asparagus , sprinkle with salt , and mix to coat', 'spread in a single layer and bake in a 400 degree oven , stirring often , until tender when pierced , 15 to 20 minutes', 'let cool about 15 minutes', 'meanwhile , in a large bowl , mix vinegar and remaining 2 tablespoons oil', 'add spinach , strawberries , toasted walnuts , and cooled asparagus', 'mix to coat', 'add more salt and pepper to taste', 'enjoy']
result4 = ['heat oven to 400 degrees', 'line a baking sheet with foil', 'brush with olive oil', 'season both sides of the salmon filets with salt and pepper', 'bake until salmon flakes easily , about 20 - 25 minutes', 'meanwhile in a heavy saucepan , simmer shallots , vinegar and wine until shallots are soft and very little liquid remains', 'add the butter and 2 tablespoons water , whisking constantly over high heat until the butter is melted and incorporated', 'stir in the parsley , capers and lemon zest', 'serve over the salmon']
result5 = ['preheat oven to 350 degrees', 'place bread cubes on baking sheet and toast until crisp', 'meanwhile , in a large bowl , toss together tomatoes , peppers , cucumber , garlic , capers , parsley and basil', 'drizzle in the vinegars , oil and season with salt and pepper', 'toss again', 'let sit for 15 to 20 minutes in the refrigerator', 'when ready to serve , toss together with the toasted bread cubes', 'gently fold in goat or feta cheese , avocado and adjust seasoning , if necessary', 'serve']

# Preprocess the result recipe info
preprocessed_result1 = preprocess(result1)
preprocessed_result2 = preprocess(result2)
preprocessed_result3 = preprocess(result3)
preprocessed_result4 = preprocess(result4)
preprocessed_result5 = preprocess(result5)

In [10]:
distance1 = model.wmdistance(preprocessed_target, preprocessed_result1)
print('distance 1 = %.4f' % distance1)
distance2 = model.wmdistance(preprocessed_target, preprocessed_result2)
print('distance 2 = %.4f' % distance2)
distance3 = model.wmdistance(preprocessed_target, preprocessed_result3)
print('distance 3 = %.4f' % distance3)
distance4 = model.wmdistance(preprocessed_target, preprocessed_result4)
print('distance 4 = %.4f' % distance4)
distance5 = model.wmdistance(preprocessed_target, preprocessed_result5)
print('distance 5 = %.4f' % distance5)

distance 1 = 2.1079
distance 2 = 2.0876
distance 3 = 2.1179
distance 4 = 2.0636
distance 5 = 2.0868
