In [1]:
import os
import pandas as pd
import numpy as np
import spacy


from matplotlib import pyplot as plt
from tqdm import tqdm

# from operator import itemgetter
# from collections import Counter, OrderedDict

# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.stem import SnowballStemmer
# from nltk.corpus import stopwords
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

from gensim.models.phrases import Phrases, Phraser
# from gensim.models import Word2Vec

# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.decomposition import PCA

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

BASE_PATH = "../data"

First, import the wine dataset.

In [None]:

i = 0
for file in os.listdir(BASE_PATH):
    file_location = BASE_PATH + '/' + str(file)
    if i==0:
        wine_dataframe = pd.read_csv(file_location)
        i+=1
    else:
        df_to_append = pd.read_csv(file_location, low_memory=False, encoding='latin-1')
        wine_dataframe = pd.concat([wine_dataframe, df_to_append], axis=0)

wine_dataframe.drop_duplicates(subset=['Name'], inplace=True)

geographies = ['Subregion', 'Region', 'Province', 'Country']

for geo in geographies:
    wine_dataframe[geo] = wine_dataframe[geo].apply(lambda x : str(x).strip())

print(wine_dataframe.shape)


Then, the food dataset.

In [2]:
food_review_dataset = pd.read_csv(BASE_PATH + '/food_reviews/RAW_recipes.csv')
print(food_review_dataset.shape)

(231637, 12)


In [17]:
model = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

clean_ingredients = []
base_form_ingredient = []

def process_ingredient(ingredient):
    docs = model(ingredient)
    base_form_ingredient.append(ingredient)
    ingredient = " ".join([token.lemma_ for token in docs])
    clean_ingredients.append(ingredient.replace(' ', '_'))

def extract_ingredients(all_raw_ingredients):
    for ingredients in tqdm(all_raw_ingredients, total=len(all_raw_ingredients)):
        for ingredient in eval(ingredients):
            if ' and ' in ingredient or ' or ' in ingredient:
                ingredient = ingredient.replace(' and ', ' ').split(' ')
                for ingre in ingredient:
                    process_ingredient(ingre)
            else:
                process_ingredient(ingredient)

    ingredients_df = pd.DataFrame({'base_form': base_form_ingredient, 'ingredients': clean_ingredients})
    ingredients_df.drop_duplicates(subset=['ingredients'], inplace=True)
    ingredients_df.to_csv(BASE_PATH + '/food_reviews/ingredients.csv')

extract_ingredients(food_review_dataset.ingredients.to_numpy()[:500])

100%|██████████| 500/500 [00:09<00:00, 50.17it/s]


In [35]:
descriptor_mapping = pd.read_csv(BASE_PATH + '/food_reviews/ingredients.csv').set_index('ingredients')

def remove_columns(df):
    df[['calories','total fat (PDV)','sugar (PDV)','sodium (PDV)','protein (PDV)','saturated fat (PDV)','carbohydrates (PDV)']] = df.nutrition.str.split(",",expand=True) 
    df['calories'] =  df['calories'].apply(lambda x: x.replace('[','')) 
    df['carbohydrates (PDV)'] =  df['carbohydrates (PDV)'].apply(lambda x: x.replace(']','')) 
    df[['calories','total fat (PDV)','sugar (PDV)','sodium (PDV)','protein (PDV)','saturated fat (PDV)','carbohydrates (PDV)']] = df[['calories','total fat (PDV)','sugar (PDV)','sodium (PDV)','protein (PDV)','saturated fat (PDV)','carbohydrates (PDV)']].astype('float')
    df.drop(['minutes', 'contributor_id', 'submitted', 'tags', 'nutrition', 'n_steps'], inplace=True, axis=1)
    return df

def return_mapped_descriptor(word, mapping):
    if word in list(mapping.index):
        normalized_word = mapping.at[word, 'ingredients']
        return normalized_word
    else:
        return word

def custom_removel_component(steps):
    words_to_remove = ['/', '-', 'ounce', 'cup', 'teaspoon', 'tbsp', 'tsp', 'tablespoon', 'sm', 'c', 'cube', 'tbsp.', 'sm.', 'c.', 'oz']
    # Lemmatize the set of instructions
    steps = steps.replace(' , ', ' ')
    docs = model(steps)

    return [token.lemma_ for token in docs]

def normalize_instructions(instructions_list):
    normalized_instructions_token = []
    for instructions in tqdm(instructions_list, total=len(instructions_list)):
        steps = " ".join([step.strip() for step in eval(instructions)])
        step = custom_removel_component(steps)
        normalized_instructions_token.append(step)
    return normalized_instructions_token


# food_review_dataset = remove_columns(food_review_dataset)
normalized_instructions_token = normalize_instructions(food_review_dataset['steps'].to_numpy()[:500])



100%|██████████| 500/500 [00:03<00:00, 144.30it/s]


In [28]:
food_bigram_model = Phrases(normalized_instructions_token, min_count=20, threshold=1)
food_bigrams = [food_bigram_model[sent] for sent in normalized_instructions_token]
food_trigram_model = Phrases(food_bigrams, min_count=5)
phrased_food_sentences = [food_trigram_model[sent] for sent in food_bigrams]
# food_trigram_model.save('food_trigrams.pkl')

In [39]:
normalized_instructions = [" ".join([word for word in sentence]) for sentence in phrased_food_sentences]

In [40]:
normalized_instructions

['make a choice and proceed with recipe depend_on size of squash , cut_into half or fourth remove seed for spicy squash , drizzle olive_oil or melt_butter over each cut squash piece season_with mexican seasoning mix ii for sweet squash , drizzle melt honey , butter , grated piloncillo over each cut squash piece season_with sweet mexican spice mix bake_at 350_degree , again depend_on size , for 40_minute up to an_hour , until a fork can easily pierce the skin be_careful not to burn the squash especially if_you opt to use sugar or butter if_you feel more comfortable , cover the squash with aluminum_foil the first half hour , give or take , of baking if_desire , season_with salt',
 'preheat_oven_to 425 degree_f press dough into_the bottom and side_of a 12 inch pizza pan bake_for 5_minute until set but_not brown cut sausage into small_piece whisk egg and milk in_a_bowl until frothy spoon sausage over baked crust and sprinkle_with cheese pour egg mixture slowly over sausage and cheese s & p

### 1. Training our Word Embeddings

First, we need to train a Word2Vec model on all the words in our corpus. We will process our wine and food terms separately - some of the wine terms will be standardized to account for commonalities in the colorful language of the world of wine.

If the trigram model has already been trained, simply retrieve it.

In [None]:
wine_trigram_model = Phraser.load('wine_trigrams.pkl')
food_trigram_model = Phraser.load('food_trigrams.pkl')

Now for the most important part: leveraging existing wine theory, the work of others like Bernard Chen, wine descriptor mappings and the UC Davis wine wheel, the top 5000 most frequent wine terms were reviewed to (i) determine whether they are a descriptor that can be derived by blind tasting, and (ii) whether they are informative (judgments like 'tasty' and 'great' are not considered to be informative). The roughly 1000 descriptors that remain were then mapped onto a normalized descriptor, a category and a class:

In [None]:
descriptor_mapping = pd.read_csv('descriptor_mapping.csv', encoding='latin1').set_index('raw descriptor')

def return_mapped_descriptor(word, mapping):
    if word in list(mapping.index):
        normalized_word = mapping.at[word, 'level_3']
        return normalized_word
    else:
        return word

normalized_wine_sentences = []
for sent in phrased_wine_sentences:
    normalized_wine_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word, descriptor_mapping)
        normalized_wine_sentence.append(str(normalized_word))
    normalized_wine_sentences.append(normalized_wine_sentence)

We will go through the same process for food, but without normalizing the nonaroma descriptors.

In [None]:
aroma_descriptor_mapping = descriptor_mapping.loc[descriptor_mapping['type'] == 'aroma']
normalized_food_sentences = []
for sent in phrased_food_sentences:
    normalized_food_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word, aroma_descriptor_mapping)
        normalized_food_sentence.append(str(normalized_word))
    normalized_food_sentences.append(normalized_food_sentence)

Now, let's combine the wine dataset with our food dataset so we can train our embeddings. We want to make sure that the food and wine embeddings are calculated in the same feature space so that we can compute similarity vectors later on.

In [None]:
normalized_sentences = normalized_wine_sentences + normalized_food_sentences

We are ready to train our Word2Vec model!

In [None]:
wine_word2vec_model = Word2Vec(normalized_sentences, size=300, min_count=8, iter=15)
print(wine_word2vec_model)

wine_word2vec_model.save('food_word2vec_model.bin')

In [None]:
# if the word2vec model has already been trained, simply load it
wine_word2vec_model = Word2Vec.load("food_word2vec_model.bin")

### 3. Preparing our Food Dataset

Now that we have our wine aroma vectors and the nonaroma scalars, we can turn our attention to food. 

We will want to generate nonaroma vectors for any type of food that we want a wine pairing with. For food, we don't have the luxury of being able to define nonaroma vs. aroma descriptors, so the approach we take will be slightly different:

The aroma vector will be the full food embedding.

We will define an embedding for each of our core nonaromas (sweet, acid, salt, piquant, fat and bitter), and the weight/body of the food. We will define the maximum distance between each of the nonaroma embeddings and a range of commonly appearing foods. The foods that least and most resemble each nonaroma will eventually allow us to create a normalized scale between 0 (very dissimilar) and 1 (very similar) to say how much a food reflects each nonaroma. 

First, let's load this list of common foods.

In [None]:
foods = pd.read_csv('list_of_foods.csv')
foods_list = list(foods['Food'])
foods_list_normalized = [normalize_text(f) for f in foods_list]
foods_list_preprocessed = [food_trigram_model[f][0] for f in foods_list_normalized]
foods_list_preprocessed = list(set(foods_list_preprocessed))

Load the word embedding for each food in the list of sample foods, and save to a dictionary. 

In [None]:
foods_vecs = dict()

word_vectors = wine_word2vec_model.wv
for f in foods_list_preprocessed:
    try:
        food_vec = word_vectors[f]
        foods_vecs[f] = food_vec
    except:
        continue

Now, we can define the nonaroma embeddings + the weight embedding as the average of foods that represent each nonaroma characteristic.

In [None]:
from scipy import spatial

core_tastes_revised = {'weight': ['heavy', 'cassoulet', 'cassoulet', 'full_bodied', 'thick', 'milk', 'fat', 'mincemeat', 'steak', 'bold', 'pizza', 'pasta', 'creamy', 'bread'],
                       'sweet': ['sweet', 'sugar', 'cake', 'mango', 'stevia'], 
                       'acid': ['acid', 'sour', 'vinegar', 'yoghurt', 'cevich', 'cevich'],
                       'salt': ['salty', 'salty', 'parmesan', 'oyster', 'pizza', 'bacon', 'cured_meat', 'sausage', 'potato_chip'], 
                       'piquant': ['spicy'], 
                       'fat': ['fat', 'fried', 'creamy', 'cassoulet', 'foie_gras', 'buttery', 'cake', 'foie_gras', 'sausage', 'brie', 'carbonara'], 
                       'bitter': ['bitter', 'kale']
                      }

average_taste_vecs = dict()
core_tastes_distances = dict()
for taste, keywords in core_tastes_revised.items():
    
    all_keyword_vecs = []
    for keyword in keywords:
        c_vec = word_vectors[keyword]
        all_keyword_vecs.append(c_vec)
    
    avg_taste_vec = np.average(all_keyword_vecs, axis=0)
    average_taste_vecs[taste] = avg_taste_vec
        
    taste_distances = dict()
    for k, v in foods_vecs.items():
        similarity = 1- spatial.distance.cosine(avg_taste_vec, v)
        taste_distances[k] = similarity
        
    core_tastes_distances[taste] = taste_distances        

We can now find out which foods most and least resemble each nonaroma.

In [None]:
food_nonaroma_infos = dict()
# for each core taste, identify the food item that is farthest and closest. We will need this to create a normalized scale between 0 and 1
for key, value in core_tastes_revised.items():
    dict_taste = dict()
    farthest = min(core_tastes_distances[key], key=core_tastes_distances[key].get)
    farthest_distance = core_tastes_distances[key][farthest]
    closest = max(core_tastes_distances[key], key=core_tastes_distances[key].get)
    closest_distance = core_tastes_distances[key][closest]
    print(key, farthest, closest)
    dict_taste['farthest'] = farthest_distance
    dict_taste['closest'] = closest_distance
    dict_taste['average_vec'] = average_taste_vecs[key]
    food_nonaroma_infos[key] = dict_taste

Now, let's save the average embedding for each nonaroma, as well as the minimum and maximum distance to each nonaroma embedding - we will use these to scale the nonaroma scalars that we obtain for any foods we try to pair wine with.

In [None]:
food_nonaroma_infos_df = pd.DataFrame(food_nonaroma_infos).T
food_nonaroma_infos_df.to_csv('average_nonaroma_vectors.csv')

We have all the pieces we need to build our wine recommendations. We will continue with this in a separate notebook.