In [131]:
# Import necessary libraries
import numpy as np
import pandas as pd
import gensim as gs
from gensim import utils
import torch

from functools import reduce

In [132]:
# Import data
data = pd.read_csv('cleaneddata.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,title,summary,instructions,ingredients,ingredient types,diets,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,healthScore,pricePerServing,readyInMinutes,servings
0,0,orange fig teacake with caramel glaze,orange fig teacake with caramel glaze is a veg...,you will need a 9 springform pan or a cake ...,ap flour; baking powder; cardamom; eggs; fresh...,Beverages Milk Eggs Other Dairy Spices and Sea...,lacto ovo vegetarian,True,False,False,False,False,False,False,False,3.0,75.55,45,10
1,1,poached eggs on a bed of fried mushrooms and c...,poached eggs on a bed of fried mushrooms and c...,in a frying pan heat up oil then add mushroom...,bread; butter; eggs; eggs; mushrooms; oil; sal...,Beverages Milk Eggs Other Dairy Spices and Sea...,lacto ovo vegetarian,True,False,False,False,False,False,False,False,15.0,147.7,45,2
2,2,pandan chiffon cake,for 26 cents per serving this recipe covers ...,preheat the oven to 170c blend the pandan le...,all purpose flour; bay leaves; coconut milk; c...,Ethnic Foods Produce Spices and Seasonings Bev...,dairy free; lacto ovo vegetarian,True,False,False,True,False,False,False,False,1.0,26.06,45,9
3,3,pork chop with honey mustard and apples,pork chop with honey mustard and apples might...,pre heat your oven to 200c 400f line a roa...,apples; dijon mustard; garlic cloves; honey; j...,Meat Spices and Seasonings Condiments Oil Vine...,gluten free; dairy free; paleolithic; primal,False,False,True,True,False,False,False,False,17.0,242.23,45,4
4,4,beet gnocchi with steak and brown butter sauce,the recipe beet gnocchi with steak and brown b...,cooking beets heat oven to 400 degrees wash be...,gnocchi; beets; olive oil; s p; goat cheese; r...,Produce Spices and Seasonings Meat Spices and ...,,False,False,False,False,False,False,False,False,12.0,417.69,45,4


In [133]:
# For each column, check how many NAN values exist
data.isnull().sum()

Unnamed: 0             0
title                  0
summary                5
instructions         201
ingredients            0
ingredient types       0
diets               1078
vegetarian             0
vegan                  0
glutenFree             0
dairyFree              0
veryHealthy            0
cheap                  0
veryPopular            0
sustainable            0
healthScore            0
pricePerServing        0
readyInMinutes         0
servings               0
dtype: int64

# Word2Vec

In [134]:
# Classification

# 1. For each input recipe, have a result vector of it's properties
# 2. For example, [isVegetarian, isVegan, isGlutenFree, isDairyFree, isLowSugar, isVeryHealthy, isCheap, isVeryPopular, isSustainable]


In [135]:
# Get a list of all "possible" ingredients
ingredients = data["ingredients"].to_list()

all_ingredients = []
for i in ingredients: 
    items = i.split("; ")
    all_ingredients += items

all_ingredients = list(set(all_ingredients))

In [136]:
# GET X and Y

# For each row in the datframe data
propertyColumns = ['vegetarian', 'vegan', 'glutenFree', 'dairyFree', 'veryHealthy', 'cheap', 'veryPopular', 'sustainable']

Y = torch.zeros(len(data), len(propertyColumns))
X = []

# For each recipe, get a list of ingredients (input) and a 
# binary vector of the properties (output)
for i, row in data.iterrows():
    # Get the ingredients for each recipe
    ingredient = row['ingredients'].split("; ")
    # For each ingredient, replace whitespaces with a hyphen
    ingredient = [i.replace(" ", "-") for i in ingredient]
    X.append(ingredient)
    
    # Get a binary vector for each property to be predicted
    for j in range(len(propertyColumns)):
        columnName = propertyColumns[j]
        if row[columnName] == True:
            Y[i][j] = 1

print(X[:5])
print(Y[:5])
assert len(X) == len(Y)

[['ap-flour', 'baking-powder', 'cardamom', 'eggs', 'fresh-figs', 'heavy-cream', 'navel-oranges', 'salt', 'sugar', 'unsalted-butter', 'vanilla-extract', 'water'], ['bread', 'butter', 'eggs', 'eggs', 'mushrooms', 'oil', 'salt', 'salt', 'vinegar', 'water'], ['all-purpose-flour', 'bay-leaves', 'coconut-milk', 'corn-oil', 'cream-of-tartar', 'curry-paste', 'egg-whites', 'egg-yolks', 'salt', 'sugar', 'water'], ['apples', 'dijon-mustard', 'garlic-cloves', 'honey', 'juice-of-lemon', 'olive-oil', 'pork-chops', 'salt-and-pepper', 'white-onion'], ['gnocchi', 'beets', 'olive-oil', 's-p', 'goat-cheese', 'ricotta', 'flour', 'steak', 'butter', 'shallot', 'butter', 'fresh-thyme', 'walnuts']]
tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])


In [137]:
# Convert X to Word2Vec

# Tokenize, lowercase, and remove punctuation, numbers, and alphanumeric characters
w2vec = gs.models.Word2Vec(X, vector_size=100, workers=4)
w2vec.save("w2vec_1.model")

In [138]:
word_vectors = w2vec.wv
all_ingredients = word_vectors.key_to_index
embeddings = word_vectors.vectors
print(embeddings.shape)
print(len(all_ingredients))

(750, 100)
750


In [139]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# Filter X, Y to only keep those ingredients in X that have embeddings
X_final = []
Y_final = []
for i in range(len(X)):
    x = [a for a in X[i] if a in all_ingredients]
    if x != []:
        X_final.append(x)
        Y_final.append(np.array(Y[i]))

X = np.array(X_final)
Y = np.array(Y_final)

print(X.shape)
print(Y.shape)

(4299,)
(4299, 8)


  X = np.array(X_final)


In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

# Trained word2vec. 
# Now, train a neural network to predict properties
model = MLPClassifier(hidden_layer_sizes=(50, 25, 10), solver= "adam", max_iter=1000, random_state=1)

# Training the model
# For each recipe, get the average embedding of its ingredient list
X_embedding_train = []
for recipe in X_train:
    for ingredient in recipe:
        e = []
        try:
            a = np.array(w2vec.wv[ingredient])
            e.append(a)
        except:
            pass
    e = np.array(e)
    avg_emb = np.average(e, axis=0)

    X_embedding_train.append(avg_emb)
    
    
X_embedding_train = np.array(X_embedding_train)
y_train = np.array(y_train)
print(X_embedding_train.shape)
print(y_train.shape)
# model.fit(X_embedding_train, y_train)
print(X_embedding_train[0])
print(y_train[0])

# prediction = model.predict(embedding)

(3439, 100)
(3439, 8)
[-0.18059078  0.34228677 -0.30207455 -0.00533764 -0.0374865  -0.47131056
  0.37753576  0.18378316 -0.79107     0.09286666 -0.20051491 -0.395642
  0.16111849  0.382102   -0.08139008 -0.41169694 -0.09023887 -0.16439267
  0.11145539 -0.59541297  0.38597038  0.00894879  0.25511956 -0.28313038
  0.17430067  0.06952149 -0.1828475  -0.30113685 -0.13362797 -0.22394526
  0.45854896  0.11858353  0.22010475  0.11835691 -0.13748641  0.39917758
 -0.04176695  0.04680825  0.19782577 -0.85688454 -0.11450142 -0.12954219
  0.30807334 -0.17557888  0.40472862  0.0344989  -0.29920813 -0.1677821
  0.21044278 -0.17289107  0.3569338  -0.21477485  0.05858085  0.13258383
  0.00506248 -0.0622467  -0.03926927  0.22899216 -0.72934157 -0.03464407
  0.1706075   0.23083025  0.00419992 -0.1284201  -0.71663797  0.0905507
  0.09400386  0.03550544 -0.4046216   0.66603    -0.4205427   0.08438339
  0.31017599  0.07598463 -0.03003437  0.05058973 -0.09639854  0.22768298
 -0.1381442   0.39992276 -0.23375

In [141]:
# Now, I have an input matrix and a label vector. Why am I not able to train the model?
model.fit(X_embedding_train, y_train)

MLPClassifier(hidden_layer_sizes=(50, 25, 10), max_iter=1000, random_state=1)

In [142]:
X_embedding_test = []
for recipe in X_test:
    for ingredient in recipe:
        e = []
        try:
            a = np.array(w2vec.wv[ingredient])
            e.append(a)
        except:
            pass
    e = np.array(e)
    avg_emb = np.average(e, axis=0)

    X_embedding_test.append(avg_emb)

In [143]:
prediction = model.predict(X_embedding_test)
prediction

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [144]:
y_test

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [150]:
from sklearn.metrics import accuracy_score, hamming_loss

score = accuracy_score(y_test, prediction)
test_loss = hamming_loss(y_test, prediction)

print(score)
print(test_loss)

0.1313953488372093
0.19912790697674418


In [152]:
training_pred = model.predict(X_embedding_train)

training_loss = hamming_loss(y_train, training_pred)
print(training_loss)

0.19326112241930793


AttributeError: 'MLPClassifier' object has no attribute 'save'

In [146]:
import random
# Examples of Predictions

random_index = random.sample(range(0, len(X_test)), 5)
# In training dataset
print("Input: ", X_train[random_index])

# Get predicted properties
# For each recipe
y_pred = model.predict(X_embedding_train[random_index])
for j in range(len(y_pred)):
    # For each property
    p = []
    for k in range(len(y_pred[j])):
        print(y_pred)
        if y_pred[j][k] == 1:
            p.append(propertyColumns[k])
    print("Predicted: ", p)
print("\n")

# Get actual properties
y_true = y_train[random_index]
for j in range(len(y_true)):
    p = []
    for k in range(len(y_true[j])):
        if y_true[j][k] == 1:
            p.append(propertyColumns[k])
    print("Actual: ", p)
print("\n")


# In test dataset

Input:  [list(['ground-flaxseed', 'water', 'wheat-bran', 'bran', 'spelt', 'spelt-flour', 'psyllium-husks', 'stevia-extract', 'flaxseeds', 'chia-seeds', 'baking-soda', 'sea-salt', 'nutmeg', 'cinnamon', 'demerara-sugar', 'agave-nectar', 'applesauce', 'ener-g-egg-replacer', 'milk', 'canola-oil', 'raisins', 'prunes'])
 list(['peanut-butter', 'bananas', 'cocoa-powder', 'agave-nectar', 'vanilla-extract'])
 list(['coconut-flour', 'baking-powder', 'baking-soda', 'dark-brown-sugar', 'kosher-salt', 'unsweetened-shredded-coconut', 'eggs', 'coconut-oil', 'coconut-milk', 'vanilla-extract', 'blueberries'])
 list(['mayonnaise', 'capers', 'horseradish', 'dijon-mustard', 'shallot', 'parsley', 'cod-fillets', 'bay-leaves', 'milk', 'water', 'potatoes', 'salt', 'lemon-zest', 'fresh-parsley', 'chives', 'pepper', 'flour', 'egg', 'breadcrumbs', 'sunflower-oil', 'lemon-wedges'])
 list(['apricot', 'baby-spinach', 'button-mushrooms', 'chili-paste', 'dried-apricots', 'fettuccini', 'garlic', 'olive-oil', 'romano-c

In [147]:
print("Input: ", X_test[random_index])
random_index = random.sample(range(0, len(X_test)), 5)

X_embedding_test = np.array(X_embedding_test)
# Get predicted properties
# For each recipe
y_pred = model.predict(X_embedding_test[random_index])
for j in range(len(y_pred)):
    # For each property
    p = []
    for k in range(len(y_pred[j])):
        print(y_pred)
        if y_pred[j][k] == 1:
            p.append(propertyColumns[k])
    print("Predicted: ", p)
print("\n")

# Get actual properties
y_true = y_test[random_index]
for j in range(len(y_true)):
    p = []
    for k in range(len(y_true[j])):
        if y_true[j][k] == 1:
            p.append(propertyColumns[k])
    print("Actual: ", p)
print("\n")

Input:  [list(['bay-leaves', 'canned-beans', 'canned-tomato-sauce', 'canned-tomatoes', 'carrots', 'cayenne-pepper', 'celery', 'chili-powder', 'corn', 'cumin', 'ears-corn', 'garlic', 'green-bell-pepper', 'onion', 'oregano', 'salt', 'tomatoes', 'vegetable-stock'])
 list(['baking-soda', 'baking-powder', 'salt', 'cinnamon', 'ground-nutmeg', 'applesauce', 'vanilla', 'egg-whites', 'coconut-oil', 'cranberries', 'pecans'])
 list(['bananas', 'juice-of-orange', 'crackers', 'semi-sweet-chocolate-baking-chips', 'margarine', 'walnuts', 'no-calorie-sweetener', 'pb-cups'])
 list(['all-purpose-flour', 'baking-powder', 'cherries', 'eggs', 'granulated-sugar', 'milk', 'unsalted-butter', 'vanilla-sugar'])
 list(['peanut-oil', 'cremini-mushrooms', 'ground-turkey', 'black-pepper', 'ground-cinnamon', 'allspice', 'ground-ginger', 'ground-coriander', 'garlic', 'soy-sauce', 'rice-wine', 'oyster-sauce', 'granny-smith-apple', 'scallions', 'hoisin-sauce'])]
[[0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
