In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import gensim as gs
from gensim import utils
import torch

from functools import reduce

In [2]:
# Import data
data = pd.read_csv('cleaneddata.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,title,summary,instructions,ingredients,ingredient types,diets,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,healthScore,pricePerServing,readyInMinutes,servings
0,0,orange fig teacake with caramel glaze,orange fig teacake with caramel glaze is a veg...,you will need a 9 springform pan or a cake ...,ap flour; baking powder; cardamom; eggs; fresh...,Beverages Milk Eggs Other Dairy Spices and Sea...,lacto ovo vegetarian,True,False,False,False,False,False,False,False,3.0,75.55,45,10
1,1,poached eggs on a bed of fried mushrooms and c...,poached eggs on a bed of fried mushrooms and c...,in a frying pan heat up oil then add mushroom...,bread; butter; eggs; eggs; mushrooms; oil; sal...,Beverages Milk Eggs Other Dairy Spices and Sea...,lacto ovo vegetarian,True,False,False,False,False,False,False,False,15.0,147.7,45,2
2,2,pandan chiffon cake,for 26 cents per serving this recipe covers ...,preheat the oven to 170c blend the pandan le...,all purpose flour; bay leaves; coconut milk; c...,Ethnic Foods Produce Spices and Seasonings Bev...,dairy free; lacto ovo vegetarian,True,False,False,True,False,False,False,False,1.0,26.06,45,9
3,3,pork chop with honey mustard and apples,pork chop with honey mustard and apples might...,pre heat your oven to 200c 400f line a roa...,apples; dijon mustard; garlic cloves; honey; j...,Meat Spices and Seasonings Condiments Oil Vine...,gluten free; dairy free; paleolithic; primal,False,False,True,True,False,False,False,False,17.0,242.23,45,4
4,4,beet gnocchi with steak and brown butter sauce,the recipe beet gnocchi with steak and brown b...,cooking beets heat oven to 400 degrees wash be...,gnocchi; beets; olive oil; s p; goat cheese; r...,Produce Spices and Seasonings Meat Spices and ...,,False,False,False,False,False,False,False,False,12.0,417.69,45,4


In [3]:
# For each column, check how many NAN values exist
data.isnull().sum()

Unnamed: 0             0
title                  0
summary                5
instructions         201
ingredients            0
ingredient types       0
diets               1078
vegetarian             0
vegan                  0
glutenFree             0
dairyFree              0
veryHealthy            0
cheap                  0
veryPopular            0
sustainable            0
healthScore            0
pricePerServing        0
readyInMinutes         0
servings               0
dtype: int64

# Word2Vec

In [4]:
# Classification

# 1. For each input recipe, have a result vector of it's properties
# 2. For example, [isVegetarian, isVegan, isGlutenFree, isDairyFree, isLowSugar, isVeryHealthy, isCheap, isVeryPopular, isSustainable]


In [5]:
# Get a list of all "possible" ingredients
ingredients = data["ingredients"].to_list()

all_ingredients = []
for i in ingredients: 
    items = i.split("; ")
    all_ingredients += items

all_ingredients = list(set(all_ingredients))

In [6]:
# GET X and Y

# For each row in the datframe data
propertyColumns = ['vegetarian', 'vegan', 'glutenFree', 'dairyFree', 'veryHealthy', 'cheap', 'veryPopular', 'sustainable']

Y = torch.zeros(len(data), len(propertyColumns))
X = []

# For each recipe, get a list of ingredients (input) and a 
# binary vector of the properties (output)
for i, row in data.iterrows():
    # Get the ingredients for each recipe
    ingredient = row['ingredients'].split("; ")
    # For each ingredient, replace whitespaces with a hyphen
    ingredient = [i.replace(" ", "-") for i in ingredient]
    X.append(ingredient)
    
    # Get a binary vector for each property to be predicted
    for j in range(len(propertyColumns)):
        columnName = propertyColumns[j]
        if row[columnName] == True:
            Y[i][j] = 1

print(X[:5])
print(Y[:5])
assert len(X) == len(Y)

[['ap-flour', 'baking-powder', 'cardamom', 'eggs', 'fresh-figs', 'heavy-cream', 'navel-oranges', 'salt', 'sugar', 'unsalted-butter', 'vanilla-extract', 'water'], ['bread', 'butter', 'eggs', 'eggs', 'mushrooms', 'oil', 'salt', 'salt', 'vinegar', 'water'], ['all-purpose-flour', 'bay-leaves', 'coconut-milk', 'corn-oil', 'cream-of-tartar', 'curry-paste', 'egg-whites', 'egg-yolks', 'salt', 'sugar', 'water'], ['apples', 'dijon-mustard', 'garlic-cloves', 'honey', 'juice-of-lemon', 'olive-oil', 'pork-chops', 'salt-and-pepper', 'white-onion'], ['gnocchi', 'beets', 'olive-oil', 's-p', 'goat-cheese', 'ricotta', 'flour', 'steak', 'butter', 'shallot', 'butter', 'fresh-thyme', 'walnuts']]
tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])


In [7]:
# Convert X to Word2Vec

# Tokenize, lowercase, and remove punctuation, numbers, and alphanumeric characters
w2vec = gs.models.Word2Vec(X, vector_size=100, workers=4)
w2vec.save("w2vec_1.model")

In [8]:
word_vectors = w2vec.wv
all_ingredients = word_vectors.key_to_index
embeddings = word_vectors.vectors
print(embeddings.shape)
print(len(all_ingredients))

(750, 100)
750


In [9]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# Filter X, Y to only keep those ingredients in X that have embeddings
X_final = []
Y_final = []
for i in range(len(X)):
    x = [a for a in X[i] if a in all_ingredients]
    if x != []:
        X_final.append(x)
        Y_final.append(np.array(Y[i]))

X = np.array(X_final)
Y = np.array(Y_final)

print(X.shape)
print(Y.shape)

(4299,)
(4299, 8)


  X = np.array(X_final)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

# Trained word2vec. 
# Now, train a neural network to predict properties
model = MLPClassifier(hidden_layer_sizes=(50, 25, 10), solver= "adam", max_iter=1000, random_state=1)

# Training the model
# For each recipe, get the average embedding of its ingredient list
X_embedding_train = []
for recipe in X_train:
    for ingredient in recipe:
        e = []
        try:
            a = np.array(w2vec.wv[ingredient])
            e.append(a)
        except:
            pass
    e = np.array(e)
    avg_emb = np.average(e, axis=0)

    X_embedding_train.append(avg_emb)
    
    
X_embedding_train = np.array(X_embedding_train)
y_train = np.array(y_train)
print(X_embedding_train.shape)
print(y_train.shape)
# model.fit(X_embedding_train, y_train)
print(X_embedding_train[0])
print(y_train[0])

(3439, 100)
(3439, 8)
[-1.11182168e-01  3.62482309e-01 -3.01345289e-01 -3.49534489e-02
 -1.44745214e-02 -4.78322804e-01  4.05416429e-01  1.70084625e-01
 -7.77252495e-01  8.84021595e-02 -1.72145262e-01 -3.99690390e-01
  1.61188453e-01  3.66060466e-01 -8.15672353e-02 -3.60926569e-01
 -7.73423389e-02 -1.79382145e-01  7.73228109e-02 -5.76307714e-01
  3.69724452e-01 -1.84747670e-03  2.17143014e-01 -3.25623900e-01
  1.78207055e-01  4.28164788e-02 -1.30135000e-01 -2.57006258e-01
 -1.60830468e-01 -2.25592643e-01  4.48741496e-01  1.82673961e-01
  1.35758922e-01  1.35211140e-01 -9.83311161e-02  3.17319810e-01
 -5.53197376e-02 -4.59723640e-03  1.90990701e-01 -8.85578871e-01
 -1.28730044e-01 -9.67239961e-02  3.04675281e-01 -1.74578235e-01
  4.04140830e-01  5.88542074e-02 -2.73872346e-01 -1.39174089e-01
  1.72759384e-01 -1.40973419e-01  3.22027653e-01 -2.48675749e-01
  3.59665081e-02  1.25527978e-01  4.00148472e-03 -6.03583157e-02
  2.00766288e-02  2.34513953e-01 -6.48447812e-01 -9.72093195e-02
  1

In [11]:
# Now, I have an input matrix and a label vector. Why am I not able to train the model?
model.fit(X_embedding_train, y_train)

MLPClassifier(hidden_layer_sizes=(50, 25, 10), max_iter=1000, random_state=1)

In [13]:
X_embedding_test = []
for recipe in X_test:
    for ingredient in recipe:
        e = []
        try:
            a = np.array(w2vec.wv[ingredient])
            e.append(a)
        except:
            pass
    e = np.array(e)
    avg_emb = np.average(e, axis=0)

    X_embedding_test.append(avg_emb)

In [14]:
prediction = model.predict(X_embedding_test)
prediction

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [18]:
from sklearn.metrics import hamming_loss

test_loss = hamming_loss(y_test, prediction)
print(test_loss)

0.2002906976744186


In [19]:
training_pred = model.predict(X_embedding_train)
training_loss = hamming_loss(y_train, training_pred)
print(training_loss)

0.19384268682756614
