In [2]:
import numpy as np
import csv
import collections
from scipy import sparse
import torch
from transformers import *
import progressbar
import sys
#sys.path.append('train.py')
#from train import Linear, Gaussian, load_data, Square
import matplotlib.pyplot as plt
import progressbar

In [3]:
# Load recipe data
with open("RAW_recipes.csv", 'r') as f:
    reader = csv.reader(f)
    raw_read = list(reader)

# Get the ingredients for each recipe and associate them with the recipe's unique ID
id_to_ing = dict()
id_to_rating = dict()
for i,recipe in enumerate(raw_read[1:]):
    # Remove junk characters from ingredients
    id_to_ing[recipe[1]] = [a.strip("'").strip('"') for a in recipe[10].strip('"[').strip(']"').split(", ")]
    id_to_rating[recipe[1]] = []

del raw_read

# Find ingredients that appear in fewer than five recipes
wordCounter = collections.Counter()
for id in id_to_ing:
    for word in set(id_to_ing[id]):
        wordCounter[word] += 1
for key in wordCounter:
    if wordCounter[key] < 5:
        wordCounter[key] = -1
wordCounter = +wordCounter
goodWords = set(wordCounter.keys())

to_delete = []

# Delete ingredients that appear in fewer than five recipes
for id in id_to_ing:
    id_to_ing[id] = [a for a in id_to_ing[id] if a in goodWords]
    if id_to_ing == []:
        to_delete += [id]
for item in to_delete:
    del id_to_rating[item]

# Load the user interaction data
with open("RAW_interactions.csv", 'r') as f:
    reader = csv.reader(f)
    raw_ratings = list(reader)

# Get the ratings for ecah recipe
for i, interaction in enumerate(raw_ratings[1:]):
    if interaction[1] in id_to_rating.keys() and interaction[3] != "0":
        id_to_rating[interaction[1]] += [int(interaction[3])]

del raw_ratings

to_delete = []

# Delete recipes that have no rating
for recipe in id_to_rating:
    if id_to_rating[recipe] == []:
        to_delete += [recipe]
    else:
        id_to_rating[recipe] = sum(id_to_rating[recipe])/len(id_to_rating[recipe])
for item in to_delete:
    del id_to_rating[item]


# Get rid of the recipe ID, instead associating the ingredients with the rating
X = [0 for i in range(len(id_to_rating))]
Y = [0 for i in range(len(id_to_rating))]
for i, id in enumerate(id_to_rating.keys()):
    X[i] = id_to_ing[id]
    Y[i] = id_to_rating[id]

# Save as a training set, valdiation set, and test set (70/15/15 split)
Y = np.array(Y)
with open('X_train.txt', 'w') as f:
    for ingredients in X[:158612]:
        for ing in ingredients:
            f.write(ing+'#')
        f.write('\n')
with open('X_val.txt', 'w') as f:
    for ingredients in X[158612:192601]:
        for ing in ingredients:
            f.write(ing+'#')
        f.write('\n')
with open('X_test.txt', 'w') as f:
    for ingredients in X[192601:]:
        for ing in ingredients:
            f.write(ing+'#')
        f.write('\n')
np.save('Y_train',Y[:158612])
np.save('Y_val',Y[158612:192601])
np.save('Y_test',Y[192601:])



In [4]:
def loss(predictions, Y):
    """ Return the MSE loss. """
    return 0.5 * np.linalg.norm(Y - predictions)**2 /len(Y)

def avg_dif(predictions, Y):
    """ Calculate the average L1 distance between predictions and true ratings. """
    return sum([abs(Y[i] - predictions[i]) for i in range(len(Y))])[0] / len(Y)

def lin_reg_prediction(X, theta):
    """ Return the prediction vector for inputs X using theta output from linear regression. """
    return np.dot(X, theta)

def kernel_prediction(beta, K):
    """ Return the prediction vector for input kernel K using beta output from kernel methods. """
    return np.dot(beta.T, K).T

def calc_grad(X,Y,theta):
    """ Calculate the gradient for use in gradient descent for linear regression. """
    return np.dot((Y - np.dot(X,theta)).T,X).T / (X.shape[0])

def linear_regression(data, embedding, iters, learning_rate = 0.1, makePlot = True):
    """ Perform linear regression. """
    (X, X_val, X_test, Y, Y_val, Y_test) = data
    theta = np.zeros((X.shape[1],1))

    tloss = []
    vloss = []

    for i in progressbar.progressbar(iters):
        theta += learning_rate * calc_grad(X, Y, theta)

        tloss += [loss(lin_reg_prediction(X, theta), Y)]
        vloss += [loss(lin_reg_prediction(X_val, theta), Y_val)]
    if makePlot:
        plot(iters, tloss, vloss, embedding, "Linear Regression")
    test_predictions = lin_reg_prediction(X_test, theta)
    print("Average difference in star rating: %.4f" %avg_dif(test_predictions, Y_test))
    return tloss, vloss, loss(test_predictions, Y_test)

def Linear(xi, xj):
    """ Linear kernel function. """
    return np.dot(xi,xj.T)

def Square(xi, xj):
    """ Square kernel function. """
    return np.dot(xi,xj.T)**2

def Gaussian(xi, xj, sigma = 1):
    """ Gaussian kernel function. """
    return np.exp( - np.dot((xi-xj), (xi-xj).T) / (2 * sigma**2))

def kernel_method(data, kernel, embedding,  iters, learning_rate = 0.0001, makePlot = True):
    """ Use kernel method to learn parameter vector beta for prediction. """
    (K_train, K_val, K_test, Y_train, Y_val, Y_test) = data
    beta = np.zeros((K_train.shape[0],1))

    tloss = []
    vloss = []

    for i in progressbar.progressbar(iters):
        beta += learning_rate * ( Y_train - np.dot(beta.T, K_train).T )
        tloss += [loss(kernel_prediction(beta, K_train), Y_train)]
        vloss += [loss(kernel_prediction(beta, K_val), Y_val)]

    if makePlot:
        plot(iters, tloss, vloss, embedding, kernel.__name__+" Kernel")
    test_predictions = kernel_prediction(beta, K_test)
    print("Average difference in star rating: %.4f" %avg_dif(test_predictions, Y_test))
    return tloss, vloss, loss(test_predictions, Y_test)

def plot(iters, tloss, vloss, embedding, method):
    """ Make a plot of training and validation loss during training. """
    plt.plot(iters,tloss,label='training loss')
    plt.plot(iters,vloss,label='validation loss')

    plt.title('Average Loss per Example During Training for \n'+embedding+' Embedding with '+method)
    plt.xlabel('iteration number')
    plt.ylabel('average loss')
    plt.legend()
    plt.show()

def learning_rate_sweep(vals_to_test, func):
    """ Meta-function to sweep over possible learning rates and plot validation set loss. """
    outputs = []
    for i in vals_to_test:
        _, vloss, _ = func(i)
        outputs += [vloss[-1]]
    plt.plot(vals_to_test, outputs, 'o--')
    plt.title('Average Loss over Validation Set')
    plt.xlabel('learning rate')
    plt.ylabel('loss')
    plt.show()
    print(outputs)

def load_data(embedding, trim = True):
    """ Load features as saved by features_X.py. """
    Y_train = np.expand_dims(np.load('Y_train.npy'), axis=1)
    Y_val = np.expand_dims(np.load('Y_val.npy'), axis=1)
    Y_test = np.expand_dims(np.load('Y_test.npy'), axis=1)

    if embedding == "One-Hot":
        X_train = sparse.load_npz('X_train_one_hot.npz').todense()
        X_val = sparse.load_npz('X_val_one_hot.npz').todense()
        X_test = sparse.load_npz('X_test_one_hot.npz').todense()

    if embedding == "BERT":
        X_train = np.load('X_train_bert.npy')
        X_val = np.load('X_val_bert.npy')
        X_test = np.load('X_test_bert.npy')

    if trim:
        X_train = X_train[:5000,:]
        X_val = X_val[:1000,:]
        X_test = X_test[:1000,:]

        Y_train = Y_train[:5000,:]
        Y_val = Y_val[:1000,:]
        Y_test = Y_test[:1000,:]

    return (X_train, X_val, X_test, Y_train, Y_val, Y_test)

def load_kernel(embedding, kernel):
    """ Load kernel matrix as saved by features_X.py. """
    Y_train = np.expand_dims(np.load('Y_train.npy'), axis=1)[:5000,:]
    Y_val = np.expand_dims(np.load('Y_val.npy'), axis=1)[:1000,:]
    Y_test = np.expand_dims(np.load('Y_test.npy'), axis=1)[:1000,:]

    K_train = np.load('K_train_'+embedding+'_'+kernel.__name__+'.npy')
    K_val = np.load('K_val_'+embedding+'_'+kernel.__name__+'.npy')
    K_test = np.load('K_test_'+embedding+'_'+kernel.__name__+'.npy')

    return (K_train, K_val, K_test, Y_train, Y_val, Y_test)

def pretty_print(inps):
    """ Print out training, validation, and test set losses. """
    train_loss, validation_loss, test_loss = inps
    print("Training loss: %.4f   Validation loss: %.4f   Test loss: %.4f" %(train_loss[-1], validation_loss[-1], test_loss))


In [5]:
def load_raw(filename, size):
    """ Load the text files output by the clean_data.py script. """
    with open(filename, 'r') as f:
        X = [a.rstrip('#\n').split('#') for a in list(f)]
    return X[:size]

In [6]:
def make_vocab(X_train, X_val, X_test):
    """ Create a set of the unique ingredients. """
    vocab = set()
    for X in [X_train, X_val, X_test]:
        for ing_list in X:
            for ing in ing_list:
                vocab.add(ing)
    vocab = list(vocab)
    print(len(vocab))
    return vocab

In [7]:
def one_hot(X, vocab, filename):
    """ Save the design matrix with a one-hot encoding. """
    X_one_hot = np.append(np.ones((len(X), 1)), np.zeros((len(X), len(vocab))),axis=1)
    for i,recipe in enumerate(X):
        for ing in recipe:
            X_one_hot[i, vocab.index(ing)] = 1

    X_one_hot = sparse.coo_matrix(X_one_hot)
    sparse.save_npz(filename, X_one_hot)

In [8]:
def bert(X, filename):
    """ Save the design matrix with BERT embedding. """
    X_out = np.zeros((len(X),768))

    with torch.no_grad():
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        X_preprocess = ["[CLS] " + ' ; '.join(x)+" [SEP]" for x in X]
        tokens = [tokenizer.tokenize(x) for x in X_preprocess]
        indexed_tokens = [tokenizer.convert_tokens_to_ids(recipe) for recipe in tokens]
        model = BertModel.from_pretrained('bert-base-uncased')
        model.eval()
        for i, recipe_embedding in enumerate(indexed_tokens):
            if i % 100 == 0:
                print(i)
            tokens_tensor = torch.tensor([recipe_embedding])
            segments_tensor = torch.ones_like(tokens_tensor)
            encoded_layers, _ = model(tokens_tensor, segments_tensor)

            X_out[i, :] = np.array(torch.mean(encoded_layers[0], dim=0))
    np.save(filename, X_out)

In [9]:
def get_kernel(X_train, X_test, kernel_func, symmetric = False):
    """ Create the kernel matrix. Use symmetric to speed up computation if X_train and X_test are identical. """
    K = np.zeros((X_train.shape[0], X_test.shape[0]))
    if symmetric:
        for i in progressbar.progressbar(range(X_train.shape[0])):
            for j in range(i+1):
                K[i,j] = K[j,i] = kernel_func(X_train[i,:], X_test[j,:])
    else:
        for i in progressbar.progressbar(range(X_train.shape[0])):
            for j in range(X_test.shape[0]):
                K[i,j] = kernel_func(X_train[i,:], X_test[j,:])
    return K


In [10]:
def kernelize(embedding, kernel):
    """ Create the kernel matrices for the training, validation, and test sets. """
    (X_train, X_val, X_test, Y_train, Y_val, Y_test) = load_data(embedding)
    np.save('K_train_'+embedding+'_'+kernel.__name__, get_kernel(X_train, X_train, kernel, symmetric = True))
    np.save('K_val_'+embedding+'_'+kernel.__name__, get_kernel(X_train, X_val, kernel, symmetric = False))
    np.save('K_test_'+embedding+'_'+kernel.__name__, get_kernel(X_train, X_test, kernel, symmetric = False))

In [None]:
if __name__ == '__main__':
    """ Load the outputs of clean_data and build and save the feature vectors and kernels used in training. """
    X_train = load_raw('X_train.txt', 5000)
    X_val = load_raw('X_train.txt', 1000)
    X_test = load_raw('X_train.txt', 1000)

    vocab = make_vocab(X_train, X_val, X_test)

    one_hot(X_train, vocab, 'X_train_one_hot')
    one_hot(X_val, vocab, 'X_val_one_hot')
    one_hot(X_test, vocab, 'X_test_one_hot')

    bert(X_train, 'X_train_bert')
    bert(X_val, 'X_val_bert')
    bert(X_test, 'X_test_bert')

    kernelize("One-Hot", Square)
    kernelize("One-Hot", Gaussian)
    kernelize("BERT", Square)
    kernelize("BERT", Gaussian)

3866


loading file vocab.txt from cache at /Users/yizhechen/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/yizhechen/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/tokenizer_config.json
loading configuration file config.json from cache at /Users/yizhechen/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer

In [11]:
if __name__ == "__main__":
    pretty_print(linear_regression(load_data("One-Hot"), "One-Hot", range(50), 0.5))

    pretty_print(kernel_method(load_kernel("One-Hot", Linear), Linear, "One-Hot", range(50), 0.0002))

    pretty_print(kernel_method(load_kernel("One-Hot", Gaussian), Gaussian, "One-Hot", range(100), 0.015))

    pretty_print(linear_regression(load_data("BERT"), "BERT", range(100), 0.02))

    pretty_print(kernel_method(load_kernel("BERT", Linear), Linear, "BERT", range(100), 4*10**-6))

    pretty_print(kernel_method(load_kernel("BERT", Gaussian), Gaussian, "BERT", range(100), 0.05))

    pretty_print(kernel_method(load_kernel("One-Hot", Square), Square, "One-Hot", range(100), 0.5*10**-4))

    pretty_print(kernel_method(load_kernel("BERT", Square), Square, "BERT", range(100), 3*10**-8))



TypeError: 'module' object is not callable