In [None]:
# CLONE THE GIT REPOSITORY FOR FOODBERT
!git clone https://github.com/ChantalMP/Exploiting-Food-Embeddings-for-Ingredient-Substitution.git
%cd Exploiting-Food-Embeddings-for-Ingredient-Substitution/
!wget https://github.com/ChantalMP/Exploiting-Food-Embeddings-for-Ingredient-Substitution/releases/download/0.1/foodbert_data.zip
!unzip -qq foodbert_data.zip
!mv foodbert_data/* foodbert/data/
!rm -r foodbert_data

In [None]:
pip install transformers

In [7]:
import re
 # define function for preprocessing ingredients

def preprocess_ingredients(file_path):
    try:
        with open(file_path, 'r') as file:
            file_contents = file.read()

        # Extract the list of ingredients using regex
        match = re.search(r"\[([^\]]+)\]", file_contents)
        if not match:
            raise ValueError("Couldn't find the list of ingredients in the file")

        # Convert the string representation of the list to an actual Python list
        ingredients_str = match.group(1)
        your_ingredients = [ingredient.strip(" '") for ingredient in ingredients_str.split(',')]

        # Preprocess ingredients: strip newline characters, spaces, and single quotes
        cleaned_ingredients = [ingredient.replace("\n", "").strip().replace("'", "") for ingredient in your_ingredients]

        # Further clean the ingredients to remove any extra surrounding quotes
        cleaned_ingredients = [ingredient.strip("'").strip('"') for ingredient in cleaned_ingredients]

        return cleaned_ingredients

    except FileNotFoundError:
        raise FileNotFoundError("The specified file path was not found.")

 # load english ingredients and korean ingredients
file_path = '/content/english_unique.txt'
cleaned_ingredients_eng = preprocess_ingredients(file_path)

file_path ='/content/kor_unique.txt'
cleaned_ingredients_kor = preprocess_ingredients(file_path)

# load russian ingredients
with open("/content/rus_unique.txt", "r") as file:
    cleaned_ingredients_rus = [line.strip() for line in file.readlines()]



In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import torch
import pickle
from transformers import AutoTokenizer, AutoModel

import os

from transformers import BertTokenizer, BertModel

#CREATE FUNCTION WHICH GENERATES EMBEDDINGS AND SAVES THEM

def generate_and_save_embeddings(cleaned_ingredients, model, tokenizer, language_code,model_name):
    model.eval()

    def get_word_embedding(word: str):
        input_ids = torch.tensor(tokenizer.encode(word)).unsqueeze(0)  # Batch size 1
        with torch.no_grad():
            outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        return last_hidden_states[0][0].detach().numpy()  # Using the embedding of the first token ([CLS] token) as a representation for the entire sequence

    # Calculate embeddings for all ingredients
    ingredient_embeddings = {}
    for ingredient in cleaned_ingredients:
        ingredient_embeddings[ingredient] = get_word_embedding(ingredient)

    # Construct the file name with the specified language code and model name
    #model_name = model.config.architectures[0]
    file_name = f"{model_name}_{language_code}_embeddings.pkl"

    # Get the directory path
    directory_path = "/content/drive/MyDrive/MyEmbeddings/"

    # Create the full file path
    file_path = os.path.join(directory_path, file_name)

    # Save the embeddings to a .pkl file
    with open(file_path, "wb") as file:
        pickle.dump(ingredient_embeddings, file)

    print(f"Embeddings saved to {file_path}")



# Example usage
# generate_and_save_embeddings(cleaned_ingredients, 'xlm-roberta-base')



BERT BASE MULTILIGUAL UNCASED

In [None]:
# Load the "bert-base-multilingual-uncased" model and tokenizer
model_name = "bert-base-multilingual-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.eval()

# Call the function to generate and save embeddings
generate_and_save_embeddings(cleaned_ingredients_eng, model, tokenizer, "EN", "bert-base-multilingual-uncased")
generate_and_save_embeddings(cleaned_ingredients_kor, model, tokenizer, "KOR", "bert-base-multilingual-uncased")
generate_and_save_embeddings(cleaned_ingredients_rus, model, tokenizer, "RUS", "bert-base-multilingual-uncased")

FOODBERT

In [None]:
from transformers import BertModel, BertForMaskedLM, BertTokenizer
import torch
import json
with open('foodbert/data/used_ingredients.json', 'r') as f:
    used_ingredients = json.load(f)
tokenizer = BertTokenizer(vocab_file='foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False, max_len=128, never_split=used_ingredients)
model = BertModel.from_pretrained(pretrained_model_name_or_path='foodbert/data/mlm_output/checkpoint-final')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.eval()

# Call the function to generate and save embeddings
generate_and_save_embeddings(cleaned_ingredients_eng, model, tokenizer, "ENG","foodbert")
generate_and_save_embeddings(cleaned_ingredients_kor, model, tokenizer, "KOR","foodbert")
generate_and_save_embeddings(cleaned_ingredients_rus, model, tokenizer, "RUS","foodbert")

