Import libraries

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from pathlib import Path
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
try:
    base_dir = Path(__file__).resolve().parent.parent
except NameError:
    base_dir = Path().resolve()

1. Creating equal classes - of 335 per eco-grade, as shown in EDA

In [3]:
# Input
file_path = base_dir / "data" / "final" / "1_cleaned_ingredients.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")
df = df[~df['ecoscore_grade'].isin(['a-plus', 'f'])]

# Group by 'ecograde_score' and sample 335 rows from each group
df_subset = df.groupby('ecoscore_grade').apply(lambda x: x.sample(n=335, random_state=42) if len(x) >= 435 else x)

# Reset the index after the groupby operation
df_subset = df_subset.reset_index(drop=True)

# Output
output_path = base_dir / "data" / "final" / "2_cleaned_ingredients_ecograde_subset_test.xlsx"
df_subset.to_excel(output_path, index=False)


2. BERT tokenizer and vector embedding

In [4]:
# Input
file_path = base_dir / "data" / "final" / "2_cleaned_ingredients_ecograde_subset_test.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# --- Load tokenizer and model ---
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - old model
# model = BertModel.from_pretrained('bert-base-uncased') - old model
tokenizer = AutoTokenizer.from_pretrained("alexdseo/RecipeBERT")
model = AutoModel.from_pretrained("alexdseo/RecipeBERT")

# Vectorisation function with expected RecipeBERT input format
def bert_vectorize(text):
    formatted_text = f"ingredients: {text}"
    inputs = tokenizer(formatted_text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # [CLS] token embedding
    return cls_embedding

# Add BERT vector column to DataFrame
df['bert_vector'] = df['cleaned_root_ingredients'].apply(bert_vectorize)
X = np.stack(df['bert_vector'].values)

# Apply to your DataFrame
df['bert_vector'] = df['cleaned_root_ingredients'].apply(bert_vectorize)
X = np.stack(df['bert_vector'].values)

# Print an example shape
print(df['bert_vector'].iloc[0].shape)

# Output
# output_dir = file_path.parent
# np.save(output_dir / "X_embeddings.npy", X) 

Some weights of BertModel were not initialized from the model checkpoint at alexdseo/RecipeBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(768,)


In [5]:
vector_df = pd.DataFrame(X.tolist())  # BERT vectors
vector_df['ecoscore_grade'] = df['ecoscore_grade'].values
vector_df.to_csv(file_path.parent / "3_bert_vectors_with_eco_grades_test.csv", index=False)


**References**

BERT vector embeddings how to (which model to use) https://airbyte.com/data-engineering-resources/bert-vector-embedding


https://plainenglish.io/blog/bert-pytorch-implementation-prepare-dataset-part-1 Prepping data for BERT