Import libraries

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from pathlib import Path
tqdm.pandas()

In [None]:
try:
    base_dir = Path(__file__).resolve().parent.parent
except NameError:
    base_dir = Path().resolve()

1. Creating equal classes - from the cleaned ingredients dataset, create a balanced subset of the data by sampling 335 rows from each 'ecoscore_grade' group. 335 was taken from the EDA which revealed that the ecoscore_grade with the fewest rows had 335 rows. Save resulting subset of data as a new Excel file for further analysis.

In [None]:
# Input
file_path = base_dir / "data" / "final" / "1_cleaned_ingredients.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Group by 'ecograde_score' and sample 335 rows from each group
df_subset = df.groupby('ecoscore_grade').apply(lambda x: x.sample(n=335, random_state=42) if len(x) >= 335 else x)

# Reset the index after the groupby operation
df_subset = df_subset.reset_index(drop=True)

# Output
output_path = base_dir / "data" / "final" / "2_cleaned_ingredients_ecograde_subset.xlsx"
df_subset.to_excel(output_path, index=False)


2. BERT tokenizer and vector embedding - load the balanced subset of the cleaned ingredients dataset, initialise the RecipeBERT tokenizer and model, and define a function to generate BERT vector embeddings for each ingredient list. Apply this function to the DataFrame, creating a new column with the embeddings and stacking them into a NumPy array for further analysis.

In [None]:
# Input
file_path = base_dir / "data" / "final" / "2_cleaned_ingredients_ecograde_subset.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("alexdseo/RecipeBERT")
model = AutoModel.from_pretrained("alexdseo/RecipeBERT")

# Vectorisation function with expected RecipeBERT input format
def bert_vectorize(text):
    formatted_text = f"ingredients: {text}"
    inputs = tokenizer(formatted_text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # [CLS] token embedding
    return cls_embedding

# Add BERT vector column to DataFrame
df['bert_vector'] = df['cleaned_root_ingredients'].apply(bert_vectorize)
X = np.stack(df['bert_vector'].values)

# Print an example shape
print(df['bert_vector'].iloc[0].shape)

3. Create final dataframe - create a DataFrame from the BERT vector embeddings (`X`) and adds the corresponding `ecoscore_grade` labels from the original DataFrame. Save the resulting DataFrame as a CSV file, which allows for use of the BERT embeddings alongside their ecoscore labels in the predictive models.

In [None]:
vector_df = pd.DataFrame(X.tolist())  # BERT vectors
vector_df['ecoscore_grade'] = df['ecoscore_grade'].values
vector_df.to_csv(file_path.parent / "3_bert_vectors_with_eco_grades.csv", index=False)

**References**

Prepping data for BERT - https://plainenglish.io/blog/bert-pytorch-implementation-prepare-dataset-part-1

BERT
https://airbyte.com/data-engineering-resources/bert-vector-embedding