# INDEX
* [Imports and functions](#Imports-and-functions)
* [Configuration](#Configuration)
* [Prepare dataset](#Prepare-dataset)
* [Build model](#Build-model)
    * [Model inputs](#Model-inputs)
    * [Model output](#Model-output)
    * [Model](#Model)
* [Train model](#Train-model)
    * [Save model and resources](#Save-model-and-resources)
    * [Training stats](#Training-stats)
* [Test model](#Test-model)
    * [Predict with training model](#Predict-with-training-model)
    * [Predict with serving model](#Predict-with-serving-model)

In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
# eventual initialization for colab notebooks
if IN_COLAB:
  # we try hard to be re-entrant,
  # that is to be able to rerun this without cloning repository more than once
  COLAB_BRANCH = "master"
  !curl https://raw.githubusercontent.com/openfoodfacts/off-category-classification/$COLAB_BRANCH/lib/colab.py --output /content/colab.py
  !cd /content && python /content/colab.py $COLAB_BRANCH
  %cd /content/off-category-classification/experiments

In [2]:
# codecarbon - start tracking
from codecarbon import EmissionsTracker

tracker = EmissionsTracker(log_level = "WARNING", save_to_api = True, experiment_id = "6d2c8401-afba-42de-9600-6e95bea5fd80")
tracker.start()

[codecarbon ERROR @ 20:26:33] Unable to read Intel RAPL files for CPU power, we will use a constant for your CPU power. Please view https://github.com/mlco2/codecarbon/issues/244 for workarounds : [Errno 13] Permission denied: '/sys/class/powercap/intel-rapl/intel-rapl:1/energy_uj'
[codecarbon ERROR @ 20:26:33] Unable to read Intel RAPL files for CPU power, we will use a constant for your CPU power. Please view https://github.com/mlco2/codecarbon/issues/244 for workarounds : [Errno 13] Permission denied: '/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj'
[codecarbon ERROR @ 20:26:34] Unable to read Intel RAPL files for CPU power, we will use a constant for your CPU power. Please view https://github.com/mlco2/codecarbon/issues/244 for workarounds : [Errno 13] Permission denied: '/sys/class/powercap/intel-rapl/intel-rapl:1/energy_uj'
[codecarbon ERROR @ 20:26:34] Unable to read Intel RAPL files for CPU power, we will use a constant for your CPU power. Please view https://github.com/

# Imports

In [3]:
import sys
sys.path.append('../') #append a relative path to the top package to the search path

In [4]:
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
from tensorflow.keras import callbacks, layers
from tensorflow.keras.utils import plot_model

from lib.dataset import *
from lib.directories import init_cache_dir, init_model_dir
from lib.io import load_model, save_model
from lib.model import top_labeled_predictions, top_predictions_table
from lib.plot import plot_training_stats

# Configuration

In [5]:
MODEL_BASE_DIR = pathlib.Path('../model')
CACHE_DIR = pathlib.Path('../tensorflow_cache')

PREPROC_BATCH_SIZE = 10_000  # some large value, only affects execution time

# splits are handled by `tfds.load`, see doc for more elaborate ways to sample
TRAIN_SPLIT = 'train[70:75%]'
VAL_SPLIT = 'train[80%:85%]'
TEST_SPLIT = 'train[90%:95%]'
MAX_EPOCHS = 5

# Prepare dataset

Run this once to fetch, build and cache the dataset.
Further runs will be no-ops, unless you force operations (see TFDS doc).

Once this is done, `load_dataset('off_categories', ...)` to access the dataset.

In [6]:
import datasets.off_categories

builder = tfds.builder('off_categories')
builder.download_and_prepare()

# Or run via command line (if `tfds` is in the path):
# !cd ../datasets && tfds build off_categories

# Build model

In [7]:
tf.random.set_seed(42)

# Taxonomy information

In [8]:
import json
from lib.taxonomy import Taxonomy
! ls category_taxonomy.json || wget https://github.com/openfoodfacts/robotoff-models/releases/download/keras-category-classifier-xx-2.0/category_taxonomy.json

taxo = Taxonomy.from_data(json.load(open('category_taxonomy.json')))

category_taxonomy.json


## Model inputs

In [9]:
# we use dicts so rerunning individual model cells is idempotent
inputs = {}
input_graphs = {}

In [10]:
ds = load_dataset('off_categories', split=TRAIN_SPLIT)

In [11]:
%%time

feature_name = 'product_name'

product_name_input = tf.keras.Input(shape=(1,), dtype=tf.string, name=feature_name)

product_name_vectorizer = layers.TextVectorization(
    split = 'whitespace',
    max_tokens = 93_000,
    output_sequence_length = 30)

product_name_vectorizer.adapt(
    select_feature(ds, feature_name).batch(PREPROC_BATCH_SIZE))

x = product_name_vectorizer(product_name_input)

x = layers.Embedding(
    input_dim = product_name_vectorizer.vocabulary_size(),
    output_dim = 64,
    mask_zero = False)(x)

product_name_graph = layers.Bidirectional(layers.LSTM(
    units = 64,
    recurrent_dropout = 0.2,
    dropout = 0.0))(x)

inputs[feature_name] = product_name_input
input_graphs[feature_name] = product_name_graph

len(product_name_vectorizer.get_vocabulary())

CPU times: user 3.51 s, sys: 1.14 s, total: 4.65 s
Wall time: 1.32 s


22002

In [12]:
%%time

feature_name = 'ingredients_tags'

ingredients_input = tf.keras.Input(shape=(None,), dtype=tf.string, name=feature_name)

ingredients_vocab = get_vocabulary(
    flat_batch(select_feature(ds, feature_name), batch_size=PREPROC_BATCH_SIZE),
    min_freq = 3,
    max_tokens = 5_000)

ingredients_graph = layers.StringLookup(
    vocabulary = ingredients_vocab,
    output_mode = 'multi_hot')(ingredients_input)

inputs[feature_name] = ingredients_input
input_graphs[feature_name] = ingredients_graph

len(ingredients_vocab)

CPU times: user 4.36 s, sys: 1.59 s, total: 5.96 s
Wall time: 1.18 s


5000

## Model output

In [15]:
%%time

labels = 'categories_tags'

categories_vocab = get_vocabulary(
    flat_batch(select_feature(ds, labels), batch_size=PREPROC_BATCH_SIZE),
    min_freq = 10)

# StringLookup(output_mode='multi_hot') mode requires num_oov_indices >= 1.
# We don't want OOVs in the categories_tags output layer, since it wouldn't make
# sense to predict OOV. So we'll drop the OOV in _transform below.
# Be careful when using StringLookup methods, some of them will return values
# based on a vocabulary with OOV (e.g. vocabulary_size()). Keep this in mind when
# mapping predictions back to the original vocabulary.
categories_multihot = layers.StringLookup(
    vocabulary = categories_vocab,
    output_mode = 'multi_hot',
    num_oov_indices = 1)

len(categories_vocab)

CPU times: user 4.42 s, sys: 1.8 s, total: 6.23 s
Wall time: 1.26 s


1241

## Model

In [18]:
# a specific model that do not penalize on certain categories
from lib.taxonomy_mask import MaskingModel

In [19]:
# ensure final order is independent of cell execution/insertion order
features = sorted(inputs.keys())

x = layers.Concatenate()([input_graphs[k] for k in features])
x = layers.Dropout(0.2)(x)
x = layers.Dense(64)(x)
x = layers.Dropout(0.2)(x)
x = layers.Activation('relu')(x)
output = layers.Dense(len(categories_vocab), activation='sigmoid')(x)

model = MaskingModel(inputs=[inputs[k] for k in features], outputs=[output])

threshold = 0.5
num_labels = len(categories_vocab)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.0),
    metrics=[
        tf.metrics.Precision(thresholds=threshold, name='precision'),
        tf.metrics.Recall(thresholds=threshold, name='recall'),
        tfa.metrics.F1Score(average='micro', threshold=threshold, num_classes=num_labels, name='f1_score_micro'),
        tfa.metrics.F1Score(average='macro', threshold=threshold, num_classes=num_labels, name='f1_score_macro'),
    ]
)

In [20]:
model.summary()

Model: "masking_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 product_name (InputLayer)      [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, 30)          0           ['product_name[0][0]']           
 ization)                                                                                         
                                                                                                  
 ingredients_tags (InputLayer)  [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 30, 64)       1408128     ['text_vectorization[

In [21]:
plot_model(model, show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


# Train model

In [22]:
# helpers to add features and encode
from lib.taxonomy_mask import TaxonomyTransformer, binarize_compat 

add_compatible_categories = TaxonomyTransformer(taxo).add_compatible_categories

In [23]:
def categories_encode(ds: tf.data.Dataset):
    """encode categories

    - as multi-hot for y
    - as a mask for "compat" feature
    """
    @tf.function
    @tf.autograph.experimental.do_not_convert
    def _transform(x, y):
        y = categories_multihot(y)
        y = y[1:]  # drop OOV
        # we also binarize compatibility feature
        x = binarize_compat(x, categories_multihot, "compat")
        return (x, y)

    # applies to non-batched dataset
    return (
        ds
        .map(_transform, num_parallel_calls=tf.data.AUTOTUNE, deterministic=True)
        .apply(filter_empty_labels)
    )

In [24]:
# Remember to clean obsolete dirs once in a while
MODEL_DIR = init_model_dir(MODEL_BASE_DIR)
CACHE_DIR = init_cache_dir(CACHE_DIR)

batch_size = 128

ds_train = (
    load_dataset('off_categories', split=TRAIN_SPLIT, features=features, as_supervised=True)
    .apply(add_compatible_categories)
    .apply(categories_encode)
    .padded_batch(batch_size)
    .cache(str(CACHE_DIR / 'train'))
)

ds_val = (
    load_dataset('off_categories', split=VAL_SPLIT, features=features, as_supervised=True)
    .apply(add_compatible_categories)
    .apply(categories_encode)
    .padded_batch(batch_size)
    .cache(str(CACHE_DIR / 'val'))
)

Model directory: ../model.20220824-205334
Cache directory: ../tensorflow_cache


In [25]:
%%time

history = model.fit(
    ds_train,
    epochs = MAX_EPOCHS,
    validation_data = ds_val,
    callbacks = [
        callbacks.TerminateOnNaN(),
        callbacks.ModelCheckpoint(
            filepath = str(MODEL_DIR / "weights.{epoch:02d}-{val_loss:.4f}"),
            monitor = 'val_loss',
            save_best_only = True,
            save_format = 'tf',
        ),
        callbacks.EarlyStopping(monitor='val_loss', patience=4),
        callbacks.CSVLogger(str(MODEL_DIR / 'training.log')),
        callbacks.History()
    ]
)

Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)


    326/Unknown - 335s 1s/step - loss: 0.0603 - binary_accuracy: 0.9885 - precision: 0.0049 - recall: 0.0102INFO:tensorflow:Assets written to: ../model.20220824-205334/weights.01-0.0178/assets


INFO:tensorflow:Assets written to: ../model.20220824-205334/weights.01-0.0178/assets


Epoch 2/5


INFO:tensorflow:Assets written to: ../model.20220824-205334/weights.02-0.0172/assets


Epoch 3/5


INFO:tensorflow:Assets written to: ../model.20220824-205334/weights.03-0.0167/assets


Epoch 4/5


INFO:tensorflow:Assets written to: ../model.20220824-205334/weights.04-0.0160/assets


Epoch 5/5


INFO:tensorflow:Assets written to: ../model.20220824-205334/weights.05-0.0154/assets


CPU times: user 25min 56s, sys: 1min 16s, total: 27min 13s
Wall time: 13min 27s


## Training stats

In [26]:
stats = pd.read_csv(MODEL_DIR / 'training.log')
stats

Unnamed: 0,epoch,binary_accuracy,loss,precision,recall,val_binary_accuracy,val_loss,val_precision,val_recall
0,0,0.988514,0.060309,0.004866,0.010168,0.99627,0.017836,0.0,0.0
1,1,0.99626,0.018097,0.583765,0.001749,0.996275,0.017249,0.625438,0.002792
2,2,0.996277,0.017504,0.626408,0.012372,0.996305,0.016658,0.711507,0.015816
3,3,0.996322,0.016878,0.69662,0.030607,0.996372,0.016045,0.797069,0.036546
4,4,0.996387,0.01628,0.704906,0.059554,0.996434,0.015445,0.809166,0.057301


In [27]:
plot_training_stats(stats)

## Save model and resources

In [28]:
SAVED_MODEL_DIR = MODEL_DIR / 'saved_model'

@tf.function
def serving_func(*args, **kwargs):
    preds = model(*args, **kwargs)
    return top_labeled_predictions(preds, categories_vocab, k=50)

save_model(SAVED_MODEL_DIR, model, categories_vocab, serving_func)

INFO:tensorflow:Assets written to: ../model.20220824-205334/saved_model/assets


INFO:tensorflow:Assets written to: ../model.20220824-205334/saved_model/assets


# Test model

In [29]:
m, labels = load_model(SAVED_MODEL_DIR)

In [30]:
ds_test = load_dataset('off_categories', split=TEST_SPLIT)

## Predict with serving model

In [31]:
%%time

preds_test = m.predict(ds_test.padded_batch(128))
preds_test

  inputs = self._flatten_to_reference_inputs(inputs)


CPU times: user 24.6 s, sys: 1.83 s, total: 26.5 s
Wall time: 5.41 s


array([[2.47663021e-01, 2.17880249e-01, 1.16226524e-01, ...,
        1.43647194e-04, 1.61200762e-04, 1.63912773e-04],
       [3.13569546e-01, 2.74577320e-01, 8.84172618e-02, ...,
        3.95774841e-04, 4.25547361e-04, 4.45872545e-04],
       [3.36490154e-01, 3.01163048e-01, 6.55404329e-02, ...,
        3.38256359e-04, 4.05877829e-04, 4.16815281e-04],
       ...,
       [2.86023498e-01, 2.50528723e-01, 9.69553292e-02, ...,
        2.57015228e-04, 2.80886889e-04, 2.90870667e-04],
       [3.50041270e-01, 3.05147827e-01, 7.35341907e-02, ...,
        5.99324703e-04, 6.28679991e-04, 6.70313835e-04],
       [2.58788496e-01, 2.27135181e-01, 1.09421253e-01, ...,
        1.69306993e-04, 1.88469887e-04, 1.92373991e-04]], dtype=float32)

In [32]:
# This is the function exported as the default serving function in our saved model
top_preds_test = top_labeled_predictions(preds_test, labels, k=3)
top_preds_test

(<tf.Tensor: shape=(42880, 3), dtype=float32, numpy=
 array([[0.24766302, 0.21788025, 0.11622652],
        [0.31356955, 0.27457732, 0.13280305],
        [0.33649015, 0.30116305, 0.1177021 ],
        ...,
        [0.2860235 , 0.25052872, 0.11411557],
        [0.35004127, 0.30514783, 0.15778136],
        [0.2587885 , 0.22713518, 0.10942125]], dtype=float32)>,
 <tf.Tensor: shape=(42880, 3), dtype=string, numpy=
 array([[b'en:plant-based-foods-and-beverages', b'en:plant-based-foods',
         b'en:snacks'],
        [b'en:plant-based-foods-and-beverages', b'en:plant-based-foods',
         b'en:beverages'],
        [b'en:plant-based-foods-and-beverages', b'en:plant-based-foods',
         b'en:meats'],
        ...,
        [b'en:plant-based-foods-and-beverages', b'en:plant-based-foods',
         b'en:beverages'],
        [b'en:plant-based-foods-and-beverages', b'en:plant-based-foods',
         b'en:beverages'],
        [b'en:plant-based-foods-and-beverages', b'en:plant-based-foods',
         

In [33]:
%%time

# Same data, but pretty
pred_table_test = top_predictions_table(top_preds_test)

# Add some interpretable features to the final table
# Table must be row-aligned with predictions above (= taken from same data sample)
extra_cols_test = as_dataframe(select_features(ds_test, ['code', 'product_name']))

pd.concat([extra_cols_test, pred_table_test], axis=1)

CPU times: user 14.6 s, sys: 1.87 s, total: 16.5 s
Wall time: 9.09 s


Unnamed: 0,code,product_name,top prediction 1,top prediction 2,top prediction 3
0,0637793029422,Blueberry preserve,en:plant-based-foods-and-beverages: 24.77%,en:plant-based-foods: 21.79%,en:snacks: 11.62%
1,5400141381191,Mousseline de pommes,en:plant-based-foods-and-beverages: 31.36%,en:plant-based-foods: 27.46%,en:beverages: 13.28%
2,3250392084080,Choux-Fleurs Bio,en:plant-based-foods-and-beverages: 33.65%,en:plant-based-foods: 30.12%,en:meats: 11.77%
3,8425275510085,Aceite de oliva virgen extra,en:plant-based-foods-and-beverages: 36.90%,en:plant-based-foods: 32.29%,en:beverages: 17.45%
4,4260193516047,Yörem Ayran,en:plant-based-foods-and-beverages: 30.26%,en:plant-based-foods: 26.39%,en:beverages: 12.39%
...,...,...,...,...,...
42875,2325616003473,Muslo de pato en confit,en:plant-based-foods-and-beverages: 27.41%,en:plant-based-foods: 24.06%,en:beverages: 10.70%
42876,8711542001906,Drop,en:snacks: 27.84%,en:sweet-snacks: 22.86%,en:plant-based-foods-and-beverages: 21.36%
42877,0754527010720,Xperimental IPA,en:plant-based-foods-and-beverages: 28.60%,en:plant-based-foods: 25.05%,en:beverages: 11.41%
42878,3596710347087,Fleur de sel de l'île de Ré recoltée à la main,en:plant-based-foods-and-beverages: 35.00%,en:plant-based-foods: 30.51%,en:beverages: 15.78%


In [34]:
# codecarbon - stop tracking
tracker.stop()

9.896983892725066e-06