# Appendix 1 : Filter the whole OpenFoodFacts database

In [1]:
import csv
import itertools
from typing import Sequence

import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

In [2]:
OFF_FILE = (
    "/home/mathieu/datasets/openfoodfacts/2024-07-19/en.openfoodfacts.org.products.csv"
)

The OpenFoodFacts website links to a file that [describes the data fields](https://static.openfoodfacts.org/data/data-fields.txt).

The first important information is that the OpenFoodFacts CSV file is encoded in UTF-8 and uses the TAB (`\t`) character as delimiter.

We will read the first 1,000 rows and see what data types are inferred by `read_csv`.

Then we will assign explicit data types or convert values following a few guidelines :

* Most of the columns in OpenFoodFacts contain nutritional values. Their name ends with `_100g` and their values are [decimal numerals](https://en.wikipedia.org/wiki/Decimal) ;
* Columns ending with `_t` contain [Unix times](https://en.wikipedia.org/wiki/Unix_time) ;
* Columns ending with `_datetime` contain date and time in [Coordinated Universal Time](https://en.wikipedia.org/wiki/Coordinated_Universal_Time) in the [ISO 8601 format](https://en.wikipedia.org/wiki/ISO_8601) ;
* Columns ending with `_tags` are comma separated lists of (string) values.

The `object` data type is the default data type in pandas, that is used whenever a column has non-numerical values.
There is however a `string` data type that you can explicitly set for columns that contain textual values (strings).

In [3]:
# set explicit data types for some columns
OFF_DTYPES = {
    "code": "string",  # barcode
    "url": "string",  # URL on OFF
    # metadata
    # - OFF user names are "category"
    # - Unix times and ISO 8601 datetimes are read as "object" to be cast explicitly later
    "creator": "category",  # OFF user
    "created_t": "object",  # post-processed
    "created_datetime": "object",  # post-processed
    "last_modified_t": "object",  # post-processed
    "last_modified_datetime": "object",  # post-processed
    "last_modified_by": "category",  # OFF user
    "last_updated_t": "object",  # post-processed
    "last_updated_datetime": "object",  # post-processed
    #
    "product_name": "string",
    "abbreviated_product_name": "string",  # non-normalized, noisy ?
    "generic_name": "string",
    "quantity": "string",
    # packaging has a taxonomy
    "packaging": "string",  # list ; sep=','
    "packaging_tags": "string",  # list ; sep=',' ; slightly normalized version of 'packaging'
    "packaging_en": "string",  # list[str] ; sep="," ; really a list of categories
    "packaging_text": "string",  # plain text
    # brands has no taxonomy
    "brands": "string",  # list ; sep="," (to normalize because ",  " occurs)
    "brands_tags": "string",  # list ; sep=',' ; slightly normalized version of 'brands'
    # categories have a taxonomy and are (or should be) normalized
    # => candidates for 'category', list of str, list of 'category' ?
    # https://wiki.openfoodfacts.org/Data_fields#Categories
    "categories": "string",  # list ; sep="," (or ", "? to normalize)
    "categories_tags": "string",  # list ; sep="," ; slightly normalized version of 'categories' ?
    "categories_en": "string",  # list ; sep="," ; plain text EN version of 'categories_tags' ?
    # origins has no taxonomy
    "origins": "string",  # list ; sep=',' ; not normalized
    "origins_tags": "string",  # list ; sep=',' ; slightly normalized version of 'origins' but not enough
    "origins_en": "string",  # list ; sep=',' ; plain text EN version of 'origins_tags' ? not enough
    # manufacturing_places has no taxonomy
    "manufacturing_places": "string",  # list ; sep=',' ; not normalized
    "manufacturing_places_tags": "string",  # list ; sep=',' ; slightly normalized version of 'manufacturing_places'
    # labels has a taxonomy
    "labels": "string",  # list ; sep=',' (or ', '?); not normalized
    "labels_tags": "string",  # list ; sep=',' (or ', '?); slightly normalized version of 'labels'
    "labels_en": "string",  # list ; sep=',' ; plain text EN version of 'labels_tags'
    #
    "emb_codes": "string",  # list ; sep=',' (or ', '?); not normalized
    "emb_codes_tags": "string",  # list ; sep=',' ; slightly normalized version of 'emb_codes'
    # '(lat, lon)'
    "first_packaging_code_geo": "string",  # FIXME
    #
    "cities": "string",  #  list[str] ; sep=',' ; currently empty
    "cities_tags": "string",  # list[str] ; sep=',' ; slightly normalized
    #
    "purchase_places": "string",  # list ; sep=',' ; not normalized
    "stores": "string",  # list ; sep=',' ; not normalized
    #
    "countries": "string",  # list ; sep=',' (or ', ' ?); not normalized
    "countries_tags": "string",  # list ; sep=',' ; normalized version of 'countries'
    "countries_en": "string",  # list ; sep=',' ; plain text EN version of 'countries_tags'
    #
    "ingredients_text": "string",
    "ingredients_tags": "string",  # list ; sep=',' ; normalized version of 'ingredients'
    "ingredients_analysis_tags": "string",  # list ; sep=',' ; normalized values eg. "en:palm-oil-free,en:non-vegan"
    # ...
    # TODO handle like states ? maybe use a cutoff to keep only the most frequent ones, and put the rest in a text column "others" ?
    "allergens": "string",  # list[str] ; sep=',' (or ', ' ?); very noisy, so not currently a list of (normalized) categories
    "allergens_en": "string",  # list[str] ; sep=',' (or ', ' ?); plain text EN version of 'allergens'  # currently empty
    # TODO same (less urgent)
    "traces": "string",  #  list ; sep=',' (or ', ' ?); normalized ?
    "traces_tags": "string",  # list ; sep=',' (or ', ' ?); traces + automatic enrichment?
    "traces_en": "string",  # list ; sep=',' (or ', ' ?); plain text EN version of 'traces_tags'
    # https://wiki.openfoodfacts.org/Data_fields#Serving_size
    "serving_size": "string",  # (2024-07-19: this is column number 50)
    "serving_quantity": "float",  # computed from serving_size
    # https://wiki.openfoodfacts.org/API_Fields
    "no_nutrition_data": "category",  # former name: 'no_nutriments' ; values = "on", "off", "true", "null" ?
    # additives
    "additives_n": "UInt8",  # col 53
    "additives": "string",  # list[str] ; sep=',' ; currently empty !?
    "additives_tags": "string",  # list[str] ; sep=',' (or ', ' ?); normalized
    "additives_en": "string",  # list[str] ; sep=',' (or ', ' ?); plain text EN version of 'additives_tags'
    # palm: grouped and subsumed by new field "ingredients_analysis_tags"
    # 'ingredients_from_palm_oil_n': 'UInt8',  # dropped
    ## 'ingredients_from_palm_oil': 'string',  # list ; sep=',' ; currently empty !?
    # 'ingredients_from_palm_oil_tags': 'string',  # list ; sep=',' (or ', ' ?); normalized
    # 'ingredients_that_may_be_from_palm_oil_n': 'UInt8',
    ## 'ingredients_that_may_be_from_palm_oil': 'string',  # list ; sep=',' ; currently empty !?
    # 'ingredients_that_may_be_from_palm_oil_tags': 'string',  # list ; sep=',' (or ', ' ?); normalized
    # synthetic scores, high-level information for customers
    "nutriscore_score": "Int8",  # or Int64 ? ; col 57
    "nutriscore_grade": CategoricalDtype(
        categories=["a", "b", "c", "d", "e"], ordered=True
    ),  # "unknown" for missing values
    "nova_group": CategoricalDtype(
        categories=["1", "2", "3", "4"], ordered=True
    ),  # empty string for missing values
    "pnns_groups_1": "category",  # there's an 'unknown' cat but also some NaN values !?
    "pnns_groups_2": "category",  # id !?
    "food_groups": "category",  # NEW : str ; either empty string or an atomic value like "en:vegetables"
    "food_groups_tags": "category",  # NEW: list[str] ; sep="," ; ex: "en:fruits-and-vegetables,en:vegetables"
    "food_groups_en": "category",  # NEW: list[str] ; sep="," ; ex: "en:fruits-and-vegetables,en:vegetables"
    # states
    "states": "string",  # post-processed ; list[str] ; sep=", "
    "states_tags": "string",  # post-processed ;  list[str] ; sep=","
    "states_en": "string",  # post-processed ;  list[str] ; sep=","
    #
    "brand_owner": "string",  # ?
    # eco score
    "ecoscore_score": "float64",  # should maybe be Int8 or Int64, but at least 1 float value stored as of 2024-07-19 ; renamed from "ecoscore_score_fr"  # col 69
    "ecoscore_grade": CategoricalDtype(
        categories=["a", "b", "c", "d", "e"], ordered=True
    ),  # renamed from "ecoscore_grade_fr"
    # ?
    "nutrient_levels_tags": "string",  # list[str] ; sep="," ; really a list of categories
    "product_quantity": "float64",  # new ; could be Int64 but many float values as of 2024-07
    "owner": "category",  # new ; a few values with big arity, lots of values with small arity
    "data_quality_errors_tags": "string",  # list[str] ; sep="," ; really a list of categories
    "unique_scans_n": "Int64",
    "popularity_tags": "string",  # list[str] ; sep="," ; really a list of categories
    "completeness": "float64",  # in [0.0, 1.0] ; maximum 4 decimal digits
    # metadata on images
    "last_image_t": "object",  # post-processed
    "last_image_datetime": "object",  # post-processed
    # TODO categories are messy
    "main_category": "string",
    "main_category_en": "string",
    # URLs
    "image_url": "string",
    "image_small_url": "string",
    "image_ingredients_url": "string",
    "image_ingredients_small_url": "string",
    "image_nutrition_url": "string",
    "image_nutrition_small_url": "string",
    #
    # 'nutrition-score-fr_100g': 'Int64',  # Int8 ?
}

In [4]:
SET_COLS = {
    # states and its variants
    # 'states' cols have 42 unique values (as of 2021-08-16)
    # we can have 41 at this stage, because we filtered entries with no
    # 'image_small_url' (hence no 'image_url')
    # which should amount to filtering out the tag 'Front-photo-not-selected'
    "states": {
        "sep": ",",
        "alt_seps": r",\s+",
        "nb_uniq_vals": (42, 41),
    },
    "states_tags": {
        "sep": ",",
        "alt_seps": r",\s+",
        "nb_uniq_vals": (42, 41),
    },
    "states_en": {
        "sep": ",",
        "alt_seps": r",\s+",
        "nb_uniq_vals": (42, 41),
    },
    # other columns
    "popularity_tags": {
        "sep": ",",
    },
    "nutrient_levels_tags": {
        "sep": ",",
    },
}

In [5]:
def find_empty_columns(df, threshold=0.01, verbose=False):
    """Find empty or near-empty columns.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame
    threshold : float
        Proportion of values that must be non-null to consider a column non-empty.
    verbose : boolean
        If True, print the name and count of each empty column.

    Returns
    -------
    empty_cols : List[str]
        Names of empty columns.
    """
    empty_cols = []
    nb_rows = df.shape[0]
    if verbose:
        print("nb_rows: ", nb_rows)
    for col_name in df.columns:
        if df[col_name].count() < threshold * nb_rows:
            if verbose:
                print(col_name, df[col_name].count())
            empty_cols.append(col_name)
    return empty_cols

In [6]:
def replace_set_column(
    df: pd.DataFrame,
    col_name: str,
    sep: str,
    alt_seps: str | None = None,
    nb_uniq_vals: Sequence[int] | None = None,
) -> pd.DataFrame:
    """Replace a set-valued column with a set of boolean-valued columns.

    A set column has values that are sets of categorical values (concretely in OFF:
    lists of strings, whose linear order in a value is meaningless or can be recovered
    from a taxonomy, and that are taken from a small inventory of possible values).
    A set column can therefore be replaced with a semantically equivalent set of boolean
    columns.
    The process is somehow time-consuming and memory-intensive, but the resulting
    DataFrame has a much smaller memory footprint, and is arguably easier to query.

    Parameters
    ----------
    df :
        Original DataFrame
    col_name :
        Name of the set-valued column to replace.
    sep :
        Canonical separator between values.
    alt_seps :
        Pattern (regex) to capture non-canonical separators between values and replace
        them with the canonical separator, as a preliminary canonicalization step.
        This enables clean processing of non-canonical (legacy?) rows using eg. ", "
        instead of ",".
    nb_uniq_vals :
        Expected numbers of unique values, for quality control.

    Returns
    -------
    upd_df :
        Updated DataFrame.
    """
    # replace non-canonical separators with the canonical one, then split into atomic
    # values
    col_split = df[col_name].str.replace(alt_seps, sep).str.split(sep, expand=False)
    # gather the set of unique atomic values, across all rows
    uniq_vals = set(itertools.chain.from_iterable(col_split))
    # control that the total number of unique values matches our expectation
    # this is a quality check aimed at avoiding generating columns with non-canonical
    # atomic values (ex: " en:toto") due to bad separators that escaped "alt_seps"
    if nb_uniq_vals:
        try:
            assert len(uniq_vals) in nb_uniq_vals
        except AssertionError:
            print(len(uniq_vals))
            # print(uniq_vals_dbg[col_name] - uniq_vals)
            raise
    # create a column for each atomic value in the set
    new_cols = {
        col_name + "__" + uniq_val: col_split.apply(lambda x: uniq_val in x)
        for uniq_val in uniq_vals
    }
    upd_df = df.assign(**new_cols)
    # drop the original column
    upd_df.drop(columns=[col_name], inplace=True)
    return upd_df


In [7]:
def load_off_csv(
    filepath_or_buffer,
    dtype=None,
    nrows: int | None = None,
    threshold=0.01,
    replace_set_columns=True,
    split_geo=False,
):
    """Load the OpenFoodFacts CSV file.

    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        Valid string path to the CSV file.
    threshold : float
        Threshold on the proportion of non-null values to consider a column non-empty.
        0.01 means we keep all columns where at least 1% of the values are non-empty.
    nrows : int, optional
        Number of rows that should be read ; if None, read all.
    replace_set_columns : boolean
        If True, replace each set column (currently 'states', 'states_tags' and 'states_en' ;
        'popularity_tags')
        with a list of boolean columns. This requires to read a sample of the data first.
    split_geo : boolean
        If True, replace a geocode (string) column with two (float) columns for latitude and
        longitude.

    Returns
    -------
    df : pd.DataFrame
        DataFrame containing the OpenFoodFacts tabular data.
    """
    if (nrows is not None) and (not isinstance(nrows, int)):
        # should not happen
        print(f"nrows is {nrows} but should be an int or None. Assumed None.")
        nrows = None

    # 1. read a sample of the dataset, to get a sense of the columns: dtype, sparseness,
    # number of unique values etc. and filter columns that will be of very little to no
    # use
    # by default, read the first 200_000 lines
    df_sample = pd.read_csv(
        filepath_or_buffer,
        sep="\t",
        dtype=dtype,
        nrows=200_000,
        quoting=csv.QUOTE_NONE,
        low_memory=False,
    )

    # 2. all columns were read from the header, and there are probably columns missing
    # from the dtype specification (if any was given), especially among the "_100g" for
    # nutritional values setup dtype
    if dtype is None:
        dtype = {}
    else:
        cols_sample_only = set(df_sample.columns.values) - set(dtype.keys())
        try:
            assert all(x.endswith("_100g") for x in cols_sample_only)
        except AssertionError:
            print("Column names absent from dtype that do not end with '_100g':")
            for col_name in cols_sample_only:
                if not col_name.endswith("_100g"):
                    uniq_vals = df_sample[col_name].unique()
                    nuniq_vals = df_sample[col_name].nunique()
                    nonnull_vals = df_sample[col_name].notnull().sum()
                    print(
                        f"- {col_name}: "
                        + f"{nonnull_vals} non-null values, "
                        + f"{nuniq_vals} unique values\n"
                        + f"  {uniq_vals}"
                    )
            print("All column names absent from dtype:")
            print(cols_sample_only)
            print("Column names in dtype absent from the data sample:")
            print(set(dtype.keys()) - set(df_sample.columns.values))
            raise
    # explicitly set all (not yet defined) _100g to float
    for col_name in df_sample.columns:
        if col_name in dtype:
            continue
        if col_name.endswith("_100g"):
            dtype[col_name] = "float"

    # filter rows :
    # * product_name and brands
    df_sample.drop(
        df_sample[df_sample["product_name"].isna() | df_sample["brands"].isna()].index,
        inplace=True,
    )
    # * entries that don't have an image for the (general aspect of the) product
    df_sample.drop(df_sample[df_sample["image_small_url"].isna()].index, inplace=True)
    # * barcodes that are not of 8 or 13 characters long (EAN-8 or EAN-13)
    df_sample.drop(
        df_sample[
            (df_sample["code"].str.len() != 8) & (df_sample["code"].str.len() != 13)
        ].index,
        inplace=True,
    )
    # * ambiguous barcodes, because they result in entries we cannot trust:
    # incomplete, mixes of products...
    #   - EAN8 : GS1-8 prefixes 000-099 and 200-299 "Used to issue GS1 restricted circulation number within a company"
    #   (source: https://www.gs1.org/sites/default/files/docs/barcodes/WR15-006%20Updating%20Figures%20in%20General%20Specification_errataAnkurComment.pdf)
    df_sample.drop(
        df_sample[df_sample["code"].str.fullmatch(r"[02]\d{7}")].index, inplace=True
    )
    #   - EAN-13: GS1-13 prefix 00000 "Reserved for GS1 Company Prefix equivalent of GS1-8 Prefix"
    df_sample.drop(
        df_sample[df_sample["code"].str.fullmatch(r"00000[02]\d{7}")].index,
        inplace=True,
    )
    #   - and just all those starting with 00000 because their URLs on the OFF website are all bad
    df_sample.drop(
        df_sample[df_sample["code"].str.startswith("00000")].index, inplace=True
    )
    # * entries that don't have complete categories
    df_sample.drop(
        df_sample[
            df_sample["states_en"].str.contains("Categories to be completed")
        ].index,
        inplace=True,
    )
    # * entries that don't have complete nutritional values
    df_sample.drop(
        df_sample[
            df_sample["states_en"].str.contains("Nutrition facts to be completed")
        ].index,
        inplace=True,
    )

    # filter columns (to avoid loading) :
    usecols = list(df_sample.columns)
    # * URLs except 'url' and 'image_small_url'
    extra_url_cols = [
        x for x in df_sample.columns if x.endswith("_url") and x != "image_small_url"
    ]
    df_sample.drop(columns=extra_url_cols, inplace=True)
    usecols = [x for x in usecols if x not in extra_url_cols]
    # TODO check they are effective duplicates
    dup_cols = [
        # "_t" are redundant with "_datetime"
        "created_t",
        "last_modified_t",
        "last_updated_t",
        "last_image_t",
        # categories : we only keep the '_en' version of those
        "categories",
        "categories_tags",
        "main_category",
        "origins",
        "origins_tags",
        "countries",
        "countries_tags",
        "brands_tags",
        "labels",
        "labels_tags",
        "packaging_tags",
        "states",
        "states_tags",
        "manufacturing_places_tags",
        "traces",
        "traces_tags",
    ]
    df_sample.drop(columns=dup_cols, inplace=True)
    usecols = [x for x in usecols if x not in dup_cols]
    # * columns that are > 99% empty
    empty_cols = find_empty_columns(df_sample, threshold=threshold, verbose=False)
    df_sample.drop(columns=empty_cols, inplace=True)
    usecols = [x for x in usecols if x not in empty_cols]

    # read the requested amount of data
    df = pd.read_csv(
        filepath_or_buffer,
        sep="\t",
        usecols=usecols,
        dtype=dtype,
        nrows=nrows,
        quoting=csv.QUOTE_NONE,
    )
    # print(df.iloc[-1])  # DEBUG
    # convert columns with unix timestamps and datetimes
    for col_name in ("created_t", "last_modified_t", "last_updated_t", "last_image_t"):
        if col_name not in usecols:
            continue
        # Unix timestamps
        # NB : adding ".dt.tz_localize('UTC')" results in the same value as in the _datetime field
        df[col_name] = pd.to_datetime(df[col_name], unit="s")
    for col_name in (
        "created_datetime",
        "last_modified_datetime",
        "last_updated_datetime",
        "last_image_datetime",
    ):
        if col_name not in usecols:
            continue
        # ISO 8601 dates
        df[col_name] = pd.to_datetime(df[col_name])

    # filter rows :
    # * entries with missing product_name or brands
    df.drop(df[df["product_name"].isna() | df["brands"].isna()].index, inplace=True)
    # * entries that don't have an image for the (general aspect of the) product
    df.drop(df[df["image_small_url"].isna()].index, inplace=True)
    # (NB : .str methods in pandas are very slow, so it's way faster to apply them later)
    # * barcodes that are not of 8 or 13 characters long (EAN-8 or EAN-13)
    df.drop(
        df[(df["code"].str.len() != 8) & (df["code"].str.len() != 13)].index,
        inplace=True,
    )
    # * ambiguous barcodes, because they result in entries we cannot trust:
    # incomplete, mixes of products...
    #   - EAN-8: GS1-8 prefixes 000-099 and 200-299 "Used to issue GS1 restricted circulation number within a company"
    #   (source: https://www.gs1.org/sites/default/files/docs/barcodes/WR15-006%20Updating%20Figures%20in%20General%20Specification_errataAnkurComment.pdf)
    df.drop(df[df["code"].str.fullmatch(r"[02]\d{7}")].index, inplace=True)
    #   - EAN-13: GS1-13 prefix 00000 "Reserved for GS1 Company Prefix equivalent of GS1-8 Prefix"
    df.drop(df[df["code"].str.fullmatch(r"00000[02]\d{7}")].index, inplace=True)
    #   - and just all those starting with 00000 because their URLs on the OFF website are all bad
    df.drop(df[df["code"].str.startswith("00000")].index, inplace=True)
    # * entries that don't have categories
    df.drop(
        df[df["states_en"].str.contains("Categories to be completed")].index,
        inplace=True,
    )
    # * entries that don't have complete nutritional values
    df.drop(
        df[df["states_en"].str.contains("Nutrition facts to be completed")].index,
        inplace=True,
    )
    # remove unused categories for 'creator' and 'last_modified_by'
    # df['creator'].cat.remove_unused_categories()
    # df['last_modified_by'].cat.remove_unused_categories()
    # I'm not entirely sure how to use this and the user guide might not be up-to-date here
    # https://pandas.pydata.org/docs/reference/api/pandas.Series.cat.remove_unused_categories.html

    # filter columns (again, because we read the full dataset (and not the first 200k lines
    # and we dropped entries):
    # * columns that are > 99% empty
    empty_cols = find_empty_columns(df, threshold=threshold, verbose=False)
    df.drop(columns=empty_cols, inplace=True)
    usecols = [x for x in usecols if x not in empty_cols]

    if replace_set_columns:
        # replace each set column with a set of boolean columns ;
        # 'states' columns have 42 atomic values, much lower than the 5903 distinct occurring
        # combinations (as of 2021-08-16)
        for col_name, col_specs in SET_COLS.items():
            if col_name not in usecols:
                continue
            df = replace_set_column(df=df, col_name=col_name, **col_specs)

    if split_geo:
        # first_packaging_code_geo: split latitude and longitude, cast as floats
        col_name = "first_packaging_code_geo"
        df_col = df[col_name].str.split(",", expand=True)
        df_col[0] = pd.to_numeric(df_col[0], errors="coerce")
        df_col[1] = pd.to_numeric(df_col[1], errors="coerce")
        df_col.rename(
            columns={0: col_name + "__" + "lat", 1: col_name + "__" + "lon"},
            inplace=True,
        )
        df = df.join(df_col)
        df.drop(columns=["first_packaging_code_geo"], inplace=True)
    #
    return df

In [8]:
df = load_off_csv(
    OFF_FILE, dtype=OFF_DTYPES, nrows=None, replace_set_columns=False
)  # 1_546_028 ok, 1_546_029 not ok
# old: 416_552 rows, 66 columns, 1.0 GB (76 s)
# 2024:
# - nrows=2_000_000: 411940 rows, 85 columns, 1.5 GB (148 s)
# - nrows=3_333_516: 778725 rows, 81 columns, 2.8 GB (248 to 298 s)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 778725 entries, 1074 to 3333499
Data columns (total 81 columns):
 #   Column                                                 Non-Null Count   Dtype              
---  ------                                                 --------------   -----              
 0   code                                                   778725 non-null  string             
 1   url                                                    778725 non-null  string             
 2   creator                                                778724 non-null  category           
 3   created_datetime                                       778725 non-null  datetime64[ns, UTC]
 4   last_modified_datetime                                 778725 non-null  datetime64[ns, UTC]
 5   last_modified_by                                       772572 non-null  category           
 6   last_updated_datetime                                  766338 non-null  datetime64[ns, UTC]
 7   product_name

In [9]:
pd.DataFrame(df.memory_usage(deep=True).sort_values()[-10:])

Unnamed: 0,0
ingredients_analysis_tags,67191862
product_name,67330594
image_small_url,109971672
url,114245846
nutrient_levels_tags,119973664
categories_en,120609884
ingredients_text,184003965
ingredients_tags,209272621
popularity_tags,363863154
states_en,380210073


In [10]:
# new set columns: "popularity_tags", "ingredients_tags" (?)
df["ingredients_analysis_tags"].iloc[:5].values

<StringArray>
[                                                                              <NA>,
                                          'en:palm-oil-free,en:vegan,en:vegetarian',
 'en:palm-oil-content-unknown,en:vegan-status-unknown,en:vegetarian-status-unknown',
                               'en:may-contain-palm-oil,en:non-vegan,en:vegetarian',
 'en:palm-oil-content-unknown,en:vegan-status-unknown,en:vegetarian-status-unknown']
Length: 5, dtype: string

In [11]:
# "ingredients_tags" is a list of categorical values: each ingredient generates 1-n language-prefixed tags
# ex:
# * ingredients_text = "Hafervollkornflocken, Rohrzucker"
# * ingredients_tags = "en:whole-grain-oat-flakes,en:cereal,en:oat,en:oat-flakes,en:cane-sugar,en:added-sugar,en:disaccharide,en:sugar"
df["ingredients_text"].iloc[:5].values

<StringArray>
[                                                                                                                                                                                                                                                                                                        <NA>,
                                                                        'Water, Lime Juice from Concentrate (30%), Acid: Citric Acid; Preservatives: Potassium Sorbate, Sodium Metabisulphite (Sulphites); Flavourings, Antioxidant: Ascorbic Acid; Acidity Regulator: Sodium Citrate; Sweetener: Sucralose.',
                                                                                                                                'pâte 66.7%. farine de BLE  beurre (lait) 18,6%. eau sel levure désactivée.  Garniture compote de pommes 30,5%: purée de pommes concentrée, sirop de glucose fructose (BLE).',
                                                                             

In [12]:
df["nutriscore_score"].count()

652630

In [13]:
# number of non-null values of the 10 sparsest columns
df.count().sort_values().head(10)

vitamin-b2_100g              8718
vitamin-b1_100g              8831
vitamin-d_100g              11431
monounsaturated-fat_100g    15327
polyunsaturated-fat_100g    15345
packaging_text              22995
alcohol_100g                23116
potassium_100g              26339
vitamin-a_100g              27497
data_quality_errors_tags    28242
dtype: int64

In [14]:
# display the first entry
df.head(2).tail(1).values

array([['0000131327786',
        'http://world-en.openfoodfacts.org/product/0000131327786/lime-cordial-sainsbury-s',
        'blakejones99', Timestamp('2024-06-01 20:48:44+0000', tz='UTC'),
        Timestamp('2024-06-01 22:13:50+0000', tz='UTC'), 'roboto-app',
        Timestamp('2024-06-01 22:13:50+0000', tz='UTC'), 'Lime Cordial',
        <NA>, '1l', <NA>, <NA>, <NA>, "Sainsbury's", 'Lime-cordial',
        <NA>, <NA>, 'Vegetarian,Vegan,No added sugar', <NA>, <NA>, <NA>,
        "Sainsbury's", 'United Kingdom',
        'Water, Lime Juice from Concentrate (30%), Acid: Citric Acid; Preservatives: Potassium Sorbate, Sodium Metabisulphite (Sulphites); Flavourings, Antioxidant: Ascorbic Acid; Acidity Regulator: Sodium Citrate; Sweetener: Sucralose.',
        'en:water,en:lime-juice-concentrate,en:fruit,en:citrus-fruit,en:juice,en:fruit-juice,en:lime,en:lime-juice,en:acid,en:preservative,en:e223,en:flavouring,en:antioxidant,en:acidity-regulator,en:sweetener,en:e330,en:e202,en:e300,en:sodium-

In [15]:
df.dtypes.to_dict()

{'code': string[python],
 'url': string[python],
 'creator': CategoricalDtype(categories=['a-avilaaa', 'abdoubasbas', 'acistopogm', 'agamitsudo',
                   'alaetien', 'alban14121999', 'allergies-app-chakib',
                   'allfitnessfactory-de', 'aly22', 'andre',
                   ...
                   'lion76', 'lookatchu', 'malard', 'mordragt', 'purplesam',
                   'randy1156', 'serayet', 'town1997', 'deeulyana', 'jrg2024'],
 , ordered=False),
 'created_datetime': datetime64[ns, UTC],
 'last_modified_datetime': datetime64[ns, UTC],
 'last_modified_by': CategoricalDtype(categories=['5m4u9', 'acistopogm', 'agamitsudo', 'akaandrew', 'aleene',
                   'alexfauquette', 'alexouille', 'alibunt',
                   'allergies-app-chakib', 'angharadpike',
                   ...
                   'philanne', 'themindasrimal', 'xhxhd', 'yamum', 'zangele',
                   'afracnicus', 'backpedal', 'fitster', 'soefo', 'yanisro'],
 , ordered=False),
 'la

In [16]:
def dump_dtype(df: pd.DataFrame, f_out: str) -> None:
    """Dump the dtype of a DataFrame to a text file, usable for read_csv.

    The dtypes cannot be used as is, because we use pandas new types for strings,
    categories etc. *unless* we import all the necessary dtypes at the beginning, but
    that would be confusing for users. The only exception we make is for the ordered
    categories.

    Parameters
    ----------
    df :
        DataFrame
    f_out :
        Path to the text file to which the dtype will be written.
    """
    # init with the current dtypes, then overwrite those that do not support the
    # roundtrip well
    dump_dtype = df.dtypes.to_dict()
    # unordered category
    cols_ucats = [
        (col_name, "category")
        for col_name in df.select_dtypes(include="category").columns.values
        if not df[col_name].cat.ordered
    ]
    # print("category columns: ", cols_ucats)
    dump_dtype.update(cols_ucats)
    # string
    # NB: select_dtypes() does (as of 2024-07) accept "string"
    cols_string = [
        (col_name, "string")
        for col_name in df.select_dtypes(include=["string"]).columns.values
    ]
    # print("string columns: ", cols_string)  # DEBUG
    dump_dtype.update(cols_string)
    # datetime
    # dumped and read as objects, re-parsed to datetimes each time
    cols_datetime = [
        (col_name, "object")
        for col_name in df.select_dtypes(include="datetimetz").columns.values
    ]
    # print("datetimetz columns: ", cols_datetime)
    dump_dtype.update(cols_datetime)
    # numeric types
    # TODO is it necessary ?
    for col_dtype in ("float", "UInt8", "Int8", "Int64"):
        dump_dtype.update(
            [
                (col_name, col_dtype)
                for col_name in df.select_dtypes(include=col_dtype).columns.values
            ]
        )

    # FIXME the dump contains line breaks that seem to confuse python on Google colab !?
    with open(f_out, "w") as f:
        print(dump_dtype, file=f)

In [17]:
DTYPE_FILE = "../data/processed/dtype.txt"
dump_dtype(df=df, f_out=DTYPE_FILE)

In [18]:
# dump the filtered dataset
FILTERED_OFF = "../data/processed/off_products_subset.csv"


In [19]:
df.to_csv(FILTERED_OFF, sep="\t", index=False, quoting=csv.QUOTE_NONE)

In [20]:
# open new files to check consistency with the original
with open(DTYPE_FILE) as f:
    new_dtype = eval(f.read())

In [21]:
new_df = pd.read_csv(FILTERED_OFF, sep="\t", dtype=new_dtype, quoting=csv.QUOTE_NONE)
# convert columns with datetimes
for col_name in ("created_datetime", "last_modified_datetime", "last_updated_datetime", "last_image_datetime"):
    if col_name in new_df:
        # ISO 8601 dates
        new_df[col_name] = pd.to_datetime(new_df[col_name])

In [22]:
new_df.head(2)

Unnamed: 0,code,url,creator,created_datetime,last_modified_datetime,last_modified_by,last_updated_datetime,product_name,generic_name,quantity,...,vitamin-a_100g,vitamin-d_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,potassium_100g,calcium_100g,iron_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,nutrition-score-fr_100g
0,101209159,http://world-en.openfoodfacts.org/product/0000...,kiliweb,2018-02-22 10:56:57+00:00,2023-04-28 23:59:01+00:00,roboto-app,2024-02-09 14:48:49+00:00,Véritable pâte à tartiner noisettes chocolat noir,,350 g,...,,,,,,,,,,23.0
1,131327786,http://world-en.openfoodfacts.org/product/0000...,blakejones99,2024-06-01 20:48:44+00:00,2024-06-01 22:13:50+00:00,roboto-app,2024-06-01 22:13:50+00:00,Lime Cordial,,1l,...,,,,,,,,,30.0,
