# Appendix 1 : Filter the whole OpenFoodFacts database

In [1]:
import itertools

import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

In [2]:
OFF_FILE = "/home/mathieu/datasets/openfoodfacts/2024-07-19/en.openfoodfacts.org.products.csv"

The OpenFoodFacts website links to a file that [describes the data fields](https://static.openfoodfacts.org/data/data-fields.txt).

The first important information is that the OpenFoodFacts CSV file is encoded in UTF-8 and uses the TAB (`\t`) character as delimiter.

We will read the first 1,000 rows and see what data types are inferred by `read_csv`.

Then we will assign explicit data types or convert values following a few guidelines :

* Most of the columns in OpenFoodFacts contain nutritional values. Their name ends with `_100g` and their values are [decimal numerals](https://en.wikipedia.org/wiki/Decimal) ;
* Columns ending with `_t` contain [Unix times](https://en.wikipedia.org/wiki/Unix_time) ;
* Columns ending with `_datetime` contain date and time in [Coordinated Universal Time](https://en.wikipedia.org/wiki/Coordinated_Universal_Time) in the [ISO 8601 format](https://en.wikipedia.org/wiki/ISO_8601) ;
* Columns ending with `_tags` are comma separated lists of (string) values.

The `object` data type is the default data type in pandas, that is used whenever a column has non-numerical values.
There is however a `string` data type that you can explicitly set for columns that contain textual values (strings).

In [3]:
# set explicit data types for some columns
OFF_DTYPES = {
    'code': 'string',  # barcode
    'url': 'string',  # URL on OFF
    # metadata
    # - OFF user names are "category"
    # - Unix times and ISO 8601 datetimes are read as "object" to be cast explicitly later
    'creator': 'category',  # OFF user
    'created_t': 'object',  # post-processed
    'created_datetime': 'object',  # post-processed
    'last_modified_t': 'object',  # post-processed
    'last_modified_datetime': 'object',  # post-processed
    'last_modified_by': 'category',  # OFF user
    'last_updated_t': 'object',  # post-processed
    'last_updated_datetime': 'object',  # post-processed
    #
    'product_name': 'string',
    # 'abbreviated_product_name': 'string',  # currently empty (?)
    'generic_name': 'string',
    'quantity': 'string',
    # packaging has a taxonomy
    'packaging': 'string',  # list ; sep=','
    'packaging_tags': 'string',  # list ; sep=',' ; slightly normalized version of 'packaging'
    "packaging_en": "string",  # list[str] ; sep="," ; really a list of categories
    'packaging_text': 'string',  # plain text
    # brands has no taxonomy
    'brands': 'string',  # list ; sep="," (to normalize because ",  " occurs)
    'brands_tags': 'string',  # list ; sep=',' ; slightly normalized version of 'brands'
    # categories have a taxonomy and are (or should be) normalized
    # => candidates for 'category', list of str, list of 'category' ?
    # https://wiki.openfoodfacts.org/Data_fields#Categories
    'categories': 'string',  # list ; sep="," (or ", "? to normalize)
    'categories_tags': 'string',  # list ; sep="," ; slightly normalized version of 'categories' ?
    'categories_en': 'string',  # list ; sep="," ; plain text EN version of 'categories_tags' ?
    # origins has no taxonomy
    'origins': 'string',  # list ; sep=',' ; not normalized
    'origins_tags': 'string',  # list ; sep=',' ; slightly normalized version of 'origins' but not enough
    'origins_en': 'string',  # list ; sep=',' ; plain text EN version of 'origins_tags' ? not enough
    # manufacturing_places has no taxonomy
    'manufacturing_places': 'string',  # list ; sep=',' ; not normalized
    'manufacturing_places_tags': 'string',  # list ; sep=',' ; slightly normalized version of 'manufacturing_places'
    # labels has a taxonomy
    'labels': 'string',  # list ; sep=',' (or ', '?); not normalized
    'labels_tags': 'string',  # list ; sep=',' (or ', '?); slightly normalized version of 'labels'
    'labels_en': 'string',  # list ; sep=',' ; plain text EN version of 'labels_tags'
    #
    'emb_codes': 'string',  # list ; sep=',' (or ', '?); not normalized
    'emb_codes_tags': 'string',  # list ; sep=',' ; slightly normalized version of 'emb_codes'
    # '(lat, lon)'
    'first_packaging_code_geo': 'string',  # FIXME
    #
    # 'cities': 'string',  #  list ; sep=',' ; currently empty !?
    'cities_tags': 'string',  # list ; sep=',' ; slightly normalized
    #
    'purchase_places': 'string',  # list ; sep=',' ; not normalized
    'stores': 'string',  # list ; sep=',' ; not normalized
    #
    'countries': 'string',  # list ; sep=',' (or ', ' ?); not normalized
    'countries_tags': 'string',  # list ; sep=',' ; normalized version of 'countries'
    'countries_en': 'string',  # list ; sep=',' ; plain text EN version of 'countries_tags'
    #
    'ingredients_text': 'string',
    "ingredients_tags": "string",  # list ; sep=',' ; normalized version of 'ingredients'
    "ingredients_analysis_tags": "string",  # list ; sep=',' ; normalized values eg. "en:palm-oil-free,en:non-vegan"
    # ...
    # TODO handle like states ? maybe use a cutoff to keep only the most frequent ones, and put the rest in a text column "others" ?
    'allergens': 'string',  # list ; sep=',' (or ', ' ?); normalized ?
    # 'allergens_en': 'string',  # list ; sep=',' (or ', ' ?); plain text EN version of 'allergens'  # currently empty
    # TODO same (less urgent)
    'traces': 'string',  #  list ; sep=',' (or ', ' ?); normalized ?
    'traces_tags': 'string',  # list ; sep=',' (or ', ' ?); traces + automatic enrichment?
    'traces_en': 'string',  # list ; sep=',' (or ', ' ?); plain text EN version of 'traces_tags'
    # https://wiki.openfoodfacts.org/Data_fields#Serving_size
    'serving_size': 'string',  # (2024-07-19: this is column number 50)
    'serving_quantity': 'float',  # computed from serving_size
    # https://wiki.openfoodfacts.org/API_Fields
    "no_nutrition_data": "category",  # former name: 'no_nutriments' ; values = "on", "off", "true", "null" ?
    # additives
    'additives_n': 'UInt8',  # col 53
    # 'additives': 'string',  # list ; sep=',' ; currently empty !?
    'additives_tags': 'string',  # list ; sep=',' (or ', ' ?); normalized
    'additives_en': 'string',  # list ; sep=',' (or ', ' ?); plain text EN version of 'additives_tags'
    # palm: dropped and subsumed by other fields (?)
    # 'ingredients_from_palm_oil_n': 'UInt8',  # dropped
    ## 'ingredients_from_palm_oil': 'string',  # list ; sep=',' ; currently empty !?
    # 'ingredients_from_palm_oil_tags': 'string',  # list ; sep=',' (or ', ' ?); normalized
    # 'ingredients_that_may_be_from_palm_oil_n': 'UInt8',
    ## 'ingredients_that_may_be_from_palm_oil': 'string',  # list ; sep=',' ; currently empty !?
    # 'ingredients_that_may_be_from_palm_oil_tags': 'string',  # list ; sep=',' (or ', ' ?); normalized
    # synthetic scores, high-level information for customers
    'nutriscore_score': 'Int8',    # or Int64 ? ; col 57
    'nutriscore_grade': CategoricalDtype(categories=['a', 'b', 'c', 'd', 'e'], ordered=True), # "unknown" for missing values
    'nova_group': CategoricalDtype(categories=['1', '2', '3', '4'], ordered=True),  # empty string for missing values
    'pnns_groups_1': 'category',  # there's an 'unknown' cat but also some NaN values !?
    'pnns_groups_2': 'category',  # id !?
    "food_groups": "category",  # NEW : str ; either empty string or an atomic value like "en:vegetables"
    "food_groups_tags": "category",  # NEW: list[str] ; sep="," ; ex: "en:fruits-and-vegetables,en:vegetables"
    "food_groups_en": "category",  # NEW: list[str] ; sep="," ; ex: "en:fruits-and-vegetables,en:vegetables"
    # states
    'states': 'string',  # post-processed ; list[str] ; sep=", "
    'states_tags': 'string',  # post-processed ;  list[str] ; sep=","
    'states_en': 'string',  # post-processed ;  list[str] ; sep=","
    # 
    'brand_owner': 'string',  # ?
    # eco score
    'ecoscore_score': 'float64',  # should maybe be Int8 or Int64, but at least 1 float value stored as of 2024-07-19 ; renamed from "ecoscore_score_fr"  # col 69
    'ecoscore_grade': CategoricalDtype(categories=['a', 'b', 'c', 'd', 'e'], ordered=True),  # renamed from "ecoscore_grade_fr"
    # ?
    "nutrient_levels_tags": "string",  # list[str] ; sep="," ; really a list of categories
    # "product_quantity": "Int64",  # new, ?
    # "owner": "", # new, ??
    "data_quality_errors_tags": "string",  # list[str] ; sep="," ; really a list of categories
    "unique_scans_n": "Int64",
    "popularity_tags": "string",  # list[str] ; sep="," ; really a list of categories
    "completeness": "float64",  # in [0.0, 1.0] ; maximum 4 decimal digits
    # metadata on images
    "last_image_t": "object",  # post-processed
    "last_image_datetime": "object",  # post-processed
    # TODO categories are messy
    'main_category': 'string',
    'main_category_en': 'string',
    # URLs
    'image_url': 'string',
    'image_small_url': 'string',
    'image_ingredients_url': 'string',
    'image_ingredients_small_url': 'string',
    'image_nutrition_url': 'string',
    'image_nutrition_small_url': 'string',
    #
    # 'nutrition-score-fr_100g': 'Int64',  # Int8 ?
}

In [4]:
def find_empty_columns(df, threshold=0.01, verbose=False):
    """Find empty or near-empty columns.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame
    threshold : float
        Proportion of values that must be non-null to consider a column non-empty.
    verbose : boolean
        If True, print the name and count of each empty column.

    Returns
    -------
    empty_cols : List[str]
        Names of empty columns.
    """
    empty_cols = []
    nb_rows = df.shape[0]
    if verbose:
        print('nb_rows: ', nb_rows)
    for col_name in df.columns:
        if df[col_name].count() < threshold * nb_rows:
            if verbose:
                print(col_name, df[col_name].count())
            empty_cols.append(col_name)
    return empty_cols

In [37]:
def load_off_csv(filepath_or_buffer, nrows=None, threshold=0.01,
                 replace_set_columns=True, split_geo=False):
    """Load the OpenFoodFacts CSV file.
    
    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        Valid string path to the CSV file.
    threshold : float
        Threshold on the proportion of non-null values to consider a column non-empty.
        0.01 means we keep all columns where at least 1% of the values are non-empty. 
    nrows : int, optional
        Number of rows that should be read ; if None, read all.
    replace_set_columns : boolean
        If True, replace each set column (currently 'states', 'states_tags' and 'states_en')
        with a list of boolean columns. This requires to read a sample of the data first.
    split_geo : boolean
        If True, replace a geocode (string) column with two (float) columns for latitude and
        longitude. 
    
    Returns
    -------
    df : pd.DataFrame
        DataFrame containing the OpenFoodFacts tabular data.
    """
    dtype = OFF_DTYPES
    # read the first 200_000 lines of the dataset to get a good sample of the values in each column
    df_sample = pd.read_csv(
        filepath_or_buffer,
        sep="\t",
        dtype=dtype,
        nrows=min(nrows, 200_000),
        low_memory=False
    )
    # explicitly set all (remaining) _100g to float
    for col_name in df_sample.columns:
        if col_name in dtype:
            continue
        if col_name.endswith('_100g'):
            dtype[col_name] = 'float'
    # filter rows :
    # * product_name and brands
    df_sample.drop(df_sample[df_sample['product_name'].isna() | df_sample['brands'].isna()].index,
                   inplace=True)
    # * entries that don't have an image for the (general aspect of the) product
    df_sample.drop(df_sample[df_sample['image_small_url'].isna()].index,
                   inplace=True)
    # * barcodes that are not of 8 or 13 characters long (EAN-8 or EAN-13)
    df_sample.drop(df_sample[(df_sample['code'].str.len() != 8) & (df_sample['code'].str.len() != 13)].index,
                   inplace=True)
    # * ambiguous barcodes, because they result in entries we cannot trust:
    # incomplete, mixes of products...
    #   - EAN8 : GS1-8 prefixes 000-099 and 200-299 "Used to issue GS1 restricted circulation number within a company"
    #   (source: https://www.gs1.org/sites/default/files/docs/barcodes/WR15-006%20Updating%20Figures%20in%20General%20Specification_errataAnkurComment.pdf)
    df_sample.drop(df_sample[df_sample['code'].str.fullmatch(r'[02]\d{7}')].index,
                   inplace=True)
    #   - EAN-13: GS1-13 prefix 00000 "Reserved for GS1 Company Prefix equivalent of GS1-8 Prefix"
    df_sample.drop(df_sample[df_sample['code'].str.fullmatch(r'00000[02]\d{7}')].index,
                   inplace=True)
    #   - and just all those starting with 00000 because their URLs on the OFF website are all bad
    df_sample.drop(df_sample[df_sample['code'].str.startswith('00000')].index,
                   inplace=True)
    # * entries that don't have complete categories
    df_sample.drop(df_sample[df_sample['states_en'].str.contains('Categories to be completed')].index,
            inplace=True)
    # * entries that don't have complete nutritional values
    df_sample.drop(df_sample[df_sample['states_en'].str.contains('Nutrition facts to be completed')].index,
            inplace=True)

    # filter columns (to avoid loading) :
    usecols = list(df_sample.columns)
    # * URLs except 'url' and 'image_small_url'
    extra_url_cols = [x for x in df_sample.columns if x.endswith('_url') and x != 'image_small_url']
    df_sample.drop(columns=extra_url_cols, inplace=True)
    usecols = [x for x in usecols if x not in extra_url_cols]
    # TODO check they are effective duplicates
    dup_cols = [
        # "_t" are redundant with "_datetime"
        'created_t', 'last_modified_t', "last_updated_t", "last_image_t",
        # categories : we only keep the '_en' version of those
        'categories', 'categories_tags', 'main_category',
        'origins', 'origins_tags',
        'countries', 'countries_tags',
        'brands_tags',
        'labels', 'labels_tags',
        'packaging_tags',
        'states', 'states_tags',
        'manufacturing_places_tags',
        'traces', 'traces_tags',
    ]
    df_sample.drop(columns=dup_cols, inplace=True)
    usecols = [x for x in usecols if x not in dup_cols]
    # * columns that are > 99% empty
    empty_cols = find_empty_columns(df_sample, threshold=threshold, verbose=False)
    df_sample.drop(columns=empty_cols, inplace=True)
    usecols = [x for x in usecols if x not in empty_cols]

    # read the requested amount of data
    df = pd.read_csv(
        filepath_or_buffer,
        sep='\t',
        usecols=usecols,
        dtype=dtype,
        nrows=nrows,
    )
    # convert columns with unix timestamps and datetimes
    for col_name in ("created_t", "last_modified_t", "last_updated_t", "last_image_t"):
        if col_name not in usecols:
            continue
        # Unix timestamps
        # NB : adding ".dt.tz_localize('UTC')" results in the same value as in the _datetime field 
        df[col_name] = pd.to_datetime(df[col_name], unit='s')
    for col_name in ("created_datetime", "last_modified_datetime", "last_updated_datetime", "last_image_datetime"):
        if col_name not in usecols:
            continue
        # ISO 8601 dates
        df[col_name] = pd.to_datetime(df[col_name])

    # filter rows :
    # * entries with missing product_name or brands
    df.drop(df[df['product_name'].isna() | df['brands'].isna()].index,
            inplace=True)
    # * entries that don't have an image for the (general aspect of the) product
    df.drop(df[df['image_small_url'].isna()].index,
            inplace=True)
    # (NB : .str methods in pandas are very slow, so it's way faster to apply them later)
    # * barcodes that are not of 8 or 13 characters long (EAN-8 or EAN-13)
    df.drop(df[(df['code'].str.len() != 8) & (df['code'].str.len() != 13)].index,
            inplace=True)
    # * ambiguous barcodes, because they result in entries we cannot trust:
    # incomplete, mixes of products...
    #   - EAN-8: GS1-8 prefixes 000-099 and 200-299 "Used to issue GS1 restricted circulation number within a company"
    #   (source: https://www.gs1.org/sites/default/files/docs/barcodes/WR15-006%20Updating%20Figures%20in%20General%20Specification_errataAnkurComment.pdf)
    df.drop(df[df['code'].str.fullmatch(r'[02]\d{7}')].index,
            inplace=True)
    #   - EAN-13: GS1-13 prefix 00000 "Reserved for GS1 Company Prefix equivalent of GS1-8 Prefix"
    df.drop(df[df['code'].str.fullmatch(r'00000[02]\d{7}')].index,
            inplace=True)
    #   - and just all those starting with 00000 because their URLs on the OFF website are all bad
    df.drop(df[df['code'].str.startswith('00000')].index,
            inplace=True)
    # * entries that don't have categories
    df.drop(df[df['states_en'].str.contains('Categories to be completed')].index,
            inplace=True)
    # * entries that don't have complete nutritional values
    df.drop(df[df['states_en'].str.contains('Nutrition facts to be completed')].index,
            inplace=True)
    # remove unused categories for 'creator' and 'last_modified_by'
    # df['creator'].cat.remove_unused_categories()
    # df['last_modified_by'].cat.remove_unused_categories()
    # I'm not entirely sure how to use this and the user guide might not be up-to-date here
    # https://pandas.pydata.org/docs/reference/api/pandas.Series.cat.remove_unused_categories.html

    # filter columns (again, because we read the full dataset (and not the first 200k lines
    # and we dropped entries):
    # * columns that are > 99% empty
    empty_cols = find_empty_columns(df, threshold=threshold, verbose=False)
    df.drop(columns=empty_cols, inplace=True)
    usecols = [x for x in usecols if x not in empty_cols]

    if replace_set_columns:
        # replace each set column with a set of boolean columns ;
        # 'states' columns have 42 atomic values, much lower than the 5903 distinct occurring
        # combinations (as of 2021-08-16)
        for col_name in ('states', 'states_tags', 'states_en'):
            if col_name not in usecols:
                continue
            # lists of values in OFF should use ',' as separator but some fields contain
            # variants such as ', ', ',  ' (either for all values, or just some outliers !)
            col_split = df[col_name].str.replace(', ', ',').str.split(',', expand=False)
            uniq_vals = set(itertools.chain.from_iterable(col_split))
            # 'states' cols have 42 unique values (as of 2021-08-16)
            # we can have 41 because we filtered entries with no 'image_small_url' (hence no 'image_url')
            # which should amount to filtering out the tag 'Front-photo-not-selected'
            try:
                assert len(uniq_vals) == 42 or len(uniq_vals) == 41
            except AssertionError:
                print(len(uniq_vals))
                # print(uniq_vals_dbg[col_name] - uniq_vals)
                raise
            # create a column for each atomic value in the set
            for uniq_val in uniq_vals:
                df[col_name + '__' + uniq_val] = col_split.apply(lambda x: uniq_val in x)
            # drop the original column
            df.drop(columns=[col_name], inplace=True)

    if split_geo:
        # first_packaging_code_geo: split latitude and longitude, cast as floats
        col_name = 'first_packaging_code_geo'
        df_col = df[col_name].str.split(',', expand=True)
        df_col[0] = pd.to_numeric(df_col[0], errors='coerce')
        df_col[1] = pd.to_numeric(df_col[1], errors='coerce')
        df_col.rename(columns={0: col_name + '__' + 'lat', 1: col_name + '__' + 'lon'},
                        inplace=True)
        df = df.join(df_col)
        df.drop(columns=['first_packaging_code_geo'], inplace=True)
    #
    return df

In [38]:
# df = load_off_csv(OFF_FILE, nrows=None, replace_set_columns=False)
df = load_off_csv(OFF_FILE, nrows=1_546_029, replace_set_columns=False)  # 1_546_028 ok, 1_546_029 not ok
# 416_552 rows, 66 columns, 1.0 GB (76 s)
df.info(memory_usage='deep')

  df = pd.read_csv(


<class 'pandas.core.frame.DataFrame'>
Index: 81335 entries, 1074 to 1535997
Data columns (total 84 columns):
 #   Column                                                 Non-Null Count  Dtype              
---  ------                                                 --------------  -----              
 0   code                                                   81335 non-null  object             
 1   url                                                    81335 non-null  object             
 2   creator                                                81335 non-null  object             
 3   created_datetime                                       81335 non-null  datetime64[ns, UTC]
 4   last_modified_datetime                                 81335 non-null  datetime64[ns, UTC]
 5   last_modified_by                                       80687 non-null  object             
 6   last_updated_datetime                                  79449 non-null  datetime64[ns, UTC]
 7   product_name          

In [44]:
df.iloc[-1]

code                                                                                         3543971371009
url                                                      http://world-en.openfoodfacts.org/product/3543...
creator                                                                                            kiliweb
created_datetime                                                                 2017-08-13 07:05:38+00:00
last_modified_datetime                                                           2023-11-20 02:16:50+00:00
                                                                               ...                        
calcium_100g                                                                                           NaN
iron_100g                                                                                              NaN
magnesium_100g                                                                                         NaN
fruits-vegetables-nuts-estimate-from-

In [None]:
df['nutriscore_score'].count()

348265

In [None]:
# number of non-null values of the 10 sparsest columns
df.count().sort_values().head(10)

magnesium_100g                     4295
vitamin-b1_100g                    4485
polyunsaturated-fat_100g           7649
monounsaturated-fat_100g           7655
potassium_100g                     7689
ingredients_from_palm_oil_tags     9397
vitamin-a_100g                    11760
brand_owner                       12689
trans-fat_100g                    13074
cholesterol_100g                  13098
dtype: int64

In [None]:
# display the first entry
df.head(2).tail(1).values

array([['0000159487776',
        'http://world-en.openfoodfacts.org/product/0000159487776/milkyway-magic-stars-chocolates',
        'usda-ndb-import',
        Timestamp('2017-03-09 16:01:56+0000', tz='UTC'),
        Timestamp('2020-04-22 20:31:56+0000', tz='UTC'),
        'Milkyway, magic stars chocolates', <NA>, <NA>, <NA>, 'Milkyway',
        'Snacks,Sweet snacks,Cocoa and its products,Confectioneries,Chocolate candies',
        <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, 'United States',
        'Sugar, cocoa butter, skimmed milk powder, cocoa mass, whey powder (from milk), lactose, milk fat, emulsifier (soya lecithin), natural vanilla extract.',
        <NA>, <NA>, '100 g (100 g)', 100.0, 1, 'en:e322,en:e322i',
        'E322 - Lecithins,E322i - Lecithin', 0, <NA>, 0, <NA>, <NA>, nan,
        '4', 'Sugary snacks', 'Sweets',
        'To be completed,Nutrition facts completed,Ingredients completed,Expiration date to be completed,Packaging code to be completed,Characteristics to be compl

In [None]:
df.dtypes.to_dict()

{'code': StringDtype,
 'url': StringDtype,
 'creator': CategoricalDtype(categories=['a-avilaaa', 'acistopogm', 'agamitsudo', 'alaetien',
                   'alban14121999', 'aleene', 'allfitnessfactory-de', 'andre',
                   'averment', 'b7',
                   ...
                   'bitcoding', 'callumnsmith', 'gwla10', 'lion76', 'locness3',
                   'malard', 'melissa123456', 'pducrot', 'serayet', 'tabea'],
                  ordered=False),
 'created_datetime': datetime64[ns, UTC],
 'last_modified_datetime': datetime64[ns, UTC],
 'product_name': StringDtype,
 'generic_name': StringDtype,
 'quantity': StringDtype,
 'packaging': StringDtype,
 'brands': StringDtype,
 'categories_en': StringDtype,
 'origins_en': StringDtype,
 'manufacturing_places': StringDtype,
 'labels_en': StringDtype,
 'emb_codes': StringDtype,
 'emb_codes_tags': StringDtype,
 'purchase_places': StringDtype,
 'stores': StringDtype,
 'countries_en': StringDtype,
 'ingredients_text': StringDtype,
 

In [None]:
# dump the dtype of the filtered dataset
# the dtypes cannot be used as is, because we use pandas new types (strings, categories...)
# *unless* we import all the necessary dtypes at the beginning, but that would be...
# (the only exception we make is for the ordered categories)
dump_dtype = {}
dump_dtype['creator'] = 'category'
cols_string = [(col_name, 'string') for col_name in df.select_dtypes('string').columns.values]
dump_dtype.update(cols_string)
cols_datetime = [(col_name, 'object') for col_name in df.select_dtypes('datetime64[ns, UTC]').columns.values]
dump_dtype.update(cols_datetime)
cols_float = [(col_name, 'float') for col_name in df.select_dtypes('float').columns.values]
dump_dtype.update(cols_float)
cols_uint8 = [(col_name, 'UInt8') for col_name in df.select_dtypes('UInt8').columns.values]
dump_dtype.update(cols_uint8)
cols_int8 = [(col_name, 'Int8') for col_name in df.select_dtypes('Int8').columns.values]
dump_dtype.update(cols_int8)
#
for col_name, col_dtype in df.dtypes.to_dict().items():
    if col_name not in dump_dtype :
        dump_dtype[col_name] = col_dtype
#
DTYPE_FILE = '../data/processed/dtype.txt'
# FIXME the dump contains line breaks that seem to confuse python on Google colab !?
with open(DTYPE_FILE, 'w') as f:
    print(dump_dtype, file=f)

In [None]:
# dump the filtered dataset
FILTERED_OFF = '../data/processed/off_products_subset.csv'


In [None]:
df.to_csv(FILTERED_OFF, sep='\t', index=False)

In [None]:
# open new files to check consistency with the original
with open(DTYPE_FILE) as f:
    new_dtype = eval(f.read())

In [None]:
new_df = pd.read_csv(FILTERED_OFF, sep='\t', dtype=new_dtype)
# convert columns with datetimes
for col_name in ('created_datetime', 'last_modified_datetime'):
    # ISO 8601 dates
    new_df[col_name] = pd.to_datetime(new_df[col_name])

In [None]:
new_df.head(2)

Unnamed: 0,code,url,creator,created_datetime,last_modified_datetime,product_name,generic_name,quantity,packaging,brands,...,sodium_100g,alcohol_100g,vitamin-a_100g,vitamin-c_100g,vitamin-b1_100g,potassium_100g,calcium_100g,iron_100g,magnesium_100g,nutrition-score-fr_100g
0,101209159,http://world-en.openfoodfacts.org/product/0000...,kiliweb,2018-02-22 10:56:57+00:00,2020-01-18 19:26:31+00:00,Véritable pâte à tartiner noisettes chocolat noir,,350 g,,Bovetti,...,0.004,,,,,,,,,23.0
1,159487776,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,2017-03-09 16:01:56+00:00,2020-04-22 20:31:56+00:00,"Milkyway, magic stars chocolates",,,,Milkyway,...,,,,,,,,,,
