In [1]:
# `autoreload` reloads modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2

In [2]:
import json
from urllib import request

from pandas.io.json import json_normalize
import pandas as pd

from robotoff.products import ProductDataset

import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from src.data.explore_data import extract_values

In [3]:
URL_LABELS_FULL = "https://world.openfoodfacts.org/data/taxonomies/labels.full.json"

with request.urlopen(URL_LABELS_FULL) as url:
    labels_full = json.loads(url.read().decode())

In [4]:
print(f"How many labels registered ?\t{len(extract_values(labels_full, 'name'))}")
print(f"How many links to images ?\t{len(extract_values(labels_full, 'image'))}")

How many labels registered ?	1015
How many links to images ?	10


In [7]:
df_sparse_labels.columns[df_sparse_labels.columns.str.startswith("image")]

Index(['image.en'], dtype='object')

In [8]:
# convert nested json to structured pandas.dataframe
df_sparse_labels = pd.concat(
    (
        json_normalize(labels_full.get(label_key))
        for label_key, value in labels_full.items()
    ),
    sort=True,
    ignore_index=True
)

# custom the indexes
df_sparse_labels.index = [k for k, v in labels_full.items()]
df_sparse_labels.index.name = "label_id"

# drop unnecessary colums
list_synonyms = df_sparse_labels.loc[:, df_sparse_labels.columns.str.startswith('synonyms.')].columns.tolist()
columns_to_drop = ['address.en', 'city.en', 'country.en', 'postalcode.en', 'image.en', 'region.en', 'website.en'] + list_synonyms
df_sparse_labels = df_sparse_labels.drop(columns_to_drop, axis=1)

# combine sparse columns starting with "name."
starts_with_name = df_sparse_labels.columns.str.startswith('name.')
sparse_columns = df_sparse_labels.loc[:, starts_with_name].columns.tolist()
languages = df_sparse_labels.loc[:, sparse_columns].columns.str.replace('name.', '').tolist()
df_sparse_labels['name'] = df_sparse_labels.loc[:, sparse_columns].values.tolist()
df_sparse_labels['name'] = df_sparse_labels.name.map(lambda values: dict(zip(languages, values)))
df_sparse_labels = df_sparse_labels.drop(sparse_columns, axis=1)

In [9]:
df_sparse_labels.loc[(df_sparse_labels.index.str.startswith("fr:")) & (df_sparse_labels.children.isnull()), :].drop('children', axis=1)

Unnamed: 0_level_0,parents,wikidata.en,name
label_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fr:entreprise-du-patrimoine-vivant,,Q3214304,"{'ar': nan, 'bg': nan, 'ca': nan, 'cs': nan, '..."
fr:mondial-du-rose,[fr:vinalies],,"{'ar': nan, 'bg': nan, 'ca': nan, 'cs': nan, '..."
fr:eleve-en-fut-de-chene,,,"{'ar': nan, 'bg': nan, 'ca': nan, 'cs': nan, '..."
fr:mis-en-bouteille-a-la-propriete,,,"{'ar': nan, 'bg': nan, 'ca': nan, 'cs': nan, '..."
fr:medaille-de-bronze-du-concours-de-bordeaux-2012,[fr:medaille-de-bronze-du-concours-de-bordeaux],,"{'ar': nan, 'bg': nan, 'ca': nan, 'cs': nan, '..."
fr:viande-ovine-francaise,[en:french-meat],,"{'ar': nan, 'bg': nan, 'ca': nan, 'cs': nan, '..."
fr:medaille-d-argent-du-concours-mondial-de-bruxelles-2013,[en:silver-medal-of-the-concours-mondial-de-br...,,"{'ar': nan, 'bg': nan, 'ca': nan, 'cs': nan, '..."
fr:viande-de-veau-francais,[en:french-meat],,"{'ar': nan, 'bg': nan, 'ca': nan, 'cs': nan, '..."
fr:savourez-l-alsace,,,"{'ar': nan, 'bg': nan, 'ca': nan, 'cs': nan, '..."
fr:medaille-de-bronze-du-concours-de-bordeaux-2013,[fr:medaille-de-bronze-du-concours-de-bordeaux],,"{'ar': nan, 'bg': nan, 'ca': nan, 'cs': nan, '..."
