-
-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
…gory (#543)
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,161 +1,107 @@ | ||
import os | ||
import re | ||
import json | ||
from openfoodfacts.taxonomy import get_taxonomy | ||
|
||
|
||
PARENT_CATEGORIES_ID = [ | ||
"en:vegetables", # 391 | ||
"en:fruits", # 287 | ||
"en:culinary-plants", # 152 | ||
"en:nuts", # 77 | ||
"en:potatoes", # 27 | ||
"en:textured-vegetable-protein", # 2 | ||
"en:textured-vegetable-protein" # 2 | ||
] | ||
|
||
EXTRA_CHILDREN = [ | ||
"en:rolled-oats", | ||
"en:ginger", | ||
"en:mushrooms", | ||
"en:candies" | ||
] | ||
|
||
ADDITIONAL_FILTERING = [ | ||
"Cooked ", | ||
"Fresh ", | ||
"Frozen ", | ||
"Canned " | ||
"Cooked", | ||
"Fresh", | ||
"Frozen", | ||
"Canned" | ||
] | ||
|
||
script_path = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
def get_languages(): | ||
with open("src/i18n/data/languages.json") as f: | ||
with open(os.path.join(script_path, "../../src/i18n/data/languages.json")) as f: | ||
return json.load(f) | ||
|
||
|
||
def get_category_taxonomy(): | ||
return get_taxonomy("category") | ||
|
||
|
||
def get_taxonomy_node_by_id(taxonomy, node_id): | ||
return next((node for node in taxonomy.iter_nodes() if node.id == node_id), None) | ||
|
||
|
||
def get_taxonomy_node_list_by_id_list(taxonomy, node_id_list): | ||
node_list = list() | ||
for node_id in node_id_list: | ||
taxonomy_node = get_taxonomy_node_by_id(taxonomy, node_id) | ||
if taxonomy_node: | ||
node_list.append(taxonomy_node) | ||
return node_list | ||
|
||
|
||
def taxonomy_node_list_to_dict_list(node_list, delete_parents=False): | ||
node_dict_list = list() | ||
for node in node_list: | ||
node_dict = { "id": node.id, **node.to_dict() } | ||
if delete_parents: | ||
del node_dict["parents"] | ||
node_dict_list.append(node_dict) | ||
return node_dict_list | ||
|
||
|
||
def get_taxonomy_node_children_full_list(taxonomy, node_parent): | ||
children_node_list = list() | ||
for node in taxonomy.iter_nodes(): | ||
node_parents = node.get_parents_hierarchy() | ||
if next((n for n in node_parents if n == node_parent), None): | ||
children_node_list.append(node) | ||
return children_node_list | ||
return [node for node in taxonomy.iter_nodes() if node.id in node_id_list] | ||
|
||
def get_all_descendants(taxonomy, node_parent): | ||
return [node for node in taxonomy.iter_nodes() if node_parent in node.get_parents_hierarchy()] | ||
|
||
def filter_categories(categories, parent_categories): | ||
# get child nodes of parent_categories | ||
node_child_list = list() | ||
for parent_node in parent_categories: | ||
parent_node_children = get_taxonomy_node_children_full_list(categories, parent_node) | ||
node_child_list.extend(parent_node_children) | ||
|
||
# add extra nodes | ||
# Get all descendants for every parent node in parent_categories | ||
all_descendants = [descendant for parent_node in parent_categories for descendant in get_all_descendants(categories, parent_node)] | ||
# Add extra children to the list of descendants | ||
print("Add extra nodes:", EXTRA_CHILDREN) | ||
for id in EXTRA_CHILDREN: | ||
node = get_taxonomy_node_by_id(categories, id) | ||
if node: | ||
node_child_list.append(node) | ||
|
||
# additional filtering | ||
all_descendants.extend(get_taxonomy_node_list_by_id_list(categories, EXTRA_CHILDREN)) | ||
print("Additional filtering on:", ADDITIONAL_FILTERING) | ||
node_child_list_filtered = list() | ||
for node in node_child_list: | ||
if not node.get_localized_name("en").startswith(tuple(ADDITIONAL_FILTERING)): | ||
node_child_list_filtered.append(node) | ||
|
||
# remove duplicates | ||
node_child_list_filtered_unique = list(set(node_child_list_filtered)) | ||
|
||
# keep only ids starting with "en:" | ||
node_child_list_filtered_unique = [node for node in node_child_list_filtered_unique if node.id.startswith("en:")] | ||
|
||
return node_child_list_filtered_unique | ||
# additional filtering using regex word boundary that only keeps ids starting with "en:" | ||
filtered_descendants = {node for node in all_descendants if not any(re.search(r'\b{}\b'.format(filter_word), node.get_localized_name("en"), flags=re.IGNORECASE) for filter_word in ADDITIONAL_FILTERING) and node.id.startswith("en:")} | ||
return filtered_descendants | ||
|
||
|
||
def write_categories_to_files(categories): | ||
def write_categories_to_files(categories, delete_parents=False): | ||
languages = get_languages() | ||
for language in languages: | ||
language_code = language['code'] | ||
language_categories = list() | ||
# for each category, get translation (or default to en) | ||
for category in categories: | ||
language_category_name = category['name'][language_code] if (language_code in category['name']) else category['name']['en'] | ||
language_categories.append({"id": category['id'], "name": language_category_name}) | ||
language_categories = [{"id": category['id'], "name": category['name'].get(language_code, category['name']['en']), "parents": category.get('parents')} for category in categories] | ||
# handle parents key | ||
for i, category in enumerate(language_categories): | ||
if not category["parents"] or delete_parents: | ||
del language_categories[i]["parents"] | ||
# order by name | ||
language_categories = sorted(language_categories, key=lambda x: x['name']) | ||
# write to file | ||
with open(f"src/data/categories/{language['code']}.json", "w") as f: | ||
with open(os.path.join(script_path, f"../../src/data/categories/{language_code}.json"), "w") as f: | ||
json.dump(language_categories, f, ensure_ascii=False) | ||
|
||
|
||
def compare_new_categories_with_old_categories(): | ||
with open("src/data/category-tags.json") as f: | ||
with open(os.path.join(script_path,"../../src/data/category-tags.json")) as f: | ||
old_categories = json.load(f) | ||
print("old_categories", len(old_categories)) | ||
|
||
with open("src/data/categories/en.json") as f: | ||
with open(os.path.join(script_path,"../../src/data/categories/en.json")) as f: | ||
new_categories = json.load(f) | ||
print("new_categories", len(new_categories)) | ||
|
||
# check missing in new | ||
category_missing_in_new_list = list() | ||
for category in old_categories: | ||
found = next((c for c in new_categories if c['id'] == category['id']), None) | ||
if not found: | ||
category_missing_in_new_list.append(category) | ||
category_missing_in_new_list = [category for category in old_categories if not any(c['id'] == category['id'] for c in new_categories)] | ||
|
||
print("missing in new", len(category_missing_in_new_list)) | ||
print(category_missing_in_new_list) | ||
|
||
# check missing in old | ||
category_missing_in_old_list = list() | ||
for category in old_categories: | ||
found = next((c for c in old_categories if c['id'] == category['id']), None) | ||
if not found: | ||
category_missing_in_old_list.append(category) | ||
print("missing in old", len(category_missing_in_old_list)) | ||
category_missing_in_old_list = [category for category in old_categories if not any(c['id'] == category['id'] for c in old_categories)] | ||
|
||
print("missing in old", len(category_missing_in_old_list)) | ||
|
||
if __name__ == "__main__": | ||
""" | ||
How-to run ? | ||
> pip install openfoodfacts | ||
> python data/categories/filter_categories.py | ||
> python filter_categories.py | ||
""" | ||
# init | ||
CATEGORIES_FULL = get_category_taxonomy() | ||
CATEGORIES_FULL = get_taxonomy("category") | ||
print("Total number of categories:", len(CATEGORIES_FULL)) | ||
PARENT_CATEGORIES = get_taxonomy_node_list_by_id_list(CATEGORIES_FULL, PARENT_CATEGORIES_ID) | ||
print("Filter with the following parent categories:", PARENT_CATEGORIES) | ||
print("Filter with the following parent categories:", [node.id for node in PARENT_CATEGORIES]) | ||
|
||
categories_filtered = filter_categories(CATEGORIES_FULL, PARENT_CATEGORIES) | ||
categories_filtered_to_dict_list = taxonomy_node_list_to_dict_list(list(categories_filtered), delete_parents=True) | ||
categories_filtered_to_dict_list = [{"id": node.id, **node.to_dict()} for node in categories_filtered] | ||
print("Categories remaining:", len(categories_filtered_to_dict_list)) | ||
|
||
write_categories_to_files(categories_filtered_to_dict_list) | ||
write_categories_to_files(categories_filtered_to_dict_list, delete_parents=True) | ||
print("Wrote to language files") | ||
|
||
# compare_new_categories_with_old_categories() |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.