Skip to content

Commit

Permalink
refactor: Simplified category filtering script and added candies cate…
Browse files Browse the repository at this point in the history
…gory (#543)
  • Loading branch information
odin-h committed Apr 19, 2024
1 parent 0a4d73c commit dd03eda
Show file tree
Hide file tree
Showing 151 changed files with 188 additions and 241 deletions.
130 changes: 38 additions & 92 deletions data/categories/filter_categories.py
Original file line number Diff line number Diff line change
@@ -1,161 +1,107 @@
import os
import re
import json
from openfoodfacts.taxonomy import get_taxonomy


PARENT_CATEGORIES_ID = [
"en:vegetables", # 391
"en:fruits", # 287
"en:culinary-plants", # 152
"en:nuts", # 77
"en:potatoes", # 27
"en:textured-vegetable-protein", # 2
"en:textured-vegetable-protein" # 2
]

EXTRA_CHILDREN = [
"en:rolled-oats",
"en:ginger",
"en:mushrooms",
"en:candies"
]

ADDITIONAL_FILTERING = [
"Cooked ",
"Fresh ",
"Frozen ",
"Canned "
"Cooked",
"Fresh",
"Frozen",
"Canned"
]

script_path = os.path.dirname(os.path.abspath(__file__))

def get_languages():
with open("src/i18n/data/languages.json") as f:
with open(os.path.join(script_path, "../../src/i18n/data/languages.json")) as f:
return json.load(f)


def get_category_taxonomy():
return get_taxonomy("category")


def get_taxonomy_node_by_id(taxonomy, node_id):
return next((node for node in taxonomy.iter_nodes() if node.id == node_id), None)


def get_taxonomy_node_list_by_id_list(taxonomy, node_id_list):
node_list = list()
for node_id in node_id_list:
taxonomy_node = get_taxonomy_node_by_id(taxonomy, node_id)
if taxonomy_node:
node_list.append(taxonomy_node)
return node_list


def taxonomy_node_list_to_dict_list(node_list, delete_parents=False):
node_dict_list = list()
for node in node_list:
node_dict = { "id": node.id, **node.to_dict() }
if delete_parents:
del node_dict["parents"]
node_dict_list.append(node_dict)
return node_dict_list


def get_taxonomy_node_children_full_list(taxonomy, node_parent):
children_node_list = list()
for node in taxonomy.iter_nodes():
node_parents = node.get_parents_hierarchy()
if next((n for n in node_parents if n == node_parent), None):
children_node_list.append(node)
return children_node_list
return [node for node in taxonomy.iter_nodes() if node.id in node_id_list]

def get_all_descendants(taxonomy, node_parent):
return [node for node in taxonomy.iter_nodes() if node_parent in node.get_parents_hierarchy()]

def filter_categories(categories, parent_categories):
# get child nodes of parent_categories
node_child_list = list()
for parent_node in parent_categories:
parent_node_children = get_taxonomy_node_children_full_list(categories, parent_node)
node_child_list.extend(parent_node_children)

# add extra nodes
# Get all descendants for every parent node in parent_categories
all_descendants = [descendant for parent_node in parent_categories for descendant in get_all_descendants(categories, parent_node)]
# Add extra children to the list of descendants
print("Add extra nodes:", EXTRA_CHILDREN)
for id in EXTRA_CHILDREN:
node = get_taxonomy_node_by_id(categories, id)
if node:
node_child_list.append(node)

# additional filtering
all_descendants.extend(get_taxonomy_node_list_by_id_list(categories, EXTRA_CHILDREN))
print("Additional filtering on:", ADDITIONAL_FILTERING)
node_child_list_filtered = list()
for node in node_child_list:
if not node.get_localized_name("en").startswith(tuple(ADDITIONAL_FILTERING)):
node_child_list_filtered.append(node)

# remove duplicates
node_child_list_filtered_unique = list(set(node_child_list_filtered))

# keep only ids starting with "en:"
node_child_list_filtered_unique = [node for node in node_child_list_filtered_unique if node.id.startswith("en:")]

return node_child_list_filtered_unique
# additional filtering using regex word boundary that only keeps ids starting with "en:"
filtered_descendants = {node for node in all_descendants if not any(re.search(r'\b{}\b'.format(filter_word), node.get_localized_name("en"), flags=re.IGNORECASE) for filter_word in ADDITIONAL_FILTERING) and node.id.startswith("en:")}
return filtered_descendants


def write_categories_to_files(categories):
def write_categories_to_files(categories, delete_parents=False):
languages = get_languages()
for language in languages:
language_code = language['code']
language_categories = list()
# for each category, get translation (or default to en)
for category in categories:
language_category_name = category['name'][language_code] if (language_code in category['name']) else category['name']['en']
language_categories.append({"id": category['id'], "name": language_category_name})
language_categories = [{"id": category['id'], "name": category['name'].get(language_code, category['name']['en']), "parents": category.get('parents')} for category in categories]
# handle parents key
for i, category in enumerate(language_categories):
if not category["parents"] or delete_parents:
del language_categories[i]["parents"]
# order by name
language_categories = sorted(language_categories, key=lambda x: x['name'])
# write to file
with open(f"src/data/categories/{language['code']}.json", "w") as f:
with open(os.path.join(script_path, f"../../src/data/categories/{language_code}.json"), "w") as f:
json.dump(language_categories, f, ensure_ascii=False)


def compare_new_categories_with_old_categories():
with open("src/data/category-tags.json") as f:
with open(os.path.join(script_path,"../../src/data/category-tags.json")) as f:
old_categories = json.load(f)
print("old_categories", len(old_categories))

with open("src/data/categories/en.json") as f:
with open(os.path.join(script_path,"../../src/data/categories/en.json")) as f:
new_categories = json.load(f)
print("new_categories", len(new_categories))

# check missing in new
category_missing_in_new_list = list()
for category in old_categories:
found = next((c for c in new_categories if c['id'] == category['id']), None)
if not found:
category_missing_in_new_list.append(category)
category_missing_in_new_list = [category for category in old_categories if not any(c['id'] == category['id'] for c in new_categories)]

print("missing in new", len(category_missing_in_new_list))
print(category_missing_in_new_list)

# check missing in old
category_missing_in_old_list = list()
for category in old_categories:
found = next((c for c in old_categories if c['id'] == category['id']), None)
if not found:
category_missing_in_old_list.append(category)
print("missing in old", len(category_missing_in_old_list))
category_missing_in_old_list = [category for category in old_categories if not any(c['id'] == category['id'] for c in old_categories)]

print("missing in old", len(category_missing_in_old_list))

if __name__ == "__main__":
"""
How-to run ?
> pip install openfoodfacts
> python data/categories/filter_categories.py
> python filter_categories.py
"""
# init
CATEGORIES_FULL = get_category_taxonomy()
CATEGORIES_FULL = get_taxonomy("category")
print("Total number of categories:", len(CATEGORIES_FULL))
PARENT_CATEGORIES = get_taxonomy_node_list_by_id_list(CATEGORIES_FULL, PARENT_CATEGORIES_ID)
print("Filter with the following parent categories:", PARENT_CATEGORIES)
print("Filter with the following parent categories:", [node.id for node in PARENT_CATEGORIES])

categories_filtered = filter_categories(CATEGORIES_FULL, PARENT_CATEGORIES)
categories_filtered_to_dict_list = taxonomy_node_list_to_dict_list(list(categories_filtered), delete_parents=True)
categories_filtered_to_dict_list = [{"id": node.id, **node.to_dict()} for node in categories_filtered]
print("Categories remaining:", len(categories_filtered_to_dict_list))

write_categories_to_files(categories_filtered_to_dict_list)
write_categories_to_files(categories_filtered_to_dict_list, delete_parents=True)
print("Wrote to language files")

# compare_new_categories_with_old_categories()
2 changes: 1 addition & 1 deletion src/data/categories/aa.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ach.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/af.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ak.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/am.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ar.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/as.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ast.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/az.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/be.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ber.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/bg.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/bm.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/bn.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/bo.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/br.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/bs.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ca.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ce.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/chr.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/co.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/crs.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/cs.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/cv.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/cy.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/da.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/de.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/el.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/en.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/en_AU.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/en_GB.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/eo.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/es.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/et.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/eu.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/fa.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/fi.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/fil.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/fo.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/fr.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ga.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/gd.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/gl.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/gu.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ha.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/he.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/hi.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/hr.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ht.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/hu.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/hy.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/id.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ii.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/is.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/it.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/iu.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ja.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/jv.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ka.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/kab.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/kk.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/km.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/kmr_TR.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/kn.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ko.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/kw.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ky.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/la.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/lb.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/lo.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/lt.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/lv.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/mg.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/mi.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ml.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/mn.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/mr.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ms.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/mt.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/my.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/nb.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ne.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/nl_BE.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/nl_NL.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/nn.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/no.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/nr.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/oc.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/or.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/pa.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/pl.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/pt_BR.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/pt_PT.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/qu.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/rm.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ro.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ru.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sa.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sat.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sc.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sco.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sd.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sg.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/si.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sk.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sl.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sma.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sn.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/so.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/son.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sq.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sr.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sr_CS.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sr_RS.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ss.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/st.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sv.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/sw.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ta.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/te.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/tg.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/th.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ti.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/tl.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/tn.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/tr.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ts.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/tt.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/tw.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ty.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/tzl.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ug.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/uk.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ur.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/uz.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/val.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/ve.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/vec.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/vi.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/data/categories/vls.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/wa.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/wo.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/xh.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/yi.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/yo.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/zea.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/zh_CN.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/zh_HK.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/zh_TW.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/data/categories/zu.json

Large diffs are not rendered by default.

0 comments on commit dd03eda

Please sign in to comment.