In [None]:
from robotoff.products import ProductDataset

ds = ProductDataset.load()
product_iter = (ds.stream()
                  .filter_by_country_tag('en:france')
                  .iter())

for i, product in enumerate(product_iter):
    if 'labels_tags' in product.keys():
        labels.append(product['labels_tags'])
        
    if 'brands_tags' in product.keys():
        brands.append(product['brands_tags'])
        
    if i > 200000:
        break

In [12]:
import re

BARCODE_PATH_REGEX = re.compile(r"^(...)(...)(...)(.*)$")


def split_barcode(barcode: str):
    if not barcode.isdigit():
        raise ValueError("unknown barcode format: {}".format(barcode))

    match = BARCODE_PATH_REGEX.fullmatch(barcode)

    if match:
        return [x for x in match.groups() if x]

    return [barcode]

In [13]:
def get_images(product):
    urls = []
    if 'images' in product.keys():
        images_ids = [key for key in product['images'].keys() if key.isdigit()]
        barcode = split_barcode(product['code'])
        
        if len(barcode) == 1 :
            for image_id in images_ids:
                urls.append('https://static.openfoodfacts.org/images/products/%s/%s.jpg' % (barcode[0], image_id))
        if len(barcode) == 4 :
            for image_id in images_ids:
                s = "{0}/{1}/{2}/{3}"
                formatted_barcode = s.format(*barcode)
                urls.append('https://static.openfoodfacts.org/images/products/%s/%s.jpg' % (formatted_barcode, image_id))
        if len(barcode) == 3 :
            for image_id in images_ids:
                s = "{0}/{1}/{2}"
                formatted_barcode = s.format(*barcode)
                urls.append('https://static.openfoodfacts.org/images/products/%s/%s.jpg' % (formatted_barcode, image_id))
        if len(barcode) == 2 :
            for image_id in images_ids:
                s = "{0}/{1}"
                formatted_barcode = s.format(*barcode)
                urls.append('https://static.openfoodfacts.org/images/products/%s/%s.jpg' % (formatted_barcode, image_id))
        return urls


In [14]:
print(len(labels))
print(len(brands))

82904
97869


In [15]:
#Flatten lists as one product can have more than one brand/label
flat_list_labels = [item for sublist in labels for item in sublist]
flat_list_brands = [item for sublist in brands for item in sublist]

In [16]:
from collections import Counter
top_labels = Counter(flat_list_labels)
top_brands = Counter(flat_list_brands)

In [17]:
# Get top labels and initialize a count at 30 for next steps
top_labels  = dict([(i[0],30) for i in top_labels.most_common(30)])
top_brands  = dict([(i[0],30) for i in top_brands.most_common(30)])


In [18]:
print(top_labels)

{'en:organic': 30, 'en:green-dot': 30, 'en:eu-organic': 30, 'en:gluten-free': 30, 'en:no-preservatives': 30, 'en:made-in-france': 30, 'fr:ab-agriculture-biologique': 30, 'en:no-colorings': 30, 'en:vegetarian': 30, 'en:carbon-footprint': 30, 'en:fr-bio-01': 30, 'fr:viande-francaise': 30, 'en:vegan': 30, 'en:french-meat': 30, 'en:no-added-sugar': 30, 'en:french-pork': 30, 'en:palm-oil-free': 30, 'fr:eco-emballages': 30, 'en:superior-quality': 30, 'en:eu-non-eu-agriculture': 30, 'en:no-artificial-flavors': 30, 'en:pdo': 30, 'fr:viande-bovine-francaise': 30, 'en:nutriscore': 30, 'en:kosher': 30, 'en:pgi': 30, 'fr:deconseille-aux-femmes-enceintes': 30, 'en:made-in-germany': 30, 'en:contains-gmos': 30, 'en:low-or-no-sugar': 30}


In [19]:
top_brands

{'marks-spencer': 30,
 'casino': 30,
 'carrefour': 30,
 'leclerc': 30,
 'la-nouvelle-agriculture': 30,
 'lidl': 30,
 'le-gaulois': 30,
 'm-s': 30,
 'danone': 30,
 'nestle': 30,
 'auchan': 30,
 'bonduelle': 30,
 'fleury-michon': 30,
 'maitre-coq': 30,
 'charal': 30,
 'u': 30,
 'milbona': 30,
 'haribo': 30,
 'aldi': 30,
 'labeyrie': 30,
 'sans-marque': 30,
 'lindt': 30,
 'panzani': 30,
 'leader-price': 30,
 'delpeyrat': 30,
 'bledina': 30,
 'ducros': 30,
 'sainte-lucie': 30,
 'hema': 30,
 'e-leclerc': 30}

In [20]:
def get_top_brands_and_labels(product):
    global top_brands
    global top_labels
    t= ""
    if 'brands_tags' in product.keys():
        # intersection
        inter_brand = [i for i in product.get('brands_tags') if i in top_brands]
        if len(inter_brand) != 0:
                # if there's one brand/label we're interested in and for which we have yet to have written 30 barcode add it to t. 
                    # t contains all brands and labels separated by a whitespace
                    # add a row for each URL
                
            for b in inter_brand:
                t += " " + b


    if 'labels_tags' in product.keys():
        inter_label = [i for i in product.get('labels_tags') if i in top_labels]
        if len(inter_label) != 0:
            for b in inter_label:
                t += " " + b    

    return t

In [21]:
#Fetching all the dataset AGAIN and writing barcode for those products which have a label/brand which we are interested in. 
from robotoff.products import ProductDataset

ds = ProductDataset.load()

product_iter = (ds.stream()
                  .filter_by_country_tag('en:france')
                  .iter())

with open('off_barcode.csv','w') as file:
    for product in product_iter:
        brand_and_labels = get_top_brands_and_labels(product)
        urls = get_images(product)
        if brand_and_labels and urls:
            for url in urls:
                row = '%s, %s' % (brand_and_labels, url)
                
                file.write(row)
                file.write('\n')
                
               

EOFError: Compressed file ended before the end-of-stream marker was reached