In [27]:
import numpy as np
import pandas as pd
import re

In [56]:
from PIL import Image
import requests
import io
def text_cleaner(text):
  dont_take = ["kj", "kcal", "kj", "total", "free", "net", "ingredients", "ingredient", "et", "de", "fat", "mg", "cg", "g", "kg", "ml", "cl", "l", "kl", "per", "pour", "valeur", "or", "le", "la", "dont", "consommer", "poids", "net", "www", "com", "which", "of", "wt"]
  text_cleaned = text.replace("\n", " ") #remove line breaks
  text_cleaned = re.sub("\S*(www\.|\.com|\.net|\.fr|\.co\.uk|\.org)\S*", "", text_cleaned) #remove websites
  #text_cleaned = re.sub("[^A-Za-z0-9 \-àâäéèêëîïôöùûüÿçÂÊÎÔÛÄËÏÖÜÀÆæÇÉÈŒœÙ]", " ", text_cleaned) #keep alphanum and accents
  text_cleaned = re.sub("\w*([0-9]{0,}[,|\.]{0,}[0-9])\w*", " ", text_cleaned) #remove measurements 
  text_cleaned = re.sub(r"\b([a-zA-Z]{1})\b", " ", text_cleaned) # remove isolated letters ex --> g g g g g
  text_cleaned = re.sub("( +- +)", " ", text_cleaned)
  text_cleaned = re.sub(r"[\·|/|\-|\\|(|)|\+|\*|\[|\]|™|ᴿˣ|\*|\—|\^|\"|®|>|<|″|\||\&|\#|\,|\;|⭐|\xa0|\?|\%|\'|©|\@|\$|\€|\:|\}|\{|\°]", " ", text_cleaned)
  text_cleaned = re.sub(r" +", " ", text_cleaned) # remove multiple spaces

  text_cleaned = " ".join([w for w in text_cleaned.split() if (w.isalpha() and w.lower() not in dont_take)])
  return text_cleaned
  
def make_barcode(x):
    x = str(x)
    return "{}/{}/{}/{}".format(x[:3], x[3:6], x[6:9], x[9:])

def make_link_from_barcode(barcode, df, file = "image", keys = None):
    if keys is None:
        keys = df.loc[df["code"]==barcode, "keys"].values[0]
    if isinstance(keys, str):
        keys = eval(keys)
    elif isinstance(keys, list):
        pass

    links = []
    if file == "image": file = "jpg"
    if file == "json": file = "json"
    barcode_with_slash = make_barcode(barcode)
    for key in keys:
        link = "https://world.openfoodfacts.org/images/products/{}/{}.{}".format(barcode_with_slash, key,file)
        links.append(link)
    return links

def show_images(links):
    for link in links:
        response = requests.get(link)
        image_bytes = io.BytesIO(response.content)
        img = Image.open(image_bytes)
        img.show()



def show_images_from_barcode(barcode, df, keys = None):
    links = make_link_from_barcode(barcode, df=df, keys = keys)
    show_images(links)

import math
import requests
def get_score_from_verticles(txt_annotations:dict):
    
    txt = txt_annotations["description"]
    len_text = len(txt)
    y_min = math.inf
    y_max = -math.inf
    x_min = math.inf
    x_max = -math.inf

    verticles = txt_annotations['boundingPoly']['vertices']
    for coords in verticles:
        if 'y' in coords:
            y_min = min(coords['y'], y_min)
            y_max = max(coords['y'], y_max)
        if 'x' in coords:
            x_min = min(coords['x'], x_min)
            x_max = max(coords['x'], x_max)
    volume = abs(x_max-x_min) * abs(y_max-y_min)
    score = volume/len_text
    return score,txt

def get_n_most_important_words(results, word_count_limit = 10):
    to_keep = {}
    for items in results:
        words = items[1]
        for word in text_cleaner(words).split():
            if word not in to_keep:
                to_keep[word.lower()] = word
                if len(to_keep) == word_count_limit:
                    return " ".join(to_keep.values())
    return " ".join(to_keep.values())      


def get_big_words_from_image(barcode, df, keys = None):
    #clear_output()
    #print(barcode)
    texts = []
    links = make_link_from_barcode(barcode, df, file = "json", keys = keys)
    for link in links:
        try:
            response = urllib.request.urlopen(link)
            js = json.loads(response.read())
            txt_annotations = js['responses'][0]['textAnnotations']
            results = sorted([get_score_from_verticles(txt_a)  for txt_a in txt_annotations], reverse = True)
            text = get_n_most_important_words(results, word_count_limit = 10)
        except:
            text = ""
        texts.append(text)
    return " ".join(texts)

def get_big_words_from_image_clean(barcode, df, keys = None):
    return text_cleaner(get_big_words_from_image(barcode, df, keys = keys))

def get_big_words_from_txt_annotations(txt_annotations):
    try:
        results = sorted([get_score_from_verticles(txt_a) for txt_a in txt_annotations], reverse = True)
        text = get_n_most_important_words(results, word_count_limit = 10)
    except:
        text = ""
    return text

def get_code_from_link(link):
    code_with_bars = link.split("/",5)[-1].rsplit("/",1)[0]
    code = re.sub("/", "", code_with_bars)
    return code


In [28]:
import pickle 
path = "big_words_dict.pkl"
with open(path, 'rb') as file:
      
    big_words_dict = pickle.load(file)
df = pd.read_pickle("dataset.pkl")
codes_to_fetch = [code for code in df["code"] if code not in big_words_dict]
len(codes_to_fetch)

233003

In [41]:
def make_link_from_barcode(barcode, keys, file = "image"):
    if isinstance(keys, str):
        keys = eval(keys)
    elif isinstance(keys, list):
        pass

    links = []
    if file == "image": file = "jpg"
    if file == "json": file = "json"
    barcode_with_slash = make_barcode(barcode)
    for key in keys:
        link = "https://world.openfoodfacts.org/images/products/{}/{}.{}".format(barcode_with_slash, key,file)
        links.append(link)
    return links

In [43]:
from tqdm import tqdm
links_list = []
for barcode, keys in zip(tqdm(codes_to_fetch), df.loc[df["code"].isin(codes_to_fetch), "keys"]):
    links = make_link_from_barcode(barcode, keys, file = "json")
    links_list.append(links)
    

100%|██████████| 233003/233003 [00:00<00:00, 444628.49it/s]


In [49]:
links_to_fetch = [link for list_ in links_list for link in list_]
len(links_to_fetch)

521462

In [50]:
import asyncio
import time 
import aiohttp
from aiohttp.client import ClientSession
my_conn = aiohttp.TCPConnector(limit=10)
import nest_asyncio
nest_asyncio.apply()

async def download_link(url:str,session:ClientSession):
    async with session.get(url) as response:
        result = await response.text()
        return result
        #print(f'Read {len(result)} from {url}')

async def download_all(urls:list):
    my_conn = aiohttp.TCPConnector(limit=10)
    async with aiohttp.ClientSession(connector=my_conn) as session:
        tasks = []
        for url in urls:
            task = asyncio.ensure_future(download_link(url=url,session=session))
            tasks.append(task)
        results = await asyncio.gather(*tasks,return_exceptions=True) # the await must be nest inside of the session
        return results 

url_list = links_to_fetch

start = time.time()
results = asyncio.run(download_all(url_list))
end = time.time()
print(f'download {len(url_list)} links in {end - start} seconds')

  super().__init__(loop=loop)


download 521462 links in 2622.443365097046 seconds


In [81]:
dic = {}

In [82]:
"""takes approx 6min"""
for link, json_string in zip(tqdm(links_to_fetch), results):
    barcode = get_code_from_link(link)
    if str(json_string) != "":
        js = json.loads(json_string)
        txt_annotations = js['responses'][0]['textAnnotations']
        big_words = get_big_words_from_txt_annotations(txt_annotations)
        if barcode in dic:
            if len(dic[barcode]) <= 500:
                dic[barcode] = " " + str(big_words)
        else:
            dic[barcode] = str(big_words)



100%|██████████| 521462/521462 [05:55<00:00, 1465.50it/s]  


In [86]:
with open('new_barcodes_dict.pkl', 'wb') as file:
    # A new file will be created
    pickle.dump(dic, file)


In [87]:
[barcode for barcode in dic if barcode in big_words_dict]

[]

In [88]:
print(len(big_words_dict))
for barcode in dic:
    big_words_dict[barcode] = dic[barcode]
print(len(big_words_dict))

363547
375415


In [89]:
with open('barcodes_dict_with_new_ones.pkl', 'wb') as file:
    # A new file will be created
    pickle.dump(big_words_dict, file)
