In [67]:
def text_detection_full_response(path):
    from google.cloud import vision
    import io
    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.text_detection(image=image)
    return response

In [68]:
def strip_menu(response):
    import pandas as pd
    
    # remove these chars from entry
    chars_to_remove = pd.read_csv('../data/chars_to_remove.csv')
    chars_to_remove = chars_to_remove['name'].tolist()

    # remove entry if it exactly matches any of these (words or phrases)
    drop_exact_words = pd.read_csv('../data/drop_exact.csv')
    drop_exact_words = drop_exact_words['name'].tolist()
    drop_exact_words = [item.lower() for item in drop_exact_words]


    # remove these words from entry (words only)
    words_to_remove = pd.read_csv('../data/words_to_remove.csv')
    words_to_remove = words_to_remove['name'].tolist()
    
    # remove entry if it contains any of these (words or phrases)
    drop_contain_words = pd.read_csv('../data/drop_contain.csv')
    drop_contain_words = drop_contain_words['name'].tolist()

    
    # remove entry if it starts with any of these (words or phrases)
    drop_start_words = pd.read_csv('../data/drop_start.csv')
    drop_start_words = drop_start_words['name'].tolist()
    
    # drop entry if it contains fewer chars than minimum
    min_length = 4
    
    text = response.text_annotations[0].description
    menu_original = text.lower()
    menu_original = menu_original.split('\n')
    
    menu_chars_removed = []
    for item in menu_original:
        for char in chars_to_remove:
            item = item.replace(char,' ')
        menu_chars_removed.append(item)
    menu_chars_removed = [item.strip() for item in menu_chars_removed]
      
    menu_exact_matches_dropped = []
    for item in menu_chars_removed:
        if item.lower() in drop_exact_words:
            pass
        else:
            menu_exact_matches_dropped.append(item)
        
    menu_words_removed = []
    for item in menu_exact_matches_dropped:
        temporary = []
        for word in item.split(' '):
            if word.lower() not in words_to_remove:
                temporary.append(word)
        remaining_words = ' '.join(temporary)
        menu_words_removed.append(remaining_words)
               
             
    menu_contains_dropped = []
    dropping = []
    for item in menu_words_removed:
        for i in drop_contain_words:
            if i in item:
                dropping.append(item)
    for item in menu_words_removed:
        if item not in dropping:
            menu_contains_dropped.append(item)

    menu_starts_dropped = []
    dropping = []
    for item in menu_contains_dropped:
        for i in drop_start_words:
            if item.startswith(i):
                dropping.append(item)
    for item in menu_contains_dropped:
        if item not in dropping:
            menu_starts_dropped.append(item)

    
    menu_exact_matches_dropped = []
    for item in menu_starts_dropped:
        if item.lower() in drop_exact_words:
            pass
        else:
            menu_exact_matches_dropped.append(item)
            
    bounding_white_space_removed = [item.strip() for item in menu_exact_matches_dropped]
    too_short_dropped = [item for item in bounding_white_space_removed if len(item) >= min_length]
    
    duplicates_dropped = []
    for item in too_short_dropped:
        if item not in duplicates_dropped:
            duplicates_dropped.append(item)

    
    stripped_menu = duplicates_dropped
    
    print('original menu:')
    print()
    print(menu_original)
    print()
    print('stripped menu:')
    print()
    print(stripped_menu)
    print()
    return(stripped_menu)

In [66]:
def optimized_image_fetch_and_check(query):
    import os
    from dotenv import load_dotenv, find_dotenv
    from google_images_search import GoogleImagesSearch
    from google.cloud import vision
    import pandas as pd
    import csv

    env_path = find_dotenv()
    load_dotenv(env_path)
    
    GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
    GOOGLE_CX = os.getenv('GOOGLE_CX')
    
    helper_word = 'recipe'
    
    beverages = pd.read_csv('../data/stripped_drinks.csv')
    beverages = beverages['name'].tolist()

    if query.lower() in beverages:
        helper_word = 'beverage'
    
    print(f'searching for {query} ({helper_word})...')
    print()

    gis = GoogleImagesSearch(GOOGLE_API_KEY,GOOGLE_CX)

    if query.lower() in beverages:
        helper_word = 'beverage'
    
    _search_params = {
    'q': f'{query} {helper_word}',
    'num': 1,
    #'imgSize': 'large',
    'imgType': 'photo',
    'imgColorType': 'color'}
    
    gis.search(search_params=_search_params)
    print('fetching image:')
    if len(gis.results()) == 0:
        print('no image found, not verified as food.')
        with open('../data/drop_exact.csv', 'a', newline='') as drop_list:
            writer = csv.writer(drop_list)
            writer.writerow([query.lower()])
            print('added to drop item list')   
        print()
        return None
    
    url = gis.results()[0].url
    print(url)
    print()
    
    verified_queries = pd.read_csv('../data/verified.csv')
    verified_queries = verified_queries['name'].tolist()

    if query.lower() in verified_queries:
        print(f'{query} already in verified list, no need to verify!')
        print()
        return url
    
    client = vision.ImageAnnotatorClient()
    image = vision.Image()
    image.source.image_uri = url
    
    response = client.label_detection(image=image, max_results=1)
    label = [lab.description for lab in response.label_annotations]
    score = [lab.score for lab in response.label_annotations]
    
    text_response = client.text_detection(image=image)
    texts = text_response.text_annotations
    n_chars = 0
    if len(texts)>0:
        n_chars = len(texts[0].description)
    
    print('verification filter:')
    print('label must be Food, Tableware, Bottle, Beverage can, Liquid, or Water')
    print('score must be above .955')
    print('number of chars must be below 200')
    print()
    print(f'label: {label}')
    print(f'label score: {score}')
    print(f'chars detected: {n_chars}')
    print()
    
    try:
        if (label[0] in ['Food','Tableware','Bottle','Beverage can','Liquid','Water']) and score[0] > .955 and n_chars < 200:
            print('verified!')
            if query not in verified_queries:
                with open('../data/verified.csv', 'a', newline='') as verify:
                    writer = csv.writer(verify)
                    writer.writerow([query.lower()])
                    print('added to verified list')
            if label[0] in ['Bottle','Beverage can','Liquid','Water']:
                if query not in beverages:
                    with open('../data/stripped_drinks.csv', 'a', newline='') as drinks:
                        writer = csv.writer(drinks)
                        writer.writerow([query.lower()])
                        print('added to beverage list')   
            print()
            print(url)
            print()
            
            return url
    except IndexError:
        print('label missing, not verified.')
        pass
    

    _search_params = {
    'q': f'{query} {helper_word}]',
    'num': 3,
    #'imgSize': 'large',
    'imgType': 'photo',
    'imgColorType': 'color',
    'safe': 'medium'}
        
    gis = GoogleImagesSearch(GOOGLE_API_KEY,GOOGLE_CX)
    gis.search(search_params=_search_params)
    urls = [result.url for result in gis.results()]
    print('fetching additional images:')
    if len(urls)<=1:
        print('no additional images found, not verified.')
        with open('../data/drop_exact.csv', 'a', newline='') as drop_list:
            writer = csv.writer(drop_list)
            writer.writerow([query.lower()])
            print('added to drop item list')   
        print()
        return None
    urls = urls[1:]
    for url in urls:
        print(url)
    print()
    
    labels = []
    scores = []
    char_counts = [] 
    for url in urls:
        
        image.source.image_uri = url
        response = client.label_detection(image=image, max_results=1)
        label = [lab.description for lab in response.label_annotations]
        score = [lab.score for lab in response.label_annotations]
        labels.append(label)
        scores.append(score)
        
        text_response = client.text_detection(image=image)
        texts = text_response.text_annotations
        n_chars = 0
        if len(texts)>0:
            n_chars = len(texts[0].description)
        char_counts.append(n_chars)
        
    print(f'labels: {labels}')
    print(f'label scores: {scores}')
    print(f'chars detected: {char_counts}')
    print()

    for label,score,n_chars in zip(labels,scores, char_counts):
        try:
            if (label[0] in ['Food','Tableware','Bottle','Beverage can','Liquid','Water']) and score[0] > .955 and n_chars < 200:
                print('verified!')
                print()
                print(urls[labels.index(label)])
                print()
                return urls[labels.index(label)]
        except:
            pass
    print('not verified.')
    with open('../data/drop_exact.csv', 'a', newline='') as drop_list:
        writer = csv.writer(drop_list)
        writer.writerow([query.lower()])
        print('added to drop item list')  
    print()
    return None

In [69]:
response = text_detection_full_response('../raw_data/all_menus/spanish_menu_1.jpg')

In [70]:
stripped_menu = strip_menu(response)

original menu:

['tapas', 'bacalao frito', 'lorem ipsum dolor sit amet, sit audire', 'recusabo complectitur eu.', 'escalivada', 'lorem ipsum dolor sit amet, sit audire', 'recusabo complectitur eu.', 'patatas bravas', 'lorem ipsum dolor sit amet, sit audire', 'recusabo complectitur eu.', 'queso manchego', 'lorem ipsum dolor sit amet, sit audire', 'recusabo complectitur eu.', 'ensalada rusa', 'pellentesque nec nulla non urna faucibus', 'maximus in tellus.', '7.00', '4.50', '6.00', '7.00', '6.00', 'menu', 'main', 'paella mixta', 'lorem ipsum dolor sit amet, sit audire', 'recusabo complectitur eu.', 'gambas al ajillo', 'lorem ipsum dolor sit amet, sit audire', 'recusabo complectitur eu.', 'lenguado', 'lorem ipsum dolor sit amet, sit audire', 'recusabo complectitur eu.', 'pollo al horno', 'pellentesque nec nulla non urna', 'faucibus maximus in tellus.', 'bacalao frito', 'lorem ipsum dolor sit amet, sit audire', 'recusabo complectitur eu.', 'lomo de salmón', 'lorem ipsum dolor sit amet, sit 

In [73]:
for item in stripped_menu:
    optimized_image_fetch_and_check(item)

searching for tapas (recipe)...

fetching image:
https://www.recipetineats.com/wp-content/uploads/2016/08/Easy-Spanish-Tapas.jpg

verification filter:
label must be Food, Tableware, Bottle, Beverage can, Liquid, or Water
score must be above .955
number of chars must be below 200

label: ['Food']
label score: [0.9854244589805603]
chars detected: 75

verified!
added to verified list

https://www.recipetineats.com/wp-content/uploads/2016/08/Easy-Spanish-Tapas.jpg

searching for bacalao frito (recipe)...

fetching image:
https://lh3.googleusercontent.com/EiUfkEoA8xGjMYknQT8WDewGxnIhDO4CjHlJxNOtqWvHNTwGV1I6xrfkwXS8J5u7hMRz268PujDlL96RGR7nuaA6nFC2ZbKkf2gRhMQ=w1200-rj-l68-e365

verification filter:
label must be Food, Tableware, Bottle, Beverage can, Liquid, or Water
score must be above .955
number of chars must be below 200

label: ['Food']
label score: [0.9737876653671265]
chars detected: 0

verified!
added to verified list

https://lh3.googleusercontent.com/EiUfkEoA8xGjMYknQT8WDewGxnIhDO4C

fetching additional images:
https://i.pinimg.com/474x/1c/ca/ec/1ccaec00636dda86f5c351a22f260592.jpg
https://d1csarkz8obe9u.cloudfront.net/posterpreviews/fresh-food-menu-poster-template-37fbc38a7e9c26cce11be07ab1017631_screen.jpg?ts=1561536745

labels: [['Food'], ['Food']]
label scores: [[0.9442237615585327], [0.9612721800804138]]
chars detected: [822, 556]

not verified.
added to drop item list

searching for faucibus maximus in tellus (recipe)...

fetching image:
https://d1csarkz8obe9u.cloudfront.net/posterpreviews/daily-specials-menu-template-design-5f7d9df4f1fc9845472e3884972815af_screen.jpg?ts=1636983188

verification filter:
label must be Food, Tableware, Bottle, Beverage can, Liquid, or Water
score must be above .955
number of chars must be below 200

label: ['Food']
label score: [0.9599761366844177]
chars detected: 0

verified!
added to verified list

https://d1csarkz8obe9u.cloudfront.net/posterpreviews/daily-specials-menu-template-design-5f7d9df4f1fc9845472e3884972815af_screen.

In [16]:
import pandas as pd

In [27]:
df = pd.read_csv('../data/stripped_drinks.csv')
df


Unnamed: 0,name
0,chevy with a white license plate
1,fukmeup
2,in the shade
3,florida bushwacker
4,belmont
...,...
766,screwdriver
767,red bull
768,redbull
769,starbucks coffee


In [35]:
import csv

with open('../data/stripped_drinks.csv', 'a', newline='') as stripped_drinks:
    writer = csv.writer(stripped_drinks)
    writer.writerow(['seltzer'])


In [36]:
df = pd.read_csv('../data/stripped_drinks.csv')
df

Unnamed: 0,name
0,chevy with a white license plate
1,fukmeup
2,in the shade
3,florida bushwacker
4,belmont
...,...
769,starbucks coffee
770,fanta
771,coors
772,bud light
