In [85]:
from google.cloud import vision
import pandas as pd
import io
import re

### Extract text from image

In [86]:
client = vision.ImageAnnotatorClient()

def extract_text(image_path):
    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations

    return texts[0].description if texts else ""


In [None]:
text = extract_text('../images/nutrition_facts3.jpg')
print(text)

Nutrition
Serving Size 3/4 cup (28g)
Facts
Servings Per Container about 22
with 12
Amount
Honey Nut
Per Serving
Cheerios
cup skim
milk
Calories
110
150
Calories from Fat
15
15
% Daily Value**
Total Fat 1.5g*
2%
2%
Saturated Fat Og
0%
0%
Trans Fat 0g
Polyunsaturated Fat 0.5g
Monounsaturated Fat 0.5g
Cholesterol Omg
0%
1%
Sodium 160mg
7%
9%
Potassium 115mg
3%
9%
Total
Carbohydrate 22g
7%
9%
Dietary Fiber 2g
8%
8%
Soluble Fiber less than 1g
Sugars 9g
Other Carbohydrate 11g
Protein 2g
Vitamin A
10%
15%
Vitamin C
10%
10%
Calcium
10%
25%
Iron
25%
25%
Vitamin D
10%
25%
Thiamin
25%
30%
Riboflavin
25%
35%
Niacin
25%
25%
Vitamin B6
25%
25%
Folic Acid
50%
50%
Vitamin B12
25%
35%
Phosphorus
8%
20%
Magnesium
6%
10%
Zinc
25%
30%
w
Amount in cereal. A serving of cereal plus skim milk
provides 1.5g total fat, less than 5mg cholesterol,
220mg sodium, 320mg potassium, 28g total
carbohydrate (15g sugars, 12g other carbohydrate), and
7g protein.
** Percent Daily Values are based on a 2,000 calorie diet.
Y

In [88]:
# Extracting the nutrition as list
nutrition_list = text.split("\n")
nutrition_list = [nutrient for nutrient in nutrition_list if nutrient]
nutrition_list

['Nutrition',
 'Serving Size 3/4 cup (28g)',
 'Facts',
 'Servings Per Container about 22',
 'with 12',
 'Amount',
 'Honey Nut',
 'Per Serving',
 'Cheerios',
 'cup skim',
 'milk',
 'Calories',
 '110',
 '150',
 'Calories from Fat',
 '15',
 '15',
 '% Daily Value**',
 'Total Fat 1.5g*',
 '2%',
 '2%',
 'Saturated Fat Og',
 '0%',
 '0%',
 'Trans Fat 0g',
 'Polyunsaturated Fat 0.5g',
 'Monounsaturated Fat 0.5g',
 'Cholesterol Omg',
 '0%',
 '1%',
 'Sodium 160mg',
 '7%',
 '9%',
 'Potassium 115mg',
 '3%',
 '9%',
 'Total',
 'Carbohydrate 22g',
 '7%',
 '9%',
 'Dietary Fiber 2g',
 '8%',
 '8%',
 'Soluble Fiber less than 1g',
 'Sugars 9g',
 'Other Carbohydrate 11g',
 'Protein 2g',
 'Vitamin A',
 '10%',
 '15%',
 'Vitamin C',
 '10%',
 '10%',
 'Calcium',
 '10%',
 '25%',
 'Iron',
 '25%',
 '25%',
 'Vitamin D',
 '10%',
 '25%',
 'Thiamin',
 '25%',
 '30%',
 'Riboflavin',
 '25%',
 '35%',
 'Niacin',
 '25%',
 '25%',
 'Vitamin B6',
 '25%',
 '25%',
 'Folic Acid',
 '50%',
 '50%',
 'Vitamin B12',
 '25%',
 '35%',
 'P

In [89]:
features = ['energy-kcal_100g', 'saturated-fat_100g', 'trans-fat_100g', 'cholesterol_100g',
            'sugars_100g', 'fiber_100g', 'proteins_100g', 'sodium_100g', 'calcium_100g',
            'iron_100g', 'other_carbohydrates_100g', 'fat_100g', 'ingredients']

In [90]:
nutrition_dict = {feature: 0 for feature in features}
nutrition_dict

{'energy-kcal_100g': 0,
 'saturated-fat_100g': 0,
 'trans-fat_100g': 0,
 'cholesterol_100g': 0,
 'sugars_100g': 0,
 'fiber_100g': 0,
 'proteins_100g': 0,
 'sodium_100g': 0,
 'calcium_100g': 0,
 'iron_100g': 0,
 'other_carbohydrates_100g': 0,
 'fat_100g': 0,
 'ingredients': 0}

In [91]:
key_mapping = {
    'energy-kcal_100g': 'Calories',
    'saturated-fat_100g': 'Saturated Fat',
    'trans-fat_100g': 'Trans Fat',
    'cholesterol_100g': 'Cholesterol',
    'sugars_100g': 'Sugars',
    'fiber_100g': 'Dietary Fiber',
    'proteins_100g': 'Protein',
    'sodium_100g': 'Sodium',
    'calcium_100g': 'Calcium',
    'iron_100g': 'Iron',
    'other_carbohydrates_100g': 'Carbohydrate',
    'fat_100g': 'total Fat',
    'ingredients': 'ingredients'
}

### Extracting the nutrition values

In [92]:
def extract_values(nutrition_dict, nutrition_list, key_mapping):
    for key, search_term in key_mapping.items():
        for i, item in enumerate(nutrition_list):
            if search_term.lower() in item.lower():
                # If we find "Calories", we need to explicitly capture the next numeric value
                if search_term.lower() == 'calories':
                    if i + 1 < len(nutrition_list):
                        match = re.search(r'(\d+\.?\d*)', nutrition_list[i + 1])
                        if match:
                            value = match.group(1)
                            nutrition_dict[key] = float(value) if '.' in value else int(value)
                    break
                # Handle "Calcium" and "Iron" to capture next numeric values
                elif search_term.lower() == 'calcium' or search_term.lower() == 'iron':
                    if i + 1 < len(nutrition_list):
                        match = re.search(r'(\d+)%?', nutrition_list[i + 1])
                        if match:
                            value = match.group(1)
                            nutrition_dict[key] = float(value)/100 if '.' in value else int(value)/100
                    break

                elif search_term.lower() == 'ingredients':
                    match = re.search(r'ingredients[:\s]*([^$]+)', item, re.IGNORECASE)
                    if match:
                        ingredients_text = match.group(1).strip()  # Capture the text after "Ingredients:"
                        nutrition_dict[key] = ingredients_text
                    break

                else:
                    # For other terms, we just capture the numeric value
                    match = re.search(r'(\d+\.?\d*)\s?(g|mg|%)?', item)
                    if match:
                        value = match.group(1)
                        if value:
                            nutrition_dict[key] = float(value) if '.' in value else int(value)
                    break
    return nutrition_dict

In [93]:
updated_nutrition_dict = extract_values(nutrition_dict, nutrition_list, key_mapping)
updated_nutrition_dict

{'energy-kcal_100g': 110,
 'saturated-fat_100g': 0,
 'trans-fat_100g': 0,
 'cholesterol_100g': 0,
 'sugars_100g': 9,
 'fiber_100g': 2,
 'proteins_100g': 2,
 'sodium_100g': 160,
 'calcium_100g': 0.1,
 'iron_100g': 0.25,
 'other_carbohydrates_100g': 22,
 'fat_100g': 1.5,
 'ingredients': 'Whole Grain Oats, Sugar,'}

In [94]:
def convert_mg_to_g(nutrition_dict):
    # Conversion factor: 1 mg = 0.001 g
    conversion_factor = 0.001
    keys_to_convert = ['cholesterol_100g', 'sodium_100g']

    for key in keys_to_convert:
        if key in nutrition_dict:
            nutrition_dict[key] = nutrition_dict[key] * conversion_factor

    return nutrition_dict

In [101]:
new_nutrition_dict = convert_mg_to_g(updated_nutrition_dict)
new_nutrition_dict

{'energy-kcal_100g': 110,
 'saturated-fat_100g': 0,
 'trans-fat_100g': 0,
 'cholesterol_100g': 0.0,
 'sugars_100g': 9,
 'fiber_100g': 2,
 'proteins_100g': 2,
 'sodium_100g': 0.00016,
 'calcium_100g': 0.1,
 'iron_100g': 0.25,
 'other_carbohydrates_100g': 22,
 'fat_100g': 1.5,
 'ingredients': 'Whole Grain Oats, Sugar,'}

### Extracting serving size

In [96]:
def extract_serving_size(nutrition_list):
    for item in nutrition_list:
        match = re.search(r'\((\d+)', item)
        if match:
            return int(match.group(1))
    return None

In [97]:
serving_size = extract_serving_size(nutrition_list)
serving_size

28

### Convert data of the serving size to 100g

In [98]:
def convert_to_100g(nutrition_dict, serving_size=serving_size):
    nutrition_100g = {}
    for key, value in nutrition_dict.items():
        if isinstance(value, str):
            nutrition_100g[key] = value
        elif value != 0:
            nutrition_100g[key] = (value * 100) / serving_size
        else:
            nutrition_100g[key] = 0

    return nutrition_100g

In [99]:
new_input = convert_to_100g(new_nutrition_dict, serving_size=serving_size)
new_input

{'energy-kcal_100g': 392.85714285714283,
 'saturated-fat_100g': 0,
 'trans-fat_100g': 0,
 'cholesterol_100g': 0,
 'sugars_100g': 32.142857142857146,
 'fiber_100g': 7.142857142857143,
 'proteins_100g': 7.142857142857143,
 'sodium_100g': 0.5714285714285714,
 'calcium_100g': 0.35714285714285715,
 'iron_100g': 0.8928571428571429,
 'other_carbohydrates_100g': 78.57142857142857,
 'fat_100g': 5.357142857142857,
 'ingredients': 'Whole Grain Oats, Sugar,'}

### Create a dataframe with the new input [X]

In [100]:
df = pd.DataFrame([new_input])
df

Unnamed: 0,energy-kcal_100g,saturated-fat_100g,trans-fat_100g,cholesterol_100g,sugars_100g,fiber_100g,proteins_100g,sodium_100g,calcium_100g,iron_100g,other_carbohydrates_100g,fat_100g,ingredients
0,392.857143,0,0,0,32.142857,7.142857,7.142857,0.571429,0.357143,0.892857,78.571429,5.357143,"Whole Grain Oats, Sugar,"
