## Food LLM annotation

In [1]:
import pandas as pd
import os
import base64
import requests
import re
import numpy as np

In [2]:
#picture 008 in patient 004, 008, 009, 010, 016 removed. Double
#picture 021 in patient 001 removed. Double

In [3]:
macronutrients_instruction = '''Examine the provided meal image to analyze and estimate its nutritional content accurately. Focus on determining the amounts of simple sugars (like industrial sugar and honey), 
complex sugars (such as starch and whole grains), proteins, fats, and dietary fibers (found in fruits and vegetables), all in grams. Also estimate the total weight of the meal in grams.
To assist in accurately gauging the scale of the meal, a 1 Swiss Franc coin, which has a diameter of 23.22 mm, may be present in the picture. 
Use the size of this coin as a reference to estimate the size of the meal and the amounts of the nutrients more precisely. 
Provide your assessment of each nutritional component in grams. All estimates should be given as a single whole number. If there is no coin in the picture or the meal is covered partially, estimate anyways.
Format your response as follows:
- Simple sugars (g): 
- Complex sugars (g): 
- Proteins (g): 
- Fats (g): 
- Dietary fibers (g): 
- Weight (g): 
- Explanation: 

Example response:
Simple sugars (g): 40
Complex sugars (g): 60
Proteins (g): 25
Fats (g): 30
Dietary fibers (g): 5 
Weight (g): 750
Explanation: The pizza and cola meal, with its refined crust and toppings, is rich in carbs, fats, and proteins. The cola boosts the meal's simple sugars. 
The 1 Swiss Franc coin helps estimate the pizza at 30 cm diameter and the cola at 330 ml, indicating a significant blood sugar impact.'''

In [4]:
api_key = ""
headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

In [5]:
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')
  
def parse_nutritional_info(text):
    pattern = r'(Simple sugars \(g\)|Complex sugars \(g\)|Proteins \(g\)|Fats \(g\)|Dietary fibers \(g\)|Weight \(g\)):\s*(\d+)'
    matches = re.findall(pattern, text)
    nutritional_info = {match[0]: int(match[1]) for match in matches}
    simple_sugars = nutritional_info.get('Simple sugars (g)', 0)
    complex_sugars = nutritional_info.get('Complex sugars (g)', 0)
    proteins = nutritional_info.get('Proteins (g)', 0)
    fats = nutritional_info.get('Fats (g)', 0)
    dietary_fibers = nutritional_info.get('Dietary fibers (g)', 0)
    weight = nutritional_info.get('Weight (g)', 0)
    return simple_sugars, complex_sugars, proteins, fats, dietary_fibers, weight

In [7]:
for patient in ['001', '002', '004', '006', '007', '008']:
    print(f"Processing patient {patient}")
    food_data_path = f"diabetes_subset_pictures-glucose-food-insulin/{patient}/food.csv"
    food_data = pd.read_csv(food_data_path)
    food_data[['simple_sugars', 'complex_sugars', 'proteins', 'fats', 'dietary_fibers', 'weight', 'message']] = 0
    for i, row in food_data.iterrows():
        image_path = f"diabetes_subset_pictures-glucose-food-insulin/{patient}/food_pictures/{row['picture']}"
        if not os.path.exists(image_path):
            continue  
        base64_image = encode_image(image_path)
        payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
            "role": "user",
            "content": [
                {
                "type": "text",
                "text": macronutrients_instruction
                },
                {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                }
                }
            ]
            }
        ],
        "max_tokens": 300
        }
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        message = response.json()['choices'][0]['message']['content']
        try:
            parsed_info = parse_nutritional_info(message)
            print(parsed_info)
            food_data.loc[i, ['simple_sugars', 'complex_sugars', 'proteins', 'fats', 'dietary_fibers', 'weight']] = parsed_info
            food_data.loc[i, 'message'] = message
        except:
            print(f"Picture {row['picture']} for patient {patient} could not be annotated")
    food_data.to_csv(f'gpt4_food_data/food_data_{patient}.csv', index=False)

Processing patient 001
(2, 5, 20, 15, 4, 300)
(5, 30, 20, 15, 5, 350)
(0, 0, 0, 8, 0, 10)
(20, 45, 10, 15, 5, 400)
(15, 30, 20, 25, 5, 350)
(15, 5, 2, 1, 3, 250)
(3, 50, 9, 2, 2, 90)
(5, 7, 3, 5, 3, 300)
(0, 25, 20, 15, 2, 300)
(1, 33, 3, 1, 1, 100)
(25, 10, 4, 3, 2, 250)
(0, 0, 0, 0, 0, 0)
(20, 15, 6, 3, 0, 150)
(5, 40, 20, 15, 7, 500)
(30, 0, 0, 0, 0, 600)
(1, 30, 20, 10, 5, 300)
(30, 60, 10, 15, 5, 500)
(30, 15, 1, 0, 4, 200)
(3, 40, 20, 15, 4, 500)
(3, 50, 8, 10, 7, 300)
(0, 0, 0, 0, 0, 0)
(2, 20, 15, 15, 3, 350)
(5, 70, 10, 10, 5, 300)
(20, 0, 5, 3, 0, 250)
(20, 30, 15, 20, 3, 300)
Processing patient 002
(5, 30, 20, 22, 4, 350)
(1, 5, 22, 25, 2, 300)
(5, 75, 20, 25, 10, 500)
(5, 15, 8, 7, 3, 300)
(2, 70, 15, 20, 4, 500)
(5, 40, 30, 20, 4, 500)
(5, 45, 30, 20, 7, 500)
(5, 20, 10, 15, 4, 300)
Processing patient 004
(3, 20, 5, 10, 4, 300)
(5, 30, 35, 35, 4, 500)
(5, 40, 15, 10, 7, 350)
(5, 70, 30, 25, 6, 500)
(0, 45, 30, 20, 5, 500)
(4, 40, 15, 20, 2, 350)
(5, 85, 12, 15, 4, 400)
(0,

In [8]:
food_data

Unnamed: 0.1,Unnamed: 0,picture,description,calories,balance,quality,datetime,simple_sugars,complex_sugars,proteins,fats,dietary_fibers,weight,message
0,0,001.jpg,Pasta with champignons and parmesan cheesse,637.0,Balance,Good quality,2014:10:01 12:21:59,5,75,20,25,4,500,- Simple sugars (g): 5\n- Complex sugars (g): ...
1,1,002.jpg,Salad with olive oil,88.0,Unbalance,Good quality,2014:10:01 12:22:06,5,80,15,10,5,500,- Simple sugars (g): 5\n- Complex sugars (g): ...
2,2,003.jpg,Pasta with champignons and parmesan cheesse,637.0,Balance,Good quality,2014:10:01 12:22:15,5,75,20,15,5,600,- Simple sugars (g): 5\n- Complex sugars (g): ...
3,3,005.jpg,Pumpkin soup,93.0,Unbalance,Good quality,2014:10:01 12:22:31,5,20,10,10,2,300,- Simple sugars (g): 5\n- Complex sugars (g): ...
4,4,006.jpg,Sausages with salad and potatoes chips,1059.0,Unbalance,Medium quality,2014:10:01 21:20:45,10,40,30,35,8,800,- Simple sugars (g): 10\n- Complex sugars (g):...
5,5,007.jpg,"Soup,Salmon pastry,grilled vegestables, boiled...",1678.0,Unbalance,Medium quality,2014:10:02 13:05:33,15,70,20,25,10,900,- Simple sugars (g): 15\n- Complex sugars (g):...
6,7,011.jpg,Spaguetti with ragûot and bolognese sauce,687.0,Unbalance,Low quality,2014:10:02 20:56:36,5,70,30,20,4,500,Simple sugars (g): 5\nComplex sugars (g): 70\n...
7,8,012.jpg,Bread and natural yogurt,224.0,Balance,Medium quality,2014:10:03 10:07:46,5,30,8,10,2,250,- Simple sugars (g): 5\n- Complex sugars (g): ...
8,9,013.jpg,Ragoût beef and frites natural potatoes,476.0,Unbalance,Low quality,2014:10:03 14:07:24,5,30,25,25,5,500,- Simple sugars (g): 5\n- Complex sugars (g): ...
9,10,014.jpg,Batata,86.0,Unbalance,Good quality,2014:10:03 20:36:17,5,30,20,15,10,400,- Simple sugars (g): 5\n- Complex sugars (g): ...


In [54]:
for patient in ['001', '002', '004', '006', '007', '008']:
    food_data = pd.read_csv(f'gpt4/food_data_{patient}.csv')
    print(patient)
    for i, row in food_data.iterrows():
        if np.sum(food_data.loc[i,['simple_sugars',	'complex_sugars', 'proteins', 'fats', 'dietary_fibers', 'weight']]) == 0:
            print(row['picture'])

while True:
    all_data_present = True
    for patient in ['001', '002', '004', '006', '007', '008']:
        food_data = pd.read_csv(f'gpt4/food_data_{patient}.csv')
        print(patient)
        for i, row in food_data.iterrows():
            if np.sum(food_data.loc[i,['simple_sugars',	'complex_sugars', 'proteins', 'fats', 'dietary_fibers', 'weight']]) == 0:
                all_data_present = False
                image_path = f"diabetes_subset_pictures-glucose-food-insulin/{patient}/food_pictures/{row['picture']}"
                if not os.path.exists(image_path):
                    continue  
                base64_image = encode_image(image_path)
                attempts = 0
                max_attempts = 3
                while attempts < max_attempts:
                    payload = {
                    "model": "gpt-4-vision-preview",
                    "messages": [
                        {
                        "role": "user",
                        "content": [
                            {
                            "type": "text",
                            "text": macronutrients_instruction
                            },
                            {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                            }
                        ]
                        }
                    ],
                    "max_tokens": 300
                    }
                    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
                    message = response.json()['choices'][0]['message']['content']
                    print(message)
                    try:
                        parsed_info = parse_nutritional_info(message)
                        food_data.loc[i, ['simple_sugars', 'complex_sugars', 'proteins', 'fats', 'dietary_fibers', 'weight']] = parsed_info
                        food_data.loc[i, 'message'] = message
                        break  
                    except Exception as e:
                        print(f"Attempt {attempts+1} failed for patient {patient}, row {i}. Reason: {str(e)}")
                        attempts += 1
                        if attempts == max_attempts:
                            print(f"Max attempts reached for patient {patient}, row {i}.")
                            break
        food_data.to_csv(f'gpt4/food_data_{patient}.csv', index=False)
    if all_data_present:
        break

001
010.jpg
022.jpg
002
004
006
007
008
