In [3]:
!pip install --upgrade pymupdf


Collecting pymupdf
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.0


In [4]:
import fitz  # PyMuPDF
import os
from collections import defaultdict

# Mount your drive (run this once in Colab)
# from google.colab import drive
# drive.mount('/content/drive')

# Path to your menus
menu_dir = "/content/drive/MyDrive/PNAE Gen AI Project/Menus for the State Alagoas"

# Simple Brazilian food nutrition DB (expand this!)
nutrition_db = {
    "banana": {"calories": 90, "protein": 1.1, "carbs": 23, "fat": 0.3},
    "goiaba": {"calories": 68, "protein": 2.6, "carbs": 14, "fat": 1},
    "melancia": {"calories": 30, "protein": 0.6, "carbs": 8, "fat": 0.2},
    "manga": {"calories": 60, "protein": 0.8, "carbs": 15, "fat": 0.4},
    "arroz": {"calories": 130, "protein": 2.5, "carbs": 28, "fat": 0.3},
    "feijão": {"calories": 110, "protein": 7, "carbs": 20, "fat": 0.5},
    "frango": {"calories": 165, "protein": 31, "carbs": 0, "fat": 3.6},
    "carne": {"calories": 250, "protein": 26, "carbs": 0, "fat": 17},
    "fígado": {"calories": 135, "protein": 20.4, "carbs": 3.9, "fat": 3.6},
    "peixe": {"calories": 100, "protein": 20, "carbs": 0, "fat": 2},
    "batata doce": {"calories": 86, "protein": 1.6, "carbs": 20, "fat": 0.1}
}

# Extract all text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

# Get nutrition estimate
def estimate_nutrition(food_item):
    nutrients = {"calories": 0, "protein": 0, "carbs": 0, "fat": 0}
    for name in nutrition_db:
        if name in food_item.lower():
            return nutrition_db[name]
    return nutrients

# Parse menu text and analyze nutrition
def analyze_menu(text):
    days = ["segunda-feira", "terça-feira", "quarta-feira", "quinta-feira", "sexta-feira"]
    menu_by_day = defaultdict(list)
    current_day = None

    for line in text.splitlines():
        line_clean = line.strip().lower()
        for day in days:
            if day in line_clean:
                current_day = day
                break

        if current_day and any(word in line_clean for word in nutrition_db):
            menu_by_day[current_day].append(line_clean)

    nutrition_summary = {}
    for day, items in menu_by_day.items():
        summary = {"calories": 0, "protein": 0, "carbs": 0, "fat": 0}
        for item in items:
            nutrients = estimate_nutrition(item)
            for k in summary:
                summary[k] += nutrients[k]
        nutrition_summary[day.title()] = summary

    return nutrition_summary

# Run analysis on all PDFs in folder
for file_name in os.listdir(menu_dir):
    if file_name.endswith(".pdf"):
        file_path = os.path.join(menu_dir, file_name)
        print(f"\n--- Analyzing {file_name} ---")
        try:
            text = extract_text_from_pdf(file_path)
            report = analyze_menu(text)
            for day, stats in report.items():
                print(f"{day}: {stats}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")



--- Analyzing SETEMBRO-BERCARIO(3).pdf ---
Sexta-Feira: {'calories': 14398, 'protein': 1168.8999999999999, 'carbs': 1613.1, 'fat': 375.10000000000025}

--- Analyzing SETEMBRO-EF-INTEGRAL.pdf ---
Sexta-Feira: {'calories': 22868, 'protein': 2035.599999999999, 'carbs': 2118.2000000000003, 'fat': 714.4000000000002}

--- Analyzing SETEMBRO-EF-PARCIAL.pdf ---
Sexta-Feira: {'calories': 13447, 'protein': 1376.3000000000004, 'carbs': 915.5999999999999, 'fat': 493.30000000000035}

--- Analyzing SETEMBRO-EI-INTEGRAL.pdf ---
Sexta-Feira: {'calories': 14531, 'protein': 1151.3999999999996, 'carbs': 1674.1999999999998, 'fat': 371.10000000000025}

--- Analyzing SETEMBRO-EI-PARCIAL.pdf ---
Sexta-Feira: {'calories': 15357, 'protein': 1166.3999999999992, 'carbs': 1879.1999999999998, 'fat': 376.5000000000002}

--- Analyzing JULHO-BERCARIO.xlsx-1(1).pdf ---
Sexta-Feira: {'calories': 13350, 'protein': 995.3000000000002, 'carbs': 1646.5, 'fat': 320.7000000000003}

--- Analyzing JULHO-EF-INTEGRAL.xlsx-1.pdf 

# OpenAI API

It was at this point where I realized that some calculations were going wrong and it was going to be very difficult in order to model all these foods by hand. Then I figured we could use an OpenAI Model in order to model the nutritional benefits of each meal

In [5]:
import fitz  # PyMuPDF
from openai import OpenAI
import openai
import re
import json
from collections import defaultdict

# ✅ STEP 1: Load API Key securely (replace this with your actual key)
OPENAI_API_KEY = ""
client = OpenAI(api_key=OPENAI_API_KEY)

def extract_json_from_gpt(raw_text):
    # Remove markdown fences (```json ... ```)
    cleaned = re.sub(r"```json|```", "", raw_text).strip()

    # Remove units like "g" from numbers
    cleaned = re.sub(r'(\d+(?:\.\d+)?)\s*g', r'\1', cleaned)

    return cleaned

# ✅ STEP 2: Load and extract text from one PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

# ✅ STEP 3: Separate by weekday
def split_by_day(text):
    days = ["segunda-feira", "terça-feira", "quarta-feira", "quinta-feira", "sexta-feira"]
    day_blocks = defaultdict(str)
    current_day = None

    for line in text.splitlines():
        line_clean = line.strip().lower()
        for day in days:
            if day in line_clean:
                current_day = day
                break
        if current_day:
            day_blocks[current_day] += line + "\n"

    return day_blocks

def gpt_estimate_weekly_nutrition(menu_text, school_level="berçário"):
    prompt = f"""
You are a nutritionist who specializes in early childhood school feeding.

Below is a full weekly menu (Monday to Friday) for infants aged 6 to 12 months, with meals organized by day and by time of day.

For each day of the week, estimate the total daily values of **calories**, **protein**, **carbohydrates**, and **fat**, assuming one standard portion per item appropriate for this age group.

Return your answer as a JSON object, formatted exactly like this (without explanations or markdown):

{{
  "Segunda-Feira": {{
    "calories": ...,
    "protein": ...,
    "carbs": ...,
    "fat": ...
  }},
  "Terça-Feira": {{
    ...
  }},
  ...
  "Sexta-Feira": {{
    ...
  }}
}}

Menu:
{menu_text}
"""


    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",  # or gpt-4-turbo if/when available
            messages=[
                {"role": "system", "content": "Você é um especialista em nutrição infantil e alimentação escolar."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
        )

        reply = response.choices[0].message.content.strip()
        cleaned = extract_json_from_gpt(reply)

        # Try parsing as JSON
        try:
            return json.loads(cleaned)
        except json.JSONDecodeError:
            print("\n⚠️ GPT response was not valid JSON. Here's what it said:")
            print(reply)
            return None

    except Exception as e:
        print(f"Error with GPT API: {e}")
        return None


# ✅ STEP 5: Process ONE PDF
pdf_path = "/content/drive/MyDrive/PNAE Gen AI Project/Menus for the State Alagoas/JULHO-BERCARIO_signed(1).pdf"
text = extract_text_from_pdf(pdf_path)
day_blocks = split_by_day(text)

print(f"\n--- Nutrition Estimates for: {pdf_path.split('/')[-1]} ---")
result = gpt_estimate_weekly_nutrition(text)
if result:
    for day, stats in result.items():
        print(f"\n{day}: {stats}")
else:
    print("Error in estimating food quantities")




--- Nutrition Estimates for: JULHO-BERCARIO_signed(1).pdf ---

Segunda-Feira: {'calories': 450, 'protein': 18, 'carbs': 60, 'fat': 15}

Terça-Feira: {'calories': 460, 'protein': 19, 'carbs': 65, 'fat': 16}

Quarta-Feira: {'calories': 470, 'protein': 20, 'carbs': 70, 'fat': 17}

Quinta-Feira: {'calories': 480, 'protein': 21, 'carbs': 75, 'fat': 18}

Sexta-Feira: {'calories': 440, 'protein': 17, 'carbs': 55, 'fat': 14}


In [9]:
# ✅ STEP 5: Process ONE PDF
pdf_path = "/content/drive/MyDrive/PNAE Gen AI Project/Menus for the State Alagoas/SETEMBRO-BERCARIO(3).pdf"
text = extract_text_from_pdf(pdf_path)
day_blocks = split_by_day(text)

print(f"\n--- Nutrition Estimates for: {pdf_path.split('/')[-1]} ---")
result = gpt_estimate_weekly_nutrition(text)
if result:
    for day, stats in result.items():
        print(f"\n{day}: {stats}")
else:
    print("Erro ao estimar nutrientes para a semana.")


--- Nutrition Estimates for: SETEMBRO-BERCARIO(3).pdf ---

Segunda-Feira: {'calories': 482.99, 'protein': 73.08, 'carbs': 224.86, 'fat': 18.86}

Terça-Feira: {'calories': 482.99, 'protein': 73.08, 'carbs': 224.86, 'fat': 18.86}

Quarta-Feira: {'calories': 636.11, 'protein': 218.91, 'carbs': 123.69, 'fat': 175.67}

Quinta-Feira: {'calories': 642.34, 'protein': 102.5, 'carbs': 274.7, 'fat': 27.44}

Sexta-Feira: {'calories': 742.38, 'protein': 168.52, 'carbs': 81.9, 'fat': 33.09}
