In [50]:
from dotenv import load_dotenv
from pathlib import Path
import os
import sys

# add project root (two levels up from tests/)
sys.path.append(str(Path.cwd().parents[1]))

load_dotenv("OpenAI.env")  # loads from .env in OpenAI directory
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI
client = OpenAI(api_key=api_key)

In [None]:
INSTRUCTIONS = """
You are a parser for short French sentences (children / IME level).

Extract ONLY elements that can be illustrated with pictograms.
Never invent words not present in the sentence.
If nothing is found for a field, use null or [].

Return ONLY valid JSON (no surrounding text).
The output MUST be a JSON list, one item per input sentence, in the same order.

Each item schema:
{
  "sentence": string,
  "subject": string | null,
  "verb": { "lemma": string, "text": string } | null,
  "slots": [
    {
      "type": "object" | "color" | "weather" | "place" | "time",
      "head": string,
      "full": string,
      "tags": ["object" | "color" | "weather" | "place" | "time"]
    }
  ]
}

Rules:
- For every slot, "tags" MUST be a JSON array with 1 to 3 items.
- Allowed tag values are ONLY: "object", "color", "weather", "place", "time".
- The first tag MUST always equal the slot "type".
  Example: if type="color", tags=["color"].
- Put direct objects in slots with type="object" and tags=["object"].
- Put prepositional complements like "avec X", "sous X", "à X", "dans X" in slots as:
  - type="weather" (tags=["weather"]) if X is a weather word (pluie, soleil, neige, vent, orage, nuage).
  - type="place" (tags=["place"]) if X is a place word (école, maison, parc, télé, ville).
  - otherwise type="object" (tags=["object"]).
- Put color adjectives (rose, blanc, noir, rouge, bleu, vert, jaune, gris, orange, violet) as slots with type="color" and tags=["color"].
- If a sentence has two colors ("rose et blanc"), output two color slots, each with tags=["color"].

""".strip()

In [40]:
sentences = [
    "Papa a bu son café avec du sucre.",
    "Le chat noir dort avec les chatons.",
    "Mon vélo est rose et blanc.",
    "Emma a promené son chien sous la pluie."
]

resp = client.responses.create(
    model="gpt-4o-mini",
    instructions=INSTRUCTIONS,
    input="\n".join(f"- {s}" for s in sentences)
)


In [41]:
import json

raw = resp.output_text

try:
    output_json = json.loads(raw)  # to verify it's valid JSON
    print("Valid JSON")
    print(json.dumps(output_json, indent=2, ensure_ascii=False))
except json.JSONDecodeError as e:
    print("Invalid JSON:", e)
    print("Raw output:", raw)

Valid JSON
[
  {
    "sentence": "Papa a bu son café avec du sucre.",
    "subject": "Papa",
    "verb": {
      "lemma": "boire",
      "text": "a bu"
    },
    "slots": [
      {
        "type": "object",
        "head": "café",
        "full": "son café"
      },
      {
        "type": "object",
        "head": "sucre",
        "full": "du sucre"
      }
    ]
  },
  {
    "sentence": "Le chat noir dort avec les chatons.",
    "subject": "Le chat noir",
    "verb": {
      "lemma": "dormir",
      "text": "dort"
    },
    "slots": [
      {
        "type": "object",
        "head": "chatons",
        "full": "les chatons"
      }
    ]
  },
  {
    "sentence": "Mon vélo est rose et blanc.",
    "subject": "Mon vélo",
    "verb": {
      "lemma": "être",
      "text": "est"
    },
    "slots": [
      {
        "type": "color",
        "head": "rose",
        "full": "rose"
      },
      {
        "type": "color",
        "head": "blanc",
        "full": "blanc"
      }
    ]
  }

café -> https://static.arasaac.org/pictograms/32398/32398_500.png
sucre -> https://static.arasaac.org/pictograms/32440/32440_500.png
rose -> https://static.arasaac.org/pictograms/3151/3151_500.png
pluie -> https://static.arasaac.org/pictograms/3123/3123_500.png
jonathan -> None
