In [29]:
from dotenv import load_dotenv
from pathlib import Path
import os
import sys

# add project root (two levels up from tests/)
#sys.path.append(str(Path.cwd().parents[0]))

load_dotenv(".env")  # loads from .env in OpenAI directory
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI
client = OpenAI(api_key=api_key)

In [30]:
INSTRUCTIONS = """
You parse short French sentences (children / IME level).

For each sentence:
1) Extract elements that can be illustrated with pictograms.
2) Propose simple comprehension questions based ONLY on these elements.

Never invent words.
If missing, use null or [].
Return ONLY valid JSON.
Output is a JSON list, same order as input.

Schema (one item):

{
  "sentence": string,
  "subject": string | null,
  "verb": { "lemma": string, "text": string } | null,
  "slots": [
    {
      "type": "object" | "color" | "weather" | "place" | "time",
      "head": string,
      "full": string,
      "tags": ["object" | "color" | "weather" | "place" | "time"]
    }
  ],
  "questions": [
    {
      "qtype": "object" | "color" | "weather" | "place" | "time",
      "question": string,
      "answer_head": string
    }
  ]
}

Rules:
- tags is a JSON array (1–3 items).
- First tag MUST equal type.
- Questions:
  - use ONLY slot.head as answers
  - 1–3 per sentence max
  - short and simple
""".strip()

In [31]:
sentences = [
    "Papa a bu son café avec du sucre.",
    "Le chat noir dort avec les chatons.",
    "Mon vélo est rose et blanc.",
    "Emma a promené son chien sous la pluie."
]

resp = client.responses.create(
    model="gpt-4o-mini",
    instructions=INSTRUCTIONS,
    input="\n".join(f"- {s}" for s in sentences)
)


In [32]:
import json

raw = resp.output_text

try:
    output_json = json.loads(raw)  # to verify it's valid JSON
    print("Valid JSON")
    print(json.dumps(output_json, indent=2, ensure_ascii=False))
except json.JSONDecodeError as e:
    print("Invalid JSON:", e)
    print("Raw output:", raw)

Valid JSON
[
  {
    "sentence": "Papa a bu son café avec du sucre.",
    "subject": "Papa",
    "verb": {
      "lemma": "boire",
      "text": "a bu"
    },
    "slots": [
      {
        "type": "object",
        "head": "café",
        "full": "son café",
        "tags": [
          "object"
        ]
      },
      {
        "type": "object",
        "head": "sucre",
        "full": "du sucre",
        "tags": [
          "object"
        ]
      }
    ],
    "questions": [
      {
        "qtype": "object",
        "question": "Qu'est-ce que Papa a bu?",
        "answer_head": "café"
      },
      {
        "qtype": "object",
        "question": "Avec quoi Papa a bu son café?",
        "answer_head": "sucre"
      }
    ]
  },
  {
    "sentence": "Le chat noir dort avec les chatons.",
    "subject": "Le chat noir",
    "verb": {
      "lemma": "dormir",
      "text": "dort"
    },
    "slots": [
      {
        "type": "object",
        "head": "chatons",
        "full": "les ch

In [33]:
json_out = json.dumps(output_json, indent=2, ensure_ascii=False)

for i, sentence_data in enumerate(output_json):
    filename = f"sentence_{i+1}_data.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(sentence_data, f, ensure_ascii=False, indent=2)
    print(f"Saved data for sentence {i+1} to {filename}")

Saved data for sentence 1 to sentence_1_data.json
Saved data for sentence 2 to sentence_2_data.json
Saved data for sentence 3 to sentence_3_data.json
Saved data for sentence 4 to sentence_4_data.json


In [34]:
# check structure of json output
for item in output_json:
    assert "sentence" in item
    assert "subject" in item
    assert "verb" in item
    assert "slots" in item
    assert "questions" in item

    if item["verb"] is not None:
        assert "lemma" in item["verb"]
        assert "text" in item["verb"]

    for slot in item["slots"]:
        assert "type" in slot
        assert "head" in slot
        assert "full" in slot
        assert "tags" in slot
        assert isinstance(slot["tags"], list)
        assert slot["type"] in slot["tags"]

    for question in item["questions"]:
        assert "qtype" in question
        assert "question" in question
        assert "answer_head" in question

In [46]:
for item in output_json:
    # iterate on questions
   for question in item["questions"]:
       print(f"  - QType: {question['qtype']}, Question: {question['question']}, Answer Head: {question['answer_head']}")
       # check if answer_head is in one of the slots
       answer_head = question["answer_head"]
       found = False
       for slot in item["slots"]:
           if slot["head"] == answer_head:
               found = True
               break
       assert found, f"Answer head '{answer_head}' not found in slots"

       # fetch pictogram for answer_head
       print(resolve_term_to_picto_strict(answer_head, expected_type = slot["tags"][0]))
       



  - QType: object, Question: Qu'est-ce que Papa a bu?, Answer Head: café
ResolvedPicto(term='café', picto_id=32398, url='https://static.arasaac.org/pictograms/32398/32398_500.png', score=13.5, tags=['place', 'building', 'catering establishment', 'hospitality industry', 'work', 'tertiary sector', 'core vocabulary'], categories=['catering establishment', 'hospitality industry', 'core vocabulary-place', 'core vocabulary-work'], keyword='bar', plural='bars', source='arasaac')
  - QType: object, Question: Avec quoi Papa a bu son café?, Answer Head: sucre
ResolvedPicto(term='sucre', picto_id=32440, url='https://static.arasaac.org/pictograms/32440/32440_500.png', score=13.5, tags=['feeding', 'food', 'taste', 'communication', 'language', 'adjective', 'qualifying adjective', 'core vocabulary'], categories=['taste', 'qualifying adjective', 'core vocabulary-feeding', 'core vocabulary-communication'], keyword='sucré', plural=None, source='arasaac')
  - QType: object, Question: Avec qui dort le cha

In [43]:
item

{'sentence': 'Emma a promené son chien sous la pluie.',
 'subject': 'Emma',
 'verb': {'lemma': 'promener', 'text': 'a promené'},
 'slots': [{'type': 'object',
   'head': 'chien',
   'full': 'son chien',
   'tags': ['object']},
  {'type': 'weather',
   'head': 'pluie',
   'full': 'la pluie',
   'tags': ['weather']}],
 'questions': [{'qtype': 'object',
   'question': "Qu'est-ce qu'Emma a promené?",
   'answer_head': 'chien'},
  {'qtype': 'weather',
   'question': 'Sous quoi Emma a promené son chien?',
   'answer_head': 'pluie'}]}

In [36]:
for item in output_json:
    print(f"Sentence: {item['sentence']}")
    print(f"Subject: {item['subject']}")
    print(f"Verb: {item['verb']}")
    print("Slots:")
    for slot in item["slots"]:
        print(f"  - Type: {slot['type']}, Head: {slot['head']}, Full: {slot['full']}, Tags: {slot['tags']}")
    print("Questions:")
    for question in item["questions"]:
        print(f"  - QType: {question['qtype']}, Question: {question['question']}, Answer Head: {question['answer_head']}")

Sentence: Papa a bu son café avec du sucre.
Subject: Papa
Verb: {'lemma': 'boire', 'text': 'a bu'}
Slots:
  - Type: object, Head: café, Full: son café, Tags: ['object']
  - Type: object, Head: sucre, Full: du sucre, Tags: ['object']
Questions:
  - QType: object, Question: Qu'est-ce que Papa a bu?, Answer Head: café
  - QType: object, Question: Avec quoi Papa a bu son café?, Answer Head: sucre
Sentence: Le chat noir dort avec les chatons.
Subject: Le chat noir
Verb: {'lemma': 'dormir', 'text': 'dort'}
Slots:
  - Type: object, Head: chatons, Full: les chatons, Tags: ['object']
  - Type: color, Head: noir, Full: noir, Tags: ['color']
Questions:
  - QType: object, Question: Avec qui dort le chat?, Answer Head: chatons
  - QType: color, Question: De quelle couleur est le chat?, Answer Head: noir
Sentence: Mon vélo est rose et blanc.
Subject: Mon vélo
Verb: {'lemma': 'être', 'text': 'est'}
Slots:
  - Type: color, Head: rose, Full: rose, Tags: ['color']
  - Type: color, Head: blanc, Full: bla

In [42]:
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / "src"))

from qcmgen.pictos.resolve import resolve_term_to_picto_strict

resolve_term_to_picto_strict("chat", expected_type = )

ResolvedPicto(term='chat', picto_id=7114, url='https://static.arasaac.org/pictograms/7114/7114_500.png', score=13.5, tags=['animal', 'carnivorous', 'vertebrate', 'mammal', 'viviparous', 'terrestrial animal', 'pet', 'domestic', 'core vocabulary'], categories=['carnivorous', 'mammal', 'viviparous', 'terrestrial animal', 'pet', 'domestic animal', 'core vocabulary-living being'], keyword='chat', plural='chats', source='arasaac')