In [7]:
import json
import os
import random
from typing import Any, Dict, List, Tuple, Union, Optional

def _sanitize_text(raw_text: str) -> str:
    """Remove underscores, asterisks und Zeilenumbrüche und normalisiere Leerzeichen."""
    cleaned = raw_text.replace("\n\n", " ")
    cleaned = cleaned.replace("_", "").replace("*", "")
    cleaned = " ".join(cleaned.split())
    return cleaned.strip()

def _normalize_entries(container: Any) -> List[Dict[str, Any]]:
    """
    Try to extract a list of entry dicts from various shapes:
    - already a list of dicts
    - dict with 'blocks' or 'items' that is a list
    - story-node style single dict with question/type/options
    """
    if isinstance(container, list):
        return container
    if isinstance(container, dict):
        for key in ("blocks", "items", "content"):
            if isinstance(container.get(key), list):
                return container[key]
        # Fallback: treat a single node-like dict as one entry if it looks like a QA node
        if any(k in container for k in ("Text", "Question", "Answer", "question", "options", "type", "Branch")):
            return [container]
    return []

def _normalize_answer_value(answer: Any) -> Any:
    """
    Normalize 'Answer'-Werte in konsistente Datenstrukturen und bereinige Texte.
    - string -> bereinigter String oder None
    - list -> Liste bereinigter Strings (leere Einträge werden entfernt)
    - dict -> behalte Schlüssel, bereinige String-Werte
    """
    if answer is None:
        return None
    if isinstance(answer, str):
        sanitized = _sanitize_text(answer)
        return sanitized if sanitized else None
    if isinstance(answer, list):
        sanitized_items: List[str] = []
        for element in answer:
            sanitized = _sanitize_text(str(element))
            if sanitized:
                sanitized_items.append(sanitized)
        return sanitized_items if sanitized_items else None
    if isinstance(answer, dict):
        normalized: Dict[str, Any] = {}
        for key, value in answer.items():
            if isinstance(value, str):
                sanitized = _sanitize_text(value)
                if sanitized:
                    normalized[key] = sanitized
            else:
                normalized[key] = value
        return normalized if normalized else None
    return answer

def _emit_segments_from_entry(entry: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Convert a single entry dict into a list of segment dictionaries
    (each segment containing one of Text/Question/Answer).
    Handles both exercise-style and story-node-style keys.
    """
    segments: List[Dict[str, Any]] = []

    text_value = entry.get("Text")
    if isinstance(text_value, str):
        sanitized_text = _sanitize_text(text_value)
        if sanitized_text:
            segments.append({"Text": sanitized_text})

    question_value = entry.get("Question", entry.get("question"))
    if isinstance(question_value, str):
        sanitized_question = _sanitize_text(question_value)
        if sanitized_question:
            segments.append({"Question": sanitized_question})

    raw_answer = None
    if "Answer" in entry:
        raw_answer = entry.get("Answer")
    elif "answer" in entry:
        raw_answer = entry.get("answer")
    elif isinstance(entry.get("options"), list):
        raw_answer = entry.get("options")
    elif isinstance(entry.get("type"), str):
        t = entry["type"].lower()
        if t in ("free_text", "free-text", "text"):
            raw_answer = "free_text"

    normalized_answer = _normalize_answer_value(raw_answer)
    if normalized_answer is not None:
        segments.append({"Answer": normalized_answer})

    return segments

def _process_branch(branch: Dict[str, Any],
                    story_nodes: Dict[str, Any],
                    rng: random.Random,
                    max_depth: int = 5) -> List[Dict[str, Any]]:
    """
    Randomly choose one branch option, append the chosen answer text,
    and process the referenced story node recursively.
    Returns a flat list of segment dictionaries.
    """
    segments: List[Dict[str, Any]] = []
    if max_depth <= 0:
        return segments

    branch_items: List[Tuple[str, Any]] = []
    if isinstance(branch, dict):
        branch_items = list(branch.items())
    elif isinstance(branch, list):
        for option in branch:
            if isinstance(option, dict):
                branch_items.extend(option.items())
    if not branch_items:
        return segments

    selected_key, selected_node_name = rng.choice(branch_items)
    sanitized_choice = _sanitize_text(str(selected_key))
    if sanitized_choice:
        segments.append({"Answer": sanitized_choice})

    node_data = story_nodes.get(selected_node_name)
    if node_data is None:
        return segments

    entries = _normalize_entries(node_data)
    for nd in entries:
        segments.extend(_emit_segments_from_entry(nd))
        if "Branch" in nd and isinstance(nd["Branch"], (dict, list)):
            segments.extend(_process_branch(nd["Branch"], story_nodes, rng, max_depth=max_depth - 1))

    return segments

def get_prompt_segments_from_exercise(
    exercise_name: str,
    JSON_STRUCT_PATH: str,
    JSON_SN_STRUCT_PATH: str,
    seed: Optional[int] = None
) -> List[Dict[str, Any]]:
    """
    Gather normalized prompt segments for a given exercise.
    Returns a list of dictionaries (segments) containing Text/Question/Answer/Branch entries.
    """
    rng = random.Random(seed)

    if not os.path.exists(JSON_STRUCT_PATH):
        raise FileNotFoundError(f"Exercises JSON not found: {JSON_STRUCT_PATH}")
    if not os.path.exists(JSON_SN_STRUCT_PATH):
        raise FileNotFoundError(f"Story-nodes JSON not found: {JSON_SN_STRUCT_PATH}")

    with open(JSON_STRUCT_PATH, "r", encoding="utf-8") as f:
        ex_data = json.load(f)
    with open(JSON_SN_STRUCT_PATH, "r", encoding="utf-8") as f:
        sn_data = json.load(f)

    # Recursively search for the exercise name in the nested structure
    def find_exercise(data: Any, name: str) -> Any:
        if isinstance(data, dict):
            if name in data:
                return data[name]
            for value in data.values():
                result = find_exercise(value, name)
                if result is not None:
                    return result
        return None

    exercise_container = find_exercise(ex_data, exercise_name)
    if exercise_container is None:
        raise KeyError(f"Exercise '{exercise_name}' not found in {JSON_STRUCT_PATH}")

    entries = _normalize_entries(exercise_container)
    segments: List[Dict[str, Any]] = []

    for entry in entries:
        segments.extend(_emit_segments_from_entry(entry))
        if "Branch" in entry and isinstance(entry["Branch"], (dict, list)):
            segments.extend(_process_branch(entry["Branch"], sn_data, rng))

    return segments


def _format_answer_for_text(answer: Any) -> Optional[str]:
    if answer is None:
        return None
    if isinstance(answer, str):
        return answer
    if isinstance(answer, list):
        return "; ".join(str(x) for x in answer)
    if isinstance(answer, dict):
        mn = answer.get("min") if "min" in answer else answer.get("minText")
        mx = answer.get("max") if "max" in answer else answer.get("maxText")
        if mn is not None or mx is not None:
            mn_str = str(mn).strip() if mn is not None else ""
            mx_str = str(mx).strip() if mx is not None else ""
            if mn_str and mx_str:
                return f"{mn_str} - {mx_str}"
            return mn_str or mx_str
        return "; ".join(f"{k}: {v}" for k, v in answer.items())
    return str(answer)


def prompt_segments_to_text(segments: List[Dict[str, Any]]) -> str:
    lines: List[str] = []
    for seg in segments:
        if "Text" in seg:
            lines.append(f"TEXT: {seg['Text']}")
        elif "Question" in seg:
            lines.append(f"FRAGE: {seg['Question']}")
        elif "Answer" in seg:
            ans = _format_answer_for_text(seg["Answer"])
            if ans:
                lines.append(f"ANTWORT: {ans}")
    return "\n".join(line.replace("\n", " ").strip() for line in lines if line.strip())


In [9]:
JSON_SN_STRUCT_PATH = "../../../data/processed/kiso_app_storynodes_struct.json"
JSON_STRUCT_PATH = "../../../data/processed/kiso_app_merged_structured.json"

segments = get_prompt_segments_from_exercise(
    exercise_name="Kopfkino",
    JSON_STRUCT_PATH=JSON_STRUCT_PATH,
    JSON_SN_STRUCT_PATH=JSON_SN_STRUCT_PATH,
    seed=42
)
print(prompt_segments_to_text(segments))
#print(segments)


FRAGE: Lisa bewirbt sich auf ein Praktikum. Zwei Wochen nach dem Bewerbungsgespräch hat sie immer noch keine Rückmeldung erhalten. Was könnte Lisa denken? Wähle eine der Optionen aus und schaue, was passiert.
ANTWORT: Ich habe mich wahrscheinlich blamiert. Die haben gemerkt, dass ich komisch bin. Ich bin einfach nicht geeignet für solche Sachen.
TEXT: Lisa schämt sich und fühlt sich niedergeschlagen. Sie zieht sich zurück und hat keine Lust darauf, weitere Bewerbungen zu verschicken. Ich werde sowieso nie irgendwo genommen, denkt sie sich. Ich werde nie Erfolg haben.
TEXT: Bewertungen beeinflussen wie wir denken, fühlen und handeln. Oft laufen diese Bewertungen ganz automatisch in unserem Kopf ab wie ein Film. Wir erleben etwas und schon beginnt unser Kopfkino. Dabei merken wir oft gar nicht, dass wir durch eine bestimmte Brille auf das schauen, was passiert. Diese Brille ist gefärbt durch unsere Erfahrungen, unsere Stimmung und unsere inneren Überzeugungen. Je nachdem, welche Brille w