In [13]:
import os
import pandas as pd
from openai import OpenAI
from pathlib import Path
import time, re, json

In [2]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
# Generate the prompts

In [4]:
pairs = [["bells", "clocks"], ["mugs", "cups"], ["onions", "garlics"], ["watches", "clocks"]]
template = "Give me a story about {a} and {b} in {target_language}."
target_languages = ["English", "Swedish"]

In [5]:
# Needed functions

def make_prompt(pairs, target_languages, template, swap=False):
    """Return a list of prompts built from pairs and multiple target languages."""
    prompts = []
    for target_language in target_languages:
        for item1, item2 in pairs:
            a, b = (item2, item1) if swap else (item1, item2)
            prompt = template.format(a=a, b=b, target_language=target_language)
            prompts.append(prompt)
    return prompts

In [6]:
all_prompts = []

In [7]:
prompts_order1 = make_prompt(pairs, target_languages, template, swap=False)
prompts_order2 = make_prompt(pairs, target_languages, template, swap=True)

In [8]:
all_prompts = prompts_order1 + prompts_order2

In [9]:
all_prompts

['Give me a story about bells and clocks in English.',
 'Give me a story about mugs and cups in English.',
 'Give me a story about onions and garlics in English.',
 'Give me a story about watches and clocks in English.',
 'Give me a story about bells and clocks in Swedish.',
 'Give me a story about mugs and cups in Swedish.',
 'Give me a story about onions and garlics in Swedish.',
 'Give me a story about watches and clocks in Swedish.',
 'Give me a story about clocks and bells in English.',
 'Give me a story about cups and mugs in English.',
 'Give me a story about garlics and onions in English.',
 'Give me a story about clocks and watches in English.',
 'Give me a story about clocks and bells in Swedish.',
 'Give me a story about cups and mugs in Swedish.',
 'Give me a story about garlics and onions in Swedish.',
 'Give me a story about clocks and watches in Swedish.']

In [10]:
# Generate the model's outputs

In [11]:
def get_text_part(resp, msg_idx=0, part_idx=0):
    """Grab the first text chunk from the Responses API object."""
    return resp.output[msg_idx].content[part_idx]

def build_topk_tables(resp, k=5, msg_idx=0, part_idx=0):
    """
    Returns (wide_df, long_df)

    wide_df columns:
      i, emitted_token, emitted_p, alt1_token, alt1_p, ..., altk_token, altk_p

    long_df columns:
      i, rank (0 = emitted), token, p
    """
    part = get_text_part(resp, msg_idx, part_idx)
    rows_wide = []
    rows_long = []

    for i, t in enumerate(part.logprobs):
        emitted_tok = t.token
        emitted_p   = math.exp(t.logprob)

        # Collect candidates from top_logprobs (already the model's best guesses)
        alts_raw = getattr(t, "top_logprobs", []) or []
        # Convert to (token, p) and sort by p desc just in case
        alts = [(a.token, math.exp(a.logprob)) for a in alts_raw]
        alts.sort(key=lambda x: x[1], reverse=True)

        # Ensure emitted token appears as rank 0 in long-form, even if it's not in alts
        rows_long.append({"i": i, "rank": 0, "token": emitted_tok, "p": emitted_p})

        # Keep top k alternatives (exclude emitted if it happens to be duplicated in alts)
        topk = []
        for tok, p in alts:
            if tok == emitted_tok and abs(p - emitted_p) < 1e-12:
                continue
            topk.append((tok, p))
            if len(topk) >= k:
                break

        # Add alternatives to long-form with ranks 1..k
        for r, (tok, p) in enumerate(topk, start=1):
            rows_long.append({"i": i, "rank": r, "token": tok, "p": p})

        # Build wide-row
        wide_row = {
            "i": i,
            "emitted_token": emitted_tok,
            "emitted_p": emitted_p,
        }
        for j in range(k):
            tokj, pj = (topk[j] if j < len(topk) else ("", float("nan")))
            wide_row[f"alt{j+1}_token"] = tokj
            wide_row[f"alt{j+1}_p"] = pj
        rows_wide.append(wide_row)

    wide_df = pd.DataFrame(rows_wide)
    long_df = pd.DataFrame(rows_long)
    return wide_df, long_df

In [23]:
def slug(s: str) -> str:
    return re.sub(r"[^a-zA-Z0-9]+", "_", s).strip("_")

def id_from_prompt(prompt: str) -> str:
    m = re.search(r"about\s+(.+?)\s+and\s+(.+?)\s+in\s+([A-Za-zÅÄÖåäö]+)", prompt, re.IGNORECASE)
    if m:
        item1, item2, lang = (slug(m.group(1)), slug(m.group(2)), slug(m.group(3)))
        return f"{item1}_{item2}_{lang}"
    return slug(prompt)[:60]

def run_trials(prompts_list, trials=5, model="gpt-4o-mini", top_k=5, out_dir="outputs",
               per_request_timeout=20.0, max_output_tokens=120, retries=2, sleep_between=1.0):
    
    # Create output folder
    Path(out_dir).mkdir(parents=True, exist_ok=True)

    # Prompts loop
    for prompt in prompts_list:
        base = id_from_prompt(prompt)
        print(f"\nPrompt base: {base}")

        # Trials loop
        for t in range(1, trials + 1):
            attempt = 0
            while True:
                attempt += 1
                try:
                    # API request
                    resp = client.with_options(timeout=per_request_timeout).responses.create(
                        model=model,
                        input=prompt,
                        max_output_tokens=max_output_tokens,
                        top_logprobs=top_k,
                        include=["message.output_text.logprobs"]
                    )

                    # TEXT
                    text = resp.output_text

                    # LOGS - JSON-serializable
                    data = resp.model_dump()
                    logprobs = data["output"][0]["content"][0]["logprobs"]

                    # FILE NAMES
                    trial_name = f"{base}_{t:02d}"
                    with open(Path(out_dir) / f"{trial_name}.txt", "w", encoding="utf-8") as f:
                        f.write(text)
                    with open(Path(out_dir) / f"{trial_name}_logprobs.json", "w", encoding="utf-8") as f:
                        json.dump(logprobs, f, ensure_ascii=False, indent=2)

                    print("saved:", trial_name)
                    break

                except Exception as e:
                    if attempt > retries:
                        print(f"[!] Trial {t} failed after {retries} retries: {e}")
                        trial_name = f"{base}_{t:02d}"
                        with open(Path(out_dir) / f"{trial_name}.txt", "w", encoding="utf-8") as f:
                            f.write(f"[ERROR] {e}\nPrompt:\n{prompt}\n")
                        break
                    time.sleep(sleep_between)

In [21]:
# Test trial

model = "gpt-4o-mini"
top_k = 2
prompts = all_prompts[0:1]  # one prompt

run_trials(prompts, trials=1, model=model, top_k=top_k, out_dir="stories",
           per_request_timeout=20.0, max_output_tokens=20, retries=2)


Prompt base: bells_clocks_English
saved: bells_clocks_English_01
