In [36]:
import json
import string
import re

from pathlib import Path
import pandas as pd

import tiktoken

In [13]:
import json

# Path to your file
file_path = "logs/2025-11-12_10-30-43_give_me_a_short_story_about_bells_and_clocks_in_english.json"

# Open and load the JSON content
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Access main fields
print("Model:", data["model"])
print("Prompt:", data["prompt"])
print("\nStory:\n", data["story"])

Model: gpt-4.1-mini
Prompt: Give me a short story about bells and clocks in English.

Story:
 In a quiet village nestled between rolling hills, the bells of the old church chimed every hour, echoing through the cobblestone streets. Nearby, the grand clock tower stood tall, its hands moving steadily, marking the passage of time for all to see.

One day, the clock stopped. The villagers gathered, worried that time itself had paused. The bells fell silent, as if mourning the stillness. A young girl named Elara, curious and brave, climbed the tower to investigate. She discovered a tiny, forgotten gear jammed between the colossal cogs.

With gentle hands, she freed the gear. The clock began to tick once more, and the bells burst into joyful song, ringing out over the village. From that day on, Elara was known as the Keeper of Time, the one who listened to the whispers of bells and clocks and kept their harmony alive.


In [16]:
# Extract the interesting words from the prompt - function

def extract_items_from_prompt(prompt: str):
    """
    Extracts item1 and item2 from prompts of the form:
    'Give me a short story about [item1] and [item2] in English.'
    """
    pattern = r"Give me a short story about (.*?) and (.*?) in English\."
    match = re.match(pattern, prompt.strip())
    if match:
        item1, item2 = match.groups()
        return item1.strip(), item2.strip()
    else:
        return None, None

In [24]:
# Find interesting logs - function

def find_item_indices(data, item):
    """
    Find all indices in data['token_positions'] where the token
    matches 'item' (case-insensitive, stripped).
    """
    item = item.strip().lower()
    indices = []

    for idx, pos in enumerate(data.get("token_positions", [])):
        for lp in pos.get("top_logprobs", []):
            token = (lp.get("token") or "").strip().lower()
            if token == item:
                indices.append(idx)
                break  # no need to check other top_logprobs at this position

    return indices

In [25]:
def get_logs_for_index(data, index):
    """
    Return all log entries (token, logprob, probability)
    for a given token position index.
    """
    try:
        pos = data["token_positions"][index]
        return pos.get("top_logprobs", [])
    except (IndexError, KeyError, TypeError):
        return []

In [30]:
def item_logs_to_df(data, item):
    """
    Finds all positions where 'item' appears and returns
    a DataFrame with all candidate tokens, their logprobs, and probabilities.
    """
    indices = find_item_indices(data, item)
    rows = []

    for idx in indices:
        logs = get_logs_for_index(data, idx)
        for lp in logs:
            rows.append({
                "item": item,
                "position": idx,
                "token": lp.get("token"),
                "logprob": lp.get("logprob"),
                "probability": lp.get("probability"),
            })

    df = pd.DataFrame(rows)
    return df

In [39]:
item1, item2 = extract_items_from_prompt(data["prompt"])
df_item1 = item_logs_to_df(data, item1)
df_item1.head(2)

Unnamed: 0,item,position,token,logprob,probability
0,bells,9,there,-0.733271,0.480335
1,bells,9,the,-0.858271,0.423894


In [48]:
enc = tiktoken.encoding_for_model("gpt-4-1-mini")  # or your model
enc.encode("whispers of bells")

[1336, 69935, 315, 61794]

In [46]:
find_item_indices(data, "whispers")

[166, 167]

In [47]:
find_item_indices(data, "bells")

[9, 10, 57, 61, 72, 134, 159, 166, 167, 169]