In [None]:
#%%
import json
import random
from collections import Counter
import plotly.graph_objects as go


In [None]:
#%%
data_path = "data/dev.jsonl"
with open(data_path, "r", encoding="utf-8") as f:
    data = []
    for line in f:
        record = json.loads(line)
        if "text" in record and isinstance(record["text"], dict) and "en" in record["text"]:
            record["text"] = {"en": record["text"]["en"]}
        data.append(record)

print(f"Loaded {len(data)} records from {data_path}")

# Print an example record from the loaded data
from pprint import pprint
print("Example record (pretty-printed):")
pprint(data[0], sort_dicts=False, compact=False, width=100)


In [None]:
#%% Replace eurovoc concept codes with their English descriptors

descriptor_path = "data/eurovoc_descriptors.json"
print("Loading EuroVoc descriptors …")
with open(descriptor_path, "r", encoding="utf-8") as f_desc:
    descriptor_raw = json.load(f_desc)

# Build a simple code -> English descriptor mapping
code_to_en: dict[str, str] = {
    code: desc.get("en", "") for code, desc in descriptor_raw.items()
}
print(f"Loaded {len(code_to_en):,} EuroVoc descriptors")

levels = ["level_1", "level_2", "level_3", "all_levels"]
for record in data:
    eurovoc = record.get("eurovoc_concepts", {})
    for level in levels:
        if level in eurovoc:
            eurovoc[level] = [code_to_en.get(code, code) for code in eurovoc[level]]

# Save the updated records
out_path = "data/dev_with_descriptors.jsonl"
with open(out_path, "w", encoding="utf-8") as f_out:
    for rec in data:
        f_out.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"Wrote transformed records with descriptors to {out_path}")

# Print an example record from the transformed data (with descriptors)
print("Example record with descriptors:")
pprint(data[0], sort_dicts=False, compact=False, width=100)


In [None]:
#%% Create train/val splits --------------------------------------------------
SEED = 42
random.seed(SEED)

num_train = 100
num_val = 50

if len(data) < num_train + num_val:
    raise ValueError("Not enough records to create the desired splits")

indices = list(range(len(data)))
random.shuffle(indices)
train_indices = indices[:num_train]
val_indices = indices[num_train:num_train + num_val]

# Helper to pick reference answer (use level_1 descriptors, comma-separated)

def build_item(idx: int, record: dict) -> dict:
    """Build a dataset entry using a sequential numeric id starting from 0."""
    level1: list[str] = record.get("eurovoc_concepts", {}).get("level_1", [])
    reference_answer = level1  # already a list of descriptors
    return {
        "item": {
            "id": str(idx),  # sequential id 0,1,2,…
            "text_input": record.get("text", {}).get("en", ""),
            "reference_answer": reference_answer,
        }
    }

train_records = [build_item(idx, data[i]) for idx, i in enumerate(train_indices)]
val_records = [build_item(idx, data[i]) for idx, i in enumerate(val_indices)]

train_path = "data/build_hour_train.jsonl"
val_path = "data/build_hour_val.jsonl"

for path, recs in [(train_path, train_records), (val_path, val_records)]:
    with open(path, "w", encoding="utf-8") as f_split:
        for rec in recs:
            f_split.write(json.dumps(rec, ensure_ascii=False) + "\n")
    print(f"Wrote {len(recs)} records to {path}")

# Print an example record from the train split
print("Evals API Guide:", "https://platform.openai.com/docs/guides/evals/#uploading-test-data")
print("Example train record:")
pprint(train_records[0], sort_dicts=False, compact=False, width=100)


In [None]:
#%% Plot EuroVoc level_1 frequencies in the train split (dark mode)

# Count occurrences of each descriptor in train_records
freq_counter = Counter()
for rec in train_records:
    freq_counter.update(rec["item"]["reference_answer"])

# Sort descriptors alphabetically
sorted_items = sorted(freq_counter.items(), key=lambda x: x[1])
descriptors, counts = zip(*sorted_items) if sorted_items else ([], [])

# -----------------------------------------------------------------------------
# Plot with Plotly using a single brand color, dark mode

# Choose a single brand color (OpenAI blue)
bar_color = "#0071cf"

# Fix the figure size to a specific width and height (e.g., 900x700 pixels)
fig = go.Figure(
    go.Bar(
        x=list(counts),
        y=list(descriptors),
        orientation="h",
        marker=dict(color=bar_color),
    )
)

fig.update_layout(
    title="EuroVoc Level 1 Descriptor Frequencies – Train Split",
    xaxis_title="Frequency in train split",
    yaxis_title="",
    template="plotly_dark",  # Use dark mode template
    height=700,   # fixed height in pixels
    width=900,    # fixed width in pixels
    yaxis=dict(automargin=True),
    font=dict(color="#e6e6e6", family="Inter, sans-serif"),  # Set both color and family in one dict
    plot_bgcolor="#222222",
    paper_bgcolor="#222222",
)

fig.show()