# SDRF Extraction — Harmonizing the Data of your Data

Extract SDRF metadata from test papers using the competition baseline prompt and OpenAI, then write `submission.csv`.

---

### Secrets (for OpenAI)
In the right panel: **Add-ons → Secrets**. Add:
- **`OPENAI_API_KEY`** — your OpenAI API key (required for real extraction)
- **`OPENAI_MODEL`** (optional) — e.g. `gpt-4o-mini` or `gpt-4o`

Without `OPENAI_API_KEY`, the run uses **placeholder** mode (all "Not Applicable").

In [None]:
# Kaggle environment: list input files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

---
## 1. Setup

In [None]:
!pip install -q openai

---
## 2. Config & paths

In [None]:
import json
import os
from pathlib import Path

import pandas as pd
from openai import OpenAI

# Load Kaggle Secrets into environment
try:
    from kaggle_secrets import UserSecretsClient
    client = UserSecretsClient()
    for name in ("OPENAI_API_KEY", "OPENAI_MODEL", "USE_BATCH", "LLM_PROVIDER"):
        try:
            os.environ[name] = client.get_secret(name)
        except Exception:
            pass
except ImportError:
    pass

# Paths
INPUT_DIR = Path("/kaggle/input/harmonizing-the-data-of-your-data")
WORKING_DIR = Path("/kaggle/working")

TEST_PUBTEXT = INPUT_DIR / "Test PubText" / "Test PubText"
if not TEST_PUBTEXT.exists():
    TEST_PUBTEXT = INPUT_DIR / "Test_PubText" / "Test_PubText"
if not TEST_PUBTEXT.exists():
    TEST_PUBTEXT = INPUT_DIR / "Test PubText"

SAMPLE_SUBMISSION = INPUT_DIR / "SampleSubmission.csv"
BASELINE_PROMPT_PATH = INPUT_DIR / "BaselinePrompt.txt"
SUBMISSION_OUT = WORKING_DIR / "submission.csv"

MANUSCRIPT_KEYS = ("TITLE", "ABSTRACT", "METHODS")
MANUSCRIPT_MAX_CHARS = 120_000
PREDICTION_COLUMNS_EXCLUDE = ("ID", "PXD", "Raw Data File", "Usage")

print("Input:", INPUT_DIR)
print("SampleSubmission:", SAMPLE_SUBMISSION.exists())
print("BaselinePrompt:", BASELINE_PROMPT_PATH.exists())
print("Test PubText:", TEST_PUBTEXT.exists())

---
## 3. Helper functions

In [None]:
def strip_json(text):
    if "```" not in text:
        return text
    for p in text.split("```"):
        p = p.strip()
        if p.lower().startswith("json"):
            p = p[4:].strip()
        if p.startswith("{"):
            return p
    return text


def parse_llm_response(raw_response, raw_files):
    text = strip_json(raw_response)
    try:
        out = json.loads(text)
    except json.JSONDecodeError:
        return {raw: {} for raw in raw_files}
    for raw in out:
        for k, v in list(out[raw].items()):
            if isinstance(v, str):
                out[raw][k] = [v]
    return out


def extract_openai(manuscript_text, raw_files, prompt_spec, expected_columns, model="gpt-4o-mini"):
    api_key = os.environ.get("OPENAI_API_KEY", "").strip()
    if not api_key:
        return {raw: {} for raw in raw_files}
    text = (manuscript_text or "")[:MANUSCRIPT_MAX_CHARS]
    user = f"MANUSCRIPT_TEXT:\n{text}\n\nRAW_FILES:\n" + "\n".join(raw_files)
    if expected_columns:
        user += "\n\nUse these exact column names as JSON keys when applicable: " + ", ".join(expected_columns)
    client = OpenAI(api_key=api_key)
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt_spec},
            {"role": "user", "content": user},
        ],
        temperature=0,
    )
    raw = (resp.choices[0].message.content or "").strip()
    return parse_llm_response(raw, raw_files)


def get_manuscript(doc):
    return "\n\n".join(doc.get(k, "").strip() for k in MANUSCRIPT_KEYS if doc.get(k))


def sdrf_to_row(raw_file, sdrf_per_file, pred_columns):
    meta = sdrf_per_file.get(raw_file, {})
    row = {}
    for col in pred_columns:
        vals = meta.get(col)
        if vals and len(vals) > 0:
            row[col] = vals[0] if isinstance(vals, list) else str(vals)
        else:
            row[col] = "Not Applicable"
    return row

---
## 4. Load template & baseline prompt

In [None]:
prompt_spec = ""
if BASELINE_PROMPT_PATH.exists():
    prompt_spec = BASELINE_PROMPT_PATH.read_text(encoding="utf-8")
else:
    print("Warning: BaselinePrompt.txt not found")

sub = pd.read_csv(SAMPLE_SUBMISSION, index_col=0)
pred_columns = [c for c in sub.columns if c not in PREDICTION_COLUMNS_EXCLUDE]
n_pxds = sub["PXD"].nunique()

use_openai = bool(os.environ.get("OPENAI_API_KEY", "").strip()) and prompt_spec
model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini").strip()

print(f"Mode: {'OpenAI (' + model + ')' if use_openai else 'Placeholder'}")
print(f"Template: {len(sub)} rows, {n_pxds} PXDs")

---
## 5. Extract per PXD & build submission

In [None]:
out_df = sub.copy()

for i, (pxd, group) in enumerate(sub.groupby("PXD"), start=1):
    raw_files = group["Raw Data File"].unique().tolist()
    path = TEST_PUBTEXT / f"{pxd}_PubText.json"
    if not path.exists():
        manuscript_text = ""
        print(f"[{i}/{n_pxds}] {pxd} — no PubText")
    else:
        with open(path, "r", encoding="utf-8") as f:
            doc = json.load(f)
        manuscript_text = get_manuscript(doc)
        if "Raw Data Files" in doc:
            raw_files = doc["Raw Data Files"]
    print(f"[{i}/{n_pxds}] {pxd} ...", end=" ", flush=True)
    sdrf_per_file = extract_openai(manuscript_text, raw_files, prompt_spec, pred_columns, model)
    for idx, r in group.iterrows():
        row_vals = sdrf_to_row(r["Raw Data File"], sdrf_per_file, pred_columns)
        for col in pred_columns:
            out_df.at[idx, col] = row_vals[col]
    print("ok")

out_df.to_csv(SUBMISSION_OUT, index=True)
print(f"\nWrote {SUBMISSION_OUT} ({len(out_df)} rows)")

---
## 6. Submit to competition

After **Save & Run All** completes, use **Submit** (top right) to send this notebook's output to the leaderboard. The file `submission.csv` will be in the output.