### User input & data generation

In [25]:
from openai import OpenAI
import numpy as np, pandas as pd, re, json, os
from pathlib import Path
import hashlib
import subprocess, tempfile, shutil, pathlib
from dotenv import load_dotenv
import re

load_dotenv() 
experiment = {
    "context": (
        # "I am a graduate student writing an initial report on phosphate adsorption in saline soils "
        # "(Songnen Plain). I ran batch adsorption tests by varying the initial phosphate concentration (C0), "
        # "solution volume (V), sorbent mass (m), pH, temperature (T), and background salinity (NaCl). "
        # "At equilibrium I measured the liquid-phase concentration (Ce) and computed adsorption capacity q."
        "Record a concise observations-style report about batch adsorption of phosphate in saline soils "
        "(Songnen Plain). The study varies initial concentration, volume, sorbent mass, pH, temperature, "
        "and NaCl salinity, measures the equilibrium liquid concentration, and summarizes observed adsorption capacity "
        "and removal. Write as if reporting what was done and what was seen—no derivations."
    ),
    "motivation": (
        # "Our near-term goal is to derive and validate the phosphate adsorption capacity equation "
        # "q = ((C0 - Ce) * V) / m and describe how salinity, pH, and temperature influence removal."
        "Provide a clear record of the experimental conditions and numerical outcomes (counts, ranges, means/SDs, "
        "optional 95% CIs). Emphasize trends seen in the data (effects of salinity, pH, temperature) and note any "
        "limitations. Keep it practical, like a lab note intended for a follow-up meeting."
    ),

    "NUM_DATASETS": 1,
    "N_OBS": 12,

    # Inputs: choose ranges typical of batch adsorption tests in soils
    "inputs": {
        "C0_(mg/L)"     : ("mg/L", "uniform", [5.0, 150.0]),   # initial phosphate concentration
        "V_(mL)"        : ("mL",   "uniform", [24.8, 25.2]),   # ~25 mL batch volume
        "m_(g)"         : ("g",    "uniform", [1.20, 1.30]),   # ~1.25 g sorbent mass
        "pH"            : ("",     "uniform", [4.0, 9.0]),     # acidic to alkaline
        "T_(K)"         : ("K",    "uniform", [288.0, 308.0]), # 15–35 °C
        "salinity_(M)"  : ("M",    "uniform", [0.01, 0.20])    # background ionic strength proxy
    },

    # Outputs: Ce is generated via a bounded removal fraction f(pH, T, salinity), then q_e from capacity eqn
    "outputs": {
        # Equilibrium concentration in solution (Ce)
        # removal_frac = clip( 0.25 + 0.40*sal_term + 0.15*pH_term + 0.10*temp_term, 0.05, 0.90 )
        # where sal_term = (salinity-0.01)/0.19, pH_term = (clip(pH,4,9)-4)/5, temp_term = (T-298)/10
        "Ce_(mg/L)": (
            "mg/L",
            "C0_(mg/L) * (1.0 - np.minimum(0.90, np.maximum(0.05, "
            "0.25 + 0.40*((salinity_(M)-0.01)/0.19) + "
            "0.15*((np.minimum(np.maximum(pH,4.0),9.0)-4.0)/5.0) + "
            "0.10*((T_(K)-298.0)/10.0) )))",
            ("gaussian", [0.0, 0.25])  # light measurement noise on Ce (mg/L)
        ),

        # Equilibrium adsorption capacity q_e = ((C0 - Ce) * V) / m
        # Convert mL -> L by dividing V by 1000, units end up in mg/g
        "q_e_(mg/g)": (
            "mg/g",
            "((C0_(mg/L) - Ce_(mg/L)) * (V_(mL) / 1000.0)) / m_(g)",
            ("gaussian", [0.0, 0.02])  # tiny noise on computed capacity
        ),

        # Percent removal = 100 * (C0 - Ce)/C0
        "Removal_(%)": (
            "%",
            "100.0 * (C0_(mg/L) - Ce_(mg/L)) / C0_(mg/L)",
            ("gaussian", [0.0, 0.2])
        )
    },

    "paper_words": 1000,
    "wrap_words": 20,
    "model_id": "gpt-4.1",
    "tok_paper": 3500,
    "tok_titleabs": 500,
    "out_dir": "Output/Phosphate_Songnen_run"
}

# experiment["context"] = (
#     "I am a graduate student writing an initial report on phosphate adsorption in saline soils "
#     "(Songnen Plain). I ran batch adsorption tests by varying the initial phosphate concentration (C0), "
#     "solution volume (V), sorbent mass (m), pH, temperature (T), and background salinity (NaCl). "
#     "At equilibrium I measured the liquid-phase concentration (Ce) and computed adsorption capacity q.\n\n"
#     "CONSTRAINTS FOR THE MODEL (do not include this paragraph in the paper): "
#     "Write a LaTeX paper with sections (Introduction, Methods, Results, Discussion). "
#     "Never mention dataset generation, scripts, prompts, ground-truth formulas, noise, code, or language models. "
#     "Use math for symbols (e.g., $q_e$, $C_0$, $C_e$). Write percent as '\\%' (do NOT wrap percent in \\si{...}). "
#     "Use siunitx for units like \\si{mg\\per L}, \\si{mg\\per g}, \\si{mL}, \\si{K}. "
#     "Refer to the adsorption capacity equation as $q_e=\\frac{(C_0-C_e)V}{m}$ and state that V is converted from mL to L; "
#     "report $q_e$ in \\si{mg\\per g}. Use the actual sample size N="
#     f"{experiment['N_OBS']}"
#     " when referring to counts. Do not list variable-generation details or any mention of 'inputs x_i' or 'responses y_k'. "
#     "Avoid boilerplate like 'as an AI'. Use single backslashes for LaTeX macros (e.g., \\textbf, not \\\\textbf)."
# )

# experiment["motivation"] = (
#     "Our near-term goal is to derive and validate the phosphate adsorption capacity equation "
#     "$q_e=\\frac{(C_0-C_e)V}{m}$ across the tested ranges and describe how salinity, pH, and temperature influence removal. "
#     "Report concise summary statistics (means/SDs, optionally 95\\% CIs) and give a brief quantitative interpretation. "
#     "If appropriate, you may mention isotherm models (Langmuir/Freundlich) qualitatively, but do not expose any hidden data-generation formulas."
# )
experiment["constraints"] = (
    "[INSTRUCTIONS—DO NOT INCLUDE THIS PARAGRAPH IN THE PAPER]\n"
    "Style: observational log. Focus on what was done and what was observed.\n"
    "Do NOT print any equations or generating formulas anywhere (no '=' in math). Prefer plain language over symbols; "
    "if a symbol is unavoidable, keep it inline (e.g., C0, Ce) but never show a formula.\n"
    "Do NOT include an 'Inputs'/'Outputs' bullet list or any bullet list of variable definitions.\n"
    "Do NOT mention scripts, prompts, templates, ground-truth, noise, or code tokens (np., Uniform(…), clip(…), min/max).\n"
    f"When giving counts, use the actual sample size N={experiment['N_OBS']}. Report units with siunitx "
    "(\\si{{mg\\per L}}, \\si{{mg\\per g}}, \\si{{mL}}, \\si{{K}}). Percent must be written as '\\%' or "
    "\\SI{{..}}{{\\percent}} (never '\\\\%' or '\\si{{%}}').\n"
    "Sections required: Introduction, Methods, Results, Discussion. Keep each section compact and grounded in the data.\n"
    "Write section headings exactly as \section{Introduction}, \section{Methods}, \section{Results}, \section{Discussion}.\n"
    "Do not use \emph, \textbf, or extra braces inside \section or \subsection"
    
)

# experiment["motivation"] = (
#     "We aim to quantify phosphate retention in saline soils from the Songnen Plain and validate "
#     "the mass-balance capacity equation across realistic conditions (C0, V, m, pH, T, NaCl). "
#     "We summarize how salinity, pH, and temperature modulate removal efficiency and qe, providing "
#     "a baseline template for later isotherm/kinetics fits and treatment comparisons."
# )

client = OpenAI()
rng    = np.random.default_rng(2025)
OUT    = Path(experiment["out_dir"]); OUT.mkdir(exist_ok=True, parents=True)

dists  = {
    "uniform": lambda a,b,n: rng.uniform(a,b,n),
    "normal" : lambda m,s,n: rng.normal(m,s,n),
    "int"    : lambda a,b,n: rng.integers(a,b+1,n),
}

safe_pat = re.compile(r'[^0-9a-zA-Z_]')
def safe(col): return safe_pat.sub('_', col)

BAN_PATTERNS = re.compile(
    r"(constraints of this platform|not feasible|token limit|as an ai|i cannot|i can't|"
    r"model cannot|chatgpt|this interface|outline only)",
    re.IGNORECASE
)
def strip_meta(text: str) -> str:
    return "\n".join(ln for ln in text.splitlines() if not BAN_PATTERNS.search(ln)).strip()

def sample_inputs(n):
    return pd.DataFrame({
        col: dists[dist](*params, n)
        for col, (_, dist, params) in experiment["inputs"].items()
    })

def add_outputs(df):
    for name, (_, formula, noise) in experiment["outputs"].items():
        expr = formula.replace('^','**')
        env  = {"np": np}
        for orig in df.columns:
            token     = safe(orig)
            env[token] = df[orig]
            expr      = expr.replace(orig, token)
        y = eval(expr, env)
        if noise[0] == "gaussian":
            mu, sd = noise[1]
            y += rng.normal(mu, sd, len(df))
        df[name] = y
    return df.round(6)

def stats_json(df):
    return json.dumps(
        df.describe().T[["mean","std","min","max"]].round(4).to_dict(), indent=2
    )

  "Write section headings exactly as \section{Introduction}, \section{Methods}, \section{Results}, \section{Discussion}.\n"
  "Do not use \emph, \textbf, or extra braces inside \section or \subsection"


### LaTeX formatting helpers

In [26]:
def markdown_emphasis_to_tex(text: str) -> str:
    text = re.sub(r"\*\*(.+?)\*\*", r"\\textbf{\1}", text, flags=re.S)
    text = re.sub(r"(?<!\\)\*(?!\s)(.+?)(?<!\s)\*", r"\\emph{\1}", text, flags=re.S)
    return text

def latex_escape(s: str) -> str:
    if s is None: return ""
    repl = {'\\': r'\textbackslash{}','{': r'\{','}': r'\}','\$': r'\$','&': r'\&',
            '%': r'\%','#': r'\#','_': r'\_','^': r'\^{}','~': r'\~{}'}
    return "".join(repl.get(ch, ch) for ch in s)

_unit_caret_pat   = re.compile(r'([A-Za-z])(\d+)')
_unit_negexp_pat  = re.compile(r'([A-Za-z])-(\d+)')
def _unit_to_si(u: str) -> str:
    u = u.strip()
    u = _unit_caret_pat.sub(r'\1^\2', u)
    u = _unit_negexp_pat.sub(r'\1^{-\2}', u)
    u = u.replace("/", r"\per ")
    return r"\si{" + u + "}"

def _format_header_with_units(colname: str) -> str:
    pretty = colname.replace("_", " ")
    m = re.match(r"^(.*)\((.*)\)\s*$", pretty)
    if m:
        return f"{latex_escape(m.group(1).strip())} ({_unit_to_si(m.group(2).strip())})"
    return latex_escape(pretty)

def df_to_latex_table(df: pd.DataFrame,
                      caption="Generated Data",
                      label="tab:data",
                      floatfmt=6) -> str:
    df2 = df.copy()
    for c in df2.select_dtypes(include="float"):
        df2[c] = df2[c].round(floatfmt)
    cols_fmt = [_format_header_with_units(c) for c in df2.columns]
    col_spec = "c" * len(cols_fmt)
    header = " & ".join(cols_fmt) + r" \\ \midrule"
    rows = "\n".join(" & ".join(map(str, r)) + r" \\" for r in df2.values)
    return (
        "\\begin{table}[htbp]\n"
        "\\centering\n"
        f"\\caption{{{caption}}}\n"
        f"\\label{{{label}}}\n"
        "\\small\n"
        f"\\begin{{tabular}}{{{col_spec}}}\n"
        "\\toprule\n"
        f"{header}\n"
        f"{rows}\n"
        "\\bottomrule\n"
        "\\end{tabular}\n"
        "\\end{table}\n"
    )

def _dist_to_tex(dist: str, params) -> str:
    d = dist.lower()
    if d == "uniform": a,b = params;  return rf"\mathrm{{Uniform}}({a}, {b})"
    if d == "normal":  mu,sd = params; return rf"\mathcal{{N}}({mu},\,{sd})"
    if d == "int":     a,b = params;  return rf"\mathrm{{DiscreteUniform}}({a}, {b})"
    return latex_escape(f"{dist}({params})")

def _formula_to_latex(formula: str, input_names: list, output_names: list):
    in_syms  = {name: rf"x_{i}" for i, name in enumerate(input_names, 1)}
    out_syms = {name: rf"y_{j}" for j, name in enumerate(output_names, 1)}
    f = formula.replace("np.pi", r"\pi").replace("**", "^").replace("*", r" \cdot ")
    for name, sym in in_syms.items():  f = f.replace(name, sym)
    for name, sym in out_syms.items(): f = f.replace(name, sym)
    return f, in_syms, out_syms

def _topic_phrase_from_context(ctx: str) -> str:
    if not ctx:
        return "the study topic"
    s = ctx.strip()
    s = re.split(r'[.!?\n]', s, 1)[0]
    s = re.sub(r",?\s*so\s+(?:I|we)\b.*$", "", s, flags=re.I)
    s = re.sub(r"^\s*(?:I|we)\s+(?:am|are)\s+", "", s, flags=re.I)
    s = re.sub(r"^\s*(?:this\s+(?:paper|study)\s+)?(?:aims?|seeks?|tries?)\s+to\s+", "", s, flags=re.I)
    s = re.sub(r"^estimating\b",   "estimation of",  s, flags=re.I)
    s = re.sub(r"^predicting\b",   "prediction of",  s, flags=re.I)
    s = re.sub(r"^measuring\b",    "measurement of", s, flags=re.I)
    s = re.sub(r"^analy[sz]ing\b", "analysis of",    s, flags=re.I)
    s = re.sub(r"^investigating\b","investigation of", s, flags=re.I)
    s = s.strip(" ,")
    return latex_escape(s or "the study topic")

def inject_table_into_results_tex(paper: str, table_tex: str) -> str:
    mlabel = re.search(r"\\label\{([^}]+)\}", table_tex)
    label  = mlabel.group(1) if mlabel else "tab:data"
    between = re.search(r"\\midrule(.*?)\\bottomrule", table_tex, re.S)
    row_count = 0
    if between:
        row_count = len([ln for ln in between.group(1).splitlines() if "\\" in ln])
    mhdr = re.search(r"\\toprule\s*(.*?)\\\\\s*\\midrule", table_tex, re.S)
    header_cols = []
    if mhdr:
        header_cols = [re.sub(r"\s+", " ", h.strip()) for h in mhdr.group(1).split("&")]
    topic = _topic_phrase_from_context(experiment.get("context",""))
    PREVIEW_K = 6
    if len(header_cols) <= PREVIEW_K:
        cols_preview, tail = ", ".join(header_cols), ""
    else:
        cols_preview = ", ".join(header_cols[:PREVIEW_K])
        tail = f" (+{len(header_cols)-PREVIEW_K} more)"
    templates = [
        r"The dataset summarized in Table~\ref{%LABEL%} (N=%N%) addresses %TOPIC% and contains the columns: %COLS%%TAIL%.",
        r"Table~\ref{%LABEL%} (N=%N%) presents data for %TOPIC% with columns: %COLS%%TAIL%.",
        r"For %TOPIC%, Table~\ref{%LABEL%} reports N=%N% observations across the columns: %COLS%%TAIL%.",
    ]
    t_idx = (len(header_cols) + row_count + sum(map(ord, label))) % len(templates)
    lead_sentence = (templates[t_idx]
                     .replace("%LABEL%", label)
                     .replace("%N%", str(row_count))
                     .replace("%TOPIC%", topic)
                     .replace("%COLS%", cols_preview)
                     .replace("%TAIL%", tail))
    input_names  = list(experiment["inputs"].keys())
    output_names = list(experiment["outputs"].keys())
    items = []
    for i, (name, (unit, dist, params)) in enumerate(experiment["inputs"].items(), 1):
        items.append(rf"\item $x_{i}$: {_format_header_with_units(name)} sampled as ${_dist_to_tex(dist, params)}$")
    for j, (name, (unit, formula, noise)) in enumerate(experiment["outputs"].items(), 1):
        noise_tex = ""
        if isinstance(noise, tuple) and noise and str(noise[0]).lower() == "gaussian":
            mu, sd = noise[1]; noise_tex = rf"\;+\; \mathcal{{N}}({mu},\,{sd})"
        f_ltx, _, _ = _formula_to_latex(formula, input_names, output_names)
        items.append(rf"\item $y_{j}$: {_format_header_with_units(name)} via $y_{j} = {f_ltx}{noise_tex}$")
    items_block = "\\begin{itemize}\n" + "\n".join(items) + "\n\\end{itemize}\n"
    approach = (
        "Inputs $x_1,\\ldots,x_m$ are drawn from the specified distributions, and "
        "responses $y_1,\\ldots,y_k$ are computed from the ground‑truth equations "
        "with optional noise."
    )
    lead = (
        "\n\n"
        + lead_sentence + "\n "
        + approach + "\n\n"
        + items_block + "\n"
    )
    pattern = re.compile(r'^[ \t]*#{1,6}\s*Results\b.*$', re.IGNORECASE | re.MULTILINE)
    mr = pattern.search(paper)
    block = lead + table_tex + "\n"
    if mr:
        insert_at = mr.end()
        return paper[:insert_at] + block + paper[insert_at:]
    else:
        return paper + "\n\n" + block

def md_to_tex(text: str) -> str:
    text = markdown_emphasis_to_tex(text)
    out = []
    for ln in text.splitlines():
        if ln.startswith("##### "): out.append(r"\paragraph{"      + latex_escape(ln[6:]) + "}")
        elif ln.startswith("#### "): out.append(r"\subsubsection{" + latex_escape(ln[5:]) + "}")
        elif ln.startswith("### "):  out.append(r"\subsubsection{" + latex_escape(ln[4:]) + "}")
        elif ln.startswith("## "):   out.append(r"\subsection{"    + latex_escape(ln[3:]) + "}")
        elif ln.startswith("# "):    out.append(r"\section{"       + latex_escape(ln[2:]) + "}")
        else:                        out.append(ln)
    return "\n".join(out)

def clean_title_for_latex(s: str) -> str:
    if not s: return ""
    s = re.sub(r"\$\$.*?\$\$|\$[^$]*\$|\\\[.*?\\\]|\\\((?:.|\n)*?\\\)", "", s, flags=re.S)
    for cmd in ["textbf","emph","textit","textsc","texttt","textsf","mathrm"]:
        s = re.sub(rf"\\{cmd}\{{(.*?)\}}", r"\1", s, flags=re.S)
    s = re.sub(r"\\[a-zA-Z]+\*?(?:\[[^\]]*\])?\{(.*?)\}", r"\1", s, flags=re.S)
    s = re.sub(r"\\[a-zA-Z]+\*?(?:\[[^\]]*\])?", "", s)
    s = re.sub(r"\*\*(.+?)\*\*", r"\1", s, flags=re.S)
    s = re.sub(r"(?<!\\)\*(?!\s)(.+?)(?<!\s)\*", r"\1", s, flags=re.S)
    s = re.sub(r"[ \t]+", " ", s).strip()
    return s

def build_tex_doc(title: str, abstract: str, body_tex: str) -> str:
    safe_title = latex_escape(clean_title_for_latex(title))
    return (
        "\\documentclass[11pt]{article}\n"
        "\\usepackage[a4paper,margin=1in]{geometry}\n"
        "\\usepackage{booktabs}\n"
        "\\usepackage{amsmath, amssymb}\n"
        "\\usepackage{siunitx}\n"
        "\\usepackage{hyperref}\n"
        "\\usepackage{caption}\n"
        "\\captionsetup[table]{position=top}\n\n"
        "\\title{" + safe_title + "}\n"
        "\\author{}\n"
        "\\date{}\n\n"
        "\\begin{document}\n"
        "\\maketitle\n\n"
        "\\begin{abstract}\n"
        + markdown_emphasis_to_tex(strip_meta(abstract)) + "\n"
        "\\end{abstract}\n\n"
        + body_tex + "\n\n"
        "\\end{document}\n"
    )

def _json_fix_backslashes(s: str) -> str:
    return re.sub(r'\\(?![\\/"bfnrtu])', r'\\\\', s)

def strip_latex_math(text: str) -> str:
    patterns = [
        r"\$\$.*?\$\$",
        r"\$[^$]*\$",
        r"\\\[.*?\\\]",
        r"\\\((?:.|\n)*?\\\)",
        r"\\begin\{equation\*?\}.*?\\end\{equation\*?\}",
        r"\\begin\{align\*?\}.*?\\end\{align\*?\}",
        r"\\begin\{gather\*?\}.*?\\end\{gather\*?\}",
    ]
    for pat in patterns:
        text = re.sub(pat, "", text, flags=re.S)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text

def fix_malformed_environments(tex: str) -> str:
    def _clean_env(match, is_begin=True):
        env = match.group(1)
        env = re.split(r'\\', env, 1)[0]
        env = re.sub(r'[^A-Za-z*]', '', env)
        tag = "begin" if is_begin else "end"
        return fr"\{tag}{{{env}}}"
    tex = re.sub(r"\\begin\{([^}]*)\}",  lambda m: _clean_env(m, True),  tex)
    tex = re.sub(r"\\end\{([^}]*)\}",    lambda m: _clean_env(m, False), tex)
    stack, out = [], []
    token_pat = re.compile(r"\\(begin|end)\{([A-Za-z*]+)\}")
    idx = 0
    for m in token_pat.finditer(tex):
        out.append(tex[idx:m.end()])
        idx = m.end()
        typ, env = m.group(1), m.group(2)
        if typ == "begin":
            stack.append(env)
        else:
            if stack and stack[-1] == env:
                stack.pop()
            else:
                out[-1] = out[-1].replace(m.group(0), "")
    out.append(tex[idx:])
    for env in reversed(stack):
        out.append(f"\n\\end{{{env}}}\n")
    return "".join(out)

  repl = {'\\': r'\textbackslash{}','{': r'\{','}': r'\}','\$': r'\$','&': r'\&',


### Latex complier for the verification of the generated Latex code

In [27]:
def _tex_engine_cmd():
    # tries latexmk → pdflatex → lualatex
    for cmd in (['latexmk', '-pdf', '-interaction=nonstopmode', '-halt-on-error'],
                ['pdflatex', '-interaction=nonstopmode', '-halt-on-error'],
                ['lualatex', '-interaction=nonstopmode', '-halt-on-error']):
        if shutil.which(cmd[0]):
            return cmd
    return None

def validate_tex_source(tex_source: str, base_name: str, timeout_sec: int = 120):
    cmd_base = _tex_engine_cmd()
    if not cmd_base:
        return None, "TeX engine not found"

    with tempfile.TemporaryDirectory() as tmpdir:
        tex_path = pathlib.Path(tmpdir) / f"{base_name}.tex"
        tex_path.write_text(tex_source, encoding="utf-8")

        proc = subprocess.run(
            cmd_base + [tex_path.name],
            cwd=tmpdir,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            timeout=timeout_sec)
        ok  = (proc.returncode == 0)
        log = proc.stdout.decode("utf-8", errors="ignore")
        return ok, log


### LLM calls & main loop

In [28]:
def llm_paper(stats_json: str) -> str:
    N = experiment["N_OBS"]
    # prompt = (
    #     f"{experiment['context']}\n\n"
    #     f"Motivation: {experiment['motivation']}\n\n"
    #     f"{experiment.get('constraints','')}\n\n"
    #     f"Dataset summary statistics (JSON):\n{stats_json}\n\n"
    #     f"Write an ~{experiment['paper_words']}-word paper with sections:\n"
    #     "# Introduction\n# Methods\n# Results\n# Discussion\n\n"
    #     "• Use LaTeX math ($...$) and LaTeX emphasis (\\textbf{}, \\emph{}). "
    #     "Do NOT use Markdown.\n"
    #     "• Base Results on numeric trends; cite ≥3 quantitative findings.\n"
    #     "• Do NOT mention dataset generation, scripts, prompts, ground-truth formulas, or noise.\n"
    # )
    prompt = (
        f"{experiment['context']}\n\n"
        f"Motivation: {experiment['motivation']}\n\n"
        f"{experiment.get('constraints','')}\n\n"
        f"Dataset summary statistics (JSON):\n{stats_json}\n\n"
        f"Write an ~{experiment['paper_words']}-word LaTeX paper with sections:\n"
        "# Introduction\n# Methods\n# Results\n# Discussion\n\n"
        "• Use LaTeX emphasis (\\textbf{}, \\emph{}). Do NOT use Markdown.\n"
        "• Report quantitative observations (ranges, means/SDs, optional 95% CIs) and the tested condition ranges.\n"
        "• Do NOT introduce or display any equations. Do NOT include an Inputs/Outputs bullet list.\n"
        "• Do NOT mention dataset generation, scripts, prompts, ground-truth formulas, or noise.\n"
    )
    res = client.chat.completions.create(
        model=experiment["model_id"],
        messages=[{"role": "user", "content": prompt}],
        max_tokens=experiment["tok_paper"],
        temperature=0.4,
    )
    return strip_meta(res.choices[0].message.content.strip())

def llm_title_abs(paper):
    prompt = (
        "Provide JSON ONLY with keys 'title' and 'abstract'. "
        "Use LaTeX emphasis (\\textbf{}, \\emph{}) not Markdown. "
        "In JSON string values, ESCAPE EVERY BACKSLASH (e.g., write \\\\emph{}, not \\emph{}). "
        "Do NOT mention platform constraints/tokens. "
        "The ABSTRACT MUST BE PROSE ONLY with NO MATH: do not include $...$, $$...$$, \\(\\), \\[\\], "
        "or any equation environments.\n\n"
        "Paper:\n" + paper)
    res  = client.chat.completions.create(
        model=experiment["model_id"],
        messages=[{"role":"user","content":prompt}],
        max_tokens=experiment["tok_titleabs"], temperature=0.4)
    m = re.search(r"\{.*\}", res.choices[0].message.content, re.S)
    if not m:
        raise ValueError("Model did not return JSON for title/abstract.")
    block = m.group(0)
    fixed = _json_fix_backslashes(block)
    try:
        meta = json.loads(fixed)
    except json.JSONDecodeError:
        meta = json.loads(block.replace("\\", "\\\\"))
    abstract = meta.get("abstract","")
    abstract = "\n".join(
        ln for ln in abstract.splitlines()
        if not ln.lower().lstrip().startswith("title:")
    ).strip()
    abstract = strip_meta(abstract)
    abstract = strip_latex_math(abstract)
    abstract = markdown_emphasis_to_tex(abstract)
    meta["abstract"] = abstract
    candidates = meta.get("titles") if isinstance(meta.get("titles"), list) else [meta.get("title","")]
    chosen = choose_diverse_title(candidates, key=paper)
    meta["title"] = clean_title_for_latex(chosen)
    return meta

def _normalize_ws(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

USED_TITLES = set()

def llm_alternate_titles(paper, banned):
    prompt = (
        "Return JSON ONLY with key 'titles' as an array of 8 distinct candidate titles "
        "(8–14 words each). Titles must be derived from and faithful to the paper content "
        "provided below. Do not introduce topics that are not inferable from this content. "
        "Use plain text (no LaTeX/math). "
        "Avoid any title in this banned list:\n"
        + json.dumps(sorted(list(banned))) +
        "\n\nPaper:\n" + paper
    )
    res = client.chat.completions.create(
        model=experiment['model_id'],
        messages=[{"role": "user", "content": prompt}],
        max_tokens=experiment["tok_titleabs"],
        temperature=0.8, presence_penalty=0.6, frequency_penalty=0.3
    )
    m = re.search(r"\{.*\}", res.choices[0].message.content, re.S)
    if not m:
        return []
    block = _json_fix_backslashes(m.group(0))
    try:
        data = json.loads(block)
    except json.JSONDecodeError:
        data = json.loads(m.group(0).replace("\\", "\\\\"))
    raw = data.get("titles", [])
    seen, out = set(), []
    def _tokens(s: str):
        return re.findall(r"[A-Za-z][A-Za-z0-9\-]{2,}", (s or "").lower())
    stop = {
        "the","and","for","with","from","into","that","this","these","those",
        "study","paper","result","results","method","methods","introduction",
        "discussion","conclusion","data","dataset","analysis","approach",
        "using","use","based","model","models","over","under","between","within",
        "across","about","without","against","toward","towards","through"
    }
    vocab = {t for t in _tokens(paper) if t not in stop}
    for t in raw:
        tt = clean_title_for_latex(_normalize_ws(t))
        if not tt:
            continue
        low = tt.lower()
        if low in seen or tt in banned:
            continue
        tt_tokens = [w for w in _tokens(tt) if w not in stop]
        if not tt_tokens:
            continue
        overlap = sum(1 for w in tt_tokens if w in vocab)
        if overlap >= max(3, len(tt_tokens) // 3):
            seen.add(low)
            out.append(tt)
    return out

def choose_diverse_title(candidates, key: str) -> str:
    uniq, seen = [], set()
    for t in candidates or []:
        tt = _normalize_ws(t)
        if not tt: continue
        low = tt.lower()
        if low in seen: continue
        seen.add(low); uniq.append(tt)
    if not uniq:
        return "Untitled"
    def score(t):
        n = len(t.split())
        return -abs(n - 11)
    uniq.sort(key=score, reverse=True)
    h = int(hashlib.sha256(key.encode("utf-8")).hexdigest(), 16)
    return uniq[h % len(uniq)]

### Main loop

In [29]:
def sanitize_tex(tex: str) -> str:
    # Fix siunitx percent
    tex = tex.replace(r'\si{%}', r'\si{\percent}')
    # Fix doubled backslashes from LLM: \\textbf, \\emph, \\si, \\num, \\qty
    tex = re.sub(r'\\{2}(textbf|emph|si|SI|num|qty)\b', r'\\\1', tex)
    # Nicer header label: "q e" -> "$q_e$" (surrounded by non-word boundaries)
    tex = re.sub(r'(\W)q\s+e(\W)', r'\1$q_e$\2', tex)
    # Escape bare % outside \si{...}
    tex = re.sub(r'(?<!\\)%', r'\\%', tex)
    tex = re.sub(
        r'The dataset summarized in Table~\\ref\{[^}]+\}.*?\\begin\{table\}',
        r'\\begin{table}',
        tex,
        flags=re.S
    )
    # 2) Kill any line that contains np.*** or code-like noise in prose
    tex = re.sub(r'^.*\bnp\.(minimum|maximum|clip)\b.*$\n?', '', tex, flags=re.M)
    # (optional) also drop Uniform(…), \mathcal{N}(…) recipe lines if they sneak in
    tex = re.sub(r'^.*\bUniform\(|\\mathcal\{N\}\s*\(.*$\n?', '', tex, flags=re.M)

    # 3) Remove the stray outline/toc block some runs append at the very end
    tex = re.sub(r'\n\s*Introduction\s+Methods.*?Conclusion\s*$', '', tex, flags=re.S)
    # collapse erroneous linebreak-before-percent
    tex = tex.replace('\\\\%', r'\%')
    return tex

def normalize_sections(tex: str) -> str:
    # Fix the common broken patterns
    tex = tex.replace(r'\section\emph{{', r'\section{')
    tex = tex.replace(r'\section}{',    r'\section{')
    tex = tex.replace(r'\section\emph{', r'\section{')
    tex = tex.replace(r'\section{{',     r'\section{')

    # Canonicalize any \section line that contains one of the standard titles
    titles = ["Introduction", "Methods", "Results", "Discussion"]
    for t in titles:
        tex = re.sub(
            rf'(?m)^\s*\\section[^\n]*{re.escape(t)}[^\n]*$',
            rf'\\section{{{t}}}',
            tex
        )
    return tex

In [30]:
MAX_RETRIES = 2   # extra attempts after the first failure

for i in range(1, experiment["NUM_DATASETS"] + 1):
    print(f"▶ Dataset {i}/{experiment['NUM_DATASETS']}")

    df        = add_outputs(sample_inputs(experiment["N_OBS"]))
    stats     = stats_json(df)

    attempt   = 0
    while True:
        attempt += 1
        paper_md = llm_paper(stats)
        label    = f"tab:data{i:02d}"
        table_tex = df_to_latex_table(df, caption="Generated dataset", label=label)
        paper_md_with_table = inject_table_into_results_tex(paper_md, table_tex)
        body_tex = md_to_tex(paper_md_with_table)

        meta      = llm_title_abs(paper_md_with_table)
        alts      = llm_alternate_titles(paper_md_with_table + "\n\nABSTRACT:\n" + meta.get("abstract", ""), USED_TITLES)
        candidates = [t for t in alts if t and t not in USED_TITLES] or \
                     (meta.get("titles") if isinstance(meta.get("titles"), list) else [meta.get("title", "")])
        title      = choose_diverse_title(candidates, key=paper_md_with_table)
        if not title or title in USED_TITLES:
            base  = clean_title_for_latex(meta.get("title", "")) or "Untitled"
            title = base if base not in USED_TITLES else f"{base} — Study {i:02d}"
        USED_TITLES.add(title)
        meta["title"] = title

        tex_doc = build_tex_doc(meta["title"], meta["abstract"], body_tex)
        tex_doc = sanitize_tex(tex_doc) 
        tex_doc = normalize_sections(tex_doc) 
        ok, log = validate_tex_source(tex_doc, f"paper{i:02d}")
        if ok is None:
            (OUT / f"paper{i:02d}.tex").write_text(tex_doc, encoding="utf-8")
            print("   ⚠ validation skipped (no TeX engine); saved", f"paper{i:02d}.tex")
            break

        if ok:
            (OUT / f"paper{i:02d}.tex").write_text(tex_doc, encoding="utf-8")
            print("   ✔ compiled & saved", f"paper{i:02d}.tex")
            break

        print(f"   ✖ compile failed (attempt {attempt}/{MAX_RETRIES + 1})")
        if attempt > MAX_RETRIES:
            (OUT / f"paper{i:02d}.tex").write_text(tex_doc, encoding="utf-8")
            log_dir = OUT / "logs"; log_dir.mkdir(exist_ok=True)
            (log_dir / f"paper{i:02d}_compile.log").write_text(log, encoding="utf-8")
            print("     saved .tex and compile log to", f"logs/paper{i:02d}_compile.log")
            break

print("🎉  Done. Files in:", OUT.resolve())


▶ Dataset 1/1


  s = re.split(r'[.!?\n]', s, 1)[0]


   ⚠ validation skipped (no TeX engine); saved paper01.tex
🎉  Done. Files in: /home/hari/aros/verification/Output/Phosphate_Songnen_run
