In [2]:
import os
for k in ["http_proxy","https_proxy","HTTP_PROXY","HTTPS_PROXY"]:
    os.environ.pop(k, None)

In [62]:
#%pip install --upgrade openai
#%pip install --upgrade python-dotenv
#%pip install httpx

In [63]:
# Step 1: Extract the descriptions of the problems from the Text2Zinc GitHub repository

%pip install datasets -q

from datasets import load_dataset
import json
import os
import random

# Step 1A: Load dataset

print("üîÑ Loading Text2Zinc dataset...")
dataset = load_dataset("skadio/text2zinc")

all_descriptions = []

# Step 1B: Extract all problem descriptions

for split_name, split_data in dataset.items():
    print(f"\nüìò Processing split: {split_name}")
    print("Columns:", split_data.column_names)

    if "input.json" in split_data.column_names:
        for record in split_data["input.json"]:
            if isinstance(record, str):
                try:
                    record = json.loads(record)
                except json.JSONDecodeError:
                    continue

            if isinstance(record, dict) and "description" in record:
                all_descriptions.append(record["description"])
    else:
        print(f"‚ö†Ô∏è No 'input.json' column found in {split_name}")

print(f"\nüìä Total descriptions collected: {len(all_descriptions)}")

# Step 1C: Automatically sample N descriptions

m = 0 # Start point
n = 500  # End point
print(f"\n‚úÖ Sampling {n} descriptions...\n")

if len(all_descriptions) < n:
    print(f"‚ö†Ô∏è Only {len(all_descriptions)} available, using all.")
    sampled_descriptions = all_descriptions
else:
    sampled_descriptions = all_descriptions[m:n]


#t Step 1D: Save to files

with open("descriptions.json", "w", encoding="utf-8") as f:
    json.dump(sampled_descriptions, f, indent=2)

with open("descriptions.txt", "w", encoding="utf-8") as f:
    for desc in sampled_descriptions:
        f.write(desc.strip().replace("\n", " ") + "\n\n")

print("üíæ Saved 'descriptions.json' and 'descriptions.txt'")
print("\nüìÇ Files in current directory:", os.listdir())

print("\nExample description:\n", sampled_descriptions[0][:400])

Note: you may need to restart the kernel to use updated packages.
üîÑ Loading Text2Zinc dataset...

üìò Processing split: train
Columns: ['input.json', 'data.dzn', 'model.mzn', 'output.json', 'is_verified']

üìä Total descriptions collected: 567

‚úÖ Sampling 500 descriptions...

üíæ Saved 'descriptions.json' and 'descriptions.txt'

üìÇ Files in current directory: ['.env', '.idea', '.venv', 'Data_Generation_Pipeline.ipynb', 'descriptions.json', 'descriptions.txt', 'generated_personas', 'generated_problems', 'generated_solutions', 'veritas-solutions.zip']

Example description:
 The P-Median problem involves selecting P warehouses from a set of candidate locations to minimize the demand-weighted distance of serving all customers. We are given the demand of customers and distances between customers and warehouses. Each customer is served by exactly one warehouse. The goal is to allocate warehouses to minimize the sum of demand weighted distances of the customers to the war


In [64]:
import os, json, time  
from pathlib import Path  
from dotenv import load_dotenv  
from openai import OpenAI  
import httpx
  
# Load env (adjust path as needed)  
load_dotenv()  
# Or: load_dotenv(r"C:\Users\0117422\PycharmProjects\veritas\.env", override=True)


AZURE_API_KEY = os.getenv("OPENAI_API_KEY")
AZURE_ENDPOINT = "https://oai-preprod-canadaeast-001.openai.azure.com" # your resource
API_VERSION = "" # example; use the version your resource supports
DEPLOYMENT_NAME = "gpt-5" # your Azure deployment name
  
if not (AZURE_ENDPOINT and AZURE_API_KEY and DEPLOYMENT_NAME):  
    raise RuntimeError("Missing one of AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, or AZURE_OPENAI_DEPLOYMENT.")
  
# Only keep a custom client if you truly need it (e.g., corporate proxy/CA)  
http_client = httpx.Client(timeout=30.0)
  
client = OpenAI(  
    api_key=AZURE_API_KEY,  
    base_url=f"{AZURE_ENDPOINT}/openai/v1",          # note the /v1  
    default_query={"api-version": API_VERSION},      # Azure API version routing  
    http_client=http_client,  
)
  
# Load problem descriptions  
#with open("descriptions.txt", "r", encoding="cp1252") as f:  
#    descriptions = [line.strip() for line in f if line.strip()] 

with open("descriptions.json", "r",encoding="utf-8") as f:
    descriptions = json.load(f)
  
persona_prompt_template = """Develop a detailed, realistic persona relevant to the following problem description:

{description}

Focus the description on their occupation, incorporating how their professional background or responsibilities have led them to engage with the problem in a natural, contextual way.
Avoid generic phrases such as 'noticed' or 'began exploring.'
The persona should not include age or a specific name and must not restate the problem directly.
Ensure the background feels authentic and the connection to the problem is clearly rooted in their work or expertise.
Response must be at least 3 sentences in length in a narrative format. 
It should be exactly one paragraph long written in the most concise manner possible.  
"""
  
def prompt_chatgpt(prompt: str, model: str = DEPLOYMENT_NAME, temperature: float = 1) -> str:  
    # simple retry/backoff for throttling/transient faults  
    delay = 0.5  
    for attempt in range(6):  
        try:  
            resp = client.chat.completions.create(  
                model=model,  # IMPORTANT: this is your Azure deployment name  
                messages=[{"role": "user", "content": prompt}],  
                temperature=temperature,  
            )  
            return resp.choices[0].message.content  
        except Exception as e:  
            msg = str(e)  
            if any(code in msg for code in ("429", "503", "rate limit", "temporarily unavailable")) and attempt < 5:  
                time.sleep(delay)  
                delay = min(delay * 2, 8.0)  
                continue  
            raise
  
# Output directory  
output_dir = Path("generated_personas")  
output_dir.mkdir(parents=True, exist_ok=True)
  
personas = []  
for i, desc in enumerate(descriptions, start=1):  
    print(f"\nüîÆ Generating persona {i} of {len(descriptions)}...\n")  
    prompt = persona_prompt_template.format(description=desc)  
    persona_text = prompt_chatgpt(prompt)  
    personas.append({"id": i, "description": desc, "persona": persona_text})  
    print(persona_text[:400], "...\n")
  
# Save results  
(output_dir / "personas.json").write_text(json.dumps(personas, indent=2), encoding="utf-8")  
with open(output_dir / "personas.txt", "w", encoding="utf-8") as f:  
    for p in personas:  
        f.write(f"=== Persona {p['id']} ===\n{p['persona']}\n\n")
  
print("\n‚úÖ Successfully generated and saved all personas:")  
print(f"üìÇ {output_dir / 'personas.json'}")  
print(f"üìÇ {output_dir / 'personas.txt'}")  


üîÆ Generating persona 1 of 500...

A senior network planning analyst at an omnichannel retail chain, they lead the design of the distribution footprint that feeds hundreds of stores and parcel fulfillment nodes under strict service-level and capital constraints. With a background in operations research and carrier route engineering, they maintain a demand-by-location forecast, build travel-time matrices from carrier telemetry, and  ...


üîÆ Generating persona 2 of 500...

A municipal lighting engineer overseeing an adaptive LED retrofit on arterial roads, this professional is accountable for tuning dimming levels so corridor segments meet mandated illuminance classes while controlling energy spend and mitigating glare complaints. Using manufacturer photometry and a streetscape model, they assemble an influence matrix that quantifies how each luminaire contributes t ...


üîÆ Generating persona 3 of 500...

A refinery planning manager at a large integrated site, this professional 

In [None]:
# =========================================================
# Step 3 (ChatGPT-only version): Generate JSON + Python solutions
# =========================================================

import os
import json
import re
import time
from openai import OpenAI
from dotenv import load_dotenv

# Load API key
load_dotenv()

AZURE_API_KEY = os.getenv("OPENAI_API_KEY")
AZURE_ENDPOINT = "https://oai-preprod-canadaeast-001.openai.azure.com" # your resource
API_VERSION = "" # example; use the version your resource supports
DEPLOYMENT_NAME = "gpt-5" # your Azure deployment name
  
if not (AZURE_ENDPOINT and AZURE_API_KEY and DEPLOYMENT_NAME):  
    raise RuntimeError("Missing one of AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, or AZURE_OPENAI_DEPLOYMENT.")
  
# Only keep a custom client if you truly need it (e.g., corporate proxy/CA)  
http_client = httpx.Client(timeout=300.0)
  
client = OpenAI(  
    api_key=AZURE_API_KEY,  
    base_url=f"{AZURE_ENDPOINT}/openai/v1",          # note the /v1  
    default_query={"api-version": API_VERSION},      # Azure API version routing  
    http_client=http_client,  
)

# Load personas created in Step 2
with open("generated_personas/personas.json", "r",encoding="utf-8") as f:
    personas = json.load(f)

# Ensure output directories
os.makedirs("generated_problems", exist_ok=True)
os.makedirs("generated_solutions", exist_ok=True)

# =====================================================================
# PROMPTS
# =====================================================================

from string import Template

json_prompt_template = Template("""You are an expert in Operations Research and Constraint Programming.
Given the following persona, produce a strictly valid JSON object describing an advanced real-world optimization problem.

Persona:
$persona

Return ONLY a valid JSON that matches EXACTLY this schema:

{
  "Problem": {
    "Title": "Concise title",
    "Formal Problem Statement": "Detailed natural-language description",
    "Objective": "Clear optimization objective",
    "DecisionVariables": ["List and describe variables"],
    "Constraints": ["At least 3 explicit mathematical constraints"],
    "Parameters": ["Define parameters and constants"],
    "ModelType": "MILP / LP / MINLP / CP / other",
    "Complexity": "Why the problem is advanced",
    "ExpectedOutput": "What the model should return"
  }
}

Rules:
- Output ONLY the JSON.
- The JSON MUST be valid and parseable by python json.loads().
""")

python_prompt_template = Template("""You are an expert in mathematical optimization using Python and all constraint/optimization modeling packages.
Given the following JSON problem, write a complete executable Python script using the appropriate constraint/optimization modeling python package/module.

Rules:
- Import the appropriate package/module based on the appropriate method needed to execute and solve the python script successfully.
- Create model.
- Define decision variables with correct types.
- Add and represent the objective function accurately.
- Add all constraints explicitly.
- Include a small numeric example instance so the script runs immediately.
- After optimization/constraint is satisfied, print model status, objective value, and all variable values.
- Code must be returned INSIDE a ```python fenced block.

JSON Problem:
$json_text

Return ONLY the Python code inside a fenced ```python block.
""")

# =====================================================================
# Helper functions
# =====================================================================

def call_chatgpt(prompt, model="gpt-5", temperature=1):
    """Wrapper with retry logic."""
    for _ in range(3):
        try:
            resp = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature
            )
            return resp.choices[0].message.content
        except Exception as e:
            print("‚ö†Ô∏è ChatGPT API error:", e)
            time.sleep(2)
    return None


def extract_json(text):
    """Extract valid JSON from text."""
    if not text:
        return None
    text = text.strip()

    # Try direct load
    try:
        return json.loads(text)
    except:
        pass

    # Try to salvage largest {} block
    matches = re.findall(r"\{[\s\S]*\}", text)
    if matches:
        largest = max(matches, key=len)
        try:
            return json.loads(largest)
        except:
            return None
    return None


def extract_python(text):
    """Extract python code from ```python``` fenced block."""
    if not text:
        return None
    m = re.search(r"```python([\s\S]*?)```", text, re.DOTALL)
    if m:
        return m.group(1).strip()
    return None

# =====================================================================
# MAIN LOOP
# =====================================================================

for i, persona_obj in enumerate(personas, start=1):
    persona_text = persona_obj.get("persona", "")
    print(f"\nüß© Persona {i}/{len(personas)} ‚Äî generating JSON using ChatGPT")

    # ---------------------------------------------------------
    # 1) JSON GENERATION
    # ---------------------------------------------------------
    json_prompt = json_prompt_template.substitute(persona=persona_text)
    json_output = call_chatgpt(json_prompt)

    raw_json_path = f"generated_problems/problem_{i}_raw.txt"
    with open(raw_json_path, "w", encoding="utf-8") as f:
        f.write(json_output or "")

    print(f"üìÑ Saved raw ChatGPT JSON to {raw_json_path}")

    parsed_json = extract_json(json_output)

    if parsed_json:
        json_path = f"generated_problems/problem_{i}.json"
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(parsed_json, f, indent=2)
        print(f"‚úÖ Parsed JSON saved to {json_path}")
    else:
        print(f"‚ùå JSON parsing failed for persona {i}. Check raw file.")
        continue

    # ---------------------------------------------------------
    # 2) PYTHON / GUROBI CODE GENERATION
    # ---------------------------------------------------------
    print(f"üíª Generating Python Gurobi model for Persona {i} using ChatGPT")

    python_prompt = python_prompt_template.substitute(
        json_text=json.dumps(parsed_json, indent=2)
    )
    python_output = call_chatgpt(python_prompt)

    raw_py_path = f"generated_solutions/solution_{i}_raw.txt"
    with open(raw_py_path, "w", encoding="utf-8") as f:
        f.write(python_output or "")

    print(f"üìÑ Saved raw Python output to {raw_py_path}")

    python_code = extract_python(python_output)

    if python_code:
        py_path = f"generated_solutions/solution_{i}.py"
        with open(py_path, "w", encoding="utf-8") as f:
            f.write(python_code)
        print(f"‚úÖ Extracted Python script saved to {py_path}")
    else:
        print(f"‚ö†Ô∏è Could not extract code for persona {i} ‚Äî inspect raw file.")

print("\nüéâ Done! All problems + solutions generated using ChatGPT.")



üß© Persona 1/500 ‚Äî generating JSON using ChatGPT
üìÑ Saved raw ChatGPT JSON to generated_problems/problem_1_raw.txt
‚úÖ Parsed JSON saved to generated_problems/problem_1.json
üíª Generating Python Gurobi model for Persona 1 using ChatGPT
üìÑ Saved raw Python output to generated_solutions/solution_1_raw.txt
‚úÖ Extracted Python script saved to generated_solutions/solution_1.py

üß© Persona 2/500 ‚Äî generating JSON using ChatGPT
üìÑ Saved raw ChatGPT JSON to generated_problems/problem_2_raw.txt
‚úÖ Parsed JSON saved to generated_problems/problem_2.json
üíª Generating Python Gurobi model for Persona 2 using ChatGPT
üìÑ Saved raw Python output to generated_solutions/solution_2_raw.txt
‚úÖ Extracted Python script saved to generated_solutions/solution_2.py

üß© Persona 3/500 ‚Äî generating JSON using ChatGPT
üìÑ Saved raw ChatGPT JSON to generated_problems/problem_3_raw.txt
‚úÖ Parsed JSON saved to generated_problems/problem_3.json
üíª Generating Python Gurobi model for Perso