In [None]:
import requests
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import requests
import urllib3
import random

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:

url = "https://ollama.it/api/generate"
data = {
    "model": "deepseek-r1:7b",
    "prompt": "What is the capital of France?",
    "stream": False
}

response = requests.post(url, json=data, verify= False)
print(response.json())

In [None]:

response = requests.get(url, verify=False)

if response.status_code == 200:
    models = response.json()
    print("Available Models:", models)
else:
    print("Failed to fetch models:", response.json())
available_models = [i["name"] for i in models["models"]]
available_models

In [None]:
# https://ollama.com/library/wizard-math
# https://ollama.com/library/qwen2-math
# https://ollama.com/library/mathstral
# https://ollama.com/t1c/deepseek-math-7b-rl
# https://ollama.com/ima/deepseek-math

In [None]:
dataset = load_dataset("deepmind/aqua_rat")

In [None]:
first_row = dataset["train"][0]
first_row.keys()
print(first_row["question"])
print(first_row["correct"])

In [None]:
prompt_strict = """You are a strict mathematical assistant. Your goal is to answer the given multiple-choice question **without providing any reasoning or explanation before the answer**.

### **Follow these rules carefully:**
- **Your answer must always be the first thing you output.**
- **The final answer must be enclosed in `### ###`.**
- **Inside `### ###`, include only the letter of the selected option (e.g., `### B ###`, `### E ###`).**
- **Do not include any reasoning, calculations, or commentary before or inside the `### ###`.**
- **After the `### ###`, you may optionally confirm the answer in one short sentence.**

---

### **Question**
{question}

### **Options**
{options}

---

### **Allowed Output Format (Examples):**
✅ **Correct:** `### C ###`
✅ **Correct:** `### A ### That is the correct answer.`

🚫 **Incorrect:** `"The correct answer is ### B ###."` (Wrong – reasoning comes before)
🚫 **Incorrect:** `"After solving the problem, I found the answer is ### D ###."` (Wrong – reasoning comes before)
🚫 **Incorrect:** `"### C ### because we solve it by..."` (Wrong – reasoning comes after the answer inside `### ###`)

**⚠️ WARNING:**
- **If you fail to follow the format, your answer will be ignored.**
- **Do NOT generate any extra text before the `### X ###` answer.**

Now, answer the question **immediately**:

"""

prompt_reasoning = """
You are a helpful mathematical assistant. Your goal is to **explain the reasoning behind your answer step-by-step before revealing the final answer**.

### **Follow these rules carefully:**
- **First, break the problem down into logical, structured steps.**
- **Explain your thought process clearly and concisely before stating the final answer.**
- **At the end of your response, provide the final answer enclosed in `### ###`.**
- **Inside `### ###`, include only the letter of the selected option (e.g., `### B ###`, `### E ###`).**
- **The answer must ALWAYS be the last thing you output.**
- **Do NOT state or hint at the final answer before reaching the conclusion.**

---

### **Question**
{question}

### **Options**
{options}

---

### **Allowed Output Format (Example):**
✅ **Correct:**
```plaintext
Step 1: Identify the key values from the question.
Step 2: Apply the correct mathematical formula.
Step 3: Compute the result and match it with the given options.
Step 4: Verify the correctness.

Final answer: ### D ###
🚫 Incorrect: "The correct answer is ### C ###. Here’s why..." (Wrong – answer must come last)
🚫 Incorrect: "Step 1: Solve... Step 2: Compute... Answer: D" (Wrong – lacks ### D ###)
🚫 Incorrect: "### A ### because first we multiply..." (Wrong – reasoning must come before the answer)

⚠️ WARNING:

Do NOT reveal the final answer early.
Your reasoning must always appear BEFORE the ### X ### format answer.
If you fail to follow this format, your response will be considered invalid.
Now, solve the problem step by step before revealing the answer:
"""
deep_seek_prompt = """You are a mathematical assistant. Your goal is to provide the final answer in the correct format.

### Question
{question}

### Options
{options}

### **Formatting Rules:**
- Place the **final answer** inside `### ###` at the **end** of your response.
- Inside `### ###`, include **only** the letter of the selected option (e.g., `### B ###`, `### E ###`).
- **Do not place the answer at the beginning or in the middle of your response.**

### **Correct Output Examples:**
✅ `"After analyzing the question carefully... [EXPLANATION] ... The final answer is ### D ###."`
✅ `"By applying the correct formula, we find the correct choice: ### A ###."`

### **Incorrect Output Examples:**
🚫 `"The correct answer is ### C ###. Here’s why..."` (❌ Wrong: answer must come last)
🚫 `"Step 1: Solve... Step 2: Compute... Answer: D"` (❌ Wrong: lacks `### D ###`)
🚫 `"### A ### is the correct choice because..."` (❌ Wrong: reasoning must come first)

**Now, provide your response while ensuring the answer follows the correct format.**

"""


In [None]:
import re
def query_ollama(prompt_template: str, question: str, options: list, model: str, correct_answer: str, ollama_url: str):
    """
    Queries an Ollama model with a given strict prompt and evaluates its response.

    Args:
        prompt_template (str): The template containing the prompt format.
        question (str): The GMAT-style question.
        options (list): A list of multiple-choice options.
        model (str): The name of the Ollama model.
        correct_answer (str): The correct answer (e.g., "E").
        ollama_url (str): The base URL of the Ollama instance.

    Returns:
        dict: JSON containing the model's response, extracted answer, and correctness.
    """

    # Format the strict prompt with the given question and options
    formatted_prompt = prompt_template.format(question=question, options="\n".join([f"{chr(65 + i)}) {opt}" for i, opt in enumerate(options)]))

    url = f"{ollama_url}/api/generate"
    payload = {
        "model": model,
        "prompt": formatted_prompt,
        "temperature": 0.05,  # Lower temperature for more deterministic responses
        "stream": False  # Disable streaming for easier parsing
    }

    try:
        # Send request to Ollama API
        response = requests.post(url, json=payload, verify=False)

        if response.status_code == 200:
            model_output = response.json().get("response", "").strip()

            # Extract answer inside ### ###
            match = re.search(r"###\s*([A-E])\s*###", model_output)  # Ensure only A-E answers are extracted
            model_answer = match.group(1) if match else None  # Extracted answer or None if not found

            # Check correctness
            is_correct = model_answer.upper() == correct_answer.upper() if model_answer else False

            return {
                "model_output": model_output,
                "model_answer": model_answer,
                "is_correct": is_correct,
                "correct_answer": correct_answer,
                "model": model,
                "prompt_used": formatted_prompt
            }
        else:
            return {"error": "Failed to query Ollama", "status_code": response.status_code}

    except requests.exceptions.RequestException as e:
        return {"error": f"Request failed: {e}"}

In [None]:
row = dataset["train"][6]
row.keys()
print(row["question"])
print(row["correct"])

If (x^2 + 4x - 11)/5 ≤ x + 1, then x could be represented by which of the following?
A


In [None]:
query_ollama(prompt_reasoning, row["question"], row["options"],   "qwen2:7b", row["correct"], url)

{'model_output': "Step 1: First, we need to solve the inequality `(x^2 + 4x - 11)/5 ≤ x + 1`. Let's start by moving all terms to one side of the inequality:\n\n`x^2 + 4x - 11/5 - (5x + 5) ≤ 0`\n\nStep 2: Simplify the equation:\n\n`x^2 - x - 26/5 ≤ 0`\n\nStep 3: Now, solve this quadratic inequality. First, let's find the roots by solving `x^2 - x - 26/5 = 0`. We can use the quadratic formula:\n\nFor any equation of form `ax^2 + bx + c = 0`, its roots are given by `(b ± sqrt(b^2-4ac))/2a`.\n\nHere, a = 1, b = -1, and c = -26/5.\n\nSo the roots will be:\n\n`x = (1 ± sqrt((-1)^2-4*1*(-26/5)))/2`\n\n`x = (1 ± sqrt(1+104/5))/2`\n\n`x = (1 ± sqrt((5 + 104)/5))/2`\n\n`x = (1 ± sqrt(109/5))/2`\n\nStep 4: Now, we have the roots `sqrt(109/5)` and `-sqrt(109/5)`. This means that the inequality holds for values of x between these two points because a quadratic expression changes sign at its roots.\n\nStep 5: Converting the square root back to decimal or simplifying further:\n\n`x = (1 ± sqrt(109))/

In [None]:
prompts = [prompt_reasoning, prompt_strict]
questions = list(dataset["train"])
math_models = [
"qwen2-math:latest",
"wizard-math:latest",
"mathstral:latest",]
sampled_questions = random.sample(questions, 150)
models =["gemma2:9b"]# ["phi3:latest", "mistral-nemo:latest", "llama33:latest", "phi4:latest"] # ["deepseek-r1:7b"]


In [None]:
sampled_questions[0]

In [None]:
answers = []
for model in models:
    print(model)
    for t in tqdm(sampled_questions):
        for prompt in tqdm(prompts):
            answer = query_ollama(prompt, t["question"], t["options"], model, t["correct"], url)
            answers.append(answer)

In [None]:
df = pd.DataFrame(answers)
df.to_excel("results_math_gemma.xlsx", index=False)

In [None]:
prompts = [prompt_reasoning, prompt_strict]
questions = list(dataset["train"])

sampled_questions = random.sample(questions, 150)
models =["deepseek-r1:7b","deepseek-math:latest",]
answers = []
for model in models:
    print(model)
    for t in tqdm(sampled_questions):
            answer = query_ollama(deep_seek_prompt, t["question"], t["options"], model, t["correct"], url)
            answers.append(answer)

In [None]:
df = pd.DataFrame(answers)
df.to_excel("results_2402.xlsx", index=False)