### Install swi-prolog

In [None]:
!apt-get install swi-prolog

### System prompt

In [None]:
SYSTEM_PROMPT = """
You are a specialized Prolog code-generating assistant.

Your task is to solve math problems by providing a structured answer in two clearly defined sections:

1. <reasoning>
   - Provide a clear, concise step-by-step explanation of how you arrive at the solution.

2. <answer>
   - Provide executable Prolog code using constraint logic programming to compute the numeric answer.
   - Always start with: ':- use_module(library(clpq)).'
   - Define any necessary numeric constants or intermediate values using predicates.
   - Final answer should be unified explicitly in solve(X) using curly-brace constraints, without printing commands.

Use this XML format strictly:
<reasoning>
(Your step-by-step reasoning here)
</reasoning>
<answer>
:- use_module(library(clpq)).

(Any predicates/constants defined here)

solve(X) :-
    (Intermediate computations using curly braces)
    {X = final constraint logic}.
</answer>
"""

### Helper functions

In [None]:
import re
from datasets import load_dataset
import subprocess

# ----------------------
# Helper Functions
# ----------------------

def extract_xml_answer(text: str) -> str:
    try:
        start = text.rfind("<answer>")
        end = text.rfind("</answer>")
        if start == -1 or end == -1 or end < start:
            return None
        return text[start + len("<answer>"): end].strip()
    except Exception:
        return None

def execute_prolog_code(prolog_code: str) -> str:
    """
    Executes the given Prolog code in SWI-Prolog, calling solve(X),
    and returns the printed solution as a string (e.g., "12000").
    Returns None if there's an error or no output.
    """
    try:
        # Write the Prolog code to a temporary file
        with open("temp.pl", "w") as f:
            f.write(prolog_code)

        # Run SWI-Prolog: load 'temp.pl', call solve(X), print X, then halt
        result = subprocess.run(
            ["swipl", "-q", "-f", "temp.pl", "-g", "solve(X), writeln(X), halt"],
            capture_output=True,
            text=True,
            timeout=5,  # optional: 5-second timeout
        )

        # If there's any error output, we can check result.stderr or result.returncode
        if result.returncode != 0 or not result.stdout:
            return None

        # result.stdout is whatever got printed by writeln(X)
        lines = result.stdout.strip().splitlines()
        return lines[-1].strip() if lines else None

    except Exception as e:
        print(f"Error executing Prolog code: {e}")
        return None

### Preprocess dataset and push to HF

In [None]:
from datasets import load_dataset

def get_gsm8k_questions(split="train"):
    data = load_dataset('Thomas-X-Yang/gsm8k-prolog')[split]

    def map_fn(x):
        # Compute the correct numerical result by executing the reference Prolog solution.
        numerical_result = execute_prolog_code(x["output"])
        return {
            "instruction": x["instruction"],
            "input": x["input"],
            "output": x["output"],
            "prompt": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"{x['instruction']}\n{x['input']}"}
            ],
            # Optionally, you can also append the numerical result to the output field.
            "answer": x['output'],
            "numerical_result": str(numerical_result),  # Precomputed numeric result
        }

    data = data.map(map_fn)
    return data

dataset = get_gsm8k_questions()
print(dataset[0])

In [None]:
# Save and push the dataset to Hugging Face Hub.
# Replace "your_username" with your HF username and "hf_your_token" with your token if needed.
dataset.push_to_hub("niklasm222/gsm8k-prolog-prover-extended", token="", private=False)

### Compare preprocessed dataset with openai/gsm8k

In [None]:
import re
from datasets import load_dataset

# ----------------------
# Helper Function to extract the final numeric answer from the OpenAI GSM8K "answer" column.
# ----------------------
def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    # Remove commas to facilitate conversion to float.
    return text.split("####")[1].replace(",", "").strip()

# ----------------------
# Load both datasets
# ----------------------
# Load gsm8k-prolog-extended dataset (change split as needed)
dataset_extended = load_dataset("niklasm222/gsm8k-prolog-extended", split="train")
# Load openai/gsm8k dataset (change split as needed)
dataset_openai = load_dataset("openai/gsm8k", "main", split="train")

# Determine how many samples to compare (using the minimum length)
total = min(len(dataset_extended), len(dataset_openai))

# ----------------------
# Compare numerical_result from gsm8k-prolog-extended with the numeric answer from openai/gsm8k
# ----------------------
matches = 0
differences = []
mismatches = []  # List to store mismatched sample details

for i in range(total):
    # Get the numerical_result from the gsm8k-prolog-extended dataset
    ext_val_str = dataset_extended[i]["numerical_result"]
    try:
        ext_val = float(ext_val_str)
    except Exception as e:
        print(f"Skipping sample {i} from extended dataset due to conversion error: {e}")
        continue

    # Get the answer text from the openai/gsm8k dataset
    openai_answer_text = dataset_openai[i]["answer"]
    extracted_str = extract_hash_answer(openai_answer_text)
    if extracted_str is None:
        print(f"Skipping sample {i} from openai dataset because '####' not found.")
        continue
    try:
        openai_val = float(extracted_str)
    except Exception as e:
        print(f"Skipping sample {i} from openai dataset due to conversion error: {e}")
        continue

    diff = abs(ext_val - openai_val)
    differences.append(diff)
    if diff < 1e-6:
        matches += 1
    else:
        # Record mismatched sample details: index, extended value, openai value, and optionally the question
        question = dataset_extended[i].get("input", "N/A")
        mismatches.append({
            "index": i,
            "question": question,
            "extended_value": ext_val,
            "openai_value": openai_val,
            "difference": diff
        })

accuracy = matches / total * 100
print(f"Compared {total} samples. Match accuracy: {accuracy:.2f}%")

if mismatches:
    print("\nMismatched Samples:")
    for m in mismatches:
        print("-" * 40)
        print(f"Index: {m['index']}")
        print(f"Question: {m['question']}")
        print(f"Extended Value: {m['extended_value']}")
        print(f"OpenAI Value: {m['openai_value']}")
        print(f"Difference: {m['difference']}")


README.md:   0%|          | 0.00/536 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Compared 7473 samples. Match accuracy: 99.80%

Mismatched Samples:
----------------------------------------
Index: 167
Question: Janet filmed a new movie that is 60% longer than her previous 2-hour long movie.  Her previous movie cost $50 per minute to film, and the newest movie cost twice as much per minute to film as the previous movie.  What was the total amount of money required to film Janet's entire newest film?
Extended Value: 19200.0
OpenAI Value: 1920.0
Difference: 17280.0
----------------------------------------
Index: 228
Question: A company has 200 employees.  60% of the employees drive to work.  Of the employees who don't drive to work, half take public transportation. How many more employees drive to work than take public transportation?
Extended Value: 80.0
OpenAI Value: 40.0
Difference: 40.0
----------------------------------------
Index: 356
Question: Students at Highridge High earn 2 points for each correct answer during a quiz bowl If a student correctly answers all 

### Updates for indexes in openai/gsm8k

In [None]:
import re
from datasets import load_dataset

# Define the new answer texts for each error by index.
updates = {
    167: """The first movie was 2*60=<<2*60=120>>120 minutes
So this movie is 120*0.6=<<120*0.6=72>>72 minutes longer
So this movie is 120+72=<<120+72=192>>192 minutes
It also cost 50*2=$<<50*2=100>>100 per minute to film
So it cost 192*100=$<<192*100=19200>>19200
#### 19200""",

    228: """A company has 200 employees. 60% of the employees drive to work, so 200 * 0.60 = <<200*0.60=120>>120 drive to work.
The remaining employees who don't drive are 200 - 120 = <<200-120=80>>80.
Of these 80 employees, half take public transportation: 80 * 0.50 = <<80*0.50=40>>40.
The difference between those who drive and those who take public transportation is 120 - 40 = <<120-40=80>>80 employees.
#### 80""",

    356: """If James only missed one question in all five rounds of five questions, he correctly answered 5*5 - 1 = <<5*5-1=24>>24 questions.
Before the bonus, James earned 24 * 2 = <<24*2=48>>48 points.
Since missing one question disqualifies him from receiving the bonus in that round, he earns the bonus in 5 - 1 = <<5-1=4>>4 rounds.
Each bonus round is worth 4 points, so his bonus totals 4 * 4 = <<4*4=16>>16 points.
Including his bonus, James scored 48 + 16 = <<48+16=64>>64 points.
#### 64""",

    474: """Two-fifths of 30 = <<30*2/5=12>>12 members ordered lemon juice.
Remaining members = 30 - 12 = <<30-12=18>>18.
One-third of the remaining 18 = <<18*1/3=6>>6 members ordered mango juice.
Thus, the number of members who ordered orange juice = 18 - 6 = <<18-6=12>>12.
#### 12""",

    1081: """The first hive has 1000 bees producing 500 liters of honey, so each bee produces 500/1000 = <<500/1000=0.5>>0.5 liters.
The second hive has 20% fewer bees than the first hive: 1000 - (20% of 1000) = 1000 - 200 = <<1000-200=800>>800 bees.
Each bee in the second hive produces 40% more honey than a bee in the first hive: 0.5 * 1.4 = <<0.5*1.4=0.7>>0.7 liters per bee.
Thus, the second hive produces 800 * 0.7 = <<800*0.7=560>>560 liters of honey.
The total honey produced is 500 + 560 = <<500+560=1060>>1060 liters.
#### 1060""",

    1776: """Let x be the score that William needs to achieve.
Since there are 30 students in the class, there are 30 - 1 = <<30-1=29>>29 students besides William.
The total required score for a 75% average is 30 * 75 = <<30*75=2250>>2250.
The total score of the other 29 students is 29 * 74 = <<29*74=2146>>2146.
Therefore, William must score at least 2250 - 2146 = <<2250-2146=104>>104.
#### 104""",

    2620: """Ryan started with 36 tokens.
He used 36/3 = <<36/3=12>>12 tokens on Pac-Man.
He used 36/4 = <<36/4=9>>9 tokens on Candy Crush.
He used 7 tokens on Ski-ball.
Total tokens used = 12 + 9 + 7 = <<12+9+7=28>>28.
Remaining tokens = 36 - 28 = <<36-28=8>>8 tokens.
His parents bought him 7 times the tokens he spent on Ski-ball, which is 7 * 7 = <<7*7=49>>49 tokens.
Thus, Ryan ended up with 8 + 49 = <<8+49=57>>57 tokens.
#### 57""",

    2770: """Five boxes of pizza cost 5 x $10 = $<<5*10=50>>50.
Ten cans of soft drinks cost 10 x $2 = $<<10*2=20>>20.
So, Robert spends $50 + $20 = $<<50+20=70>>70.
Six hamburgers cost 6 x $3 = $<<6*3=18>>18.
Ten additional cans of soft drinks cost 10 x $2 = $<<10*2=20>>20.
Thus, Teddy spends $18 + $20 = $<<18+20=38>>38.
The total amount spent is $70 + $38 = $<<70+38=108>>108.
#### 108""",

    3263: """First, find how many candy canes Andy receives from his teachers:
3 canes/teacher * 4 teachers = <<3*4=12>>12 canes.
Then, add the number of candy canes he gets from his parents: 12 + 2 = <<12+2=14>>14 canes.
Then, he uses his allowance to buy 1/7 as many candy canes as he was given: 14 / 7 = <<14/7=2>>2 canes.
The total number of candy canes is 14 + 2 = <<14+2=16>>16.
Since Andy gets a cavity for every 4 candy canes he eats, the number of cavities is 16 / 4 = <<16/4=4>>4.
#### 4""",

    3529: """Forty percent of 50 = 50 * 0.40 = <<50*0.40=20>>20 supporters for the first team.
Thirty-four percent of 50 = 50 * 0.34 = <<50*0.34=17>>17 supporters for the second team.
Total supporters = 20 + 17 = <<20+17=37>>37.
Thus, the number of people who did not support either team = 50 - 37 = <<50-37=13>>13.
#### 13""",

    4099: """Let R be the total ore output of the company.
Nickel is 10% of the total output. Since Big Dig mines 720 tons of nickel daily,
the total ore output is 720 / 0.10 = 7200 tons.
Copper constitutes the remainder, i.e., 100 - 10 - 60 = 30% of the output.
Thus, the amount of copper mined daily is 7200 * 0.30 = <<7200*0.30=2160>>2160 tons.
#### 2160""",

    4796: """There are 5 * 20 = <<5*20=100>>100 tanks in the yard.
Altogether, there are 100 + 20 = <<100+20=120>>120 trucks and tanks in the yard.
#### 120""",

    7182: """Let x be the original number.
2*x + 5 = 20 + x/2
Multiplying both sides by 2: 4*x + 10 = 40 + x
Rearranging: 3*x = 30
Thus, x = 10
#### 10""",

    7401: """The capacity of the first tank is 7000 gallons, and if it is filled up to 3/4 full, it carries 3/4*7000 = 5250 gallons.
When the second tank is filled up to 4/5 of its capacity, it carries 4/5*5000 = <<4/5*5000=4000>>4000 gallons.
The total amount of water in the first two tanks is 5250+4000 = <<5250+4000=9250>>9250 gallons.
If Mr. Finnegan fills the third tank up to half its capacity, the tank fills up with 1/2*3000 = <<1500=1500>>1500 gallons.
In total, the three tanks have 9250+1500 = <<9250+1500=10850>>10750 gallons of water.
#### 10750"""
}

# Load the dataset once.
dataset_openai = load_dataset("openai/gsm8k", "main", split="train")

# Define a single update function that updates an example if its index is in the updates dict.
def update_answers(example, idx):
    if idx in updates:
        example["answer"] = updates[idx]
    return example

# Apply the update function to the dataset.
dataset_openai = dataset_openai.map(update_answers, with_indices=True)

# (Optional) Save the updated dataset to disk or push it to the Hugging Face Hub:
dataset_openai.push_to_hub("niklasm222/gsm8k-prover", token="", private=False)


Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/datasets/niklasm222/gsm8k-prover/commit/e174834fc8175348d7474e3576129758b9d02001', commit_message='Upload dataset', commit_description='', oid='e174834fc8175348d7474e3576129758b9d02001', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/niklasm222/gsm8k-prover', endpoint='https://huggingface.co', repo_type='dataset', repo_id='niklasm222/gsm8k-prover'), pr_revision=None, pr_num=None)

### Updates for indexes in gsm8k-prolog-prover

In [None]:
# Index: 1983
# Question: Janet buys 3 pounds of broccoli for $4 a pound, 3 oranges for $0.75 each, a cabbage for $3.75, a pound of bacon for $3, and two pounds of chicken for $3 a pound. What percentage of her grocery budget did she spend on meat, rounded to the nearest percent?
# Extended Value: 33.33333333333333
# OpenAI Value: 33.0
# Difference: 0.3333333333333286

# ----------------------
# Define the new Prolog code to update the 'output' field for sample at index 1983.
# ----------------------
NEW_OUTPUT_TEXT_1983 = """:- use_module(library(clpq)).
:- use_module(library(clpfd)).

cost(broccoli, 3, 4).
cost(oranges, 3, 0.75).
cost(cabbage, 1, 3.75).
cost(bacon, 1, 3).
cost(chicken, 2, 3).

percentage(Rounded_percentage) :-
    Total_vegetable_cost is 3 * 4 + 3 * 0.75 + 3.75,
    Total_meat_cost is 1 * 3 + 2 * 3,
    Total_cost is Total_vegetable_cost + Total_meat_cost,
    Percentage is (Total_meat_cost / Total_cost) * 100,
    Rounded_percentage is round(Percentage).

solve(Rounded_percentage) :-
    percentage(Rounded_percentage).
"""

# ----------------------
# Load the GSM8K-Prolog-Extended dataset.
# ----------------------
dataset_extended = load_dataset("niklasm222/gsm8k-prolog-extended", split="train")

# ----------------------
# Define a function to update the output for sample at index 1983.
# ----------------------
def update_output(example, idx):
    if idx == 1983:
        example["output"] = NEW_OUTPUT_TEXT_1983
    return example

# ----------------------
# Update the dataset with the new output text.
# ----------------------
dataset_extended = dataset_extended.map(update_output, with_indices=True)

# (Optional) Save the updated dataset to disk or push it to the Hugging Face Hub:
dataset_extended.push_to_hub("niklasm222/gsm8k-prolog-prover", token="", private=False)


Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/datasets/niklasm222/gsm8k-prolog-prover/commit/9bc0e4716472a162edc52ef0038dddfddde153b5', commit_message='Upload dataset', commit_description='', oid='9bc0e4716472a162edc52ef0038dddfddde153b5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/niklasm222/gsm8k-prolog-prover', endpoint='https://huggingface.co', repo_type='dataset', repo_id='niklasm222/gsm8k-prolog-prover'), pr_revision=None, pr_num=None)

### Comparison after cleaning

In [None]:
import re
from datasets import load_dataset

# ----------------------
# Helper Function to extract the final numeric answer from the OpenAI GSM8K "answer" column.
# ----------------------
def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    # Remove commas to facilitate conversion to float.
    return text.split("####")[1].replace(",", "").strip()

# ----------------------
# Load both datasets
# ----------------------
# Load gsm8k-prolog-extended dataset (change split as needed)
dataset_extended = load_dataset("niklasm222/gsm8k-prolog-prover", split="train")
# Load openai/gsm8k dataset (change split as needed)
dataset_openai = load_dataset("niklasm222/gsm8k-prover", split="train")

# Determine how many samples to compare (using the minimum length)
total = min(len(dataset_extended), len(dataset_openai))

# ----------------------
# Compare numerical_result from gsm8k-prolog-extended with the numeric answer from openai/gsm8k
# ----------------------
matches = 0
differences = []
mismatches = []  # List to store mismatched sample details

for i in range(total):
    # Get the numerical_result from the gsm8k-prolog-extended dataset
    ext_val_str = dataset_extended[i]["numerical_result"]
    try:
        ext_val = float(ext_val_str)
    except Exception as e:
        print(f"Skipping sample {i} from extended dataset due to conversion error: {e}")
        continue

    # Get the answer text from the openai/gsm8k dataset
    openai_answer_text = dataset_openai[i]["answer"]
    extracted_str = extract_hash_answer(openai_answer_text)
    if extracted_str is None:
        print(f"Skipping sample {i} from openai dataset because '####' not found.")
        continue
    try:
        openai_val = float(extracted_str)
    except Exception as e:
        print(f"Skipping sample {i} from openai dataset due to conversion error: {e}")
        continue

    diff = abs(ext_val - openai_val)
    differences.append(diff)
    if diff < 1e-6:
        matches += 1
    else:
        # Record mismatched sample details: index, extended value, openai value, and optionally the question
        question = dataset_extended[i].get("input", "N/A")
        mismatches.append({
            "index": i,
            "question": question,
            "extended_value": ext_val,
            "openai_value": openai_val,
            "difference": diff
        })

accuracy = matches / total * 100
print(f"Compared {total} samples. Match accuracy: {accuracy:.2f}%")

if mismatches:
    print("\nMismatched Samples:")
    for m in mismatches:
        print("-" * 40)
        print(f"Index: {m['index']}")
        print(f"Question: {m['question']}")
        print(f"Extended Value: {m['extended_value']}")
        print(f"OpenAI Value: {m['openai_value']}")
        print(f"Difference: {m['difference']}")


README.md:   0%|          | 0.00/536 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Compared 7473 samples. Match accuracy: 100.00%
