In [1]:
!pip install groq datasets



In [2]:
# Load dataset
from datasets import load_dataset
import random
from datasets import Dataset
from groq import Groq

ds = load_dataset("openai/gsm8k", "main")

In [3]:
# Print format
ds["train"][:5]

{'question': ['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
  'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
  'Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?',
  'Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?',
  'James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?'],
 'answer': ['Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips alt

In [4]:
df = ds['train'].to_pandas()[["question", "answer"]]


df.to_csv("gsm8k_train_questions_answers.csv", index=False)

In [5]:
df.head()

Unnamed: 0,question,answer
0,Natalia sold clips to 48 of her friends in Apr...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...
1,Weng earns $12 an hour for babysitting. Yester...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...
2,Betty is saving money for a new wallet which c...,"In the beginning, Betty has only 100 / 2 = $<<..."
3,"Julie is reading a 120-page book. Yesterday, s...",Maila read 12 x 2 = <<12*2=24>>24 pages today....
4,James writes a 3-page letter to 2 different fr...,He writes each friend 3*2=<<3*2=6>>6 pages a w...


In [6]:
import pandas as pd
import random
from groq import Groq

In [None]:
def mess_up_answer(question: str, original_answer: str) -> str:
    """
    Uses the LLM to create a new answer by introducing random calculation mistakes,
    returning only the messed-up answer text (no extra disclaimers or explanations).
    """
    client = Groq(api_key='')

    system_prompt = (
        "You are a helpful assistant, but you must occasionally introduce random arithmetic mistakes. "
        "Your entire response must be ONLY the incorrect or messed-up answer—nothing else. "
        "Do not add extra explanations, disclaimers, or phrases like 'here is the messed-up answer.' "
        "Just provide the final text of the messed-up solution."
    )

    user_prompt = (
        f"Question: {question}\n\n"
        f"Original Answer:\n{original_answer}\n\n"
        "Rewrite the answer so it contains 1-2 calculation mistakes. "
        "Do NOT mention you are inserting mistakes or provide any disclaimers. "
        "Return only the messed-up answer text."
    )

    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        model="llama3-70b-8192",  # or your preferred model
        temperature=0.7,
        max_completion_tokens=512,
        top_p=1,
        stop=None,
        stream=False,
    )

    messed_answer = response.choices[0].message.content.strip()
    return messed_answer


In [8]:
# Sample test

messed_up = mess_up_answer("Ann can skate 6 miles an hour. Her friend Glenda can skate 8 miles an hour. If they start in the same place and skate in straight lines in opposite directions for 3 hours, how many miles apart do they end up?","""First find how far Glenda goes in 3 hours by multiplying her speed by the number of hours she travels: 3 hours * 8 miles/hour = <<3*8=24>>24 miles
Then do the same thing for Ann: 3 hours * 6 miles/hour = <<3*6=18>>18 miles
Now add the number of miles both people skated to find the total distance between them: 18 miles + 24 miles = <<18+24=42>>42 miles #### 42""" )

In [9]:
messed_up

'First find how far Glenda goes in 3 hours by multiplying her speed by the number of hours she travels: 3 hours * 8 miles/hour = 27 miles\nThen do the same thing for Ann: 3 hours * 6 miles/hour = 17 miles\nNow add the number of miles both people skated to find the total distance between them: 17 miles + 27 miles = 51 miles'

In [10]:
# Shuffling the dataset
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
# Ensure the column exists

per_20 = int(len(df_shuffled) * 0.2)

if "changed_answer" not in df_shuffled.columns:
    df_shuffled["changed_answer"] = None

# Iterate row by row
for i in range(per_20):
    # If 'changed_answer' is already set for this row, skip it
    if df_shuffled.at[i, "changed_answer"] is not None:
        print(f"Row {i} skipped (already has changed_answer).")
        continue

    question = df_shuffled.at[i, "question"]
    original_answer = df_shuffled.at[i, "answer"]

    # Call the LLM exactly once for each row
    messed_up = mess_up_answer(question, original_answer)

    # Store the messed-up answer in the DataFrame
    df_shuffled.at[i, "changed_answer"] = messed_up

    # Save the entire DataFrame to CSV after each update
    df_shuffled.to_csv('ModifiedDataset.csv', index=False)
    print(f"Row {i} processed and saved.")

Row 0 skipped (already has changed_answer).
Row 1 skipped (already has changed_answer).
Row 2 skipped (already has changed_answer).
Row 3 skipped (already has changed_answer).
Row 4 skipped (already has changed_answer).
Row 5 skipped (already has changed_answer).
Row 6 skipped (already has changed_answer).
Row 7 skipped (already has changed_answer).
Row 8 skipped (already has changed_answer).
Row 9 skipped (already has changed_answer).
Row 10 skipped (already has changed_answer).
Row 11 skipped (already has changed_answer).
Row 12 skipped (already has changed_answer).
Row 13 skipped (already has changed_answer).
Row 14 skipped (already has changed_answer).
Row 15 skipped (already has changed_answer).
Row 16 skipped (already has changed_answer).
Row 17 skipped (already has changed_answer).
Row 18 skipped (already has changed_answer).
Row 19 skipped (already has changed_answer).
Row 20 skipped (already has changed_answer).
Row 21 skipped (already has changed_answer).
Row 22 skipped (alre

In [63]:
def omit_steps_answer(question: str, original_answer: str) -> str:
    """
    Uses the LLM to rewrite a correct answer but omit or skip
    several important steps in the explanation. The final numeric
    result or conclusion should remain correct. Only return
    the partial solution text, with no extra disclaimers.
    """
    client = Groq(api_key='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')

    system_prompt = (
        "You are a helpful assistant. Rewrite the given correct answer by omitting "
        "or glossing over a few critical steps in the reasoning. The final numeric "
        "result or conclusion must remain correct, but the explanation should be partial. "
        "Your response must ONLY be the incomplete solution—no additional disclaimers or notes."
    )

    user_prompt = (
        f"Question: {question}\n\n"
        f"Original Correct Answer:\n{original_answer}\n\n"
        "Rewrite the answer so that it omits some key steps in the middle, but still ends "
        "with the correct final result. Do NOT mention that you omitted steps or provide "
        "any disclaimers. Return only the incomplete explanation text."
    )

    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        model="llama-3.3-70b-versatile",  # adjust to your preferred model
        temperature=0.7,
        max_completion_tokens=512,
        top_p=1,
        stop=None,
        stream=False,
    )

    omitted_steps_answer = response.choices[0].message.content.strip()
    return omitted_steps_answer


In [20]:
# Ensure the column exists

# Calculate 20% of the total rows
per_20 = int(len(df_shuffled) * 0.2)

# Make sure the column exists
if "changed_answer" not in df_shuffled.columns:
    df_shuffled["changed_answer"] = None

# Define your start and end indices for the *second* 20% block
start_idx = per_20         # e.g. 1494 if your first block ended at 1493
end_idx = 2 * per_20       # e.g. 1494 * 2 = 2988, but range() stops at end_idx - 1

# Iterate row by row over [per_20 .. 2*per_20-1]
for i in range(start_idx, end_idx):
    # Skip if 'changed_answer' is already set
    if df_shuffled.at[i, "changed_answer"] is not None:
        print(f"Row {i} skipped (already has changed_answer).")
        continue

    question = df_shuffled.at[i, "question"]
    original_answer = df_shuffled.at[i, "answer"]

    # Call your function (e.g., omit_steps_answer or mess_up_answer)
    omitted_answer = omit_steps_answer(question, original_answer)

    # Store the new answer
    df_shuffled.at[i, "changed_answer"] = omitted_answer

    # Save the DataFrame each time
    df_shuffled.to_csv('ModifiedDataset.csv', index=False)
    print(f"Row {i} processed and saved.")


Row 1494 skipped (already has changed_answer).
Row 1495 skipped (already has changed_answer).
Row 1496 skipped (already has changed_answer).
Row 1497 skipped (already has changed_answer).
Row 1498 skipped (already has changed_answer).
Row 1499 skipped (already has changed_answer).
Row 1500 skipped (already has changed_answer).
Row 1501 skipped (already has changed_answer).
Row 1502 skipped (already has changed_answer).
Row 1503 skipped (already has changed_answer).
Row 1504 skipped (already has changed_answer).
Row 1505 skipped (already has changed_answer).
Row 1506 skipped (already has changed_answer).
Row 1507 skipped (already has changed_answer).
Row 1508 skipped (already has changed_answer).
Row 1509 skipped (already has changed_answer).
Row 1510 skipped (already has changed_answer).
Row 1511 skipped (already has changed_answer).
Row 1512 skipped (already has changed_answer).
Row 1513 skipped (already has changed_answer).
Row 1514 skipped (already has changed_answer).
Row 1515 skip

In [86]:
def grade_changed_answer(question: str, original_answer: str, changed_answer: str) -> str:
    """
    Compares original_answer with changed_answer using an LLM to see what mistakes
    or omissions are present. Provides point deductions and a final score out of 10,
    with only minimal feedback.

    Rules:
      - If changed_answer is empty or null => "Good answer, covers all points" + "Score: 10/10".
      - If the final answer is correct but steps are missing => mention missing steps
        without deducting points.
      - If there are real mistakes (incorrect steps or final result) => mention them,
        deduct points, and give final score.
      - Output only short bullet points / lines for errors or observations and
        the final "Score: X/10".
      - No extra disclaimers, intros, or text beyond this.
    """
    import pandas as pd
    from groq import Groq

    # 1) If no changed_answer is provided, return a perfect score immediately.
    if not changed_answer or pd.isnull(changed_answer):
        return "Good answer, covers all points.\nScore: 10/10"

    client = Groq(api_key='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')

    system_prompt = (
        "You are a strict but succinct grader. Compare the original correct answer with the changed answer. "
        "Only deduct points if the changed answer has incorrect arithmetic or conclusions and make the point deduction, Be reasonable with the deduction and do not remove too many points for small mistakes. "
        "If the final result is correct but some steps are omitted, do not deduct points—just note the missing steps. "
        "Provide short bullet points or lines for each observation, then a final line: 'Score: X/10'. "
        "No disclaimers, no introductions, no additional commentary."
    )

    user_prompt = (
        f"Question: {question}\n\n"
        f"Original Answer:\n{original_answer}\n\n"
        f"Changed Answer:\n{changed_answer}\n\n"
        "Please produce only short bullet-style notes or deductions, then a final line: 'Score: X/10'. "
        "If final answer is correct but some steps are omitted, do not deduct points—just note it."
    )

    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        model="llama3-70b-8192",  # Or your preferred model
        temperature=0.7,
        max_completion_tokens=512,
        top_p=1,
        stop=None,
        stream=False,
    )

    feedback = response.choices[0].message.content.strip()
    return feedback


In [78]:
# Re-load the dataset
df_shuffled = pd.read_csv('ModifiedDataset.csv')

df_shuffled.head()

Unnamed: 0,question,answer,changed_answer
0,In Professor Plum's biology class there are 40...,"We start with the initial numbers of students,...","We start with the initial numbers of students,..."
1,Diane bought twenty more apples than Cecile. I...,Diane bought 15 + 20 = <<15+20=35>>35 apples.\...,"Diane bought 15 + 20 = 32 apples.\nTherefore, ..."
2,Ann can skate 6 miles an hour. Her friend Glen...,First find how far Glenda goes in 3 hours by m...,First find how far Glenda goes in 3 hours by m...
3,"Running for 2 hours, Jonah burnt 30 calories e...","When Jonah ran for 2 hours, burning 30 calorie...","When Jonah ran for 2 hours, burning 30 calorie..."
4,The city of Richmond has 1000 more people than...,Victoria has 3000-1000=<<3000-1000=2000>>2000 ...,Victoria has 3000-1000=1900 people.\nBeacon ha...


In [80]:
df_shuffled.drop(columns=['grading'])

KeyError: "['grading'] not found in axis"

In [82]:
df_shuffled.columns

Index(['question', 'answer', 'changed_answer'], dtype='object')

In [50]:
feedback = grade_changed_answer(df_shuffled.at[0,'question'],df_shuffled.at[0,'answer'],df_shuffled.at[0,'changed_answer'])

In [51]:
feedback

'• Incorrect calculation: 40 * 0.8 = 36 (should be 32)\n• Incorrect percentage: multiplied 36 by 0.27 instead of 0.25\n• Final answer is incorrect: 10 instead of 8\nScore: 2/10'

In [43]:
feedback = grade_changed_answer(df_shuffled.at[2100,'question'],df_shuffled.at[2100,'answer'],df_shuffled.at[2100,'changed_answer'])

feedback

'• Omitted step: Number of bouquets is not explicitly stated.\nScore: 10/10'

In [44]:
feedback = grade_changed_answer(df_shuffled.at[2200,'question'],df_shuffled.at[2200,'answer'],df_shuffled.at[2200,'changed_answer'])

feedback

'• Omitted step: calculation of 15% increase in November\n• Omitted step: calculation of December visitors\nScore: 10/10'

In [52]:
feedback = grade_changed_answer(df_shuffled.at[3000,'question'],df_shuffled.at[3000,'answer'],df_shuffled.at[3000,'changed_answer'])

feedback

'Good answer, covers all points.\nScore: 10/10'

In [89]:
# 1) Ensure there's a 'grading' column
if "grading" not in df_shuffled.columns:
    df_shuffled["grading"] = None

# 2) Loop over *all* rows in df_shuffled
for i in range(len(df_shuffled)):
    # If 'grading' is already set for this row, skip it
    if df_shuffled.at[i, "grading"] is not None:
        print(f"Row {i} skipped (already has grading).")
        continue

    question = df_shuffled.at[i, "question"]
    original_answer = df_shuffled.at[i, "answer"]
    changed_answer = df_shuffled.at[i, "changed_answer"]

    # 3) Call your grading function
    feedback = grade_changed_answer(question, original_answer, changed_answer)

    # 4) Store the grading feedback
    df_shuffled.at[i, "grading"] = feedback

    # 5) Save to CSV each iteration (for partial progress)
    df_shuffled.to_csv("GradedDataset.csv", index=False)
    print(f"Row {i} processed and saved.")


Row 0 skipped (already has grading).
Row 1 skipped (already has grading).
Row 2 skipped (already has grading).
Row 3 skipped (already has grading).
Row 4 skipped (already has grading).
Row 5 skipped (already has grading).
Row 6 skipped (already has grading).
Row 7 skipped (already has grading).
Row 8 skipped (already has grading).
Row 9 skipped (already has grading).
Row 10 skipped (already has grading).
Row 11 skipped (already has grading).
Row 12 skipped (already has grading).
Row 13 skipped (already has grading).
Row 14 skipped (already has grading).
Row 15 skipped (already has grading).
Row 16 skipped (already has grading).
Row 17 skipped (already has grading).
Row 18 skipped (already has grading).
Row 19 skipped (already has grading).
Row 20 skipped (already has grading).
Row 21 skipped (already has grading).
Row 22 skipped (already has grading).
Row 23 skipped (already has grading).
Row 24 skipped (already has grading).
Row 25 skipped (already has grading).
Row 26 skipped (alread

In [91]:
def make_qa(row):
    """
    Build the 'Q) ...\nA) ...' string depending on whether changed_answer is present.
    """
    question = row["question"]
    changed = row["changed_answer"]
    original = row["answer"]
    
    # If there's a changed_answer, use that; otherwise, use the original answer
    if pd.notna(changed):  # checks for not None/NaN
        return f"Q) {question}\nA) {changed}"
    else:
        return f"Q) {question}\nA) {original}"

In [92]:
df_shuffled["QA"] = df_shuffled.apply(make_qa, axis=1)

# 3) Build the final 2-column DataFrame
new_df = df_shuffled[["QA", "grading"]]

# 4) Save to a new CSV
new_df.to_csv("qa_and_grading_only.csv", index=False)

print("Done! Saved 2-column dataset to 'qa_and_grading_only.csv'.")

Done! Saved 2-column dataset to 'qa_and_grading_only.csv'.
