In [67]:
import pprint as pp
import os
from transformers import AutoTokenizer
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer


In [68]:
df_train = pd.read_csv("train.csv").fillna(-1).iloc[:500]
df_test = pd.read_csv("test.csv")
misconception_mapping = pd.read_csv("misconception_mapping.csv").MisconceptionName.values

In [89]:
TOKENIZER = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B-Instruct")
misconception_mapping = pd.read_csv("misconception_mapping.csv").MisconceptionName.values

def fromat_prompt(row, targetCol):
    PROMPT  = """{Question} 
I believe {IncorrectAnswer}.
Explain why I am wrong.
Think about {ConstructName} in your answer. 
Why is {SubjectName} relevant?
Be specific in regards to {SubjectName}.
Explain why {IncorrectAnswer} is a misconception.
Summarize my main misunderstanding in one sentance at the end.
Summarize my main misunderstanding in one sentance at the end.""" 
    
    return PROMPT.format(ConstructName=row["ConstructName"],
                     SubjectName=row["SubjectName"],
                     Question=row["QuestionText"],
                     IncorrectAnswer=row[f"Answer{targetCol}Text"],
                     CorrectAnswer=row[f"Answer{row.CorrectAnswer}Text"])

def prepare_features(df_questions):

    def apply_template(prompt):
        messages = [
            {
                "role": "user", 
                "content": prompt
            }
        ]
        text = TOKENIZER.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        return text
    
    features = {}
    for idx, row in df_questions.iterrows():
        for option in ["A", "B", "C", "D"]:
            if (row.CorrectAnswer!=option) & (row[f"Misconception{option}Id"]!=-1):
                features[f"{row.QuestionId}_{option}"] = {
                    "misconception_id": row[f"Misconception{option}Id"], 
                    "mismisconception_text": misconception_mapping[int(row[f"Misconception{option}Id"])],
                    "formatted_prompt": apply_template(fromat_prompt(row, option)),  
                    "plaintext_prompt": fromat_prompt(row, option),
                    "construct_name": row["ConstructName"],
                    "question_text": row["QuestionText"]
                    
                }
    return features

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [90]:
features = prepare_features(df_train)
ids = list(features.keys())

In [91]:
print(features[ids[0]]['plaintext_prompt'])

\[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? 
I believe Does not need brackets.
Explain why I am wrong.
Think about Use the order of operations to carry out calculations involving powers in your answer. 
Why is BIDMAS relevant?
Be specific in regards to BIDMAS.
Explain why Does not need brackets is a misconception.
Summarize my main misunderstanding in one sentance at the end.
Keep it short and concise. Try to explain the misunderstanding as tersely as possible.


In [92]:
print(features[ids[0]]['mismisconception_text'])

Confuses the order of operations, believes addition comes before multiplication 


In [93]:
print(features[ids[100]]['plaintext_prompt'])

A line has a gradient of \( 5 \)

A line parallel to this line would have a gradient of... 
I believe Not enough information.
Explain why I am wrong.
Think about Given a gradient, state the gradient of a line parallel to it in your answer. 
Why is Parallel Lines relevant?
Be specific in regards to Parallel Lines.
Explain why Not enough information is a misconception.
Summarize my main misunderstanding in one sentance at the end.
Keep it short and concise. Try to explain the misunderstanding as tersely as possible.


In [94]:
print(features[ids[100]]['mismisconception_text'])

Does not know parallel lines have equal gradients 


In [95]:

#model_name = "Qwen/Qwen2.5-Math-7B-Instruct"
model_name = "Qwen/Qwen2.5-Math-1.5B-Instruct"

device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [96]:
print(features[ids[0]]['formatted_prompt'])

<|im_start|>system
Please reason step by step, and put your final answer within \boxed{}.<|im_end|>
<|im_start|>user
\[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? 
I believe Does not need brackets.
Explain why I am wrong.
Think about Use the order of operations to carry out calculations involving powers in your answer. 
Why is BIDMAS relevant?
Be specific in regards to BIDMAS.
Explain why Does not need brackets is a misconception.
Summarize my main misunderstanding in one sentance at the end.
Keep it short and concise. Try to explain the misunderstanding as tersely as possible.<|im_end|>
<|im_start|>assistant



In [97]:
df = pd.DataFrame(features)

In [98]:
df = pd.DataFrame.from_dict(features, orient='index').reset_index()


In [99]:
df.head(3)

Unnamed: 0,index,misconception_id,mismisconception_text,formatted_prompt,plaintext_prompt,construct_name,question_text
0,0_D,1672.0,"Confuses the order of operations, believes add...",<|im_start|>system\nPlease reason step by step...,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,Use the order of operations to carry out calcu...,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...
1,1_A,2142.0,Does not know that to factorise a quadratic ex...,<|im_start|>system\nPlease reason step by step...,"Simplify the following, if possible: \( \frac{...",Simplify an algebraic fraction by factorising ...,"Simplify the following, if possible: \( \frac{..."
2,1_B,143.0,Thinks that when you cancel identical terms fr...,<|im_start|>system\nPlease reason step by step...,"Simplify the following, if possible: \( \frac{...",Simplify an algebraic fraction by factorising ...,"Simplify the following, if possible: \( \frac{..."


In [100]:
print(first_row['formatted_prompt'])

<|im_start|>system
Please reason step by step, and put your final answer within \boxed{}.<|im_end|>
<|im_start|>user
\[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? 
I believe Does not need brackets.
Explain why I am wrong.
Think about Use the order of operations to carry out calculations involving powers in your answer. 
Why is BIDMAS relevant?
Be specific in regards to BIDMAS.
Explain why Does not need brackets is a misconception.
Summarize my main misunderstanding in one sentance at the end.
Summarize my main misunderstanding in one sentance at the end.<|im_end|>
<|im_start|>assistant



In [101]:
# Select the first row using iloc
first_row = df.iloc[0]

# Print the first row
print(first_row['formatted_prompt'])

<|im_start|>system
Please reason step by step, and put your final answer within \boxed{}.<|im_end|>
<|im_start|>user
\[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? 
I believe Does not need brackets.
Explain why I am wrong.
Think about Use the order of operations to carry out calculations involving powers in your answer. 
Why is BIDMAS relevant?
Be specific in regards to BIDMAS.
Explain why Does not need brackets is a misconception.
Summarize my main misunderstanding in one sentance at the end.
Keep it short and concise. Try to explain the misunderstanding as tersely as possible.<|im_end|>
<|im_start|>assistant



In [102]:
print(f"Response: {first_row['mismisconception_text']}\n")


Response: Confuses the order of operations, believes addition comes before multiplication 



In [103]:
type(features)

dict

In [105]:
device = "mps"  # Set to CPU since CUDA is not available


# Loop over each prompt and generate a response
for index, prompt in df.head(1).iterrows():
    inputs = tokenizer(prompt['formatted_prompt'], return_tensors="pt").to(device)
    output = model.generate(**inputs, max_new_tokens=500)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Print each response
    print("--------------------------------------------------------------------------------")
    print(f"Misconception: {prompt['mismisconception_text']}\n")
    print(f"Response: {response}\n")

--------------------------------------------------------------------------------
Misconception: Confuses the order of operations, believes addition comes before multiplication 

Response: system
Please reason step by step, and put your final answer within \boxed{}.
user
\[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? 
I believe Does not need brackets.
Explain why I am wrong.
Think about Use the order of operations to carry out calculations involving powers in your answer. 
Why is BIDMAS relevant?
Be specific in regards to BIDMAS.
Explain why Does not need brackets is a misconception.
Summarize my main misunderstanding in one sentance at the end.
Keep it short and concise. Try to explain the misunderstanding as tersely as possible.
assistant
To solve the problem \(3 \times 22+4-5\) and determine where brackets are needed to make the answer equal to 13, let's follow the order of operations (BIDMAS/BODMAS rules), which stand for:

1. **B**rackets


In [None]:
df_misconception_mapping

In [None]:
print(three[ids[0]]['misconception_id'])

In [None]:
df = pd.read_parquet("submission.parquet")
misconception_mapping = pd.read_csv("misconception_mapping.csv").MisconceptionName.values

In [None]:
three[ids[0]]['misconception_id']

In [None]:
df_misconception_mapping[int(three[ids[0]]['misconception_id'])]

In [None]:
df_misconception_mapping.MisconceptionName.values[1672]

In [None]:
one, two, three = prepare_features(df_train)

In [None]:
import os
from transformers import AutoTokenizer
import pandas as pd

IS_SUBMISSION = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))

df_train = pd.read_csv("train.csv").fillna(-1).iloc[:500]
df_test = pd.read_csv("test.csv")
tokenizer = AutoTokenizer.from_pretrained("hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4")

PROMPT  = """Question: {Question}
Incorrect Answer: {IncorrectAnswer}
Correct Answer: {CorrectAnswer}
Construct Name: {ConstructName}
Subject Name: {SubjectName}

Your task: Identify the misconception behind Incorrect Answer. Answer concisely and generically inside <response>$$INSERT TEXT HERE$$</response>.
Before answering the question think step by step concisely in 1-2 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag and respond your final misconception inside <response>$$INSERT TEXT HERE$$</response> tag."""

def apply_template(row, tokenizer, targetCol):
    messages = [
        {
            "role": "user", 
            "content": PROMPT.format(
                 ConstructName=row["ConstructName"],
                 SubjectName=row["SubjectName"],
                 Question=row["QuestionText"],
                 IncorrectAnswer=row[f"Answer{targetCol}Text"],
                 CorrectAnswer=row[f"Answer{row.CorrectAnswer}Text"])
        }
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text

df = {}
df_label = {}
for idx, row in df_train.iterrows():
    for option in ["A", "B", "C", "D"]:
        if (row.CorrectAnswer!=option) & (row[f"Misconception{option}Id"]!=-1):
            df[f"{row.QuestionId}_{option}"] = apply_template(row, tokenizer, option)
            df_label[f"{row.QuestionId}_{option}"] = [row[f"Misconception{option}Id"]]
df_label = pd.DataFrame([df_label]).T.reset_index()
df_label.columns = ["QuestionId_Answer", "MisconceptionId"]
df_label.to_parquet("label.parquet", index=False)


In [None]:
df['0_D']

In [None]:
df = pd.DataFrame([df]).T.reset_index()
df.columns = ["QuestionId_Answer", "text"]
df.to_parquet("submission.parquet", index=False)

In [None]:
def prepare_features(df_questions):
    PROMPT  = """Question: {Question}
    Incorrect Answer: {IncorrectAnswer}
    Correct Answer: {CorrectAnswer}
    Construct Name: {ConstructName}
    Subject Name: {SubjectName}
    
    Your task: Identify the misconception behind Incorrect Answer. Answer concisely and generically inside <response>$$INSERT TEXT HERE$$</response>.
    Before answering the question think step by step concisely in 1-2 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag and respond your final misconception inside <response>$$INSERT TEXT HERE$$</response> tag."""
    
    def apply_template(row, tokenizer, targetCol):
        messages = [
            {
                "role": "user", 
                "content": PROMPT.format(
                     ConstructName=row["ConstructName"],
                     SubjectName=row["SubjectName"],
                     Question=row["QuestionText"],
                     IncorrectAnswer=row[f"Answer{targetCol}Text"],
                     CorrectAnswer=row[f"Answer{row.CorrectAnswer}Text"])
            }
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        return text
    
    df_features = {}
    df_label = {}
    for idx, row in df_questions.iterrows():
        for option in ["A", "B", "C", "D"]:
            if (row.CorrectAnswer!=option) & (row[f"Misconception{option}Id"]!=-1):
                df_features[f"{row.QuestionId}_{option}"] = apply_template(row, tokenizer, option)
                df_label[f"{row.QuestionId}_{option}"] = [row[f"Misconception{option}Id"]]
    df_label = pd.DataFrame([df_label]).T.reset_index()
    df_label.columns = ["QuestionId_Answer", "MisconceptionId"]
    df_label.to_parquet("label.parquet", index=False)
    df_features = pd.DataFrame([df_features]).T.reset_index()
    df_features.columns = ["QuestionId_Answer", "text"]
    df_features.to_parquet("submission.parquet", index=False)
    return df_features, df_label

df_features, df_label = prepare_features(df_train)

In [5]:
df_features.head(2)

Unnamed: 0,QuestionId_Answer,text
0,0_D,<|begin_of_text|><|start_header_id|>system<|en...
1,1_A,<|begin_of_text|><|start_header_id|>system<|en...


In [6]:
df_label.head(2)

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,0_D,[1672.0]
1,1_A,[2142.0]


In [7]:
df_features['text'][0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuestion: \\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\n    Incorrect Answer: Does not need brackets\n    Correct Answer: \\( 3 \\times(2+4)-5 \\)\n    Construct Name: Use the order of operations to carry out calculations involving powers\n    Subject Name: BIDMAS\n    \n    Your task: Identify the misconception behind Incorrect Answer. Answer concisely and generically inside <response>$$INSERT TEXT HERE$$</response>.\n    Before answering the question think step by step concisely in 1-2 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag and respond your final misconception inside <response>$$INSERT TEXT HERE$$</response> tag.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [None]:
Question: 
\\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\n    Incorrect Answer: Does not need brackets\n    Correct Answer: \\( 3 \\times(2+4)-5 \\)\n    Construct Name: Use the order of operations to carry out calculations involving powers\n    Subject Name: BIDMAS\n    \n    
Your task: Identify the misconception behind Incorrect Answer. Answer concisely and generically inside <response>$$INSERT TEXT HERE$$</response>.\n    
Before answering the question think step by step concisely in 1-2 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag and respond your final misconception inside <response>$$INSERT TEXT HERE$$</response> tag.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'






