In [2]:
import pandas as pd
import numpy as np
from eval.math_equivalence import is_equiv

In [3]:
df = pd.read_json("MATH_hard_train_proxy_tuning.jsonl", lines=True)
df["is_equiv"] = df.apply(
    lambda x: is_equiv(x["answer"], x["prediction"]), axis=1
)
df

Unnamed: 0,question,answer,model_output,prediction,is_equiv
0,A sequence of ten $0$s and/or $1$s is randomly...,73,"Let $n$ be a positive integer, and let $a_0, ...",73,True
1,When $0.42\overline{153}$ is expressed as a fr...,42111,"Hmm, okay, so I need to convert a decimal tha...",42111,True
2,Square $BCFE$ is inscribed in right triangle $...,1624,"Alright, let me try to figure out this square...",58,False
3,"How many integers, $x$, satisfy $|5x - 3| \le 7$?",3,"Okay, so I need to find all integer values of...",3,True
4,Define \[f(x) = \frac{1}{x + \frac{1}{x}}\]and...,\sqrt5,Put your final final in reduced common common...,2,False
...,...,...,...,...,...
6266,The diameter of a circle is 16. By what number...,4,"Hmm, okay, let me break this down step by ste...",4,True
6267,Three aluminum cans can be recycled to make a ...,121,Let me think hard. Stream of consciousness: F...,121,True
6268,"Find $\frac{1}{a-1}+\frac{1}{b-1},$ where $a$ ...",-1,"\nAnswer:\nAlright, let me try to solve this p...",-1,True
6269,The graph of the quadratic $y = ax^2 + bx + c$...,3,To determine the coefficients of the quadrati...,1,False


In [4]:
def truncate_to_last_answer(model_output, answer):
    if pd.isna(model_output) or pd.isna(answer):
        raise ValueError("model_output and answer cannot be NaN")  
    
    answer_str = "\\boxed{{answer}}".replace("{answer}", str(answer)).strip()
    last_idx = model_output.rfind(answer_str)
    if last_idx == -1:
        return None
    else:
        return model_output[:last_idx + len(answer_str)]


def dedup_repeated_ngram(text, min_n=3, max_n=20):
    if pd.isna(text):
        return text
    tokens = text.split(" ")
    
    best_ngram = None
    best_end_idx = None

    for n in range(min_n, max_n + 1):
        if len(tokens) < n:
            break
        last_ngram = tokens[len(tokens)-n:]
        assert len(last_ngram) == n
        curr_idx = len(tokens) - n
        while curr_idx >= 0:
            curr_ngram = tokens[curr_idx:curr_idx+n]
            assert len(curr_ngram) == n
            if " ".join(last_ngram) == " ".join(curr_ngram):
                if best_end_idx is None or curr_idx+len(curr_ngram) < best_end_idx:
                    best_ngram = curr_ngram
                    best_end_idx = curr_idx+len(curr_ngram)
                curr_idx -= n
            else:
                break
        
    if " ".join(tokens[:best_end_idx]) != text:
        print(f"Truncated from {len(tokens)} to {len(tokens[:best_end_idx])} tokens.")
        # print(f"Truncated text: {' '.join(tokens[best_end_idx:])}")
    return " ".join(tokens[:best_end_idx]) if best_end_idx is not None else text


dedup_repeated_ngram("hi you are my friend. you are my friend. you are my friend. you are my friend. you are my friend. you are my friend. you are my friend. you are my friend. you are my friend.")

Truncated from 37 to 5 tokens.


'hi you are my friend.'

In [6]:
# Apply to DataFrame
df["model_output_truncated"] = df.apply(
    lambda row: truncate_to_last_answer(row["model_output"], row["prediction"]),
    axis=1
)

df["model_output_truncated_dedup"] = df["model_output_truncated"].apply(
    lambda x: dedup_repeated_ngram(x, min_n=3, max_n=20)
)

df["model_output_final"] = df.apply(
    lambda row: row["model_output_truncated_dedup"] if row["is_equiv"] else None,
    axis=1
)

df.to_json("MATH_hard_train_proxy_tuning_cleaned_full.jsonl", lines=True, orient="records")
df_filtered = df.drop(columns=["model_output_truncated", "model_output_truncated_dedup"])[df["model_output_final"].apply(lambda x: x is not None)].reset_index(drop=True)

df_filtered['split'] = np.where(df_filtered.index >= len(df_filtered) - 500, 'dev', 'train')
df_filtered.to_json("MATH_hard_train_proxy_tuning_cleaned.jsonl", lines=True, orient="records")
df_filtered

Truncated from 3984 to 146 tokens.
Truncated from 1302 to 743 tokens.
Truncated from 3851 to 3027 tokens.


Unnamed: 0,question,answer,model_output,prediction,is_equiv,model_output_final,split
0,A sequence of ten $0$s and/or $1$s is randomly...,73,"Let $n$ be a positive integer, and let $a_0, ...",73,True,"Let $n$ be a positive integer, and let $a_0, ...",train
1,When $0.42\overline{153}$ is expressed as a fr...,42111,"Hmm, okay, so I need to convert a decimal tha...",42111,True,"Hmm, okay, so I need to convert a decimal tha...",train
2,"How many integers, $x$, satisfy $|5x - 3| \le 7$?",3,"Okay, so I need to find all integer values of...",3,True,"Okay, so I need to find all integer values of...",train
3,How many positive three-digit integers are con...,48,"\n\nAlright, let's figure out this problem ste...",48,True,"\n\nAlright, let's figure out this problem ste...",train
4,What is the area enclosed by the graph of $|3x...,24,"To get started, divide both sides of the equa...",24,True,"To get started, divide both sides of the equa...",train
...,...,...,...,...,...,...,...
3640,What is the remainder when $5^{207}$ is divide...,6,"Answer: $\boxed{7}$\n\nWait, hold on! That an...",6,True,"Answer: $\boxed{7}$\n\nWait, hold on! That an...",dev
3641,The diameter of a circle is 16. By what number...,4,"Hmm, okay, let me break this down step by ste...",4,True,"Hmm, okay, let me break this down step by ste...",dev
3642,Three aluminum cans can be recycled to make a ...,121,Let me think hard. Stream of consciousness: F...,121,True,Let me think hard. Stream of consciousness: F...,dev
3643,"Find $\frac{1}{a-1}+\frac{1}{b-1},$ where $a$ ...",-1,"\nAnswer:\nAlright, let me try to solve this p...",-1,True,"\nAnswer:\nAlright, let me try to solve this p...",dev


In [5]:
# def dedup_repeated_ngram(text, ngram_len=10):
#     if pd.isna(text):
#         return text
#     tokens = text.split(" ")
#     if len(tokens) < ngram_len:
#         return text  # 太短，直接返回
    
#     ngrams = [' '.join(tokens[i:i+ngram_len]) for i in range(len(tokens) - ngram_len + 1)]
#     if not ngrams:
#         return text
    
#     last_ngram = ngrams[-1]
#     repeat_start = len(tokens) - ngram_len
#     while repeat_start - ngram_len >= 0:
#         prev_ngram = ' '.join(tokens[repeat_start - ngram_len:repeat_start])
#         if prev_ngram != last_ngram:
#             break
#         repeat_start -= ngram_len
    
#     clean_tokens = tokens[:repeat_start + ngram_len]
#     if len(clean_tokens) == len(tokens):
#         assert ' '.join(clean_tokens) == text
#         return text
#     else:
#         print(f"Truncated from {len(tokens)} to {len(clean_tokens)} tokens.")
#         return ' '.join(clean_tokens)