# フィルタリングしていないjsonlの作成

jsonでの形式
```json
{
    "id": 1,
    "question": "What is 1+1?",
    "output": "<think>Okay, I need to calculate the sum of 1 and 1.</think>2",
    "answer": "2",
    "domain": "math",
    "source": "numina_math"
}
```

## 共通モジュール

In [1]:
from datasets import load_dataset
import random
import re
from pydantic import BaseModel

In [2]:
class NoFilterJsonModel(BaseModel):
    id: int
    question: str
    output: str
    answer: str
    domain: str
    source: str


def extract_latex(text, pattern):
    last_start = text.rfind(pattern+"{")
    if last_start == -1:
        return None
    i = last_start + len(pattern+"{")
    count = 1
    start = i
    while i < len(text):
        if text[i] == '{':
            count += 1
        elif text[i] == '}':
            count -= 1
            if count == 0:
                return text[start:i]
        i += 1
    return None

## Math

In [3]:
replace_dict_first = {
    " ": "",
    "\n": "",
    "\r": "",
    "\\,": "",
    "\\>": "",
    "\\:": "",
    "\\;": "",
    "\\qquad": "",
    "\\quad": "",

    "\\cdot": "",
    "\\times": "",
    "\\dfrac": "\\frac",
    "\\leqslant": "\\leq",
    "\\leqq": "\\leq",
    "\\geqslant": "\\geq",
    "\\geqq": "\\geq",
    "\\left": "",
    "\\right": "",
    "^\\circ": "",
    "^{\\circ}": "",

    "¥": "",
    "$": "",
    "%": "",
    "\\": "",
}

unit_list = ["text", "textbf", "texttt", "mathrm", "mathbf", "mathbb", "mathcal"]

replace_dict_second = {
    "textbf": "",
    "texttt": "",
    "text": "",
    "mathrm": "",
    "mathbf": "",
    "mathbb": "",
    "mathcal": "",
    "{": "",
    "}": "",
    "(": "",
    ")": "",
    "[": "",
    "]": "",
    ";": "",
    ",": "",
    ".": "",
    "~": ""
}


def extract_between_markers(s, marker_groups):
    result = {}
    positions = []
    # 各マーカーの位置を取得
    for key, markers in marker_groups.items():
        for marker in markers:
            pos = s.rfind(marker)
            if pos != -1:
                positions.append((pos, key, marker))
                break
    positions.sort()
    # 各区間の文字列を抽出
    for i in range(len(positions)):
        start = positions[i][0] + len(positions[i][2])
        if i < len(positions) - 1:
            end = positions[i + 1][0]
            result[positions[i][1]] = s[start:end]
        else:
            result[positions[i][1]] = s[start:]
    return result


def calculate(text):
    text_replace = text.replace("\\dfrac", "\\frac")
    frac_match = re.match(r'\\frac\{(-?\d+)\}\{(-?\d+)\}', text_replace)
    try:
      if frac_match:
          numerator = int(frac_match.group(1))
          denominator = int(frac_match.group(2))
          return numerator / denominator
      elif ":" in text:
          numerator = int(text.split(":")[0])
          denominator = int(text.split(":")[1])
          return numerator / denominator
      else:
          return float(text)
    except:
        return None


def math_judge(problem, deepseek_answer, ground_truth_answer):
    # answerのいずれかがNoneの場合、False
    if deepseek_answer is None or ground_truth_answer is None:
        return False

    # 分数を実際に計算
    deepseek_calculate = calculate(deepseek_answer)
    ground_truth_calculate = calculate(ground_truth_answer)
    if deepseek_calculate and ground_truth_calculate and abs(deepseek_calculate - ground_truth_calculate) < 1e-3:
        return True

    # latex表現を置換(一段階目)
    deepseek_replace = deepseek_answer
    ground_truth_replace = ground_truth_answer
    for old, new in replace_dict_first.items():
        deepseek_replace = deepseek_replace.replace(old, new)
        ground_truth_replace = ground_truth_replace.replace(old, new)

    # 単位を除外
    for pattern in unit_list:
        deepseek_unit = extract_latex(deepseek_replace, pattern)
        ground_truth_unit = extract_latex(ground_truth_replace, pattern)
        if deepseek_unit is not None:
            deepseek_unit_with_pattern = pattern + "{" + deepseek_unit + "}"
            # deepseek_unit_with_patternの次の文字が"^"か判定（ex. \text{cm}^2)
            idx = deepseek_replace.find(deepseek_unit_with_pattern)
            if idx != -1 and idx + len(deepseek_unit_with_pattern) < len(deepseek_replace):
                if deepseek_replace[idx + len(deepseek_unit_with_pattern)] == "^":
                    next_char = deepseek_replace[idx + len(deepseek_unit_with_pattern) + 1]
                    deepseek_unit_with_pattern += "^" + next_char
            deepseek_no_unit = deepseek_replace.replace(deepseek_unit_with_pattern, "")
            if deepseek_no_unit == ground_truth_replace:
                return True
        if ground_truth_unit is not None:
            ground_truth_unit_with_pattern = pattern + "{" + ground_truth_unit + "}"
            idx = ground_truth_replace.find(ground_truth_unit_with_pattern)
            if idx != -1 and idx + len(ground_truth_unit_with_pattern) < len(ground_truth_replace):
                if ground_truth_replace[idx + len(ground_truth_unit_with_pattern)] == "^":
                    next_char = ground_truth_replace[idx + len(ground_truth_unit_with_pattern) + 1]
                    ground_truth_unit_with_pattern += "^" + next_char
            ground_truth_no_unit =  ground_truth_replace.replace(ground_truth_unit_with_pattern, "")
            if deepseek_replace == ground_truth_no_unit:
                return True

    # latex表現を置換(二段階目)
    for old, new in replace_dict_second.items():
        deepseek_replace = deepseek_replace.replace(old, new)
        ground_truth_replace = ground_truth_replace.replace(old, new)
    if deepseek_replace == ground_truth_replace:
        return True

    # 計算前の数式が "=" の前に示されているものを削除
    if deepseek_replace == ground_truth_replace.split("=")[-1] or deepseek_replace.split("=")[-1] == ground_truth_replace:
        return True

    # 選択肢の場合の正誤判定
    problem_replace = problem
    for old, new in replace_dict_first.items():
        problem_replace = problem_replace.replace(old, new)
    marker_groups = {
        "A": ["(A)", "A)", "A.", "(a)", "a)", "a."],
        "B": ["(B)", "B)", "B.", "(b)", "b)", "b."],
        "C": ["(C)", "C)", "C.", "(c)", "c)", "c."],
        "D": ["(D)", "D)", "D.", "(d)", "d)", "d."],
        "E": ["(E)", "E)", "E.", "(e)", "e)", "e."],
        "F": ["(F)", "F)", "F.", "(f)", "f)", "f."],
        "G": ["(G)", "G)", "G.", "(g)", "g)", "g."],
        "H": ["(H)", "H)", "H.", "(h)", "h)", "h."],
    }
    option_dict = extract_between_markers(problem_replace, marker_groups)
    for key, value in option_dict.items():
        value_replace = value
        for old, new in replace_dict_second.items():
            value_replace = value_replace.replace(old, new)
        option_dict[key] = value_replace
    if len(deepseek_replace) > 0:
        deepseek_option = deepseek_replace[0].upper()
        if deepseek_option in option_dict and (deepseek_option==ground_truth_replace.upper() or option_dict[deepseek_option]==ground_truth_replace):
            return True
    if len(ground_truth_replace) > 0:
        ground_truth_option = ground_truth_replace[0].upper()
        if ground_truth_option in option_dict and (ground_truth_option==deepseek_replace.upper() or option_dict[ground_truth_option]==deepseek_replace):
            return True

    # bool値が両方に含まれているか
    true_words = {"true", "yes"}
    false_words = {"false", "no"}
    deepseek_lower = deepseek_replace.lower()
    ground_truth_lower = ground_truth_replace.lower()
    if any(word in deepseek_lower for word in true_words) and any(word in ground_truth_lower for word in true_words):
        return True
    if any(word in deepseek_lower for word in false_words) and any(word in ground_truth_lower for word in false_words):
        return True

    return False


def make_math_json_list(train_dataset, start_id):
    id = start_id
    no_filter_json_list = []
    for train in train_dataset:
        problem = train["problem"]

        # 証明問題は除外
        if any(word in problem.lower() for word in ["prove", "proof", "show"]):
            continue

        # \boxed{}内の文字列を取得
        deepseek_answer = extract_latex(train["deepseek_solution"], "\\boxed")
        ground_truth_answer = extract_latex(train["ground_truth_solution"], "\\boxed")

        # \boxed{}が存在しない場合、Answer以降の文字列を取得
        if deepseek_answer is None:
            for answer_pattern in ["**Answer**:", "**Answer:**", "**Final Answer**:", "**Final Answer:**", ]:
                after_answer = train["deepseek_solution"].split(answer_pattern)
                if len(after_answer) > 1:
                    deepseek_answer = after_answer[-1]
                    break

        # 正誤判定
        if math_judge(problem, deepseek_answer, ground_truth_answer):
            no_filter_json = NoFilterJsonModel(
                id=id,
                question=problem,
                output=f'<think>{train["deepseek_reasoning"]}</think>{deepseek_answer}',
                answer=deepseek_answer,
                domain=train["domain"],
                source=train["source"]
            )
            no_filter_json_list.append(no_filter_json)
            id += 1
    print(f"math: {len(no_filter_json_list)} / {len(train_dataset)}")

    return no_filter_json_list

## Code

In [4]:
def extract_python_code_block(text):
    match = re.search(r"(```python.*?```)", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None


def make_code_json_list(train_dataset, start_id):
    id = start_id
    no_filter_json_list = []
    for train in train_dataset:
        problem = train["problem"]
        deepseek_answer = extract_python_code_block(train["deepseek_solution"])
        if deepseek_answer:
            no_filter_json = NoFilterJsonModel(
                id=id,
                question=problem,
                output=f'<think>{train["deepseek_reasoning"]}</think>{deepseek_answer}',
                answer=deepseek_answer,
                domain=train["domain"],
                source=train["source"]
            )
            no_filter_json_list.append(no_filter_json)
            id += 1
    print(f"code: {len(no_filter_json_list)} / {len(train_dataset)}")

    return no_filter_json_list

## Science (chemistry, biology, physics)


In [5]:
def make_science_json_list(train_dataset, start_id):
    id = start_id
    no_filter_json_list = []
    for train in train_dataset:
        problem = train["problem"]
        deepseek_solution = train["deepseek_solution"][-150:]
        deepseek_answer = extract_latex(deepseek_solution, "\\boxed")
        if deepseek_answer is None:
            matches = list(re.finditer(r"\*\*(.*?)\*\*", deepseek_solution))
            if matches:
                last_match = matches[-1]
                end_pos = last_match.end()
                bold_text = last_match.group(1)
                if (end_pos >= len(deepseek_solution) or deepseek_solution[end_pos] != ":") and "answer" not in bold_text.lower() and ":" not in bold_text:
                    deepseek_answer = bold_text
        if deepseek_answer:
            no_filter_json = NoFilterJsonModel(
                id=id,
                question=problem,
                output=f'<think>{train["deepseek_reasoning"]}</think>{deepseek_answer}',
                answer=deepseek_answer,
                domain=train["domain"],
                source=train["source"]
            )
            no_filter_json_list.append(no_filter_json)
            id += 1
    print(f"science: {len(no_filter_json_list)} / {len(train_dataset)}")

    return no_filter_json_list


## Puzzle

In [6]:
import re

def extract_letter(s):
    # "Answer"の大文字・小文字を問わず最も後ろの位置を取得
    idx = s.lower().rfind("answer")
    if idx == -1:
        return None
    # "Answer"の直後から検索
    after = s[idx + len("answer"):]
    # 記号等を除き、最初に現れる大文字アルファベット1文字を抽出
    match = re.search(r'[A-Z]', after)
    return match.group(0) if match else None

def find_second_last_double_star_upper(s):
    # "**"の出現位置をすべて取得
    positions = [i for i in range(len(s)-1) if s[i:i+2] == "**"]
    if len(positions) < 2:
        return None
    # 後ろから2回目の"**"の次の文字
    idx = positions[-2] + 2
    if idx < len(s):
        c = s[idx]
        if c.isupper() and c.isalpha():
            return c
    return None


def make_puzzle_json_list(train_dataset, start_id):
    id = start_id
    no_filter_json_list = []
    for train in train_dataset:
        problem = train["problem"]
        deepseek_answer = extract_letter(train["deepseek_solution"])
        if deepseek_answer is None:
            deepseek_answer = find_second_last_double_star_upper(train["deepseek_solution"])
        if deepseek_answer == train["ground_truth_solution"]:
            no_flter_jsonl = NoFilterJsonModel(
                id=id,
                question=problem,
                output=f'<think>{train["deepseek_reasoning"]}</think>{deepseek_answer}',
                answer=deepseek_answer,
                domain=train["domain"],
                source=train["source"]
            )
            no_filter_json_list.append(no_flter_jsonl)
            id += 1
    print(f"puzzle: {len(no_filter_json_list)} / {len(train_dataset)}")

    return no_filter_json_list

## make

In [7]:
def make_no_filter_jsonl(train_dataset, jsonl_path="no_filter.jsonl"):
    start_id = 0
    print("Process Math")
    math_dataset = train_dataset.filter(lambda example: example["domain"] == "math")
    math_json_list = make_math_json_list(math_dataset, start_id)
    start_id += len(math_json_list)

    print("Process Code")
    code_dataset = train_dataset.filter(lambda example: example["domain"] == "code")
    code_json_list = make_code_json_list(code_dataset, start_id)
    start_id += len(code_json_list)

    print("Process Science")
    science_dataset = train_dataset.filter(lambda example: example["domain"] in ["chemistry", "biology", "physics"])
    science_json_list = make_science_json_list(science_dataset, start_id)
    start_id += len(science_json_list)

    print("Process Puzzle")
    puzzle_dataset = train_dataset.filter(lambda example: example["domain"] == "puzzle")
    puzzle_json_list = make_puzzle_json_list(puzzle_dataset, start_id)
    start_id += len(puzzle_json_list)

    json_list = math_json_list + code_json_list + science_json_list + puzzle_json_list
    # print(json_list[0])
    with open(jsonl_path, "w") as f:
        for json_data in json_list:
            json_str = json_data.model_dump_json()
            f.write(json_str + "\n")

In [8]:
dataset = load_dataset("open-thoughts/OpenThoughts-114k", "metadata")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

metadata/train-00000-of-00012.parquet:   0%|          | 0.00/95.8M [00:00<?, ?B/s]

metadata/train-00001-of-00012.parquet:   0%|          | 0.00/95.1M [00:00<?, ?B/s]

metadata/train-00002-of-00012.parquet:   0%|          | 0.00/97.0M [00:00<?, ?B/s]

metadata/train-00003-of-00012.parquet:   0%|          | 0.00/96.2M [00:00<?, ?B/s]

metadata/train-00004-of-00012.parquet:   0%|          | 0.00/96.5M [00:00<?, ?B/s]

metadata/train-00005-of-00012.parquet:   0%|          | 0.00/96.9M [00:00<?, ?B/s]

metadata/train-00006-of-00012.parquet:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

metadata/train-00007-of-00012.parquet:   0%|          | 0.00/70.7M [00:00<?, ?B/s]

metadata/train-00008-of-00012.parquet:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

metadata/train-00009-of-00012.parquet:   0%|          | 0.00/370M [00:00<?, ?B/s]

metadata/train-00010-of-00012.parquet:   0%|          | 0.00/976M [00:00<?, ?B/s]

metadata/train-00011-of-00012.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/113957 [00:00<?, ? examples/s]

In [9]:
make_no_filter_jsonl(dataset["train"])

Process Math


Filter:   0%|          | 0/113957 [00:00<?, ? examples/s]

math: 51057 / 89120
Process Code


Filter:   0%|          | 0/113957 [00:00<?, ? examples/s]

code: 19903 / 19904
Process Science


Filter:   0%|          | 0/113957 [00:00<?, ? examples/s]

science: 637 / 3714
Process Puzzle


Filter:   0%|          | 0/113957 [00:00<?, ? examples/s]

puzzle: 944 / 1219


## アップロード

In [12]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
dataset = load_dataset("json", data_files="no_filter.jsonl", split="train")
dataset.push_to_hub("Ty-Yuki/SFT_OpenThoughts-114k_no-filter_no-proof")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Ty-Yuki/SFT_OpenThoughts-114k_no-filter_no-proof/commit/6f771e17ad3b9ac2b007aef55da34fe5184ca21b', commit_message='Upload dataset', commit_description='', oid='6f771e17ad3b9ac2b007aef55da34fe5184ca21b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Ty-Yuki/SFT_OpenThoughts-114k_no-filter_no-proof', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Ty-Yuki/SFT_OpenThoughts-114k_no-filter_no-proof'), pr_revision=None, pr_num=None)