In [46]:
# 定义要合并的文件列表（请确保路径正确）
file_paths = [
    '/Users/tangluoxi/Desktop/Education/单选题',
    '/Users/tangluoxi/Desktop/Education/三空题',
    '/Users/tangluoxi/Desktop/Education/双空题',
    '/Users/tangluoxi/Desktop/Education/六选二'
]

# 定义输出文件路径
output_path = '/Users/tangluoxi/Desktop/Education/合并结果.txt'

# 执行合并操作
with open(output_path, 'w', encoding='utf-8') as output_file:
    for path in file_paths:
        try:
            with open(f"{path}.txt", 'r', encoding='utf-8') as input_file:  # 自动添加.txt扩展名
                output_file.write(input_file.read())
                output_file.write("\n\n")  # 添加两个换行作为文件分隔符
            print(f"✅ 已合并: {path}.txt")
        except FileNotFoundError:
            print(f"⚠️ 文件不存在: {path}.txt")
        except Exception as e:
            print(f"❌ 读取错误: {path} - {str(e)}")

print("----------------------------------")
print(f"合并完成 → 输出文件: {output_path}")

✅ 已合并: /Users/tangluoxi/Desktop/Education/单选题.txt
✅ 已合并: /Users/tangluoxi/Desktop/Education/三空题.txt
✅ 已合并: /Users/tangluoxi/Desktop/Education/双空题.txt
✅ 已合并: /Users/tangluoxi/Desktop/Education/六选二.txt
----------------------------------
合并完成 → 输出文件: /Users/tangluoxi/Desktop/Education/合并结果.txt


In [47]:
import json
import re
from collections import OrderedDict

def parse_questions(file_path):
    questions = []
    current_question = None
    blank_mode = False
    current_blank = None

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]  # 过滤空行

    for line in lines:
        # 检测问题开始
        if re.match(r'^\d+\.', line):
            if current_question is not None:
                questions.append(current_question)
            
            current_question = OrderedDict([
                ('question_number', int(re.search(r'\d+', line).group())),
                ('content', re.sub(r'^\d+\.\s*', '', line)),
                ('options', OrderedDict()),
                ('answer', None),
                ('question_type', '')
            ])
            blank_mode = False
            current_blank = None
            continue

        # 确保current_question已初始化
        if current_question is None:
            continue  # 跳过所有在第一个问题编号之前的内容

        # 检测答案行
        if line.lower().startswith('answer'):
            answer_part = re.sub(r'answer\s*[:：]+\s*', '', line, flags=re.IGNORECASE)
            current_question['answer'] = [x.strip() for x in answer_part.split(',')] if ',' in answer_part else answer_part.strip()
            continue

        # 检测空白选项
        if line.startswith('Blank('):
            blank_mode = True
            current_blank = line.split()[0]
            current_question['options'][current_blank] = []
            continue

        # 处理选项内容
        if blank_mode:
            if current_blank and line:
                current_question['options'][current_blank].append(line)
        elif re.match(r'^[A-F]\.', line):
            option_key = line[0]
            option_value = line[3:].strip()
            if option_key.isalpha() and len(option_key) == 1:
                current_question['options'][option_key] = option_value

    # 添加最后一个问题
    if current_question is not None:
        questions.append(current_question)

    # 确定题目类型
    for q in questions:
        if 'Blank(i)' in q['options']:
            blank_count = len(q['options'])
            q['question_type'] = f'array of {blank_count} answers'
        elif len(q['options']) == 6:
            q['question_type'] = 'array of two options from 6 answers'
        else:
            q['question_type'] = 'single answer'

        # 转换单一答案为字符串
        if isinstance(q['answer'], list) and len(q['answer']) == 1:
            q['answer'] = q['answer'][0]

    return questions

# 使用示例
input_file = '/Users/tangluoxi/Desktop/Education/合并结果.txt'
output_file = '/Users/tangluoxi/Desktop/Education/questions.json'

questions = parse_questions(input_file)

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(questions, f, ensure_ascii=False, indent=2)

print(f'成功转换 {len(questions)} 道题目 → {output_file}')

成功转换 410 道题目 → /Users/tangluoxi/Desktop/Education/questions.json


In [44]:
# 指定文件路径
file_path = "/Users/tangluoxi/Desktop/Education/单选题.txt"

# 读取文件内容
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read().lower()  # 转换为小写，便于不区分大小写的搜索

# 统计 "answer" 出现的次数
count = text.count("answer")
print("The word 'answer' appears", count, "times in the file.")


The word 'answer' appears 110 times in the file.


In [17]:
import re

file_path = "/Users/tangluoxi/Desktop/Education/单选题.txt"  # 请替换为你的文件路径

with open(file_path, "r", encoding="utf-8") as f:
    content = f.read()

# 使用正则表达式匹配以数字开头后跟句点的题目编号
questions = re.findall(r"^\d+\.", content, flags=re.MULTILINE)
print("Total number of questions:", len(questions))


Total number of questions: 110


In [40]:
import re

file_path = "/Users/tangluoxi/Desktop/Education/单选题.txt"  # 替换为你的文件路径

with open(file_path, "r", encoding="utf-8") as f:
    content = f.read()

# 找出所有以数字后跟点开始的题号
question_numbers = re.findall(r"^(\d+)\.", content, flags=re.MULTILINE)
question_numbers = [int(num) for num in question_numbers]
print("Found question numbers:", question_numbers)

# 预期题号从 1 到 100
all_numbers = set(range(1, 101))
missing = sorted(all_numbers - set(question_numbers))
print("Missing question numbers:", missing)


Found question numbers: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110]
Missing question numbers: []


In [41]:
import re

file_path = "/Users/tangluoxi/Desktop/Education/单选题.txt"

with open(file_path, "r", encoding="utf-8") as f:
    content = f.read()

# 按每个题号分割题目（假设题号格式为 "数字." 开头）
questions = re.split(r"(?<=\n)(?=\d+\.)", content)

missing_answers = []

for q in questions:
    # 尝试提取题号
    match = re.match(r"(\d+)\.", q)
    if match:
        qnum = int(match.group(1))
        # 检查题块中是否包含 "Answer:" 或 "Answer：" 字样
        if ("Answer:" not in q) and ("Answer：" not in q):
            missing_answers.append(qnum)

print("Missing answer for question(s):", missing_answers)


Missing answer for question(s): []


In [48]:
import json

input_file = "/Users/tangluoxi/Desktop/Education/questions.json"  # 原文件路径
output_file = "/Users/tangluoxi/Desktop/Education/questions_updated.json"  # 输出文件路径

# 读取 JSON 数据
with open(input_file, "r", encoding="utf-8") as f:
    questions = json.load(f)

# 为每个题目重新分配连续的题号，从1开始
for idx, question in enumerate(questions, start=1):
    question["question_number"] = idx

# 将更新后的数据写入新的 JSON 文件（你也可以选择覆盖原文件）
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(questions, f, ensure_ascii=False, indent=2)

print(f"题目编号已更新为1到{len(questions)}")


题目编号已更新为1到410


In [3]:
import fitz  # PyMuPDF

# 输入PDF文件路径
pdf_paths = [
    "/Users/tangluoxi/Desktop/Education/GRE阅读机经320题+30题.pdf",
    "/Users/tangluoxi/Desktop/Education/GRE阅读机经320题+30题答案解析.pdf"
]

# 输出PDF路径
merged_pdf_path = "/Users/tangluoxi/Desktop/Education/GRE阅读机经_合并版.pdf"
text_output_path = "/Users/tangluoxi/Desktop/Education/GRE阅读机经_合并版.txt"

# 1. 合并PDF文件
def merge_pdfs(input_pdfs, output_pdf):
    merged_document = fitz.open()
    
    for pdf in input_pdfs:
        doc = fitz.open(pdf)
        merged_document.insert_pdf(doc)
    
    merged_document.save(output_pdf)
    merged_document.close()
    print(f"PDF 合并完成，文件保存至: {output_pdf}")

# 2. 将PDF转换为TXT文件
def pdf_to_text(pdf_path, txt_output_path):
    doc = fitz.open(pdf_path)
    text = ""
    
    for page in doc:
        text += page.get_text("text") + "\n\n"
    
    with open(txt_output_path, "w", encoding="utf-8") as txt_file:
        txt_file.write(text)
    
    print(f"PDF 转换为TXT完成，文件保存至: {txt_output_path}")

# 执行合并和转换
merge_pdfs(pdf_paths, merged_pdf_path)
pdf_to_text(merged_pdf_path, text_output_path)


PDF 合并完成，文件保存至: /Users/tangluoxi/Desktop/Education/GRE阅读机经_合并版.pdf
PDF 转换为TXT完成，文件保存至: /Users/tangluoxi/Desktop/Education/GRE阅读机经_合并版.txt


In [2]:
pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-macosx_10_9_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-macosx_10_9_x86_64.whl (19.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.3
Note: you may need to restart the kernel to use updated packages.


In [26]:
import json
import re

# 设置输入和输出文件路径
input_file_path = "/Users/tangluoxi/Desktop/Education/1.txt"
output_file_path = "/Users/tangluoxi/Desktop/Education/output.json"

# 从文件中读取全部内容
with open(input_file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

# 根据连续的空行分割文本，获取各个题目的块
questions = [q.strip() for q in raw_text.strip().split("\n\n") if q.strip()]

result = []
question_number = 111  # 题号从 111 开始

for q in questions:
    lines = q.splitlines()
    
    # 处理第一行，去掉题号和点，提取题干内容
    first_line = lines[0]
    match = re.match(r"^\d+\.\s*(.*)", first_line)
    content = match.group(1).strip() if match else first_line.strip()
    
    options = {}
    answer = None
    
    # 遍历剩余行，解析选项和答案
    for line in lines[1:]:
        line = line.strip()
        if line.startswith("Answer:"):
            answer = line.split("Answer:")[1].strip()
        else:
            opt_match = re.match(r"^([A-Z])\.\s*(.*)", line)
            if opt_match:
                key = opt_match.group(1)
                value = opt_match.group(2).strip()
                options[key] = value

    # 构造每道题的字典
    question_dict = {
        "question_number": question_number,
        "content": content,
        "options": options,
        "answer": answer,
        "question_type": "single answer"
    }
    result.append(question_dict)
    question_number += 1

# 将结果写入 JSON 文件
with open(output_file_path, "w", encoding="utf-8") as f:
    json.dump(result, f, indent=4, ensure_ascii=False)

print(f"JSON 数据已保存到 {output_file_path}")


JSON 数据已保存到 /Users/tangluoxi/Desktop/Education/单选题2.json


In [33]:
import json

# JSON 文件路径
file_path = "/Users/tangluoxi/Desktop/Education/GRE Verbal.json"

# 读取 JSON 文件
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 直接统计 JSON 列表长度
actual_count = len(data)
print(f"实际解析到的题目数量：{actual_count}")

# 基于题号范围计算（假设题号是连续的）
question_numbers = [q['question_number'] for q in data if 'question_number' in q]
if question_numbers:
    min_num = min(question_numbers)
    max_num = max(question_numbers)
    expected_count = max_num - min_num + 1
    print(f"题号范围从 {min_num} 到 {max_num}，预期题目数量：{expected_count}")
else:
    print("未找到题号数据")



实际解析到的题目数量：410
题号范围从 1 到 410，预期题目数量：410


In [19]:
import json

# JSON 文件路径
file_path = "/Users/tangluoxi/Desktop/Education/单选题2.json"

# 读取 JSON 文件
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 提取所有题目的 question_number，并排序
question_numbers = sorted([q["question_number"] for q in data if "question_number" in q])

if not question_numbers:
    print("没有找到题号数据。")
else:
    # 以最小和最大题号为范围，找出缺失的题号
    start = question_numbers[0]
    end = question_numbers[-1]
    missing_numbers = [num for num in range(start, end + 1) if num not in question_numbers]
    if missing_numbers:
        print("缺少的题号：", missing_numbers)
    else:
        print("题号连续，无缺失。")


题号连续，无缺失。


In [35]:
import json

# 设置输入和输出文件路径
input_file = "/Users/tangluoxi/Desktop/Education/3.json"
output_file = "/Users/tangluoxi/Desktop/Education/4.json"

# 读取 JSON 文件
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# 遍历每个题目，将 question_number 重新赋值，从 1 开始
for index, item in enumerate(data, start=1):
    item["question_number"] = index

# 将更新后的数据写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

print("题号已重新排序，并保存到文件：", output_file)


题号已重新排序，并保存到文件： /Users/tangluoxi/Desktop/Education/4.json


In [34]:
import json

# 设置文件路径
file_path = "/Users/tangluoxi/Desktop/Education/GRE Verbal.json"

# 读取 JSON 文件
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 初始化计数器
counts = {
    "single answer": 0,
    "array of 3 answers": 0,
    "array of 2 answers": 0,
    "array of two options from 6 answers": 0
}

# 遍历所有题目
for item in data:
    q_type = item.get("question_type", "").strip()
    if q_type in counts:
        counts[q_type] += 1

# 输出统计结果
print("各题型题目数量统计：")
for k, v in counts.items():
    print(f"{k}: {v}")


各题型题目数量统计：
single answer: 110
array of 3 answers: 100
array of 2 answers: 100
array of two options from 6 answers: 100


In [36]:
import re

file_path = "/Users/tangluoxi/Desktop/Education/4.json"  # 替换为你的文件路径

with open(file_path, "r", encoding="utf-8") as f:
    content = f.read()

# 找出所有以数字后跟点开始的题号
question_numbers = re.findall(r"^(\d+)\.", content, flags=re.MULTILINE)
question_numbers = [int(num) for num in question_numbers]
print("Found question numbers:", question_numbers)

# 预期题号从 1 到 100
all_numbers = set(range(1, 101))
missing = sorted(all_numbers - set(question_numbers))
print("Missing question numbers:", missing)

Found question numbers: []
Missing question numbers: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]


In [37]:
import json

# 设置文件路径
file_path = "/Users/tangluoxi/Desktop/Education/4.json"

# 读取 JSON 文件
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 初始化计数器
counts = {
    "single answer": 0,
    "array of 3 answers": 0,
    "array of 2 answers": 0,
    "array of two options from 6 answers": 0
}

# 遍历所有题目
for item in data:
    q_type = item.get("question_type", "").strip()
    if q_type in counts:
        counts[q_type] += 1

# 输出统计结果
print("各题型题目数量统计：")
for k, v in counts.items():
    print(f"{k}: {v}")


各题型题目数量统计：
single answer: 0
array of 3 answers: 99
array of 2 answers: 0
array of two options from 6 answers: 0


In [40]:
import json

input_file = "/Users/tangluoxi/Desktop/Education/GRE Verbal.json"

with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

print("加载后的题目数量:", len(data))


加载后的题目数量: 410


In [41]:
import json
import os

# 设置输入文件路径
input_file = "/Users/tangluoxi/Desktop/Education/GRE Verbal.json"
# 设置输出目录
output_dir = "/Users/tangluoxi/Desktop/Education/"

# 读取 JSON 文件
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# 根据 question_type 分组
separated = {}
for item in data:
    q_type = item.get("question_type", "").strip()
    if q_type not in separated:
        separated[q_type] = []
    separated[q_type].append(item)

# 对每个题型的数据，重新设置 question_number 并保存为单独的文件
for q_type, items in separated.items():
    # 重新设置题号，从1开始
    for idx, item in enumerate(items, start=1):
        item["question_number"] = idx
    # 为防止文件名中出现空格或特殊字符，将题型字符串处理成安全的格式
    safe_q_type = q_type.replace(" ", "_")
    filename = f"GRE_Verbal_{safe_q_type}.json"
    output_path = os.path.join(output_dir, filename)
    with open(output_path, "w", encoding="utf-8") as f_out:
        json.dump(items, f_out, indent=4, ensure_ascii=False)
    print(f"题型 '{q_type}' 包含 {len(items)} 道题目，已保存到 {output_path}")


题型 'single answer' 包含 110 道题目，已保存到 /Users/tangluoxi/Desktop/Education/GRE_Verbal_single_answer.json
题型 'array of 3 answers' 包含 100 道题目，已保存到 /Users/tangluoxi/Desktop/Education/GRE_Verbal_array_of_3_answers.json
题型 'array of 2 answers' 包含 100 道题目，已保存到 /Users/tangluoxi/Desktop/Education/GRE_Verbal_array_of_2_answers.json
题型 'array of two options from 6 answers' 包含 100 道题目，已保存到 /Users/tangluoxi/Desktop/Education/GRE_Verbal_array_of_two_options_from_6_answers.json


In [3]:
import os
import json

# 路径设置
image_dir = "/Users/tangluoxi/Desktop/Education/GRE Math Hard"
txt_file = "/Users/tangluoxi/Desktop/Education/GRE Math Hard.txt"
output_json = os.path.join(image_dir, "GRE Math Hard.json")

# 读取答案，每行格式为 "编号. 答案"
with open(txt_file, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]

exercises = []
for line in lines:
    # 假设格式为 "编号. 答案"，用点分隔
    if '.' in line:
        num, ans = line.split('.', 1)
        question_number = num.strip()
        answer = ans.strip()
    else:
        # 如果没有找到分隔符，全部作为答案
        question_number = ""
        answer = line.strip()
    
    # 根据题号生成图片路径
    image_filename = f"{question_number}.png" if question_number else ""
    image_path = os.path.join(image_dir, image_filename)
    
    exercises.append({
        "question_number": question_number,
        "image": image_path,
        "answer": answer
    })

# 生成 JSON 文件
with open(output_json, 'w', encoding='utf-8') as f:
    json.dump({"GRE Math Hard.json": exercises}, f, ensure_ascii=False, indent=4)

print("JSON 文件已生成，路径为：", output_json)


JSON 文件已生成，路径为： /Users/tangluoxi/Desktop/Education/GRE Math Hard/GRE Math Hard.json


In [4]:
import os
import json

# 文件路径设置
txt_file = "/Users/tangluoxi/Desktop/Education/1.txt"
output_json = "/Users/tangluoxi/Desktop/Education/output.json"  # 输出 JSON 文件路径

# 读取整个文本文件内容
with open(txt_file, "r", encoding="utf-8") as f:
    content = f.read()

# 假设每个题目块之间用两个换行符分隔
question_blocks = content.strip().split("\n\n")

questions = []
question_number_start = 101

for i, block in enumerate(question_blocks):
    lines = block.strip().splitlines()
    if len(lines) < 5:
        print(f"题目块 {i+1} 格式不符合要求，跳过")
        continue

    # 第一行为题干
    question_content = lines[0].strip()
    
    # 第二行：Blank(i) 选项，格式 "Blank(i): option1, option2, option3"
    blank_i_line = lines[1].strip()
    if ":" in blank_i_line:
        _, blank_i_options_str = blank_i_line.split(":", 1)
        blank_i_options = [opt.strip() for opt in blank_i_options_str.split(",")]
    else:
        blank_i_options = []
    
    # 第三行：Blank(ii) 选项
    blank_ii_line = lines[2].strip()
    if ":" in blank_ii_line:
        _, blank_ii_options_str = blank_ii_line.split(":", 1)
        blank_ii_options = [opt.strip() for opt in blank_ii_options_str.split(",")]
    else:
        blank_ii_options = []
    
    # 第四行：答案，格式 "Answer: answer1, answer2"
    answer_line = lines[3].strip()
    if ":" in answer_line:
        _, answer_str = answer_line.split(":", 1)
        answers = [ans.strip() for ans in answer_str.split(",")]
    else:
        answers = []
    
    # 第五行：题型，格式 "Question type: some type"
    question_type_line = lines[4].strip()
    if ":" in question_type_line:
        _, qtype = question_type_line.split(":", 1)
        qtype = qtype.strip()
    else:
        qtype = ""
    
    # 构造当前题目的 JSON 对象
    question_json = {
        "question_number": question_number_start + i,
        "content": question_content,
        "options": {
            "Blank(i)": blank_i_options,
            "Blank(ii)": blank_ii_options
        },
        "answer": answers,
        "question_type": qtype
    }
    questions.append(question_json)

# 构造最终 JSON 数据
output_data = {"questions": questions}

# 输出到 JSON 文件
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(output_data, f, ensure_ascii=False, indent=4)

print("JSON 文件已生成，路径为：", output_json)


题目块 10 格式不符合要求，跳过
题目块 13 格式不符合要求，跳过
题目块 18 格式不符合要求，跳过
题目块 27 格式不符合要求，跳过
题目块 29 格式不符合要求，跳过
题目块 33 格式不符合要求，跳过
题目块 35 格式不符合要求，跳过
题目块 43 格式不符合要求，跳过
题目块 47 格式不符合要求，跳过
题目块 53 格式不符合要求，跳过
题目块 55 格式不符合要求，跳过
题目块 61 格式不符合要求，跳过
题目块 76 格式不符合要求，跳过
题目块 78 格式不符合要求，跳过
题目块 83 格式不符合要求，跳过
题目块 87 格式不符合要求，跳过
题目块 90 格式不符合要求，跳过
题目块 93 格式不符合要求，跳过
题目块 102 格式不符合要求，跳过
JSON 文件已生成，路径为： /Users/tangluoxi/Desktop/Education/output.json
