## 章节

In [None]:
import os
import json

def extract_review_tips(text):
    # 查找【复习提示】的起始位置
    start_marker = "【复习提示】"
    start_idx = text.find(start_marker)
    
    if start_idx == -1:
        return None
    
    # 从标记后开始的文本
    start_idx += len(start_marker)
    
    # 使用正则表达式找到下一个章节标记（如 1.1, 2.1, 3.1 等）
    import re
    text_after_tips = text[start_idx:]
    chapter_match = re.search(r'\n\d+\.\d+', text_after_tips)
    
    if chapter_match:
        end_idx = chapter_match.start()
        review_tip = text_after_tips[:end_idx].strip()
    else:
        review_tip = text_after_tips.strip()
    
    return review_tip



# 使用示例
if __name__ == "__main__":
    directory = "/root/autodl-tmp/EasyDS/data/DS2026_extracted/text"
    chapter_page = ['13', '24', '74', '121', '136', '210', '281', '349']
    titles = ['绪论', '线性表', '栈和队列和数组', '串', '树和二叉树', '图', '查找', '排序']
    chapters = []
    for i, page in enumerate(chapter_page, 1):
        file_path = os.path.join(directory, f'page_{page}.txt')
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            review_tip = extract_review_tips(text)
            if review_tip:
                id = "c0" + str(i)
                chapter = {
  "id": id,            # 章节唯一标识符
  "title": titles[i-1],         # 章节标题
  "parent_id": "",     # 父章节ID（可选）
  "order": "integer",        # 排序顺序
  "description": review_tip,   # 章节描述
  "knowledge_points": [      # 包含的知识点ID列表
    "knowledge_point_id"
  ],
  "sub_chapters": [          # 子章节ID列表
    "chapter_id"
  ]
}
                chapters.append(chapter)
    with open('/root/autodl-tmp/EasyDS/data/questions/chapters.json', 'w', encoding='utf-8') as f:
        json.dump(chapters, f, ensure_ascii=False, indent=4)


## 知识点

In [5]:
import os
import json

def extract_knowledge_point(text, chapter_number):
    import re
    
    # 使用正则表达式匹配章节标题和内容
    section_pattern = rf'{chapter_number}\.\d+\.\d+'
    
    sections = []
    matches = list(re.finditer(section_pattern, text))
    
    for i in range(len(matches)):
        current_match = matches[i]
        current_section = current_match.group()
        start_pos = current_match.start()
        
        if i < len(matches) - 1:
            end_pos = matches[i + 1].start()
            content = text[start_pos:end_pos].strip()
        else:
            content = text[start_pos:].strip()
            
        if "本节试题精选" not in content and "答案与解析" not in content:
            lines = content.split('\n')
            full_title = lines[0].strip()
            title = full_title.split(current_section)[-1].strip()
            description = '\n'.join(lines[1:]).strip()
            
            sections.append({
                'section': current_section,
                'title': title,
                'description': description
            })
    
    return sections

def process_file(file_path, chapter_number):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    sections = extract_knowledge_point(text, chapter_number)
    
    # for section in sections:
    #     print(f"标题：{section['title']}")
    #     print(section['description'])
    #     print("-" * 50)
    return sections


# 使用示例
directory = "/root/autodl-tmp/EasyDS/data/DS2026_extracted/text"
chapter_number = "4"  # 要提取的章节号
knowledge_points = []
for i in range(121, 136):
    file_path = os.path.join(directory, f'page_{i}.txt')
    sections = process_file(file_path, chapter_number)
    for section in sections:
        id = section['section'].split('.')
        id = ''.join(id)
        knowledgepoint = {
  "id": "kc0"+id,            # 知识点唯一标识符
  "title": section['title'],         # 知识点标题
  "chapter_id": " c0"+chapter_number,    # 所属章节ID
  "description": section['description'],   # 知识点详细描述
  "related_points": [        # 关联知识点
    {
      "id": "string",        # 关联知识点ID
      "relation_type": "string" # 关系类型: prerequisite, related, extension
    }
  ],
  "questions": [             # 相关问题ID列表
    "question_id"
  ]
}
        knowledge_points.append(knowledgepoint)
with open(os.path.join("/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints",f'chapter_{chapter_number}.json'), 'w', encoding='utf-8') as f:
    json.dump(knowledge_points, f, ensure_ascii=False, indent=4)


In [8]:
import os
import json

def combine_chapter_content(directory, start_page, end_page):
    """合并章节内容"""
    combined_text = ""
    for i in range(start_page, end_page + 1):
        file_path = os.path.join(directory, f'page_{i}.txt')
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                combined_text += f.read() + "\n"
    return combined_text

def extract_knowledge_point(text, chapter_number):
    import re
    
    # 使用正则表达式匹配章节标题和内容
    section_pattern = rf'{chapter_number}\.\d+\.\d+'
    
    sections = []
    matches = list(re.finditer(section_pattern, text))
    
    for i in range(len(matches)):
        current_match = matches[i]
        current_section = current_match.group()
        start_pos = current_match.start()
        
        if i < len(matches) - 1:
            end_pos = matches[i + 1].start()
            content = text[start_pos:end_pos].strip()
        else:
            content = text[start_pos:].strip()
            
        if "本节试题精选" not in content and "答案与解析" not in content:
            lines = content.split('\n')
            full_title = lines[0].strip()
            title = full_title.split(current_section)[-1].strip()
            description = '\n'.join(lines[1:]).strip()
            
            sections.append({
                'section': current_section,
                'title': title,
                'description': description
            })
    
    return sections

def process_chapter(directory, chapter_number, start_page, end_page):
    # 先合并所有页面内容
    combined_text = combine_chapter_content(directory, start_page, end_page)
    
    # 提取知识点
    sections = extract_knowledge_point(combined_text, chapter_number)
    
    knowledge_points = []
    for section in sections:
        id = section['section'].split('.')
        id = ''.join(id)
        knowledgepoint = {
            "id": "kc0"+id,
            "title": section['title'],
            "chapter_id": "c0"+chapter_number,
            "description": section['description'],
            "related_points": [
                {
                    "id": "string",
                    "relation_type": "string"
                }
            ],
            "questions": ["question_id"]
        }
        knowledge_points.append(knowledgepoint)
    
    # 保存结果
    output_dir = "/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f'chapter_{chapter_number}.json')
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(knowledge_points, f, ensure_ascii=False, indent=4)
    
    return knowledge_points

# 使用示例
directory = "/root/autodl-tmp/EasyDS/data/DS2026_extracted/text"
chapter_number = "8"
start_page = 349  # 章节开始页
end_page = 406    # 章节结束页

knowledge_points = process_chapter(directory, chapter_number, start_page, end_page)

In [None]:
question = {
  "id": "q023001",            # 问题唯一标识符
  "title": "线性表存储",         # 问题标题
  "content": content,       # 问题内容
  "difficulty": "integer",   # 难度等级 (1-5)
  "type": "concept",          # 问题类型 ("concept", "calculation", "application")
  "knowledge_points": [      # 相关知识点ID列表
    "kc0221",
    "kc0231"
  ],
  "related_questions": [     # 相关问题（用于扩展）
    {
      "id": "string",        # 相关问题ID
      "relation_type": "string" # 关系类型: "extension", "application", "contrast"
    }
  ],
  "reference_answer": {      # 参考答案
    "content": "B",     # 答案内容
    "key_points": [          # 关键点列表
      "string"
    ],
    "explanation": """两种存储结构适用于不同的场合，不能简单地说谁好谁坏，选项1错误。链式存储用指针表示逻辑结构，而指针的设置是任意的，因此比顺序存储结构能更方便地表示各种逻辑结构，选项
ⅡI正确。在顺序存储中，插入和删除结点需要移动大量元素，效率较低，选项IⅢI的描述刚好相反。
顺序存储结构既能随机存取又能顺序存取，而链式结构只能顺序存取，选项IV正确。"""  # 详细解释
  }
}

## 题目

In [3]:
import os
import re
import json

def combine_text_files(file_paths):
    """合并多个文本文件的内容"""
    combined_text = ""
    for file_path in file_paths:
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                combined_text += f.read() + "\n"
    return combined_text

def extract_choice_questions(text):
    """提取选择题部分"""
    # 找到"一、单项选择题"的位置
    start_index = text.find("一、单项选择题")
    if start_index == -1:
        return ""
    
    # 从这个位置开始查找"二、"
    text_after_start = text[start_index:]
    end_index = text_after_start.find("二、")
    
    if end_index != -1:  # 找到了"二、"
        return text_after_start[:end_index]
    else:
        return text_after_start

def parse_questions(questions_text):
    """解析题目内容"""
    questions = {}
    current_question = []
    current_number = None
    
    # 跳过"一、单项选择题"之前的内容
    content = questions_text.split("一、单项选择题", 1)[1]
    
    for line in content.split('\n'):
        # 匹配题号（形如 01. 02. 等）
        match = re.match(r'^(\d{2})\.\s*(.+)', line)
        if match:
            # 保存前一个题目
            if current_number:
                questions[current_number] = '\n'.join(filter(None, current_question))
            # 开始新题目
            current_number = match.group(1)
            current_question = [match.group(2)]
        elif current_number and line.strip():
            current_question.append(line.strip())
    
    # 保存最后一个题目
    if current_number:
        questions[current_number] = '\n'.join(filter(None, current_question))
    
    return questions

def parse_answers(answers_text):
    """解析答案内容"""
    answers = {}
    current_number = None
    current_answer = None
    current_explanation = []
    
    # 跳过"一、单项选择题"之前的内容
    content = answers_text.split("一、单项选择题", 1)[1]
    
    for line in content.split('\n'):
        # 匹配答案（形如 01.B 或 01. B）
        match = re.match(r'^(\d{2})\.\s*([A-D])\s*$', line)
        if match:
            # 保存前一题的内容
            if current_number:
                answers[current_number] = {
                    "答案": current_answer,
                    "解析": '\n'.join(filter(None, current_explanation))
                }
            # 开始新题目
            current_number = match.group(1)
            current_answer = match.group(2)
            current_explanation = []
        elif current_number and line.strip():
            current_explanation.append(line.strip())
    
    # 保存最后一题的内容
    if current_number:
        answers[current_number] = {
            "答案": current_answer,
            "解析": '\n'.join(filter(None, current_explanation))
        }
    
    return answers

def process_questions_and_answers(question_files, answer_files):
    """处理题目和答案文件"""
    # 合并题目文件
    questions_text = combine_text_files(question_files)
    questions_section = extract_choice_questions(questions_text)
    questions = parse_questions(questions_section)
    
    # 合并答案文件
    answers_text = combine_text_files(answer_files)
    answers_section = extract_choice_questions(answers_text)
    answers = parse_answers(answers_section)
    
    # 合并题目和答案
    result = []
    for number in sorted(questions.keys()):
        if number.isdigit() and 1 <= int(number) <= 36:
            answer_info = answers.get(number, {"答案": "未找到", "解析": "未找到解析"})
            item = {
                "题号": number,
                "题目": questions[number].strip(),
                "答案": answer_info["答案"],
                "解析": answer_info["解析"]
            }
            result.append(item)
    
    return result

def final_process(id, kpid, qleft, qright, alleft, aright, output_path): 
    question_files = []
    answer_files = []

    file = "/root/autodl-tmp/EasyDS/data/DS2026_extracted/text"
    id = id
    kpid = kpid
    for i in range(qleft,qright):
        question_files.append(os.path.join(file,f"page_{i}.txt"))
    for i in range(alleft,aright):
        answer_files.append(os.path.join(file,f"page_{i}.txt"))

    results = process_questions_and_answers(question_files, answer_files)

    questions = []
    for q in results:
        question = {
    "id": id+q["题号"],            # 问题唯一标识符
    "title": "title",         # 问题标题
    "content": q["题目"],       # 问题内容
    "difficulty": "integer",   # 难度等级 (1-5)
    "type": "concept",          # 问题类型 ("concept", "calculation", "application")
    "knowledge_points": kpid,      # 相关知识点ID列表
    "related_questions": [     # 相关问题（用于扩展）
        {
        "id": "string",        # 相关问题ID
        "relation_type": "string" # 关系类型: "extension", "application", "contrast"
        }
    ],
    "reference_answer": {      # 参考答案
        "content": q["答案"],     # 答案内容
        "key_points": [          # 关键点列表
        "string"
        ],
        "explanation": q["解析"]   # 详细解释
    }
    }
        questions.append(question)

    # 如果文件不存在，则创建文件
    if not os.path.exists(output_path):
        with open(output_path,"w",encoding="utf-8") as f:
            json.dump([],f,ensure_ascii=False,indent=4)

    with open(output_path,"r",encoding="utf-8") as f:
        data = json.load(f)

    data += questions

    with open(output_path,"w",encoding="utf-8") as f:
        json.dump(data,f,ensure_ascii=False,indent=4)


In [34]:
final_process("q0870",["kc871", "kc872", "kc873", "kc874", "kc875"],404, 406, 406, 410, "/root/autodl-tmp/EasyDS/data/ds_data/questions/chapter_8.json")

### 题目起名

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_deepseek import ChatDeepSeek
from langchain_core.prompts import ChatPromptTemplate
from typing import TypedDict

class QuestionName(TypedDict):
    question_name: str


model = ChatDeepSeek(model="deepseek-chat", api_key=os.getenv("DEEPSEEK_API_KEY"))

prompt = ChatPromptTemplate([
    ("system", "请根据以下题目内容，起一个可以概括题目内容，并且简洁的题目名称, 尽可能在15个字以内。"),
    ("human", "题目内容：{content}"),
])

chain = prompt | model.with_structured_output(QuestionName)

for i in range(8, 9):
    with open(f"/root/autodl-tmp/EasyDS/data/ds_data/questions/chapter_{i}.json", "r", encoding="utf-8") as f:
        data = json.load(f)
    for question in data:
        question_name = chain.invoke({"content": question["content"]})
        question["title"] = question_name["question_name"]
        print("ok")
    with open(f"/root/autodl-tmp/EasyDS/data/ds_data/questions/chapter_{i}.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)



## 题目对应章节信息

In [None]:
import json

directory = "/root/autodl-tmp/EasyDS/data/ds_data/questions"

for i in range(2, 9):
    with open(f"/root/autodl-tmp/EasyDS/data/ds_data/questions/chapter_{i}.json", "r", encoding="utf-8") as f:
        questions = json.load(f)
    for question in questions:
        question["chapter"] = f"c0{i}"
    with open(f"/root/autodl-tmp/EasyDS/data/ds_data/questions/chapter_{i}.json", "w", encoding="utf-8") as f:
        json.dump(questions, f, ensure_ascii=False, indent=4)
    print(f"第{i}章节处理完成")
    print("="*20)

## 章节信息补充

In [4]:
import json

with open("/root/autodl-tmp/EasyDS/data/ds_data/chapters.json", "r", encoding="utf-8") as f:
    chapters = json.load(f)
for chapter in chapters:
    chapter["knowledge_points"] = []
with open("/root/autodl-tmp/EasyDS/data/ds_data/chapters.json", "w", encoding="utf-8") as f:
    json.dump(chapters, f, ensure_ascii=False, indent=4)


In [5]:
with open("/root/autodl-tmp/EasyDS/data/ds_data/chapters.json", "r", encoding="utf-8") as f:
    chapters = json.load(f)
with open("/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints/all_knowledgepoints.json", "r", encoding="utf-8") as f:
    all_knowledgepoints = json.load(f)
for chapter in chapters:
    for kp in all_knowledgepoints:
        if kp["chapter_id"] == chapter["id"]:
            chapter["knowledge_points"].append(kp["id"])
with open("/root/autodl-tmp/EasyDS/data/ds_data/chapters.json", "w", encoding="utf-8") as f:
    json.dump(chapters, f, ensure_ascii=False, indent=4)


## 补充知识点的问题列表

In [12]:
import json

for i in range(2, 9):
    with open(f"/root/autodl-tmp/EasyDS/data/ds_data/questions/chapter_{i}.json", "r", encoding="utf-8") as f:
        questions = json.load(f)
    with open(f"/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints/chapter_{i}.json", "r", encoding="utf-8") as f:
        knowledgepoints = json.load(f)
    for question in questions:
        for kp in question["knowledge_points"]:
            for kp_item in knowledgepoints:
                if kp_item["id"] == kp:
                    kp_item["questions"].append(question["id"])
    with open(f"/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints/chapter_{i}.json", "w", encoding="utf-8") as f:
        json.dump(knowledgepoints, f, ensure_ascii=False, indent=4)


## 对知识点进行关联

In [3]:
import json

all_knowledgepoints = []
for i in range(1, 9):
    with open(f"/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints/chapter_{i}.json", "r", encoding="utf-8") as f:
        knowledgepoints = json.load(f)
    for kp in knowledgepoints:
        kp["related_points"] = []
        all_knowledgepoints.append(kp)
with open(f"/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints/all_knowledgepoints.json", "w", encoding="utf-8") as f:
    json.dump(all_knowledgepoints, f, ensure_ascii=False, indent=4)



In [None]:
len(all_knowledgepoints)

In [None]:
from langchain_deepseek import ChatDeepSeek
from langchain_core.prompts import ChatPromptTemplate
from typing import TypedDict, Literal
import os
from dotenv import load_dotenv
load_dotenv()
import json

# 关系类型固定为"prerequisite", "related", "extension", 如果没有关系则返回None
class KnowledgePointRelation(TypedDict):
    related_knowledge_points: Literal["prerequisite", "related", "extension", "no_relation"]

prompt = ChatPromptTemplate([
    ("system", "当前的知识点为\ntitle: {title}\ndescription: {description}\n判断用户输入的知识点与当前知识点之间的关系，关系类型固定为\"prerequisite\", \"extension\", 如果没有很明确的前后置关系则返回\"no_relation\"。"),
    ("human", "用户输入的知识点为\ntitle: {user_title}\ndescription: {user_description}"),
])

model = ChatDeepSeek(model="deepseek-chat", api_key=os.getenv("DEEPSEEK_API_KEY"))
chain = prompt | model.with_structured_output(KnowledgePointRelation)

# with open(f"/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints/all_knowledgepoints.json", "r", encoding="utf-8") as f:
#     all_knowledgepoints = json.load(f)
for i in range(96, 101):
    with open(f"/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints/all_knowledgepoints.json", "r", encoding="utf-8") as f:
        all_knowledgepoints = json.load(f)
    for j in range(i+1, 102):
        relation = chain.invoke({"title": all_knowledgepoints[i]["title"], "description": all_knowledgepoints[i]["description"], "user_title": all_knowledgepoints[j]["title"], "user_description": all_knowledgepoints[j]["description"]})
        if relation["related_knowledge_points"] != "no_relation":
            all_knowledgepoints[i]["related_points"].append({
                "id": all_knowledgepoints[j]["id"],
                "relation_type": relation["related_knowledge_points"]
            })
            if relation["related_knowledge_points"] == "prerequisite":
                all_knowledgepoints[j]["related_points"].append({
                    "id": all_knowledgepoints[i]["id"],
                    "relation_type": "extension"
                })
            elif relation["related_knowledge_points"] == "extension":
                all_knowledgepoints[j]["related_points"].append({
                    "id": all_knowledgepoints[i]["id"],
                    "relation_type": "prerequisite"
                })
            
        # print(f"第{i+1}个知识点与第{j+1}个知识点处理完成")
    with open(f"/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints/all_knowledgepoints.json", "w", encoding="utf-8") as f:
        json.dump(all_knowledgepoints, f, ensure_ascii=False, indent=4)

    print(f"第{i+1}个知识点处理完成")
    print("="*20)

    


## 知识点速览

In [10]:
import json

with open("/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints/all_knowledgepoints.json", "r", encoding="utf-8") as f:
    all_knowledgepoints = json.load(f)



In [11]:
for knowledgepoint in all_knowledgepoints:
    if "summry" in knowledgepoint:
        # 将连续的两个换行替换为一个
        summry = knowledgepoint["summry"].replace("\n\n", "\n")
        lines = summry.split("\n")
        cleaned_lines = [line.strip() for line in lines]
        knowledgepoint["summry"] = "\n".join(cleaned_lines)

In [12]:
with open("/root/autodl-tmp/EasyDS/data/ds_data/knowledgepoints/all_knowledgepoints.json", "w", encoding="utf-8") as f:
    json.dump(all_knowledgepoints, f, ensure_ascii=False, indent=4)

## 索引

In [12]:
from index_builder import KnowledgeIndexSystem

system = KnowledgeIndexSystem.load_indices('/root/autodl-tmp/EasyDS/data/ds_data/ds_indices.pkl')

In [11]:
system.get_knowledge_point("kc0111")['summry']

'**数据**  \n- **定义**：信息的载体，可被计算机程序处理的符号集合（数、字符等），是程序加工的原料。  \n\n**数据元素**  \n- **定义**：数据的基本单位，由若干**数据项**（不可分割的最小单位）组成。  \n- **示例**：学生记录（数据元素）包含学号、姓名、性别等数据项。  \n\n**数据对象**  \n- **定义**：相同性质数据元素的集合，是数据的子集。  \n- **示例**：整数数据对象为集合 `N={0, ±1, ±2, ...}`。  \n\n**数据类型**  \n- **定义**：值的集合及其上定义的操作总称。  \n- **分类**：  \n  1. **原子类型**：值不可再分（如`int`）。  \n  2. **结构类型**：值可分解为多个成分（如`struct`）。  \n  3. **抽象数据类型（ADT）**：数学模型及操作集合（如栈、队列的接口定义）。  \n\n**数据结构**  \n- **定义**：数据元素的集合及其间的特定关系，包含三方面：  \n  1. **逻辑结构**：数据间的抽象关系（线性、树形、图等）。  \n  2. **存储结构**：逻辑结构在计算机中的实现（顺序存储、链式存储等）。  \n  3. **数据运算**：对数据的基本操作（插入、删除、查找等）。  \n- **关键点**：算法设计依赖逻辑结构，实现依赖存储结构。  \n\n**记忆提示**：  \n- 数据元素≈实体，数据项≈属性；ADT≈接口，数据结构≈接口+实现。'

In [9]:
chapters_knowledge_points = {}
for chapter_id, chapter_info in system.chapter_index.items():
    chapters_knowledge_points[chapter_id] = chapter_info['knowledge_points']

In [10]:
chapters_knowledge_points["01"]

['kc0111', 'kc0112', 'kc0121', 'kc0122']