In [None]:
import os
import shutil

def copy_markdown_files(src_dir, dest_dir):
    """递归地将所有子文件夹中的 Markdown 文件复制到目标目录"""
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)  # 如果目标目录不存在，则创建

    # 遍历源目录及其子文件夹
    for root, dirs, files in os.walk(src_dir):
        for file in files:
            if file.endswith(".md"):  # 只处理 .md 文件
                print(file)
                # 构造源文件路径
                src_file = os.path.join(root, file)
                # 构造目标文件路径
                dest_file = os.path.join(dest_dir, file)
                # 直接复制文件（如果目标文件存在，则会覆盖）
                shutil.copy(src_file, dest_file)
                print(f"复制文件: {src_file} 到 {dest_file}")

# 使用示例
src_directory = "addition//"  # 源目录路径
dest_directory = "outputs/"  # 目标目录路径

#copy_markdown_files(src_directory, dest_directory)


In [None]:
import os
import json
from langchain_text_splitters import MarkdownHeaderTextSplitter
from multiprocessing import Pool, Manager
from tqdm import tqdm  # 进度条库

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4")
]

def process_markdown_file(file_path):
    """处理单个 Markdown 文件，返回 result1 和 result2"""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            markdown_content = file.read()

        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
        chunks = markdown_splitter.split_text(markdown_content)
        
        header_key = [chunks[i].metadata['Header'].lower() for i in range(2,len(chunks))]
        index_1 = header_key.index('introduction')+2 if 'introduction' in header_key else None
        index_1 = index_1 if index_1 else header_key.index('background')+2 if 'background' in header_key else None
        index_2 = header_key.index('discussion') +2 if 'discussion' in header_key else None
        #title = #chunks[0].metadata['Header'] if chunks[0].metadata else chunks[1].metadata['Header'] 
        title = 'none' 

        results1 = {
            "text": chunks[index_1].page_content if index_1 is not None else ' ',
            "meta": {"title": title, "name": file_path.split('/')[1], "type": "introduction"}
        }
        #results2 = {
         #   "text": chunks[index_2].page_content if index_2 is not None else ' ',
          #  "meta": {"title": title, "name": file_path.split('/')[1], "type": "discussion"}
        #}
        
        return results1, results2
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None, None

def write_to_jsonl(results, output_file):
    """将结果写入 JSONL 文件"""
    with open(output_file, "a", encoding="utf-8") as file:
        for result in results:
            json.dump(result, file, ensure_ascii=False)
            file.write("\n")

def worker(args):
    """工作进程函数，将结果返回"""
    file_path, result_list1, result_list2 = args
    result1, result2 = process_markdown_file(file_path)
    if result1 and result2:
        result_list1.append(result1)
        result_list2.append(result2)

def process_files_in_parallel(input_dir, output_file1, output_file2, num_workers=4):
    """并行处理文件夹中的 Markdown 文件，分别写入两个 JSONL 文件"""
    # 获取所有 Markdown 文件路径
    markdown_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith(".md")]

    # 使用 Manager 创建共享列表
    with Manager() as manager:
        result_list1 = manager.list()
        result_list2 = manager.list()
        
        # 创建进程池
        with Pool(processes=num_workers) as pool:
            # 使用 tqdm 包裹任务以显示进度条
            with tqdm(total=len(markdown_files), desc="Processing files") as pbar:
                for _ in pool.imap_unordered(worker, [(f, result_list1, result_list2) for f in markdown_files]):
                    pbar.update()
        
        # 将结果写入 JSONL 文件
        write_to_jsonl(result_list1, output_file1)
        write_to_jsonl(result_list2, output_file2)

In [None]:
if __name__ == "__main__":
    input_directory = 'all_md/'
    output_jsonl1 = 'introduction.jsonl'
    output_jsonl2 = 'discussion.jsonl'
    
    process_files_in_parallel(input_directory, output_jsonl1, output_jsonl2, num_workers=64)

In [4]:
import os
import tiktoken
import json
from langchain_text_splitters import MarkdownHeaderTextSplitter
from multiprocessing import Pool, Manager
from tqdm import tqdm  # 进度条库

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4")
]
def write_to_json(result, output_file):
    """将结果写入 JSONL 文件"""
    with open(output_file, "a", encoding="utf-8") as file:
        #for result in results:
        json.dump(result, file, ensure_ascii=False)
        file.write("\n")

In [5]:
def convert(file_path, output_file):
    num = 0
    with open(file_path, "r", encoding='utf-8') as f:
        page_content = f.read()

    markdown_document = page_content

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(markdown_document)
    for i, doc in enumerate(md_header_splits):
        content = doc.page_content
        token_num = num_tokens_from_string(content, "o200k_base")
        results = {"text": content,
                   "meta": {"book_name": file_path.split('/')[1],'num_token':token_num}
        }
        write_to_json(results, output_file)
        num+=token_num
        return num

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
#num_tokens_from_string("tiktoken is great!", "o200k_base")

In [None]:
nums = 0
number = 0
for root, dirs, files in os.walk('all_md/'):
    for file in tqdm(files):
        contents = []
        if file.endswith(".md"):
            with open(f'all_md/{file}', "r", encoding='utf-8') as f:
                page_content = f.read()

                markdown_document = page_content

                markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
                md_header_splits = markdown_splitter.split_text(markdown_document)
                for i, doc in enumerate(md_header_splits):
                    content = doc.page_content
                    token_num = num_tokens_from_string(content, "o200k_base")
                    results = {"text": content,
                               "meta": {"book_name": file,'num_token':token_num}
                              }
                    nums+=token_num
                    contents.append(results)
                    #print('-----------------')
                with open('pretrained_data.json', "a", encoding="utf-8") as file:
                    number +=len(contents)
                    for result in contents:
                        json.dump(result, file, ensure_ascii=False)
                        file.write("\n")

 14%|██████████▉                                                                  | 4683/33083 [01:18<07:09, 66.09it/s]

In [10]:
nums

25765870

In [11]:
number

73140