In [None]:
import os
import shutil

def copy_markdown_files(src_dir, dest_dir):
    """递归地将所有子文件夹中的 Markdown 文件复制到目标目录"""
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)  # 如果目标目录不存在，则创建

    # 遍历源目录及其子文件夹
    for root, dirs, files in os.walk(src_dir):
        for file in files:
            if file.endswith(".md"):  # 只处理 .md 文件
                # 构造源文件路径
                src_file = os.path.join(root, file)
                # 构造目标文件路径
                dest_file = os.path.join(dest_dir, file)
                # 直接复制文件（如果目标文件存在，则会覆盖）
                shutil.copy(src_file, dest_file)
                print(f"复制文件: {src_file} 到 {dest_file}")

# 使用示例
src_directory = "papers/"  # 源目录路径
dest_directory = "all_md/"  # 目标目录路径

copy_markdown_files(src_directory, dest_directory)


In [None]:
import os
import json
from langchain_text_splitters import MarkdownHeaderTextSplitter
from multiprocessing import Pool, Manager
from tqdm import tqdm  # 进度条库

headers_to_split_on = [
    ("#", "Header")
]

def process_markdown_file(file_path):
    """处理单个 Markdown 文件，返回 result1 和 result2"""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            markdown_content = file.read()

        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
        chunks = markdown_splitter.split_text(markdown_content)
        
        header_key = [chunks[i].metadata['Header'].lower() for i in range(2,len(chunks))]
        index_1 = header_key.index('introduction')+2 if 'introduction' in header_key else None
        index_1 = index_1 if index_1 else header_key.index('background')+2 if 'background' in header_key else None
        index_2 = header_key.index('discussion') +2 if 'discussion' in header_key else None
        #title = #chunks[0].metadata['Header'] if chunks[0].metadata else chunks[1].metadata['Header'] 
        title = 'none' 

        results1 = {
            "text": chunks[index_1].page_content if index_1 is not None else ' ',
            "meta": {"title": title, "name": file_path.split('/')[1], "type": "introduction"}
        }
        results2 = {
            "text": chunks[index_2].page_content if index_2 is not None else ' ',
            "meta": {"title": title, "name": file_path.split('/')[1], "type": "discussion"}
        }
        
        return results1, results2
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None, None

def write_to_jsonl(results, output_file):
    """将结果写入 JSONL 文件"""
    with open(output_file, "a", encoding="utf-8") as file:
        for result in results:
            json.dump(result, file, ensure_ascii=False)
            file.write("\n")

def worker(args):
    """工作进程函数，将结果返回"""
    file_path, result_list1, result_list2 = args
    result1, result2 = process_markdown_file(file_path)
    if result1 and result2:
        result_list1.append(result1)
        result_list2.append(result2)

def process_files_in_parallel(input_dir, output_file1, output_file2, num_workers=4):
    """并行处理文件夹中的 Markdown 文件，分别写入两个 JSONL 文件"""
    # 获取所有 Markdown 文件路径
    markdown_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith(".md")]

    # 使用 Manager 创建共享列表
    with Manager() as manager:
        result_list1 = manager.list()
        result_list2 = manager.list()
        
        # 创建进程池
        with Pool(processes=num_workers) as pool:
            # 使用 tqdm 包裹任务以显示进度条
            with tqdm(total=len(markdown_files), desc="Processing files") as pbar:
                for _ in pool.imap_unordered(worker, [(f, result_list1, result_list2) for f in markdown_files]):
                    pbar.update()
        
        # 将结果写入 JSONL 文件
        write_to_jsonl(result_list1, output_file1)
        write_to_jsonl(result_list2, output_file2)

In [None]:
if __name__ == "__main__":
    input_directory = 'all_md/'
    output_jsonl1 = 'introduction.jsonl'
    output_jsonl2 = 'discussion.jsonl'
    
    process_files_in_parallel(input_directory, output_jsonl1, output_jsonl2, num_workers=64)

In [343]:
import os
import json
from langchain_text_splitters import MarkdownHeaderTextSplitter
from multiprocessing import Pool, Manager
from tqdm import tqdm  # 进度条库

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4")
]

with open('pretrained_data/outputs_books/Cancer Prevention and Management through Exercise and Weight Control (Nutrition and Disease Prevention). (Anne McTiernan) (Z-Library).md', "r", encoding="utf-8") as file:
    markdown_content = file.read()
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
chunks = markdown_splitter.split_text(markdown_content)


In [344]:
import re

def process_lines(input_string):
    lines = input_string.splitlines()
    processed_lines = []
    pattern = re.compile(r'[^a-zA-Z\s]')
    for line in lines:
        if 'department of' in line.lower() or 'university' in line.lower() or 'PhD' in line or 'PhD,' in line or 'PhD.' in line or 'school of' in line.lower() or 'md,' in line.lower():
            line = ''

        line = re.sub(r"(Received in original form|revised form|accepted|Published online)", "", line)
        line = re.sub(r"(Journal of [A-Za-z\s]+, Vol\.\s+\d+,\s+No\.\s+\d+,\s+[A-Za-z]+(?: \d{4}), pp\s+\d+-DOI:\s+\d+\.\d+\.\w+).*?", "", line)
        # Remove content inside () and []
        line = re.sub(r"\(.*?\)|\[.*?\]", "", line)
        
        # Remove content between vertical bars (|)
        line = re.sub(r"\|.*?\|", "", line)
        
        # Remove sequences of ---
        line = re.sub(r"---+", "", line)
        
        # Reduce multiple spaces to a single space
        line = re.sub(r"\s{2,}", " ", line).strip()
        
        # Remove URLs
        line = re.sub(r"https?://\S+|www\.\S+", "", line)
        
        # Remove email addresses
        line = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "", line)
        
        # Remove content enclosed in asterisks (**)
        line = re.sub(r"\*{2}.*?\*{2}", "", line)
        
        # Remove content in parentheses
        line = re.sub(r"\(.*?\)", "", line)
        
        
        # Remove numeric patterns
        #line = re.sub(r"\d+\s+.*?", "", line)
        #line = re.sub(r"\d+\s*\.\s*\d*\s*\.\s*\d*", "", line)
        line = re.sub(r"\|<\d+\.\d+\s*\|", "", line)
        line = re.sub(r"\|+", "", line)
        total_words = len(line.split())
        if total_words > 20:
            processed_lines.append(line)
    
    return "\n".join(processed_lines)


In [348]:
import re
import json


def remove_brackets_content(input_string):
    """
    Remove parentheses () and square brackets [] along with their contents, remove newline characters,
    remove content between vertical bars (|), remove sequences of ---, reduce multiple spaces to a single space,
    remove URLs and email addresses, clear the text if newline ratio exceeds 5% of total words,
    or if the ratio of '-' to total words exceeds 0.4, or if any line starts with a dash ('-'), 
    or if the ratio of numbers to total words exceeds 0.3, or remove content enclosed in asterisks (**).

    Args:
        input_string (str): The input string.

    Returns:
        str: The processed string with brackets, their content, newline characters, content between vertical bars, --- removed,
             extra spaces reduced, URLs and email addresses removed, and text cleared if specified conditions are met.
    """
    # Check if any line starts with '-'
    if any(line.strip().startswith('-') for line in input_string.splitlines()):
        return ""  # Clear the text if any line starts with '-'
    

    result = process_lines(input_string)
    result = result.replace('\n','')
    
    total_words = len(result.split())
    dash_count = result.count("-")
    numeric_count = sum(c.isdigit() for c in result)

    if total_words > 0 and ((result.count("\n") / total_words) > 0.05 or (dash_count / total_words) > 0.3 or (numeric_count / total_words) > 0.3):
        return ""  # Clear the text if conditions are met
    else:
        return result

    
#remove_brackets_content(chunks[87].page_content)

In [349]:
chunks[0].page_content

'Editorial Advisory Board  \nCAROLYN D. BERDANIER, PH.D. University of Georgia. Athens. Georgia. U.S.A.'

In [350]:
n=0
for i, doc in enumerate(chunks[:-1]):
    print("-------------------------------------------------------")
    print(f"Document {i+1}:")
   # print("Page content:")
    print(remove_brackets_content(doc.page_content))
    #if remove_brackets_content(doc.page_content) == '':
    n +=1
   # print("Metadata:")
    #for key, value in doc.metadata.items():
     #   print(f"{key}: {value}")
    
    print("\n")


-------------------------------------------------------
Document 1:



-------------------------------------------------------
Document 2:



-------------------------------------------------------
Document 3:



-------------------------------------------------------
Document 4:



-------------------------------------------------------
Document 5:



-------------------------------------------------------
Document 6:



-------------------------------------------------------
Document 7:



-------------------------------------------------------
Document 8:



-------------------------------------------------------
Document 9:



-------------------------------------------------------
Document 10:
A CRC title, part of the Taylor & Francis imprint, a member of the Taylor & Francis Group, the academic division of T&F Informa plc.Published in 2006 by CRC Press Taylor & Francis Group 6000 Broken Sound Parkway NW, Suite 300 Boca Raton, FL 33487-2742No claim to original U.S. Government work

In [351]:
n=0
for i, doc in enumerate(chunks[:-1]):
    print("-------------------------------------------------------")
    print(f"Document {i+1}:")
   # print("Page content:")
    print(doc.page_content)
    #if remove_brackets_content(doc.page_content) == '':
    n +=1
   # print("Metadata:")
    #for key, value in doc.metadata.items():
     #   print(f"{key}: {value}")
    
    print("\n")


-------------------------------------------------------
Document 1:
Editorial Advisory Board  
CAROLYN D. BERDANIER, PH.D. University of Georgia. Athens. Georgia. U.S.A.


-------------------------------------------------------
Document 2:
Pennington Biomedical Research Center, Louisiana State University, Baton Rouge, Louisiana, U.S.A.


-------------------------------------------------------
Document 3:
University of Toronto Medical School, Toronto, Canada


-------------------------------------------------------
Document 4:
The University at Buffalo. The State University of New York. Buffalo, New York, U.S.A.


-------------------------------------------------------
Document 5:
Cornell University, Ithaca, New York, U.S.A.


-------------------------------------------------------
Document 6:
Carolyn D. Berdanier, Ph.D., Professor Emerita, University of Georgia, Athens, Watkinsville, Georgia  
Naima Moustaid-Moussa, Ph.D., University of Tennessee, Knoxville, Tennessee


---------------

In [334]:
import re

def remove_non_english(text):
    # 正则表达式匹配非英文字符
    pattern = re.compile(r'[^a-zA-Z\s]')
    # 替换所有匹配的字符为空
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

# 示例文本
text = "111"
cleaned_text = remove_non_english(text)
pattern = re.compile(r'[^a-zA-Z\s]')
bool(re.search(pattern, text))

True