In [4]:
import os
import openai
from typing import List

def identify_non_content_lines(text_lines: List[str], start_line: int = 0) -> List[int]:
    """
    Use LLM to identify non-content lines in text
    
    Args:
        text_lines: List of text lines
        start_line: Starting line number
    
    Returns:
        List of line numbers to be removed
    """
    # Combine text lines into a string with line numbers
    numbered_text = ""
    for i, line in enumerate(text_lines, start=start_line):
        numbered_text += f"Line {i}: {line}\n"
    
    # Construct prompt
    prompt = f"""Please identify all non-content lines in the following text. Keep only main paragraphs and headings/titles.
Remove all other elements (including headers, footers, figure/table captions, references, author information, etc.).
Return only the line numbers to be removed, separated by commas. For example: 1,5,8

Text content:
{numbered_text}"""

    # Call API using new interface
    client = openai.OpenAI()
    openai.api_key = os.getenv("SPARE_OPENAI_API_KEY")
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an assistant helping to identify non-content lines in scientific literature."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    print(response.choices[0].message.content)
    # Parse returned line numbers
    try:
        line_numbers = [int(n.strip()) for n in response.choices[0].message.content.split(',')]
        return line_numbers
    except:
        return []

def process_text_file(file_path: str) -> List[int]:
    """
    Process text file by calling LLM every 20 lines to identify non-content
    
    Args:
        file_path: Path to text file
    
    Returns:
        List of all line numbers to be removed
    """
    lines_to_remove = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        all_lines = f.readlines()
        
    # Process in batches of 20 lines
    batch_size = 20
    for i in range(0, len(all_lines), batch_size):
        batch_lines = all_lines[i:i+batch_size]
        batch_results = identify_non_content_lines(batch_lines, start_line=i)
        print(f"Batch {i} results: {batch_results}")
        lines_to_remove.extend(batch_results)
    
    return sorted(lines_to_remove)

def remove_lines(file_path: str, line_numbers: tuple[int]) -> None:
    """
    从文本文件中删除指定行号的内容
    
    Args:
        file_path: 文本文件路径
        line_numbers: 需要删除的行号列表
    """
    # 读取所有行
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # 过滤掉需要删除的行
    filtered_lines = [line for idx, line in enumerate(lines) if idx not in line_numbers]
    
    # 写回文件
    with open(file_path, 'w', encoding='utf-8') as f:
        f.writelines(filtered_lines)



In [13]:
# Usage example
file_path = r"D:\CursorProj\Chem-Ontology-Constructor\tests\Supramolecular\data\supramolecule\barrow-et-al-2015-cucurbituril-based-molecular-recognition.txt" 
non_content_lines = process_text_file(file_path)
print(f"Lines to remove: {non_content_lines}")

0,1,2,3,4,6,8,10,12,14,16,18
Batch 0 results: [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18]
20,22,24,26,28,31,33,35,37,39
Batch 20 results: [20, 22, 24, 26, 28, 31, 33, 35, 37, 39]
41, 43, 45, 47, 50, 52, 54, 57
Batch 40 results: [41, 43, 45, 47, 50, 52, 54, 57]
60,63,65,68,71,74,77
Batch 60 results: [60, 63, 65, 68, 71, 74, 77]
80,83,86,88,90,92,94,96,98
Batch 80 results: [80, 83, 86, 88, 90, 92, 94, 96, 98]
100,102,105,107,109,111,113,115,116,117,118,119
Batch 100 results: [100, 102, 105, 107, 109, 111, 113, 115, 116, 117, 118, 119]
120,121,122,123,124,125,126,127,128,129,130,131,132,133
Batch 120 results: [120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133]
144,145,146,147,148,149,150,151,152,153,154
Batch 140 results: [144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154]
There are no non-content lines in the provided text. Therefore, no lines need to be removed.
Batch 160 results: []
There are no non-content lines in the provided text.
Batch 180 results: []
213, 214

In [10]:
for value in [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]:
    non_content_lines.remove(value)


In [14]:
# 检查重复值
print("所有行号:", non_content_lines)
print("重复的行号:", [x for x in non_content_lines if non_content_lines.count(x) > 1])
print("去重后的行号数量:", len(set(non_content_lines)), "原始行号数量:", len(non_content_lines))

所有行号: [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 31, 33, 35, 37, 39, 41, 43, 45, 47, 50, 52, 54, 57, 60, 63, 65, 67, 68, 68, 69, 70, 71, 71, 72, 73, 74, 74, 77, 80, 83, 86, 88, 90, 92, 94, 96, 96, 98, 100, 102, 105, 107, 109, 111, 113, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 213, 214, 215, 216, 217, 218, 219, 220, 243, 256, 260, 261, 262, 263, 264, 266, 267, 268, 269, 271, 272, 273, 275, 276, 277, 278, 279, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 313, 314, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 352, 353, 355, 356, 357, 358, 359, 365, 394, 428, 430, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 480, 500, 501, 509, 510, 513, 514, 518, 

In [15]:
# 使用示例
remove_lines(file_path, tuple(non_content_lines))
print(f"已删除 {len(non_content_lines)} 行内容")


已删除 1326 行内容
