In [83]:
import os
import pymupdf  # PyMuPDF
import re
import json


In [122]:
def normalize_text(text):
        """标准化文本，去除特殊字符并转换为小写"""
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # 去除特殊字符
        text = text.lower()  # 转换为小写
        return text

def find_title(texts, filename):
    
    filename_without_ext = filename[:-4] if filename.endswith('.pdf') else filename # 移除 .pdf 扩展名
    if " - " in filename_without_ext:
        filename_title = filename_without_ext.split(" - ")[-1] # 提取 " - " 之后的部分
    else:
        filename_title = filename_without_ext # 如果没有 " - "，则使用移除扩展名后的文件名
    
    # for text in texts:
    #     print(text)
    # print(filename)

    normalized_filename = normalize_text(filename_title)
    best_match_index = -1
    max_similarity = 0

    for index, text in enumerate(texts):
        normalized_text = normalize_text(text)

        # 计算相似度 (这里使用简单的字符串包含判断，可以根据需要替换为更复杂的相似度算法)
        similarity_score = 0
        filename_words = normalized_filename.split()
        text_words = normalized_text.split()

        common_words = set(filename_words) & set(text_words) # 计算共同词汇
        similarity_score = len(common_words) / len(filename_words) if filename_words else 0 # 相似度定义为共同词汇占filename词汇的比例


        if similarity_score > max_similarity:
            max_similarity = similarity_score
            best_match_index = index

    if best_match_index != -1:
        return texts[best_match_index]
    else:        
        if filename_title:
            return '(filename)'+filename_title
        else:
            return None # 如果文件名提取也失败，则返回 None
    


def extract_title_and_abstract(pdf_path, filename):
    with pymupdf.open(pdf_path) as doc:
        temp = [page.get_text() for page in doc]
        # 查找摘要和引言之间的内容
        content = ""
        in_abstract = False
        for page in temp:
            lines = page.splitlines()
            for line in lines:
                if 'abstract' in line.lower() or 'a b s t r a c t' in line.lower():  # 允许 'a b s t r a c t' 形式
                    in_abstract = True
                elif 'introduction' in line.lower():
                    in_abstract = False
                if in_abstract:
                    content += line + "\n"
        
        abstract = content.strip()

        # 如果摘要为空，则选取引言前的内容作为摘要
        if abstract == "''":
            content = ""
            in_introduction = False
            for page in temp:
                lines = page.splitlines()
                for line in lines:
                    if 'introduction' in line.lower():
                        in_introduction = True
                        break
                    content += line + "\n"
                if in_introduction:
                    break
            abstract = content.strip()

        potential_title = [item[4] for item in doc[0].get_text("blocks")]
        title = find_title(potential_title, filename)
        return title, abstract

def convert_pdfs_in_directory(directory,save_path):
    results = []
    pdf_files = [filename for filename in os.listdir(directory) if filename.endswith('.pdf')] # 获取所有 PDF 文件名
    total_pdfs = len(pdf_files) # 计算 PDF 文件总数
    processed_pdfs = 0 # 初始化已处理 PDF 计数器
    error_list = [] # 初始化错误列表

    for filename in pdf_files: # 遍历 PDF 文件列表
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(directory, filename)
            try:
                title, abstract = extract_title_and_abstract(pdf_path, filename)
                results.append({"Title": repr(title), "Abstract": repr(abstract)})
            except Exception as e: # 捕获 pymupdf 可能抛出的任何异常
                print(f"处理 PDF 文件 {filename} 时发生错误: {e}") # 打印错误信息
                error_list.append(filename) # 将文件名添加到错误列表
                continue # 跳过当前 PDF 文件，继续处理下一个

            processed_pdfs += 1 # 处理完一个 PDF，计数器加 1
            percentage_processed = (processed_pdfs / total_pdfs) * 100 # 计算已处理百分比
            print(f"已处理 PDF: {percentage_processed:.2f}% ({processed_pdfs}/{total_pdfs})") # 打印进度信息

    # 保存为 JSON 文件
    with open(f"{save_path}.json", "w", encoding="utf-8") as json_file:
        json.dump(results, json_file, ensure_ascii=False, indent=4)

    return error_list # 返回错误列表

In [116]:
test = convert_pdfs_in_directory(r'D:\CursorProj\Chem-Ontology-Constructor\tests\Carbon\temp',r'D:\CursorProj\Chem-Ontology-Constructor\tests\Carbon\temp\result')

CO2 capture using aqueous 1-(2-Hydroxyethyl) piperidine and its
blends with piperazine: Solubility and enthalpy
Shubhashis Adak , Madhusree Kundu *
Department of Chemical Engineering, National Institute of Technology, Rourkela, India
a r t i c l e i n f o
Article history:
Received 4 August 2019
Received in revised form
3 January 2020
Accepted 6 January 2020
Available online 13 January 2020
Keywords:
CO2 capture
1-(2-Hydroxyethyl) piperidine
Piperazine
Activity coefﬁcient model
pKa
Enthalpy
a b s t r a c t
In this article, equilibrium CO2 solubility in aqueous 1-(2-Hydroxyethyl) piperidine (HEP) solutions
pertaining concentrations (1, 2, 3) mol/L in the temperature range (303.15e323.15) K, and CO2 partial
pressure range 0.1e100 kPa are presented. HEP; being a tertiary alkanolamine, its rate of CO2 absorption
is comparatively slower than primary or secondary alkanolamine. Piperazine (PZ), a rate promoter was
added to HEP in 1:4 mol ratio to form blends having enhance rate of CO2 absorpti

In [123]:

# 使用示例
error_list = convert_pdfs_in_directory(r'D:\CursorProj\Chem-Ontology-Constructor\tests\Carbon\pdfs',r'D:\CursorProj\Chem-Ontology-Constructor\tests\Carbon\result\result')


已处理 PDF: 0.18% (1/569)
已处理 PDF: 0.35% (2/569)
已处理 PDF: 0.53% (3/569)
已处理 PDF: 0.70% (4/569)
已处理 PDF: 0.88% (5/569)
已处理 PDF: 1.05% (6/569)
已处理 PDF: 1.23% (7/569)
已处理 PDF: 1.41% (8/569)
已处理 PDF: 1.58% (9/569)
已处理 PDF: 1.76% (10/569)
已处理 PDF: 1.93% (11/569)
已处理 PDF: 2.11% (12/569)
已处理 PDF: 2.28% (13/569)
已处理 PDF: 2.46% (14/569)
已处理 PDF: 2.64% (15/569)
已处理 PDF: 2.81% (16/569)
已处理 PDF: 2.99% (17/569)
已处理 PDF: 3.16% (18/569)
已处理 PDF: 3.34% (19/569)
已处理 PDF: 3.51% (20/569)
已处理 PDF: 3.69% (21/569)
已处理 PDF: 3.87% (22/569)
已处理 PDF: 4.04% (23/569)
已处理 PDF: 4.22% (24/569)
已处理 PDF: 4.39% (25/569)
已处理 PDF: 4.57% (26/569)
已处理 PDF: 4.75% (27/569)
已处理 PDF: 4.92% (28/569)
已处理 PDF: 5.10% (29/569)
已处理 PDF: 5.27% (30/569)
已处理 PDF: 5.45% (31/569)
已处理 PDF: 5.62% (32/569)
已处理 PDF: 5.80% (33/569)
已处理 PDF: 5.98% (34/569)
已处理 PDF: 6.15% (35/569)
已处理 PDF: 6.33% (36/569)
已处理 PDF: 6.50% (37/569)
已处理 PDF: 6.68% (38/569)
已处理 PDF: 6.85% (39/569)
已处理 PDF: 7.03% (40/569)
已处理 PDF: 7.21% (41/569)
已处理 PDF: 7.38% (42/569)
已

In [119]:
error_list

[]

In [124]:
import json

def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
    return data

result_data = read_json_file(r'D:\CursorProj\Chem-Ontology-Constructor\tests\Carbon\result\result.json')
print(result_data[12])  # 打印读取的数据以进行验证



{'Title': "'An integrated experimental and theoretical\\napproach to probe Cr(VI) uptake using decorated\\nhalloysite nanotubes for eﬃcient water treatment†\\n'", 'Abstract': '"f-HNTs\' abstract a proton from the HCrO4\\n−ion. This results in\\nthe creation of NH3\\n+ species. Consequently, an eﬃcient\\nadsorption process occurs due to the electrostatic attraction\\nbetween the negatively charged chromate ion and the positively\\ncharged NH3\\n+ groups. As the pH increases, the ability of the\\namine groups to accept protons diminishes, which results in\\nFig. 4\\nEDX spectra of (a) pristine HNTs (H) and (b–e) f-HNTs (H1–H4) samples.\\n© 2024 The Author(s). Published by the Royal Society of Chemistry\\nRSC Adv., 2024, 14, 2947–2960 | 2953\\nPaper\\nRSC Advances\\nOpen Access Article. Published on 18 January 2024. Downloaded on 10/15/2024 10:30:11 AM. \\n This article is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported Licence.\\nView Article Online\\na reduced u

In [126]:
long_entries = [entry for entry in result_data if len(entry["Abstract"]) > 5000]
empty_entries = [entry for entry in result_data if entry["Abstract"] == "''"]
normal_entries = [entry for entry in result_data if len(entry["Abstract"]) <= 5000 and entry["Abstract"] != "''"]

with open(r'D:\CursorProj\Chem-Ontology-Constructor\tests\Carbon\result\long.json', 'w', encoding='utf-8') as long_file:
    json.dump(long_entries, long_file, ensure_ascii=False, indent=4)

with open(r'D:\CursorProj\Chem-Ontology-Constructor\tests\Carbon\result\empty.json', 'w', encoding='utf-8') as empty_file:
    json.dump(empty_entries, empty_file, ensure_ascii=False, indent=4)

with open(r'D:\CursorProj\Chem-Ontology-Constructor\tests\Carbon\result\result_final.json', 'w', encoding='utf-8') as normal_file:
    json.dump(normal_entries, normal_file, ensure_ascii=False, indent=4)

print(f"长条目数量: {len(long_entries)}")
print(f"空条目数量: {len(empty_entries)}")
print(f"正常条目数量: {len(normal_entries)}")



长条目数量: 39
空条目数量: 59
正常条目数量: 471


In [104]:
print(result_data[12]["Abstract"]) 

"f-HNTs' abstract a proton from the HCrO4\n−ion. This results in\nthe creation of NH3\n+ species. Consequently, an eﬃcient\nadsorption process occurs due to the electrostatic attraction\nbetween the negatively charged chromate ion and the positively\ncharged NH3\n+ groups. As the pH increases, the ability of the\namine groups to accept protons diminishes, which results in\nFig. 4\nEDX spectra of (a) pristine HNTs (H) and (b–e) f-HNTs (H1–H4) samples.\n© 2024 The Author(s). Published by the Royal Society of Chemistry\nRSC Adv., 2024, 14, 2947–2960 | 2953\nPaper\nRSC Advances\nOpen Access Article. Published on 18 January 2024. Downloaded on 10/15/2024 10:30:11 AM. \n This article is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported Licence.\nView Article Online\na reduced uptake of Cr(VI) ions which subsequently reduces the\nadsorption eﬃciency. When using hydrazone-modi\ue103ed HNTs\n(H2), the adsorption eﬃciency peaked at 94% within 30 minutes\nat a pH of 3 as sh