In [6]:
import json
import langid
import re

In [3]:
with open('pid_title.json') as f:
    pid_title = json.load(f)

pid_list = list(pid_title.keys())

In [5]:
pid_raw_text = {}
for pid in pid_list:
    with open(f'raw_data/{pid}.txt') as f:
        pid_raw_text[pid] = f.read()

In [39]:
pid_abstract = {}
more_than_one_part = []
zero_part = []

for pid, raw_text in pid_raw_text.items():
    parts = raw_text.split('\n\n')[1:]
    possible_parts = []

    if re.findall(r'^Author information:', raw_text, re.MULTILINE):
        has_author_info = True
    else:
        has_author_info = False

    before_author_info = True

    for part in parts:
        part_lines = part.split('\n')
        if has_author_info and re.findall(r'^Author information:', part, re.MULTILINE):
            before_author_info = False
            continue
        elif has_author_info and before_author_info:
            continue
        elif part.startswith('DOI: '):
            continue
        elif part.startswith('PMID: '):
            continue
        elif '©' in part:
            continue
        elif part.startswith('Conflict of interest statement:'):
            continue
        elif part.startswith('Comment on'):
            continue
        elif part.startswith('Comment in'):
            continue
        elif part.startswith('['):
            continue
        elif part.startswith('Publisher:'):
            continue
        elif part.startswith('Plain Language Summary:'):
            continue
        elif part.startswith('The Author(s)'):
            continue
        elif part.startswith('Collaborators:'):
            continue
        elif len(part_lines) <= 5:
            continue
        else:
            possible_parts.append(part)

    if len(possible_parts) == 0:
        zero_part.append(pid)
    elif len(possible_parts) == 1:
        pid_abstract[pid] = possible_parts[0]
    else:
        # 对于有多个可能的part，只保留英文的
        en_parts = []
        for part in possible_parts:
            if langid.classify(part)[0] == 'en':
                en_parts.append(part)
        if len(en_parts) == 1:
            # 如果按照英文筛选后只剩下一个part，那就用这个part
            pid_abstract[pid] = en_parts[0]
        else:
            # 如果按照英文筛选后还有多个part，属于特殊情况，需要人工判断
            more_than_one_part.append(pid)

In [46]:
# 过滤非英文的摘要
pid_abstract_en = {}
non_en = []
for pid, abstract in pid_abstract.items():
    if langid.classify(abstract)[0] == 'en':
        pid_abstract_en[pid] = abstract
    else:
        non_en.append(pid)

In [47]:
len(pid_abstract), len(pid_abstract_en), len(non_en), len(more_than_one_part), len(zero_part)

(11738, 11721, 17, 18, 3139)

In [50]:
# 保存摘要
with open('pid_abstract.json', 'w') as f:
    json.dump(pid_abstract_en, f)

with open('pid_abstract_en.json', 'w') as f:
    json.dump(pid_abstract_en, f)

with open('pid_abstract_non_en.json', 'w') as f:
    json.dump(non_en, f)

with open('pid_abstract_more_than_one_part.json', 'w') as f:
    json.dump(more_than_one_part, f)

with open('pid_abstract_zero_part.json', 'w') as f:
    json.dump(zero_part, f)

for pid in pid_abstract_en.keys():
    with open(f'raw_data_clean/{pid}.txt', 'w') as f:
        f.write(pid_abstract_en[pid])

In [52]:
# 手动处理多个part的情况
for pid in more_than_one_part:
    with open(f'manually_extract/{pid}.txt', 'w') as f:
        f.write(pid_raw_text[pid])