In [None]:
path = "/content/drive/MyDrive/RELISH_v1.json"

In [None]:
import xml.etree.ElementTree as ET
import requests
import time
from tqdm import tqdm
import pandas as pd
import json

# read RELISH and get all PMID
with open(path, "r") as f:
    data = json.load(f)

pmid_set = set()
for item in data:
    pmid_set.add(item["pmid"])
    pmid_set.update(item.get("response", {}).get("relevant", []))
    pmid_set.update(item.get("response", {}).get("partial", []))
    pmid_set.update(item.get("response", {}).get("irrelevant", []))
pmid_list = list(pmid_set)


In [None]:
print(len(pmid_list))

163289


In [None]:
def batch_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

def fetch_batch(pmids):
    ids = ",".join(pmids)
    params = {
        "db": "pubmed",
        "id": ids,
        "retmode": "xml"
    }
    try:
        r = requests.get(url, params=params, timeout=15)
        r.raise_for_status()
        return r.content
    except Exception as e:
        print(f"Error fetching {ids}: {e}")
        return None

def parse_xml(xml_content):
    records = []
    try:
        root = ET.fromstring(xml_content)
        for article in root.findall(".//PubmedArticle"):
            pmid = article.findtext(".//PMID", default="")
            title = article.findtext(".//ArticleTitle", default="")
            abstract = " ".join([elem.text for elem in article.findall(".//AbstractText") if elem.text])

            # try to get from KeywordList
            keyword_list = [kw.text for kw in article.findall(".//Keyword") if kw.text]

            # if no keyword, try to get descriptor name MeshHeadingList
            if not keyword_list:
                keyword_list = [mh.findtext("DescriptorName") for mh in article.findall(".//MeshHeading") if mh.find("DescriptorName") is not None]

            keywords = "; ".join(keyword_list)

            records.append({
                "pmid": pmid,
                "title": title,
                "abstract": abstract,
                "keywords": keywords
            })
    except Exception as e:
        print(f"Error parsing XML: {e}")
    return records


all_records = []
# request 200 pmid a time
for batch_pmids in tqdm(list(batch_list(pmid_list, 200)), desc="Fetching batches"):
    xml_content = fetch_batch(batch_pmids)
    if xml_content:
        batch_records = parse_xml(xml_content)
        all_records.extend(batch_records)
    time.sleep(0.34)

df_full = pd.DataFrame(all_records)
df_full.to_csv("pubmed_metadata_sample_full.csv", index=False)
print("采集完毕，保存为 pubmed_metadata_sample_full.csv")
df_full.head()

Fetching batches: 100%|██████████| 817/817 [38:46<00:00,  2.85s/it]


采集完毕，保存为 pubmed_metadata_sample_full.csv


Unnamed: 0,pmid,title,abstract,keywords
0,19082600,"The ornamental variety, Japanese striped corn,...","Phenylalanine ammonia-lyase [PAL, EC 4.3.1.24 ...",Anthocyanins; Enzyme Stability; Freeze Drying;...
1,28936401,Assessment of the Visual Analogue Score in the...,,Cholestasis; Primary biliary cholangitis; Prur...
2,23790829,Toxicological characterization of the landfill...,"In this research, toxicological safety of two ...",AO; APDC; Allium cepa; BOD; CBMN; COD; Chemica...
3,25174527,Geographic differences in the distribution of ...,To compare the distribution of the intrinsic m...,"Adolescent; Adult; Age Factors; Aged; Aged, 80..."
4,18493761,Phase I dose escalation study of docetaxel wit...,The primary objectives of this study were to e...,Adult; Aged; Antineoplastic Combined Chemother...


In [None]:
from google.colab import files
files.download('pubmed_metadata_sample_full.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_full.shape

(163243, 4)