In [None]:
!pip install biopython openpyxl requests beautifulsoup4 pandas pycountry country_converter

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting country_converter
  Downloading country_converter-1.2-py3-none-any.whl.metadata (24 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading country_converter-1.2-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry, biopython, country_converter
Successfully installed biopython-1.84 c

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


PubMed Searching

In [None]:
import os
import time
from datetime import datetime
import pandas as pd
from Bio import Entrez, Medline

def search_pubmed(query, start_date, end_date, email):
    Entrez.email = email
    handle = Entrez.esearch(db="pubmed",
                            term=query,
                            mindate=start_date,
                            maxdate=end_date,
                            retmax=100000)
    record = Entrez.read(handle)
    return record["IdList"]

def fetch_details(id_list):
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
    records = Medline.parse(handle)
    return list(records)

def extract_info(record):
    pmid = record.get("PMID", "")
    title = record.get("TI", "")
    abstract = record.get("AB", "")
    date = record.get("DP", "")  # Publication date
    pmdate = record.get("EDAT", "")  # Entrez Date (when added to PubMed)
    authors = ", ".join(record.get("AU", []))
    affiliations = "; ".join(record.get("AD", []))
    journal = record.get("TA", "")  # TA is the journal title abbreviation

    return {
        "PMID": pmid,
        "Title": title,
        "Abstract": abstract,
        "Publication Date": date,
        "PubMed Date": pmdate,
        "Authors": authors,
        "Affiliations": affiliations,
        "Journal": journal
    }

def main():
    # Mount Google Drive
    #drive.mount('/content/drive')

    query = "psoriasis"
    start_date = "2022/01/01"
    end_date = "2023/12/31"
    email = "your_email@example.com"  # 请替换为您的邮箱

    print("searching PubMed...")
    id_list = search_pubmed(query, start_date, end_date, email)
    print(f"find {len(id_list)} papers")

    print("details...")
    records = fetch_details(id_list)

    print("scraping...")
    data = [extract_info(record) for record in records]

    df = pd.DataFrame(data)

    # 重新排列列的顺序
    columns_order = ['PMID', 'Publication Date', 'PubMed Date', 'Journal', 'Title', 'Abstract', 'Authors', 'Affiliations']
    df = df[columns_order]

    # 设置保存路径
    save_path = "/content"
    output_file = os.path.join(save_path, f"psoriasis_papers_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx")

    df.to_excel(output_file, index=False)
    print(f"data has saved to {output_file}")

if __name__ == "__main__":
    main()