<a href="https://colab.research.google.com/github/pattangeumdduck/stack_study_record/blob/main/pubmed_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install Bio



In [21]:
from Bio import Entrez
from xml.etree import ElementTree as ET
import pandas as pd
import time
from datetime import datetime, timedelta

Entrez.email = "684259317@naver.com"

In [22]:
def daterange(start_date, end_date):
    # 1Îã¨ Îã®ÏúÑÎ°ú ÏãúÏûë ÎÇ†Ïßú Î™©Î°ù ÏÉùÏÑ±
    dates = []
    current = start_date
    while current <= end_date:
        dates.append(current)
        next_month = (current.replace(day=1) + timedelta(days=32)).replace(day=1)
        current = next_month
    return dates

# PMID Î¶¨Ïä§Ìä∏ Î∂àÎü¨Ïò§Í∏∞ (Ïòà: 2024/02/01 ~ 2024/03/29)
def get_pubmed_ids(query, start_date, end_date, retmax=100000):
    date_query = f'("{start_date}"[Date - Publication] : "{end_date}"[Date - Publication])'
    full_query = f'{query} AND {date_query}'
    handle = Entrez.esearch(db="pubmed", term=full_query, retmax=retmax)
    record = Entrez.read(handle)
    return record["IdList"]


# ÏÉÅÏÑ∏ Î©îÌÉÄÎç∞Ïù¥ÌÑ∞ Í∞ÄÏ†∏Ïò§Í∏∞
def fetch_pubmed_details(id_list):
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
    records = Entrez.read(handle)
    return records

In [23]:
#ÎÖºÎ¨∏ ÌïÑÎìú Ï∂îÏ∂ú Ìï®Ïàò Ï†ïÏùò
def extract_metadata(records):
    articles = []
    for article in records['PubmedArticle']:
        citation = article['MedlineCitation']
        article_info = citation['Article']

        pmid = citation['PMID']
        title = article_info.get('ArticleTitle', "")
        journal = article_info['Journal']['Title']
        try:
            pub_date = article_info['Journal']['JournalIssue']['PubDate']
            pub_year = pub_date.get('Year', '')
            pub_month = pub_date.get('Month', '01')
            pub_day = pub_date.get('Day', '01')
            pub_date_str = f"{pub_year}-{pub_month}-{pub_day}"
        except:
            pub_date_str = ""

        try:
            mesh_terms = [mh['DescriptorName'] for mh in citation.get('MeshHeadingList', [])]
        except:
            mesh_terms = []

        try:
            pub_type = article_info['PublicationTypeList'][0]
        except:
            pub_type = ""

        articles.append({
            'PMID': str(pmid),
            'Title': title,
            'Journal': journal,
            'PubDate': pub_date_str,
            'Article Type': pub_type,
            'MeSH Terms': ", ".join(str(m) for m in mesh_terms)
        })

    return pd.DataFrame(articles)

In [24]:
def collect_range(query, start_str, end_str, out_filename):
    start_date = datetime.strptime(start_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_str, "%Y-%m-%d")
    month_starts = daterange(start_date, end_date)

    all_df = []
    monthly_counts = []  # ÏõîÎ≥Ñ ÎÖºÎ¨∏ ÏàòÎ•º Ï†ÄÏû•Ìï† Î¶¨Ïä§Ìä∏
    for i, start in enumerate(month_starts):
        end = (start.replace(day=1) + timedelta(days=32)).replace(day=1) - timedelta(days=1)
        print(f"ÏàòÏßë Ï§ë: {start.date()} ~ {end.date()}")
        ids = get_pubmed_ids(query, start.strftime("%Y/%m/%d"), end.strftime("%Y/%m/%d"))
        print(f" ‚Üí {len(ids)}Ìé∏ Í≤ÄÏÉâÎê®")
        monthly_counts.append(len(ids)) # ÏõîÎ≥Ñ ÎÖºÎ¨∏ Ïàò Î¶¨Ïä§Ìä∏Ïóê Ï∂îÍ∞Ä
        for j in range(0, len(ids), 200):
            chunk_ids = ids[j:j+200]
            try:
                records = fetch_pubmed_details(chunk_ids)
                df_chunk = extract_metadata(records)
                all_df.append(df_chunk)
            except:
                continue
            time.sleep(1)
    final_df = pd.concat(all_df, ignore_index=True)
    final_df.to_csv(out_filename, index=False)
    print(f"Ï†ÄÏû• ÏôÑÎ£å: {out_filename}")
    return monthly_counts # ÏõîÎ≥Ñ ÎÖºÎ¨∏ Ïàò Î¶¨Ïä§Ìä∏ Î∞òÌôò

In [25]:
# Ïã§Ï†ú Ïã§Ìñâ
if __name__ == "__main__":
    # 2023 Ï†ÑÏ≤¥
    collect_range("Korea[Affiliation]", "2023-01-01", "2023-12-31", "pubmed_korea_2023.csv")

    # 2024ÎÖÑ 4Ïõî ~ 2025ÎÖÑ 3Ïõî
    collect_range("Korea[Affiliation]", "2024-04-01", "2025-03-31", "pubmed_korea_2024_04_to_2025_03.csv")

üìÖ ÏàòÏßë Ï§ë: 2023-01-01 ~ 2023-01-31
 ‚Üí 8926Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-02-01 ~ 2023-02-28
 ‚Üí 4677Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-03-01 ~ 2023-03-31
 ‚Üí 5155Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-04-01 ~ 2023-04-30
 ‚Üí 4527Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-05-01 ~ 2023-05-31
 ‚Üí 4799Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-06-01 ~ 2023-06-30
 ‚Üí 4851Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-07-01 ~ 2023-07-31
 ‚Üí 4722Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-08-01 ~ 2023-08-31
 ‚Üí 4731Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-09-01 ~ 2023-09-30
 ‚Üí 4995Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-10-01 ~ 2023-10-31
 ‚Üí 5011Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-11-01 ~ 2023-11-30
 ‚Üí 5125Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2023-12-01 ~ 2023-12-31
 ‚Üí 5217Ìé∏ Í≤ÄÏÉâÎê®
‚úÖ Ï†ÄÏû• ÏôÑÎ£å: pubmed_korea_2023.csv
üìÖ ÏàòÏßë Ï§ë: 2024-04-01 ~ 2024-04-30
 ‚Üí 4823Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2024-05-01 ~ 2024-05-31
 ‚Üí 4811Ìé∏ Í≤ÄÏÉâÎê®
üìÖ ÏàòÏßë Ï§ë: 2024-06-01 ~ 2024-06-30
 ‚Üí 4784Ìé∏ Í≤ÄÏÉâÎê®