<a href="https://colab.research.google.com/github/pattangeumdduck/stack_study_record/blob/main/pubmed_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install Bio



In [21]:
from Bio import Entrez
from xml.etree import ElementTree as ET
import pandas as pd
import time
from datetime import datetime, timedelta

Entrez.email = "684259317@naver.com"

In [22]:
def daterange(start_date, end_date):
    # 1달 단위로 시작 날짜 목록 생성
    dates = []
    current = start_date
    while current <= end_date:
        dates.append(current)
        next_month = (current.replace(day=1) + timedelta(days=32)).replace(day=1)
        current = next_month
    return dates

# PMID 리스트 불러오기 (예: 2024/02/01 ~ 2024/03/29)
def get_pubmed_ids(query, start_date, end_date, retmax=100000):
    date_query = f'("{start_date}"[Date - Publication] : "{end_date}"[Date - Publication])'
    full_query = f'{query} AND {date_query}'
    handle = Entrez.esearch(db="pubmed", term=full_query, retmax=retmax)
    record = Entrez.read(handle)
    return record["IdList"]


# 상세 메타데이터 가져오기
def fetch_pubmed_details(id_list):
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
    records = Entrez.read(handle)
    return records

In [23]:
#논문 필드 추출 함수 정의
def extract_metadata(records):
    articles = []
    for article in records['PubmedArticle']:
        citation = article['MedlineCitation']
        article_info = citation['Article']

        pmid = citation['PMID']
        title = article_info.get('ArticleTitle', "")
        journal = article_info['Journal']['Title']
        try:
            pub_date = article_info['Journal']['JournalIssue']['PubDate']
            pub_year = pub_date.get('Year', '')
            pub_month = pub_date.get('Month', '01')
            pub_day = pub_date.get('Day', '01')
            pub_date_str = f"{pub_year}-{pub_month}-{pub_day}"
        except:
            pub_date_str = ""

        try:
            mesh_terms = [mh['DescriptorName'] for mh in citation.get('MeshHeadingList', [])]
        except:
            mesh_terms = []

        try:
            pub_type = article_info['PublicationTypeList'][0]
        except:
            pub_type = ""

        articles.append({
            'PMID': str(pmid),
            'Title': title,
            'Journal': journal,
            'PubDate': pub_date_str,
            'Article Type': pub_type,
            'MeSH Terms': ", ".join(str(m) for m in mesh_terms)
        })

    return pd.DataFrame(articles)

In [24]:
def collect_range(query, start_str, end_str, out_filename):
    start_date = datetime.strptime(start_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_str, "%Y-%m-%d")
    month_starts = daterange(start_date, end_date)

    all_df = []
    monthly_counts = []  # 월별 논문 수를 저장할 리스트
    for i, start in enumerate(month_starts):
        end = (start.replace(day=1) + timedelta(days=32)).replace(day=1) - timedelta(days=1)
        print(f"수집 중: {start.date()} ~ {end.date()}")
        ids = get_pubmed_ids(query, start.strftime("%Y/%m/%d"), end.strftime("%Y/%m/%d"))
        print(f" → {len(ids)}편 검색됨")
        monthly_counts.append(len(ids)) # 월별 논문 수 리스트에 추가
        for j in range(0, len(ids), 200):
            chunk_ids = ids[j:j+200]
            try:
                records = fetch_pubmed_details(chunk_ids)
                df_chunk = extract_metadata(records)
                all_df.append(df_chunk)
            except:
                continue
            time.sleep(1)
    final_df = pd.concat(all_df, ignore_index=True)
    final_df.to_csv(out_filename, index=False)
    print(f"저장 완료: {out_filename}")
    return monthly_counts # 월별 논문 수 리스트 반환

In [25]:
# 실제 실행
if __name__ == "__main__":
    # 2023 전체
    collect_range("Korea[Affiliation]", "2023-01-01", "2023-12-31", "pubmed_korea_2023.csv")

    # 2024년 4월 ~ 2025년 3월
    collect_range("Korea[Affiliation]", "2024-04-01", "2025-03-31", "pubmed_korea_2024_04_to_2025_03.csv")

📅 수집 중: 2023-01-01 ~ 2023-01-31
 → 8926편 검색됨
📅 수집 중: 2023-02-01 ~ 2023-02-28
 → 4677편 검색됨
📅 수집 중: 2023-03-01 ~ 2023-03-31
 → 5155편 검색됨
📅 수집 중: 2023-04-01 ~ 2023-04-30
 → 4527편 검색됨
📅 수집 중: 2023-05-01 ~ 2023-05-31
 → 4799편 검색됨
📅 수집 중: 2023-06-01 ~ 2023-06-30
 → 4851편 검색됨
📅 수집 중: 2023-07-01 ~ 2023-07-31
 → 4722편 검색됨
📅 수집 중: 2023-08-01 ~ 2023-08-31
 → 4731편 검색됨
📅 수집 중: 2023-09-01 ~ 2023-09-30
 → 4995편 검색됨
📅 수집 중: 2023-10-01 ~ 2023-10-31
 → 5011편 검색됨
📅 수집 중: 2023-11-01 ~ 2023-11-30
 → 5125편 검색됨
📅 수집 중: 2023-12-01 ~ 2023-12-31
 → 5217편 검색됨
✅ 저장 완료: pubmed_korea_2023.csv
📅 수집 중: 2024-04-01 ~ 2024-04-30
 → 4823편 검색됨
📅 수집 중: 2024-05-01 ~ 2024-05-31
 → 4811편 검색됨
📅 수집 중: 2024-06-01 ~ 2024-06-30
 → 4784편 검색됨
📅 수집 중: 2024-07-01 ~ 2024-07-31
 → 5002편 검색됨
📅 수집 중: 2024-08-01 ~ 2024-08-31
 → 4811편 검색됨
📅 수집 중: 2024-09-01 ~ 2024-09-30
 → 5086편 검색됨
📅 수집 중: 2024-10-01 ~ 2024-10-31
 → 5270편 검색됨
📅 수집 중: 2024-11-01 ~ 2024-11-30
 → 5203편 검색됨
📅 수집 중: 2024-12-01 ~ 2024-12-31
 → 5324편 검색됨
📅 수집 중: 2025-01-01 ~ 202