In [1]:
# requests 라이브러리 설치여부 확인
!pip install requests



In [2]:
# beautifulsoup4 라이브러리 설치여부 확인
!pip install beautifulsoup4



In [3]:
# reqeusts, bs4 import
import requests
import bs4
# BeautifulSoup 클래스 import
from bs4 import BeautifulSoup

In [11]:
!pip install feedparser

Collecting feedparser
  Using cached feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting sgmllib3k (from feedparser)
  Using cached sgmllib3k-1.0.0-py3-none-any.whl
Using cached feedparser-6.0.12-py3-none-any.whl (81 kB)
Installing collected packages: sgmllib3k, feedparser
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [feedparser]
[1A[2KSuccessfully installed feedparser-6.0.12 sgmllib3k-1.0.0


In [17]:
!pip install apscheduler

Collecting apscheduler
  Downloading apscheduler-3.11.2-py3-none-any.whl.metadata (6.4 kB)
Downloading apscheduler-3.11.2-py3-none-any.whl (64 kB)
Installing collected packages: apscheduler
Successfully installed apscheduler-3.11.2


## Google

In [65]:
import requests
import feedparser
import json
import time
import urllib.parse
import re
from datetime import datetime, timezone, timedelta
from bs4 import BeautifulSoup

# =========================
# 1) 입력: 카테고리/키워드
# =========================
CATEGORIES = {
    "기후": ["날씨", "울산날씨", "강풍 경보", "부산날씨", "날씨예보", "내일 날씨", "일기예보", "대구 날씨"],
    "엔터테인먼트": ["유튜브 문제가 발생했습니다", "문상민", "파반느", "부산찬란한 너의 계절에", "김태리", "아너 그녀들의 법정", "윤영경", "정동원", "금잔디", "홍자"],
    "비즈니스 및 금융": ["김인호", "비트코인", "케이뱅크 공모주", "하이닉스 주가", "엔화", "주식", "에어로케이", "에스팀", "신영자", "액스비스"],
    "스포츠": ["엘에이 fc 대 인터 마이애미", "mls", "챔피언스리그", "인테르", "토트넘 대 아스널", "노시환", "레알 에스파냐 대 엘에이 fc", "psg 대 메스", "울브스 대 아스널", "맨 시티 대 뉴캐슬"]
}

COUNTRY = "KR"
LANG = "ko"
OUT_FILE = "google_news_grouped_by_category_keyword.json"

REQ_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "ko-KR,ko;q=0.9,en;q=0.8",
    "Referer": "https://news.google.com/",
}

KST = timezone(timedelta(hours=9))

# =========================
# 2) 시간 전처리 (YYYY-MM-DD HH:MM)
# =========================
def format_hhmm(dt: datetime) -> str:
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=KST)
    dt = dt.astimezone(KST)
    return dt.strftime("%Y-%m-%d %H:%M")

def normalize_published_to_hhmm(raw: str) -> str:
    if not raw:
        return ""
    s = raw.strip()

    # ISO8601
    if "T" in s and "-" in s:
        try:
            dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=KST)
            return format_hhmm(dt.astimezone(KST))
        except Exception:
            pass

    # RFC822
    try:
        s2 = re.sub(r"\bGMT\b", "+0000", s)
        dt = datetime.strptime(s2, "%a, %d %b %Y %H:%M:%S %z")
        return format_hhmm(dt.astimezone(KST))
    except Exception:
        pass

    return ""

# =========================
# 3) 원문 링크에서 publisher / og:image 보강(선택)
# =========================
def abs_url(url: str) -> str:
    if not url:
        return ""
    u = url.strip()
    if u.startswith("//"):
        return "https:" + u
    return u

def fetch_publisher_and_image(url: str, timeout=8):
    out = {"publisher": "", "image_url": ""}
    if not url:
        return out

    try:
        res = requests.get(url, headers=REQ_HEADERS, timeout=timeout, allow_redirects=True)
        if not res.ok:
            return out

        soup = BeautifulSoup(res.text, "html.parser")

        og_img = soup.select_one('meta[property="og:image"]')
        if og_img and og_img.get("content"):
            out["image_url"] = abs_url(og_img["content"].strip())

        og_site = soup.select_one('meta[property="og:site_name"]')
        if og_site and og_site.get("content"):
            out["publisher"] = og_site["content"].strip()

        # json-ld publisher.name 보강
        if not out["publisher"]:
            for script in soup.select('script[type="application/ld+json"]'):
                try:
                    txt = script.get_text(strip=True)
                    if not txt:
                        continue
                    data = json.loads(txt)
                    candidates = data if isinstance(data, list) else [data]
                    for obj in candidates:
                        if isinstance(obj, dict):
                            pub = obj.get("publisher")
                            if isinstance(pub, dict):
                                name = pub.get("name")
                                if isinstance(name, str) and name.strip():
                                    out["publisher"] = name.strip()
                                    break
                    if out["publisher"]:
                        break
                except Exception:
                    continue

        return out
    except Exception:
        return out

# =========================
# 4) Google RSS url + 폴백
# =========================
def build_google_rss_url(query: str, days: int | None):
    q = query if days is None else f"{query} when:{days}d"
    encoded_q = urllib.parse.quote(q)
    return f"https://news.google.com/rss/search?q={encoded_q}&hl={LANG}&gl={COUNTRY}&ceid={COUNTRY}:{LANG}"

def collect_google_top1_and_count(keyword: str):
    """
    반환:
      - news: {title,url,publisher,published_at,image_url}
      - total_count: int  (RSS가 반환한 entries 개수)
    폴백:
      1) "키워드" when:1d
      2) 키워드 when:1d
      3) "키워드" when:7d
      4) 키워드 when:7d
      5) "키워드" 기간제한 없음
      6) 키워드 기간제한 없음
    """
    empty_news = {"title": "", "url": "", "publisher": "", "published_at": "", "image_url": ""}
    variants = [
        (f'"{keyword}"', 1),
        (f"{keyword}", 1),
        (f'"{keyword}"', 7),
        (f"{keyword}", 7),
        (f'"{keyword}"', None),
        (f"{keyword}", None),
    ]

    best_count = 0

    for q, days in variants:
        rss_url = build_google_rss_url(q, days)
        try:
            res = requests.get(rss_url, headers=REQ_HEADERS, timeout=10)
            if res.status_code != 200:
                continue

            feed = feedparser.parse(res.text)
            entries = feed.entries or []
            best_count = max(best_count, len(entries))

            if not entries:
                continue

            e = entries[0]
            title = (e.get("title") or "").strip()
            link = (e.get("link") or "").strip()

            published_raw = (e.get("published") or e.get("updated") or "").strip()
            published_at = normalize_published_to_hhmm(published_raw)

            publisher = ""
            src = e.get("source")
            if isinstance(src, dict) and src.get("title"):
                publisher = str(src["title"]).strip()
            elif isinstance(src, str):
                publisher = src.strip()

            meta = fetch_publisher_and_image(link)
            if not publisher:
                publisher = (meta.get("publisher") or "").strip()
            image_url = (meta.get("image_url") or "").strip()

            news = {
                "title": title or "",
                "url": link or "",
                "publisher": publisher or "",
                "published_at": published_at or "",
                "image_url": image_url or ""
            }
            return news, len(entries)

        except Exception:
            continue

    # 전부 실패했으면 (뉴스는 empty, total_count는 best_count(대부분 0))
    return empty_news, best_count

# =========================
# 5) 메인: 카테고리 > 키워드 > { total_count, articles }
#    - articles 내부 row는 DB 컬럼 9키만 유지
# =========================
if __name__ == "__main__":
    collected_at = format_hhmm(datetime.now(KST))

    # run_id(임시): 카테고리별 1개
    run_id_by_category = {cat: idx + 1 for idx, cat in enumerate(CATEGORIES.keys())}

    # keyword_id(임시): 전체 유니크
    keyword_id_by_text = {}
    kid = 1
    for cat, kws in CATEGORIES.items():
        for kw in kws:
            if kw not in keyword_id_by_text:
                keyword_id_by_text[kw] = kid
                kid += 1

    grouped = {}
    article_id = 1

    for cat, kws in CATEGORIES.items():
        grouped.setdefault(cat, {})
        run_id = run_id_by_category[cat]

        for i, kw in enumerate(kws, start=1):
            print(f" 진행 중: [GoogleRSS] [{cat}] ({i}/{len(kws)}) {kw}" + " " * 30, end="\r")

            news, total_count = collect_google_top1_and_count(kw)

            row = {
                "article_id": article_id,
                "run_id": run_id,
                "keyword_id": keyword_id_by_text[kw],
                "title": news.get("title") or None,
                "url": news.get("url") or None,
                "publisher": news.get("publisher") or None,
                "published_at": news.get("published_at") or None,
                "image_url": news.get("image_url") or None,
                "collected_at": collected_at
            }

            # ✅ 기사 row는 DB 컬럼 키만 유지
            row = {k: row[k] for k in [
                "article_id", "run_id", "keyword_id",
                "title", "url", "publisher", "published_at",
                "image_url", "collected_at"
            ]}

            # ✅ 키워드 레벨에 total_count.google 저장 (기사 row에는 넣지 않음)
            grouped[cat][kw] = {
                "total_count": {"google": int(total_count)},
                "articles": [row]
            }

            article_id += 1
            time.sleep(0.5)

    with open(OUT_FILE, "w", encoding="utf-8") as f:
        json.dump(grouped, f, ensure_ascii=False, indent=4)

    print(f"\n✅ 완료: {OUT_FILE} 저장")
    print("   - 구조: 카테고리 > 키워드 > { total_count: {google}, articles: [기사row] }")
    print("   - published_at/collected_at: YYYY-MM-DD HH:MM")

 진행 중: [GoogleRSS] [스포츠] (10/10) 맨 시티 대 뉴캐슬                                    
✅ 완료: google_news_grouped_by_category_keyword.json 저장
   - 구조: 카테고리 > 키워드 > { total_count: {google}, articles: [기사row] }
   - published_at/collected_at: YYYY-MM-DD HH:MM


# Daum

In [62]:
import json
import time
import requests
import urllib.parse
import re
from datetime import datetime, timezone, timedelta
from bs4 import BeautifulSoup

# =========================
# 1) 입력: 카테고리/키워드
# =========================
CATEGORIES = {
    "기후": ["날씨", "울산날씨", "강풍 경보", "부산날씨", "날씨예보", "내일 날씨", "일기예보", "대구 날씨"],
    "엔터테인먼트": ["유튜브 문제가 발생했습니다", "문상민", "파반느", "부산찬란한 너의 계절에", "김태리", "아너 그녀들의 법정", "윤영경", "정동원", "금잔디", "홍자"],
    "비즈니스 및 금융": ["김인호", "비트코인", "케이뱅크 공모주", "하이닉스 주가", "엔화", "주식", "에어로케이", "에스팀", "신영자", "액스비스"],
    "스포츠": ["엘에이 fc 대 인터 마이애미", "mls", "챔피언스리그", "인테르", "토트넘 대 아스널", "노시환", "레알 에스파냐 대 엘에이 fc", "psg 대 메스", "울브스 대 아스널", "맨 시티 대 뉴캐슬"]
}

REQ_HEADER = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36",
    "Accept-Language": "ko-KR,ko;q=0.9,en;q=0.8",
    "Referer": "https://www.daum.net/",
}

KST = timezone(timedelta(hours=9))

# =========================
# 2) 유틸
# =========================
def now_kst():
    return datetime.now(tz=KST)

def format_hhmm(dt: datetime) -> str:
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=KST)
    dt = dt.astimezone(KST)
    return dt.strftime("%Y-%m-%d %H:%M")

def abs_url(url: str) -> str:
    if not url:
        return ""
    url = url.strip()
    if url.startswith("//"):
        return "https:" + url
    return url

def first_attr(soup, selectors, attr):
    if not soup:
        return ""
    for sel in selectors:
        el = soup.select_one(sel)
        if el and el.get(attr):
            v = el.get(attr, "").strip()
            if v:
                return v
    return ""

def first_text(soup, selectors):
    if not soup:
        return ""
    for sel in selectors:
        el = soup.select_one(sel)
        if el:
            t = el.get_text(" ", strip=True)
            if t:
                return t
    return ""

def parse_ldjson(soup: BeautifulSoup):
    """
    ld+json에서 publisher/name, datePublished 등을 최대한 추출
    """
    publisher = ""
    date_published = ""
    try:
        for script in soup.select('script[type="application/ld+json"]'):
            txt = script.get_text(strip=True)
            if not txt:
                continue
            data = json.loads(txt)

            def walk(obj):
                nonlocal publisher, date_published
                if isinstance(obj, dict):
                    for k in ("datePublished", "dateCreated", "dateModified"):
                        v = obj.get(k)
                        if not date_published and isinstance(v, str) and v.strip():
                            date_published = v.strip()

                    pub = obj.get("publisher")
                    if not publisher:
                        if isinstance(pub, dict):
                            name = pub.get("name")
                            if isinstance(name, str) and name.strip():
                                publisher = name.strip()
                        elif isinstance(pub, list):
                            for it in pub:
                                if isinstance(it, dict):
                                    name = it.get("name")
                                    if isinstance(name, str) and name.strip():
                                        publisher = name.strip()
                                        break

                    for v in obj.values():
                        walk(v)

                elif isinstance(obj, list):
                    for it in obj:
                        walk(it)

            walk(data)

            if publisher or date_published:
                break
    except Exception:
        pass

    return publisher, date_published

# =========================
# 3) 시간 전처리: published_at 을 YYYY-MM-DD HH:MM 로
# =========================
def to_dt_from_yyyymmddhhmmss(raw: str):
    digits = "".join(ch for ch in (raw or "") if ch.isdigit())
    if len(digits) < 12:
        return None
    y = int(digits[0:4])
    mo = int(digits[4:6])
    d = int(digits[6:8])
    hh = int(digits[8:10])
    mm = int(digits[10:12])
    ss = int(digits[12:14]) if len(digits) >= 14 else 0
    return datetime(y, mo, d, hh, mm, ss, tzinfo=KST)

def normalize_published_to_hhmm(raw: str) -> str:
    """
    가능한 경우 'YYYY-MM-DD HH:MM' (KST)로 변환
    - og:regDate(YYYYMMDDHHMMSS) 처리
    - ISO 문자열 처리
    """
    if not raw:
        return ""

    s = raw.strip()
    digits = "".join(ch for ch in s if ch.isdigit())

    # 1) 14자리/12자리 기반
    if len(digits) >= 12:
        dt = to_dt_from_yyyymmddhhmmss(digits)
        return format_hhmm(dt) if dt else ""

    # 2) ISO8601
    if "T" in s and "-" in s:
        try:
            s2 = s.replace("Z", "+00:00")
            dt = datetime.fromisoformat(s2)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=KST)
            dt = dt.astimezone(KST)
            return format_hhmm(dt)
        except Exception:
            return ""

    return ""

# =========================
# 4) 기사(또는 v.daum.net 뷰어)에서 메타 추출
# =========================
def fetch_article_meta(url: str, timeout=8):
    """
    기사(또는 v.daum.net 뷰어) 페이지에서:
    - publisher
    - published_at (YYYY-MM-DD HH:MM)
    - image_url
    추출
    """
    meta = {"publisher": "", "published_at": "", "image_url": ""}

    if not url:
        return meta

    try:
        res = requests.get(url, headers=REQ_HEADER, timeout=timeout, allow_redirects=True)
        if not res.ok:
            return meta

        soup = BeautifulSoup(res.text, "html.parser")

        # image_url: og:image 우선
        og_image = first_attr(soup, ['meta[property="og:image"]', 'meta[name="og:image"]'], "content")
        meta["image_url"] = abs_url(og_image)

        # published_at 우선순위:
        # 1) og:regDate (Daum 뷰어에서 매우 자주 14자리)
        regdate = first_attr(soup, ['meta[property="og:regDate"]'], "content")
        if regdate:
            meta["published_at"] = normalize_published_to_hhmm(regdate)

        # 2) article:published_time
        if not meta["published_at"]:
            apub = first_attr(
                soup,
                [
                    'meta[property="article:published_time"]',
                    'meta[name="article:published_time"]',
                    'meta[property="og:article:published_time"]',
                    'meta[name="pubdate"]',
                    'meta[name="publishdate"]',
                    'meta[name="date"]',
                ],
                "content",
            )
            meta["published_at"] = normalize_published_to_hhmm(apub)

        # 3) ld+json datePublished
        ld_publisher, ld_date = parse_ldjson(soup)
        if not meta["published_at"] and ld_date:
            meta["published_at"] = normalize_published_to_hhmm(ld_date)

        # publisher 우선순위:
        # 1) ld+json publisher.name
        meta["publisher"] = ld_publisher.strip() if ld_publisher else ""

        # 2) og:site_name
        if not meta["publisher"]:
            site_name = first_attr(soup, ['meta[property="og:site_name"]', 'meta[name="og:site_name"]'], "content")
            meta["publisher"] = site_name.strip() if site_name else ""

        # 3) Daum 뷰어 DOM fallback
        if (not meta["publisher"]) and ("v.daum.net" in url):
            meta["publisher"] = first_text(
                soup,
                [
                    "em.info_cp a",
                    "span.info_cp a",
                    "a.link_cp",
                    "span.txt_cp",
                    "em.txt_cp",
                ],
            )

        return meta

    except Exception:
        return meta

# =========================
# 5) Daum 검색에서 top1 기사 추출
# =========================
def collect_daum_top1(keyword: str):
    """
    다음 검색에서 키워드 1개당 1개 기사:
    title/url 뽑고,
    원문 열어서 publisher/published_at/image_url 보강
    """
    encoded_kw = urllib.parse.quote(keyword)
    search_url = f"https://search.daum.net/search?w=news&q={encoded_kw}"

    empty = {"title": "", "url": "", "publisher": "", "published_at": "", "image_url": ""}

    try:
        res = requests.get(search_url, headers=REQ_HEADER, timeout=8)
        if not res.ok:
            return empty

        soup = BeautifulSoup(res.text, "html.parser")

        # 제목 링크 후보
        a_tags = soup.select("div.item-title a")
        if not a_tags:
            a_tags = soup.select("a.f_link_b, a.link_tit")

        for a in a_tags:
            title = a.get_text(strip=True)
            url = abs_url(a.get("href", ""))

            if not url or url == "#" or len(title) < 5:
                continue

            # 원문 메타 보강(여기서 image_url/ publisher / published_at)
            meta = fetch_article_meta(url)

            return {
                "title": title,
                "url": url,
                "publisher": meta.get("publisher", "") or "",
                "published_at": meta.get("published_at", "") or "",  # ✅ YYYY-MM-DD HH:MM
                "image_url": meta.get("image_url", "") or ""
            }

        return empty

    except Exception:
        return empty

# =========================
# 6) 메인: (카테고리 > 키워드 > 기사row) + 시간 전처리 동일
#     기사 row는 DB 컬럼 키만 유지
# =========================
if __name__ == "__main__":
    collected_at = format_hhmm(now_kst())  # ✅ YYYY-MM-DD HH:MM

    # 임시 ID 매핑 (DB INSERT 전 단계에서 FK 연결용)
    # run_id: 카테고리별 1개 run 생성(1~4)
    run_id_by_category = {}
    rid = 1
    for cat in CATEGORIES.keys():
        run_id_by_category[cat] = rid
        rid += 1

    # keyword_id: 전체 키워드 유니크하게 1..N
    keyword_id_by_text = {}
    kid = 1
    for cat, kws in CATEGORIES.items():
        for kw in kws:
            if kw not in keyword_id_by_text:
                keyword_id_by_text[kw] = kid
                kid += 1

    grouped = {}
    article_id = 1

    for cat, kws in CATEGORIES.items():
        grouped.setdefault(cat, {})
        run_id = run_id_by_category[cat]

        for i, kw in enumerate(kws, start=1):
            print(f" 진행 중: [{cat}] ({i}/{len(kws)}) {kw}" + " " * 30, end="\r")

            news = collect_daum_top1(kw)

            row = {
                "article_id": article_id,
                "run_id": run_id,
                "keyword_id": keyword_id_by_text[kw],
                "title": news.get("title") or None,
                "url": news.get("url") or None,
                "publisher": news.get("publisher") or None,
                "published_at": news.get("published_at") or None,   # ✅ YYYY-MM-DD HH:MM
                "image_url": news.get("image_url") or None,
                "collected_at": collected_at                         # ✅ YYYY-MM-DD HH:MM
            }

            # ✅ row는 DB키만 유지(안전장치)
            row = {k: row[k] for k in [
                "article_id", "run_id", "keyword_id",
                "title", "url", "publisher", "published_at",
                "image_url", "collected_at"
            ]}

            grouped[cat].setdefault(kw, []).append(row)

            article_id += 1
            time.sleep(0.7)

    out_file = "daum_news_grouped_by_category_keyword.json"
    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(grouped, f, ensure_ascii=False, indent=4)

    print(f"\n✨ 완료: {out_file} 저장 (카테고리>{'키워드'}>기사row), collected_at/published_at=YYYY-MM-DD HH:MM")

 진행 중: [스포츠] (10/10) 맨 시티 대 뉴캐슬                                    
✨ 완료: daum_news_grouped_by_category_keyword.json 저장 (카테고리>키워드>기사row), collected_at/published_at=YYYY-MM-DD HH:MM


# Naver

모든 카테고리

In [61]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
import re
import json
from datetime import datetime, timezone, timedelta

# =========================
# 설정
# =========================
CATEGORIES = {
    "기후": ["날씨", "울산날씨", "강풍 경보", "부산날씨", "날씨예보", "내일 날씨", "일기예보", "대구 날씨"],
    "엔터테인먼트": ["유튜브 문제가 발생했습니다", "문상민", "파반느", "부산찬란한 너의 계절에", "김태리", "아너 그녀들의 법정", "윤영경", "정동원", "금잔디", "홍자"],
    "비즈니스 및 금융": ["김인호", "비트코인", "케이뱅크 공모주", "하이닉스 주가", "엔화", "주식", "에어로케이", "에스팀", "신영자", "액스비스"],
    "스포츠": ["엘에이 fc 대 인터 마이애미", "mls", "챔피언스리그", "인테르", "토트넘 대 아스널", "노시환", "레알 에스파냐 대 엘에이 fc", "psg 대 메스", "울브스 대 아스널", "맨 시티 대 뉴캐슬"]
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36",
    "Accept-Language": "ko-KR,ko;q=0.9,en;q=0.8",
    "Referer": "https://www.naver.com/",
}

KST = timezone(timedelta(hours=9))

# =========================
# collected_at 전처리 (HH:MM까지만)
# =========================
def format_collected_at(dt: datetime) -> str:
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=KST)
    dt = dt.astimezone(KST)
    return dt.strftime("%Y-%m-%d %H:%M")

# =========================
# 유틸: 발행일로 보이는 텍스트 판별
# =========================
def looks_like_time_or_date(txt: str) -> bool:
    if not txt:
        return False
    t = txt.strip()
    if re.search(r"\d+\s*(분|시간|일|주|개월|년)\s*전", t):
        return True
    if re.search(r"\d{4}\.\d{2}\.\d{2}\.?", t):
        return True
    if re.search(r"\d{4}-\d{2}-\d{2}", t):
        return True
    return False

# =========================
# 발행일 정규화: 'YYYY-MM-DD HH:MM' (KST)  ✅ 그대로 유지
# =========================
def normalize_published(published_raw: str):
    if not published_raw:
        return None

    s = published_raw.strip()

    # 1) ISO8601
    try:
        dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=KST)
        dt = dt.astimezone(KST)
        return dt.strftime("%Y-%m-%d %H:%M")
    except Exception:
        pass

    # 2) 날짜+시간 (2026.02.25. 13:10)
    m = re.search(r"(\d{4})\.(\d{2})\.(\d{2})\.?\s*(\d{1,2}):(\d{2})", s)
    if m:
        y, mo, d, hh, mm = m.groups()
        dt = datetime(int(y), int(mo), int(d), int(hh), int(mm), tzinfo=KST)
        return dt.strftime("%Y-%m-%d %H:%M")

    # 3) 날짜만 (2026.02.25)
    m = re.search(r"(\d{4})\.(\d{2})\.(\d{2})", s)
    if m:
        y, mo, d = map(int, m.groups())
        dt = datetime(y, mo, d, 0, 0, tzinfo=KST)
        return dt.strftime("%Y-%m-%d %H:%M")

    # 4) 상대시간 (3시간 전)
    m = re.search(r"(\d+)\s*(분|시간|일|주|개월|년)\s*전", s)
    if m:
        n = int(m.group(1))
        unit = m.group(2)
        now = datetime.now(KST)

        if unit == "분":
            dt = now - timedelta(minutes=n)
        elif unit == "시간":
            dt = now - timedelta(hours=n)
        elif unit == "일":
            dt = now - timedelta(days=n)
        elif unit == "주":
            dt = now - timedelta(weeks=n)
        elif unit == "개월":
            dt = now - timedelta(days=30 * n)
        elif unit == "년":
            dt = now - timedelta(days=365 * n)
        else:
            return None

        return dt.strftime("%Y-%m-%d %H:%M")

    # 5) 이미 YYYY-MM-DD HH:MM 이면 그대로
    if re.fullmatch(r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}", s):
        return s

    return None

# =========================
# 기사 페이지에서 발행일 추출 보조
# =========================
def parse_published_from_jsonld(soup: BeautifulSoup):
    for script in soup.select('script[type="application/ld+json"]'):
        try:
            txt = script.get_text(strip=True)
            if not txt:
                continue
            data = json.loads(txt)
            candidates = data if isinstance(data, list) else [data]
            for obj in candidates:
                if isinstance(obj, dict):
                    dp = obj.get("datePublished") or obj.get("dateCreated") or obj.get("dateModified")
                    if dp:
                        return dp
        except Exception:
            continue
    return None

def parse_published_from_meta(soup: BeautifulSoup):
    meta_props = [
        ("property", "article:published_time"),
        ("property", "og:article:published_time"),
        ("name", "article:published_time"),
        ("name", "pubdate"),
        ("name", "date"),
    ]
    for attr, key in meta_props:
        tag = soup.find("meta", attrs={attr: key})
        if tag and tag.get("content"):
            return tag["content"].strip()
    return None

def fetch_published_raw_from_article(link: str):
    try:
        res = requests.get(link, headers=HEADERS, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")

        pub = parse_published_from_meta(soup) or parse_published_from_jsonld(soup)
        if pub:
            return pub

        text = soup.get_text(" ", strip=True)
        m = re.search(r"\d{4}\.\d{2}\.\d{2}\.?\s*\d{1,2}:\d{2}", text)
        if m:
            return m.group(0)

        m = re.search(r"\d{4}\.\d{2}\.\d{2}\.?", text)
        if m:
            return m.group(0)

        return None
    except Exception:
        return None

# =========================
# 언론사 추출
# =========================
def parse_publisher_from_jsonld(soup: BeautifulSoup):
    for script in soup.select('script[type="application/ld+json"]'):
        try:
            txt = script.get_text(strip=True)
            if not txt:
                continue
            data = json.loads(txt)
            candidates = data if isinstance(data, list) else [data]
            for obj in candidates:
                if isinstance(obj, dict):
                    pub = obj.get("publisher")
                    if isinstance(pub, dict):
                        name = pub.get("name")
                        if isinstance(name, str) and name.strip():
                            return name.strip()
                    if isinstance(pub, list):
                        for it in pub:
                            if isinstance(it, dict):
                                name = it.get("name")
                                if isinstance(name, str) and name.strip():
                                    return name.strip()
        except Exception:
            continue
    return None

def fetch_publisher_from_article(link: str):
    try:
        res = requests.get(link, headers=HEADERS, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")

        pub = parse_publisher_from_jsonld(soup)
        if pub:
            return pub

        tag = soup.find("meta", attrs={"property": "og:site_name"})
        if tag and tag.get("content"):
            return tag["content"].strip()

        return None
    except Exception:
        return None

# =========================
# 이미지 추출
# =========================
def fetch_og_image(link: str):
    try:
        res = requests.get(link, headers=HEADERS, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")

        tag = soup.find("meta", attrs={"property": "og:image"})
        if tag and tag.get("content"):
            return tag["content"].strip()

        tag = soup.find("meta", attrs={"name": "twitter:image"})
        if tag and tag.get("content"):
            return tag["content"].strip()

        return None
    except Exception:
        return None

# =========================
# 네이버 뉴스 검색: 키워드별 1개
# =========================
def fetch_one_naver_news(keyword: str):
    q = urllib.parse.quote(keyword)
    url = f"https://search.naver.com/search.naver?where=news&sm=tab_jum&query={q}"

    res = requests.get(url, headers=HEADERS, timeout=10)
    res.raise_for_status()
    soup = BeautifulSoup(res.text, "html.parser")

    a = soup.select_one('a[data-heatmap-target=".tit"]') or soup.select_one("a.news_tit")
    if not a:
        return None

    title = a.get_text(strip=True)
    link = a.get("href")

    published_raw = None
    publisher = None
    card = a.find_parent("div", class_="news_area") or a.find_parent(["div", "li"])

    if card:
        infos = card.select("div.info_group span.info, span.info")
        for sp in infos:
            txt = sp.get_text(strip=True)
            if looks_like_time_or_date(txt):
                published_raw = txt
                break

        press = card.select_one("a.info.press, span.info.press, a.press, span.press")
        if press:
            publisher = press.get_text(" ", strip=True)

    if published_raw is None and link:
        published_raw = fetch_published_raw_from_article(link)

    published = normalize_published(published_raw)

    if (not publisher) and link:
        publisher = fetch_publisher_from_article(link)

    img_url = None
    if card:
        img = card.select_one("img")
        if img:
            img_url = (
                img.get("data-lazy-src")
                or img.get("data-src")
                or img.get("src")
                or img.get("data-original")
            )
            if img_url and img_url.startswith("//"):
                img_url = "https:" + img_url

    if (not img_url) and link:
        img_url = fetch_og_image(link)

    return {
        "title": title,
        "url": link,
        "publisher": publisher,
        "published_at": published,     # ✅ 'YYYY-MM-DD HH:MM' 유지
        "image_url": img_url
    }

# =========================
# 실행: 카테고리 > 키워드 > [news_article row]
# =========================
if __name__ == "__main__":
    collected_at = format_collected_at(datetime.now(KST))  # ✅ HH:MM까지만

    # run_id: 카테고리별 1개 run (임시)
    run_id_by_category = {}
    rid = 1
    for cat in CATEGORIES.keys():
        run_id_by_category[cat] = rid
        rid += 1

    # keyword_id: 전체 키워드 유니크 ID (임시)
    keyword_id_by_text = {}
    kid = 1
    for cat, kws in CATEGORIES.items():
        for kw in kws:
            if kw not in keyword_id_by_text:
                keyword_id_by_text[kw] = kid
                kid += 1

    grouped = {}
    article_id = 1

    for category, keywords in CATEGORIES.items():
        grouped.setdefault(category, {})
        run_id = run_id_by_category[category]

        for i, kw in enumerate(keywords, start=1):
            print(f" 진행 중: [{category}] ({i}/{len(keywords)}) {kw}" + " " * 20, end="\r")

            item = None
            try:
                item = fetch_one_naver_news(kw)
            except Exception:
                item = None

            row = {
                "article_id": article_id,
                "run_id": run_id,
                "keyword_id": keyword_id_by_text[kw],
                "title": (item.get("title") if item else None),
                "url": (item.get("url") if item else None),
                "publisher": (item.get("publisher") if item else None),
                "published_at": (item.get("published_at") if item else None),
                "image_url": (item.get("image_url") if item else None),
                "collected_at": collected_at
            }

            # ✅ row는 DB키만
            row = {k: row[k] for k in [
                "article_id", "run_id", "keyword_id",
                "title", "url", "publisher", "published_at",
                "image_url", "collected_at"
            ]}

            grouped[category].setdefault(kw, []).append(row)

            article_id += 1
            time.sleep(0.5)

    OUT_FILE = "naver_news_grouped_by_category_keyword.json"
    with open(OUT_FILE, "w", encoding="utf-8") as f:
        json.dump(grouped, f, ensure_ascii=False, indent=4)

    print(f"\n✅ JSON 저장 완료: {OUT_FILE}")

 진행 중: [스포츠] (10/10) 맨 시티 대 뉴캐슬                          
✅ JSON 저장 완료: naver_news_grouped_by_category_keyword.json
